diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml
index df953b5d3..6dd98b1cd 100644
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -22,7 +22,7 @@ jobs:
       - id: get_pr_info
         if: github.event_name == 'push'
         continue-on-error: true
-        uses: nv-gha-runners/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@090577647b8ddc4e06e809e264f7881650ecdccf
 
       - id: gate
         shell: bash
diff --git a/Cargo.lock b/Cargo.lock
index e4057f75c..cc1193267 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3028,6 +3028,7 @@ dependencies = [
  "openshell-prover",
  "openshell-providers",
  "openshell-tui",
+ "openshell-vfio",
  "owo-colors",
  "prost-types",
  "rcgen",
@@ -3270,6 +3271,14 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "openshell-vfio"
+version = "0.0.0"
+dependencies = [
+ "nix",
+ "tempfile",
+]
+
 [[package]]
 name = "openshell-vm"
 version = "0.0.0"
@@ -3283,11 +3292,13 @@ dependencies = [
  "nix",
  "openshell-bootstrap",
  "openshell-core",
+ "openshell-vfio",
  "rustls",
  "rustls-pemfile",
  "serde",
  "serde_json",
  "tar",
+ "tempfile",
  "thiserror 2.0.18",
  "tokio",
  "tokio-rustls",
diff --git a/architecture/README.md b/architecture/README.md
index 570fce660..008836fca 100644
--- a/architecture/README.md
+++ b/architecture/README.md
@@ -301,4 +301,6 @@ This opens an interactive SSH session into the sandbox, with all provider creden
 | [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. |
 | [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. |
 | [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. |
+| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / QEMU), kernel configuration, and build pipeline. |
+| [VM GPU Passthrough](vm-gpu-passthrough.md) | VFIO GPU passthrough for VMs: host preparation, safety checks, nvidia driver hardening, and troubleshooting. |
 | [TUI](tui.md) | Terminal user interface for sandbox interaction. |
diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md
index ce4d0bf39..9963edea8 100644
--- a/architecture/custom-vm-runtime.md
+++ b/architecture/custom-vm-runtime.md
@@ -1,18 +1,31 @@
-# Custom libkrunfw VM Runtime
+# Custom VM Runtime
 
 > Status: Experimental and work in progress (WIP). VM support is under active development and may change.
 
 ## Overview
 
-The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a
-lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel
-is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel.
+The OpenShell gateway VM supports two hypervisor backends:
 
-The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or
-conntrack support. This is insufficient for Kubernetes pod networking.
+- **libkrun** (default) — lightweight VMM using Apple Hypervisor.framework (macOS) or KVM
+  (Linux). The kernel is embedded inside `libkrunfw`. Uses virtio-MMIO device transport and
+  gvproxy for user-space networking.
+- **QEMU** — Linux-only VMM used for GPU passthrough (VFIO). Uses virtio-PCI device transport,
+  TAP networking, and requires a separate `vmlinux` kernel and `virtiofsd` for rootfs access.
+  QEMU binary is not embedded — it must be installed on the host.
+
+Backend selection is automatic: `--gpu` selects QEMU, otherwise libkrun is used. The `--backend`
+flag provides explicit control (`auto`, `libkrun`, `qemu`).
+
+When `--gpu` is passed, `openshell-vm` automatically binds an eligible GPU to `vfio-pci`
+and restores it to the original driver on shutdown. See
+[vm-gpu-passthrough.md](vm-gpu-passthrough.md) for the full lifecycle description.
 
-The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to
-the VM kernel, enabling standard Kubernetes networking.
+Both backends share the same guest kernel (built from a single `openshell.kconfig` fragment)
+and rootfs.
+
+The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or
+conntrack support. This is insufficient for Kubernetes pod networking. The custom kconfig
+adds bridge CNI, iptables/nftables, conntrack, and QEMU compatibility.
 
 ## Architecture
 
@@ -20,10 +33,11 @@ the VM kernel, enabling standard Kubernetes networking.
 graph TD
     subgraph Host["Host (macOS / Linux)"]
         BIN[openshell-vm binary]
-        EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"]
+        EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy · rootfs"]
         CACHE["~/.local/share/openshell/vm-runtime/{version}/"]
         PROV[Runtime provenance logging]
         GVP[gvproxy networking proxy]
+        QEMU_BIN["qemu-system-x86_64 · virtiofsd · vmlinux\n(GPU runtime bundle)"]
 
         BIN --> EMB
         BIN -->|extracts to| CACHE
@@ -44,8 +58,9 @@ graph TD
         INIT --> VAL --> CNI --> EXECA --> PKI --> K3S
     end
 
-    BIN -- "fork + krun_start_enter" --> INIT
-    GVP -- "virtio-net" --> Guest
+    BIN -- "libkrun: fork + krun_start_enter" --> INIT
+    BIN -- "QEMU: qemu-system-x86_64 + virtiofsd" --> INIT
+    GVP -- "virtio-net (libkrun only)" --> Guest
 ```
 
 ## Embedded Runtime
@@ -67,9 +82,22 @@ these to XDG cache directories with progress bars:
 └── ...
 ```
 
-This eliminates the need for separate bundles or downloads - a single ~120MB binary
-provides everything needed to run the VM. Old cache versions are automatically
-cleaned up when a new version is extracted.
+When using QEMU for GPU passthrough, an additional runtime bundle is required alongside
+the binary:
+
+```
+target/debug/openshell-vm.runtime/    (or alongside the installed binary)
+├── virtiofsd                          # virtio-fs daemon
+└── vmlinux                            # extracted guest kernel
+```
+
+This bundle is built with `mise run vm:bundle-runtime` and is separate from the
+embedded runtime because virtiofsd is Linux-only and not embedded in the
+self-extracting binary.
+
+This eliminates the need for separate bundles or downloads for the default (libkrun)
+path — a single ~120MB binary provides everything needed. Old cache versions are
+automatically cleaned up when a new version is extracted.
 
 ### Hybrid Approach
 
@@ -86,6 +114,34 @@ mise run vm:rootfs                 # Full rootfs (~2GB, includes images)
 mise run vm:build                  # Rebuild binary with full rootfs
 ```
 
+## Backend Comparison
+
+| | libkrun (default) | QEMU |
+|---|---|---|
+| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only |
+| Device transport | virtio-MMIO | virtio-PCI |
+| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) |
+| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) |
+| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file |
+| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) |
+| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) |
+| GPU passthrough | Not supported | VFIO PCI |
+| Vsock | libkrun built-in | `AF_VSOCK` (kernel `vhost_vsock`) |
+| VM control | krun C API | Command-line args |
+| Binary source | Embedded in runtime | Host-installed |
+| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown |
+| CLI flag | `--backend libkrun` | `--backend qemu` or `--gpu` |
+
+### Exec mode differences
+
+With libkrun, when `--exec <cmd>` is used, the command replaces the init process and
+the VM exits when PID 1 exits.
+
+With QEMU, the VM does not automatically exit when PID 1 terminates. A wrapper init
+script is dynamically written to the guest rootfs that mounts necessary filesystems,
+executes the user command, captures the exit code, and calls `poweroff -f` to trigger
+an ACPI shutdown that the hypervisor detects.
+
 ## Network Profile
 
 The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and
@@ -100,6 +156,26 @@ fast with an actionable error if they are missing.
 - Service VIPs: functional (ClusterIP, NodePort)
 - hostNetwork workarounds: not required
 
+### Networking by backend
+
+- **libkrun**: Uses gvproxy for user-space virtio-net networking. No root privileges
+  needed. Port forwarding is handled via gvproxy configuration.
+- **QEMU**: Uses TAP networking (requires root or CAP_NET_ADMIN). When `--net none`
+  is passed, networking is disabled entirely (useful for `--exec` mode tests). gvproxy
+  is not used with QEMU.
+
+## Guest Init Script
+
+The init script (`openshell-vm-init.sh`) runs as PID 1 in the guest. After mounting essential filesystems, it performs:
+
+1. **Kernel cmdline parsing** — exports environment variables passed via the kernel command line (`GPU_ENABLED`, `OPENSHELL_VM_STATE_DISK_DEVICE`, `VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). This runs after `/proc` is mounted so `/proc/cmdline` is available.
+
+2. **Cgroup v2 controller enablement** — enables `cpu`, `cpuset`, `memory`, `pids`, and `io` controllers in the root cgroup hierarchy (`cgroup.subtree_control`). k3s/kubelet requires these controllers; the `cpu` controller depends on `CONFIG_CGROUP_SCHED` in the kernel.
+
+3. **Networking** — detects `eth0` and attempts DHCP (via `udhcpc`). On failure, falls back to static IP configuration using `VM_NET_IP` and `VM_NET_GW` from the kernel cmdline (set by the QEMU backend for TAP networking). DNS is configured from `VM_NET_DNS` if set, overriding any stale `/etc/resolv.conf` entries.
+
+4. **Capability validation** — verifies required kernel features (bridge networking, netfilter, cgroups) and fails fast with actionable errors if missing.
+
 ## Runtime Provenance
 
 At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle:
@@ -128,21 +204,46 @@ graph LR
         BUILD_M["Build libkrunfw.dylib + libkrun.dylib"]
     end
 
+    subgraph GPU["Linux CI (build-gpu-deps.sh)"]
+        BUILD_GPU["Build virtiofsd\n(for QEMU backend)"]
+    end
+
+    subgraph NV["Linux CI (build-nvidia-modules.sh)"]
+        BUILD_NV["Compile NVIDIA .ko against VM kernel"]
+    end
+
+    subgraph QEMU["Host-installed"]
+        QEMU_BIN["qemu-system-x86_64\n(not built — must be on host PATH)"]
+    end
+
     subgraph Output["target/libkrun-build/"]
         LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"]
         LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"]
+        VIRTIOFSD["virtiofsd\n(QEMU backend)"]
+        VMLINUX["vmlinux\n(shared by QEMU)"]
+        NV_KO["nvidia-modules/*.ko\n(GPU builds only)"]
     end
 
     KCONF --> BUILD_L
     BUILD_L --> LIB_SO
+    BUILD_L --> VMLINUX
+    BUILD_L -->|kernel source tree| BUILD_NV
+    BUILD_NV --> NV_KO
     KCONF --> BUILD_M
     BUILD_M --> LIB_DY
+    BUILD_GPU --> VIRTIOFSD
 ```
 
+The `vmlinux` kernel is extracted from the libkrunfw build and reused by QEMU.
+Both backends boot the same kernel — the kconfig fragment includes drivers for
+both virtio-MMIO (libkrun) and virtio-PCI (QEMU) transports.
+
 ## Kernel Config Fragment
 
 The `openshell.kconfig` fragment enables these kernel features on top of the stock
-libkrunfw kernel:
+libkrunfw kernel. A single kernel binary is shared by both backends (libkrun and
+QEMU) — backend-specific drivers coexist safely (the kernel probes whichever
+transport the hypervisor provides).
 
 | Feature | Key Configs | Purpose |
 |---------|-------------|---------|
@@ -158,11 +259,18 @@ libkrunfw kernel:
 | IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing |
 | IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) |
 | Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS |
-| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits |
+| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_CGROUP_CPUACCT`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS`, `CONFIG_CGROUP_FREEZER` | Container resource limits |
+| Cgroup CPU | `CONFIG_CGROUP_SCHED`, `CONFIG_FAIR_GROUP_SCHED`, `CONFIG_CFS_BANDWIDTH` | cgroup v2 `cpu` controller for k3s/kubelet |
 | TUN/TAP | `CONFIG_TUN` | CNI plugin support |
 | Dummy interface | `CONFIG_DUMMY` | Fallback networking |
 | Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support |
 | Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support |
+| PCI / GPU | `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM` | GPU passthrough via VFIO |
+| Kernel modules | `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | Loading NVIDIA drivers in guest |
+| virtio-PCI transport | `CONFIG_VIRTIO_PCI` | QEMU device bus (libkrun uses MMIO) |
+| Serial console | `CONFIG_SERIAL_8250`, `CONFIG_SERIAL_8250_CONSOLE` | QEMU console (`ttyS0`) |
+| ACPI | `CONFIG_ACPI` | QEMU power management / clean shutdown |
+| x2APIC | `CONFIG_X86_X2APIC` | Multi-vCPU support (QEMU uses x2APIC MADT entries) |
 
 See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with
 inline comments explaining why each option is needed.
@@ -189,13 +297,22 @@ The standalone `openshell-vm` binary supports `openshell-vm exec -- <command...>
 `openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style
 commands work the same way they would inside the VM shell.
 
+### Vsock by backend
+
+- **libkrun**: Uses libkrun's built-in vsock port mapping, which transparently
+  bridges the guest vsock port to a host Unix socket.
+- **QEMU**: Uses `vhost-vsock-pci` with kernel `AF_VSOCK` sockets. The exec
+  bridge opens a kernel `AF_VSOCK` socket to the guest CID and bridges it to
+  the same Unix domain socket path used by the other backend. Requires the
+  `vhost_vsock` kernel module on the host.
+
 ## Build Commands
 
 ```bash
 # One-time setup: download pre-built runtime (~30s)
 mise run vm:setup
 
-# Build and run
+# Build and run (libkrun, default)
 mise run vm
 
 # Build embedded binary with base rootfs (~120MB, recommended)
@@ -210,6 +327,29 @@ mise run vm:build                          # Rebuild binary
 FROM_SOURCE=1 mise run vm:setup            # Build runtime from source
 mise run vm:build                          # Then build embedded binary
 
+# Build GPU runtime bundle (Linux only)
+mise run vm:bundle-runtime                 # Builds virtiofsd + extracts vmlinux
+
+# Validate QEMU host prerequisites
+mise run vm:qemu-check
+
+# Install QEMU if not present (Ubuntu/Debian)
+sudo apt install qemu-system-x86
+
+# Load vhost-vsock kernel module (required for QEMU vsock)
+sudo modprobe vhost_vsock
+echo "vhost_vsock" | sudo tee /etc/modules-load.d/vhost_vsock.conf
+
+# Build with GPU support (Linux x86_64 only)
+FROM_SOURCE=1 mise run vm:setup            # Build kernel from source (module compilation needs it)
+mise run vm:nvidia-modules                 # Compile NVIDIA .ko files against VM kernel
+mise run vm:rootfs -- --base --gpu         # Build GPU rootfs with injected kernel modules
+mise run vm:build                          # Rebuild binary with GPU rootfs
+
+# Run with QEMU backend
+openshell-vm --backend qemu                # Requires qemu-system-x86_64 on host
+openshell-vm --gpu                         # Auto-selects QEMU for GPU passthrough
+
 # Wipe everything and start over
 mise run vm:clean
 ```
@@ -221,20 +361,23 @@ rolling `vm-dev` GitHub Release:
 
 ### Kernel Runtime (`release-vm-kernel.yml`)
 
-Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all
-supported platforms. Runs on-demand or when the kernel config / pinned versions change.
+Builds the custom libkrunfw (kernel firmware), libkrun (VMM), gvproxy, and virtiofsd
+for all supported platforms. Runs on-demand or when the kernel config / pinned versions
+change.
 
 | Platform | Runner | Build Method |
 |----------|--------|-------------|
-| Linux ARM64 | `build-arm64` (self-hosted) | Native `build-libkrun.sh` |
-| Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` |
-| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` |
+| Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` |
+| Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` |
+| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no GPU support) |
 
-Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and
-provenance metadata.
+Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy,
+and provenance metadata. Linux artifacts additionally include virtiofsd and the
+extracted `vmlinux` kernel.
 
 Each platform builds its own libkrunfw and libkrun natively. The kernel inside
-libkrunfw is always Linux regardless of host platform.
+libkrunfw is always Linux regardless of host platform. Virtiofsd is
+Linux-only (macOS does not support VFIO/KVM passthrough).
 
 ### VM Binary (`release-vm-dev.yml`)
 
@@ -263,6 +406,10 @@ macOS binaries produced via osxcross are not codesigned. Users must self-sign:
 codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - ./openshell-vm
 ```
 
+> **Note:** QEMU smoke tests (`vm_boot_smoke.rs`) are gated on `OPENSHELL_VM_BACKEND=qemu`.
+> These tests require `qemu-system-x86_64` on the runner and are currently manual-only.
+> Run `mise run vm:qemu-check` to validate prerequisites before running QEMU tests.
+
 ## Rollout Strategy
 
 1. Custom runtime is embedded by default when building with `mise run vm:build`.
diff --git a/architecture/vm-gpu-passthrough.md b/architecture/vm-gpu-passthrough.md
new file mode 100644
index 000000000..621e27c0c
--- /dev/null
+++ b/architecture/vm-gpu-passthrough.md
@@ -0,0 +1,470 @@
+# VM GPU Passthrough
+
+> Status: Experimental and work in progress (WIP). GPU passthrough for the VM backend is under active development.
+
+## Overview
+
+OpenShell's VM backend can pass a physical NVIDIA GPU into a microVM using VFIO (Virtual Function I/O). This gives the guest direct access to GPU hardware, enabling CUDA workloads and `nvidia-smi` inside sandboxes without virtualization overhead.
+
+GPU passthrough uses QEMU (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers.
+
+## Architecture
+
+```
+Host                          │  Guest (microVM)
+──────────────────────────────│───────────────────────────
+  NVIDIA GPU (PCI BDF addr)   │  nvidia driver + CUDA
+  ↕ bound to vfio-pci         │  ↕
+  /dev/vfio/<group>            │  /dev/nvidia*
+  ↕                            │  ↕
+  QEMU (VFIO)            ────│→ PCI device visible
+  ↕                            │  ↕
+  TAP networking               │  k3s + device plugin
+  virtiofsd (rootfs)           │  ↕
+                               │  sandbox pods (nvidia.com/gpu)
+```
+
+### Backend selection
+
+| Flag | Backend | GPU attached? |
+|------|---------|---------------|
+| (none) | libkrun | No |
+| `--gpu` | QEMU | Yes |
+| `--gpu 0000:41:00.0` | QEMU | Yes |
+| `--backend qemu` | QEMU | Optional |
+
+Auto mode (`--backend auto`, the default) selects QEMU when `--gpu` is used, and libkrun otherwise.
+
+### Automatic GPU binding
+
+When `--gpu` is passed (with or without a specific PCI address), the launcher automatically prepares the GPU for VFIO passthrough:
+
+1. **Probe** — scans `/sys/bus/pci/devices` for NVIDIA devices (vendor `0x10de`).
+2. **Safety checks** — for each candidate GPU, verifies it is safe to claim (see below). If any check fails, the launcher refuses to proceed and exits with an actionable error.
+3. **Bind** — unbinds the selected GPU from the `nvidia` driver and binds it to `vfio-pci`. Also binds any IOMMU group peers to `vfio-pci` for group cleanliness.
+4. **Launch** — starts QEMU with the VFIO device attached and sets `GPU_ENABLED=true` in the guest kernel cmdline.
+5. **Rebind on shutdown** — when the VM exits (clean shutdown, Ctrl+C, or crash), the launcher rebinds the GPU back to the `nvidia` driver and clears `driver_override`, restoring host GPU access. Cleanup is guaranteed by a `GpuBindGuard` RAII guard that calls restore on drop, covering normal exit, early return, and panic. Only `SIGKILL` (kill -9) bypasses the guard — see Troubleshooting below for manual recovery.
+
+When a specific PCI address is given (`--gpu 0000:41:00.0`), the launcher targets that exact device. When `--gpu` is used without an address (`auto` mode), the launcher selects the best available GPU using the multi-GPU selection strategy.
+
+### Safety checks
+
+All safety checks are hard failures — if any check fails, the launcher prints an error and exits without binding. The one exception is display-manager-related blocking: when the GPU is held by Xorg or a Wayland compositor, the launcher prompts the user interactively to stop the display manager (see Single-GPU caveats).
+
+| Check | What it detects | Failure behavior |
+|-------|----------------|------------------|
+| **Display attached** | GPU drives an active DRM framebuffer or is the primary rendering device | Interactive prompt to stop display-manager; error if declined or non-interactive |
+| **Active processes** | Processes holding `/dev/nvidia*` file descriptors (CUDA jobs, monitoring) | Error if non-display processes; interactive prompt if only display servers |
+| **IOMMU enabled** | `/sys/kernel/iommu_groups/` exists and the GPU has a group assignment | Error: "IOMMU is not enabled — add intel_iommu=on or amd_iommu=on to kernel cmdline" |
+| **VFIO modules loaded** | `vfio-pci` and `vfio_iommu_type1` kernel modules are loaded | Error: "vfio-pci kernel module not loaded — run: sudo modprobe vfio-pci" |
+| **Permissions** | Write access to sysfs bind/unbind and `/dev/vfio/` | Error: "insufficient permissions — run as root or with CAP_NET_ADMIN" |
+
+### Multi-GPU selection (`--gpu` auto mode)
+
+On hosts with multiple NVIDIA GPUs, the launcher selects a GPU using this priority:
+
+1. **Already on vfio-pci** with a clean IOMMU group — use immediately (no rebind needed).
+2. **Idle (no processes, no display)** — preferred for binding.
+3. **Skip** GPUs with active displays or running processes.
+
+If no GPU passes all safety checks, the launcher fails with per-device status listing what blocked each GPU.
+
+## Host preparation
+
+The launcher handles GPU driver binding automatically. The host only needs IOMMU and VFIO kernel modules configured.
+
+### 1. Enable IOMMU
+
+IOMMU must be enabled in both BIOS/UEFI and the Linux kernel.
+
+**Intel systems:**
+
+```shell
+# Add to kernel command line (e.g. /etc/default/grub GRUB_CMDLINE_LINUX)
+intel_iommu=on iommu=pt
+```
+
+**AMD systems:**
+
+```shell
+# AMD IOMMU is usually enabled by default; verify or add:
+amd_iommu=on iommu=pt
+```
+
+After editing, run `update-grub` (or equivalent) and reboot. Verify IOMMU is active:
+
+```shell
+dmesg | grep -i iommu
+# Should show: "DMAR: IOMMU enabled" or "AMD-Vi: AMD IOMMUv2"
+```
+
+### 2. Load VFIO kernel modules
+
+```shell
+sudo modprobe vfio-pci
+sudo modprobe vfio_iommu_type1
+
+# Persist across reboots
+echo "vfio-pci" | sudo tee /etc/modules-load.d/vfio-pci.conf
+echo "vfio_iommu_type1" | sudo tee /etc/modules-load.d/vfio_iommu_type1.conf
+```
+
+### 3. Device permissions
+
+The launcher needs root (or `CAP_NET_ADMIN`) to bind/unbind GPU drivers and configure TAP networking:
+
+```shell
+# Option A: run as root (simplest)
+sudo openshell-vm --gpu
+
+# Option B: set udev rules for /dev/vfio/ access (still needs sysfs write via root)
+echo 'SUBSYSTEM=="vfio", OWNER="root", GROUP="kvm", MODE="0660"' | \
+  sudo tee /etc/udev/rules.d/99-vfio.rules
+sudo udevadm control --reload-rules
+sudo usermod -aG kvm $USER
+```
+
+### What the launcher does automatically
+
+When `--gpu` is passed, the launcher performs the following steps that previously required manual intervention:
+
+1. **Identifies NVIDIA GPUs** via sysfs (`/sys/bus/pci/devices/*/vendor`)
+2. **Runs safety checks** — display, active processes, IOMMU, VFIO modules (see Safety checks above)
+3. **Unbinds from nvidia** — writes to `/sys/bus/pci/devices/<BDF>/driver/unbind`
+4. **Sets driver override** — writes `vfio-pci` to `/sys/bus/pci/devices/<BDF>/driver_override`
+5. **Binds to vfio-pci** — writes to `/sys/bus/pci/drivers/vfio-pci/bind`
+6. **Handles IOMMU group peers** — binds other devices in the same IOMMU group to `vfio-pci`
+7. **On shutdown** — reverses all bindings, clears `driver_override`, rebinds to `nvidia`
+
+## Single-GPU caveats
+
+When the host has only one NVIDIA GPU:
+
+- **Display manager prompt.** When the GPU drives an active display or is held by a display server (Xorg, Wayland compositor), the launcher detects this and prompts the user interactively:
+
+  ```text
+  WARNING: GPU 0000:2d:00.0 is in use by the display manager.
+    Display server processes: Xorg (PID 1234)
+    Active display outputs are connected to this GPU.
+
+  Stopping the display manager will terminate your graphical session.
+  You will lose access to any open GUI applications.
+
+  The display manager will be restarted automatically when the VM exits.
+  Stop display-manager and proceed with GPU passthrough? [y/N]
+  ```
+
+  If the user confirms, the launcher runs `systemctl stop display-manager`, waits for Xorg to release the GPU, then proceeds with VFIO binding. A `DisplayManagerGuard` ensures that `systemctl start display-manager` is called when the VM exits (clean shutdown, Ctrl+C, error, or panic). In non-interactive mode (stdin is not a TTY), the prompt is skipped and the launcher exits with an error instructing the user to stop the display manager manually.
+- **Recovery is automatic.** When the VM exits (clean shutdown, Ctrl+C, or process crash), the launcher rebinds the GPU to the `nvidia` driver, clears `driver_override`, and restarts the display manager if it was stopped. No manual intervention is needed.
+- **Process check.** If non-display CUDA processes are also using the GPU (visible via `/dev/nvidia*` file descriptors), the prompt warns about those processes too. The launcher lists all PIDs and process names so the user can make an informed decision.
+
+## Supported GPUs
+
+GPU passthrough is validated with NVIDIA data center GPUs. Consumer GPUs may work but are not officially supported (NVIDIA restricts GeForce passthrough in some driver versions).
+
+| GPU | Architecture | Compute Capability | Status |
+|-----|-------------|-------------------|--------|
+| A100 | Ampere | 8.0 | Supported |
+| A30 | Ampere | 8.0 | Supported |
+| H100 | Hopper | 9.0 | Supported |
+| H200 | Hopper | 9.0 | Supported |
+| L40 | Ada Lovelace | 8.9 | Supported |
+| L40S | Ada Lovelace | 8.9 | Supported |
+| L4 | Ada Lovelace | 8.9 | Supported |
+
+## GPU build pipeline
+
+GPU passthrough requires NVIDIA kernel modules compiled against the VM kernel. The full build pipeline is:
+
+```shell
+# 1. Build kernel from source (needed for module compilation)
+FROM_SOURCE=1 mise run vm:setup
+
+# 2. Compile NVIDIA .ko files against the VM kernel
+mise run vm:nvidia-modules
+
+# 3. Build GPU rootfs and inject kernel modules
+mise run vm:rootfs -- --base --gpu
+
+# 4. Compile binary and package runtime
+mise run vm:build
+```
+
+### NVIDIA kernel module build (`vm:nvidia-modules`)
+
+The `build-nvidia-modules.sh` script clones [NVIDIA/open-gpu-kernel-modules](https://github.com/NVIDIA/open-gpu-kernel-modules) at the tag pinned by `NVIDIA_DRIVER_TAG` in `pins.env` and compiles the open kernel modules against the VM kernel source tree produced by `build-libkrun.sh`.
+
+The driver tag must match the exact version of `nvidia-headless-570-open` installed in the guest rootfs. A mismatch causes "API mismatch" errors from `nvidia-smi`. The current pin is `570.211.01`.
+
+The build produces these modules:
+
+| Module | Purpose |
+|--------|---------|
+| `nvidia.ko` | Core GPU driver |
+| `nvidia-uvm.ko` | Unified Virtual Memory (CUDA managed memory) |
+| `nvidia-modeset.ko` | Display mode setting |
+| `nvidia-drm.ko` | DRM/KMS integration |
+| `nvidia-peermem.ko` | GPUDirect RDMA (optional) |
+
+### Module injection (`vm:rootfs --gpu`)
+
+When `build-rootfs.sh` runs with `--gpu`, it:
+
+1. Reads `kernel-version.txt` (exported by `build-libkrun.sh`) to determine the kernel release string.
+2. Copies `.ko` files from `target/libkrun-build/nvidia-modules/` into the rootfs at `/lib/modules/<version>/kernel/drivers/video/nvidia/`.
+3. Runs `depmod` to generate module dependency metadata so `modprobe` works at boot.
+
+The VM init script loads `nvidia`, `nvidia_uvm`, and `nvidia_modeset` during boot when `GPU_ENABLED=true` is set on the kernel command line.
+
+## CLI usage
+
+### Auto-select GPU
+
+```shell
+# openshell-vm binary (VM backend directly)
+sudo openshell-vm --gpu
+
+# openshell CLI (gateway deployment — requires VM backend)
+OPENSHELL_GATEWAY_BACKEND=vm sudo openshell gateway start --gpu
+```
+
+> **Note:** The default gateway backend is Docker (containers). GPU passthrough
+> requires the VM backend. Set `OPENSHELL_GATEWAY_BACKEND=vm` (or `microvm`)
+> to use the VM path with `openshell gateway start`.
+
+### Specific PCI address (multi-GPU hosts)
+
+```shell
+sudo openshell-vm --gpu 0000:41:00.0
+```
+
+### Backend selection
+
+The `--backend` flag controls hypervisor selection independently of `--gpu`:
+
+```shell
+sudo openshell-vm --gpu                           # auto: selects QEMU for GPU
+sudo openshell-vm --backend qemu                   # explicit QEMU, no GPU
+sudo openshell-vm --gpu --backend qemu             # force QEMU with GPU
+sudo openshell-vm --backend libkrun                # explicit libkrun (no GPU support)
+```
+
+### Diagnostics
+
+When `--gpu` is passed, the launcher runs safety checks before unbinding. If
+checks fail, it exits with an actionable error:
+
+```text
+$ sudo openshell-vm --gpu
+GPU passthrough blocked by safety checks.
+
+  Detected devices:
+    0000:41:00.0: has active display outputs
+    0000:42:00.0: in use by PIDs: 12345 (python3), 12400 (nvidia-smi)
+
+  No GPU is available for passthrough.
+```
+
+On a headless server with an idle GPU, the pre-unbind preparation runs first:
+
+```text
+$ sudo openshell-vm --gpu
+GPU 0000:41:00.0: disabled nvidia persistence mode
+GPU 0000:41:00.0: unloaded nvidia_uvm
+GPU 0000:41:00.0: unloaded nvidia_drm
+GPU 0000:41:00.0: unloaded nvidia_modeset
+GPU 0000:41:00.0: device already unbound after nvidia module cleanup
+GPU: binding 0000:41:00.0 for VFIO passthrough
+```
+
+On shutdown (Ctrl+C or VM exit), the original driver is restored:
+
+```text
+^C
+GPU: restoring 0000:41:00.0 (cleanup)
+GPU: rebinding 0000:41:00.0 to nvidia
+```
+
+## VM Networking (QEMU)
+
+QEMU uses TAP-based networking instead of the gvproxy user-mode networking used by the libkrun backend. This has several implications for connectivity and port forwarding.
+
+### Network topology
+
+```
+Host                                   Guest (microVM)
+─────────────────────────────────────  ──────────────────────────
+  eth0 (or primary NIC)                  eth0 (virtio-net)
+  ↕                                      ↕
+  iptables MASQUERADE ←── NAT ──→        192.168.249.2/24
+  ↕                                      ↕ default gw 192.168.249.1
+  vmtap0 (TAP device)                   ↕
+  192.168.249.1/24 ←─── L2 bridge ──→   (kernel routes)
+                                         ↕
+  127.0.0.1:{port} ←── TCP proxy ──→    {port} (k3s NodePort)
+```
+
+### How it works
+
+The QEMU backend configures networking in three layers:
+
+**1. TAP device and guest IP assignment**
+
+QEMU creates a TAP device on the host side with IP `192.168.249.1/24`. The guest is assigned `192.168.249.2/24` via kernel command line parameters (`VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). The init script reads these from `/proc/cmdline` and uses them as the static fallback when DHCP is unavailable.
+
+**2. Host-side NAT and IP forwarding**
+
+After booting the VM, the launcher:
+- Enables IP forwarding (`/proc/sys/net/ipv4/ip_forward`)
+- Adds iptables MASQUERADE rules for the `192.168.249.0/24` subnet
+- Adds FORWARD rules to allow traffic to/from the VM
+
+This gives the guest internet access through the host. Rules are cleaned up on VM shutdown.
+
+**3. TCP port forwarding**
+
+Unlike gvproxy (which provides built-in port forwarding), TAP networking requires explicit port forwarding. The launcher starts a userspace TCP proxy for each port mapping (e.g., `30051:30051`). The proxy binds to `127.0.0.1:{host_port}` and forwards connections to `192.168.249.2:{guest_port}`.
+
+### DNS resolution
+
+The launcher detects the host's upstream DNS server using a two-step lookup:
+
+1. Reads `/etc/resolv.conf` and picks the first nameserver that does not start with `127.` (skipping systemd-resolved's `127.0.0.53` stub and other loopback addresses).
+2. If all nameservers in `/etc/resolv.conf` are loopback, falls back to `/run/systemd/resolve/resolv.conf` (the upstream resolv.conf maintained by systemd-resolved).
+3. If no non-loopback nameserver is found in either file, falls back to `8.8.8.8`.
+
+The resolved DNS server is passed to the guest via `VM_NET_DNS=` on the kernel command line. The init script writes it to `/etc/resolv.conf` inside the guest, unconditionally overriding any stale entries from previous boot cycles.
+
+### Key constants
+
+| Constant | Value | Purpose |
+|----------|-------|---------|
+| `TAP_HOST_IP` | `192.168.249.1` | Host side of the TAP device |
+| `TAP_GUEST_IP` | `192.168.249.2` | Guest static IP |
+| `TAP_SUBNET` | `192.168.249.0/24` | Subnet for iptables rules |
+| `TAP_NETMASK` | `255.255.255.0` | Subnet mask in VM payload |
+
+### Differences from libkrun/gvproxy networking
+
+| Feature | libkrun + gvproxy | QEMU + TAP |
+|---------|------------------|-----------|
+| Network mode | User-mode (SLIRP-like) | Kernel TAP device |
+| DHCP | Built-in (gvproxy) | None (static IP via cmdline) |
+| Guest IP | `192.168.127.2/24` | `192.168.249.2/24` |
+| Port forwarding | Built-in (gvproxy `-forward`) | Userspace TCP proxy |
+| Privileges | Unprivileged | Root or `CAP_NET_ADMIN` |
+| NAT | Handled by gvproxy | iptables MASQUERADE |
+| DNS | gvproxy provides | Host resolver passed via cmdline |
+
+### Troubleshooting networking
+
+**"lookup registry-1.docker.io: Try again" (DNS failure)**
+
+The VM cannot resolve DNS. Check:
+
+```shell
+# Verify the host DNS is non-loopback
+grep nameserver /etc/resolv.conf
+# If only 127.0.0.53 (systemd-resolved), find the upstream:
+resolvectl status | grep 'DNS Servers'
+
+# Verify iptables rules are in place
+sudo iptables -t nat -L POSTROUTING -n -v | grep 192.168.249
+sudo iptables -L FORWARD -n -v | grep 192.168.249
+
+# Verify IP forwarding is enabled
+cat /proc/sys/net/ipv4/ip_forward
+```
+
+**Gateway health check fails (port 30051 unreachable)**
+
+The TCP port forwarder may not have started, or the guest service is not yet listening:
+
+```shell
+# Check if the port forwarder is bound on the host
+ss -tlnp | grep 30051
+
+# Check if the guest is reachable
+ping -c1 192.168.249.2
+```
+
+### Host mTLS cache and state disk
+
+The launcher caches mTLS certificates on the host after the first successful boot (warm boot path). If the state disk is deleted or `--reset` is used, the VM generates new PKI that won't match the cached certs. The launcher detects this — when the state disk is freshly created or reset, it clears the stale host mTLS cache and runs the cold-boot PKI fetch path. This prevents `transport error` failures on the gateway health check after a state disk reset.
+
+## Troubleshooting
+
+### "no NVIDIA PCI device found"
+
+The host has no NVIDIA GPU installed, or the PCI device is not visible:
+
+```shell
+lspci -nn | grep -i nvidia
+# If empty, the GPU is not detected at the PCI level
+```
+
+### "has active display outputs" / "in use by display manager"
+
+The GPU drives a DRM framebuffer or is held by a display server (Xorg, Wayland compositor). If running interactively, the launcher prompts to stop the display manager. If running non-interactively or the user declines, options:
+
+- Use a different GPU for the monitor (iGPU, secondary card)
+- Stop the display manager manually: `sudo systemctl stop display-manager`
+- On headless servers, this should not occur — verify with `ls /sys/class/drm/card*/device`
+
+### "in use by PIDs: ..."
+
+Active non-display processes hold `/dev/nvidia*` file descriptors. The check is host-wide (across all NVIDIA GPUs, not per-device). The launcher lists the PIDs and process names. Stop those processes before retrying. If the only processes are display servers (Xorg, gnome-shell, etc.), the launcher will offer to stop the display manager instead.
+
+### "IOMMU not enabled or device has no IOMMU group"
+
+IOMMU must be enabled in both BIOS/UEFI and kernel cmdline. See Host Preparation above.
+
+### "VFIO kernel modules not loaded"
+
+```shell
+sudo modprobe vfio-pci
+sudo modprobe vfio_iommu_type1
+```
+
+### "insufficient sysfs permissions — run as root"
+
+The launcher needs root to write to sysfs bind/unbind paths. Run with `sudo`.
+
+### GPU not rebound after crash
+
+If the launcher process is killed with `SIGKILL` (kill -9), the cleanup handler cannot run and the GPU remains on `vfio-pci`. Manually rebind:
+
+```shell
+PCI_ADDR="0000:41:00.0"
+echo "$PCI_ADDR" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver/unbind
+echo "" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver_override
+echo "$PCI_ADDR" | sudo tee /sys/bus/pci/drivers/nvidia/bind
+```
+
+### nvidia driver unbind deadlock (kernel bug)
+
+Some nvidia driver versions deadlock in their sysfs `unbind` handler — the `write()` syscall to `/sys/bus/pci/drivers/nvidia/unbind` never returns. When this happens, the subprocess enters uninterruptible sleep (D state) and becomes unkillable even by `SIGKILL`. The GPU's PCI subsystem state is corrupted and all subsequent PCI operations on the device hang. Only a host reboot clears this state.
+
+This is a kernel/nvidia driver bug, not an openshell-vm issue. Three mitigation layers are in place:
+
+1. **Pre-unbind preparation**: Before the raw sysfs unbind, the launcher disables nvidia persistence mode (`nvidia-smi -pm 0`) and unloads nvidia submodules (`nvidia_uvm`, `nvidia_drm`, `nvidia_modeset`) via `modprobe -r`. This often cascade-removes the base nvidia module entirely, unbinding the device automatically without ever touching the dangerous sysfs path.
+
+2. **Subprocess isolation with timeout**: All sysfs writes (and the nvidia prep commands) run in a subprocess with a timeout (10s for sysfs, 15s for prep). On timeout, the subprocess is killed and dropped without calling `wait()` — preventing the parent process from being dragged into D-state.
+
+3. **Post-timeout verification**: If the unbind subprocess times out but the device is actually unbound at the hardware level (which the nvidia bug can cause — the operation completes but the syscall never returns), the launcher detects this and continues with the VFIO bind.
+
+If you hit this issue repeatedly, check for nvidia driver updates or file a bug with NVIDIA.
+
+### VM boots but `nvidia-smi` fails inside guest
+
+- Verify the GPU rootfs includes NVIDIA drivers: `chroot /path/to/rootfs which nvidia-smi`
+- Check that NVIDIA kernel modules load: `openshell-vm exec <name> -- lsmod | grep nvidia`
+- Inspect dmesg for NVIDIA driver errors: `openshell-vm exec <name> -- dmesg | grep -i nvidia`
+
+## Related
+
+- [Custom VM Runtime](custom-vm-runtime.md) — building and customizing the libkrun VM runtime
+- [System Architecture](system-architecture.md) — overall OpenShell architecture
+- Implementation:
+  - [`crates/openshell-vfio/src/lib.rs`](../crates/openshell-vfio/src/lib.rs) — GPU binding and VFIO setup
+  - [`crates/openshell-vm/src/backend/qemu.rs`](../crates/openshell-vm/src/backend/qemu.rs) — QEMU backend
diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
index be086e534..e62a6e13d 100644
--- a/crates/openshell-bootstrap/src/docker.rs
+++ b/crates/openshell-bootstrap/src/docker.rs
@@ -26,12 +26,13 @@ const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell";
 /// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a
 /// concrete device ID based on whether CDI is enabled on the daemon.
 ///
-/// | Input        | Output                                                       |
-/// |--------------|--------------------------------------------------------------|
-/// | `[]`         | `[]`  — no GPU                                               |
-/// | `["legacy"]` | `["legacy"]`  — pass through to the non-CDI fallback path    |
-/// | `["auto"]`   | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]`   |
-/// | `[cdi-ids…]` | unchanged                                                    |
+/// | Input               | Output                                                       |
+/// |---------------------|--------------------------------------------------------------|
+/// | `[]`                | `[]`  — no GPU                                               |
+/// | `["vm-passthrough"]`| `["vm-passthrough"]`  — GPU via QEMU/VFIO, no Docker device  |
+/// | `["legacy"]`        | `["legacy"]`  — pass through to the non-CDI fallback path    |
+/// | `["auto"]`          | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]`   |
+/// | `[cdi-ids…]`        | unchanged                                                    |
 pub(crate) fn resolve_gpu_device_ids(gpu: &[String], cdi_enabled: bool) -> Vec<String> {
     match gpu {
         [] => vec![],
@@ -622,6 +623,11 @@ pub async fn ensure_container(
     //                  Docker resolves them against the host CDI spec at /etc/cdi/
     match device_ids {
         [] => {}
+        [id] if id == "vm-passthrough" => {
+            // GPU passthrough is handled by QEMU/VFIO inside the container,
+            // not by Docker. No DeviceRequest needed — GPU_ENABLED=true
+            // (set below) deploys the NVIDIA device plugin in k3s.
+        }
         [id] if id == "legacy" => {
             host_config.device_requests = Some(vec![DeviceRequest {
                 driver: Some("nvidia".to_string()),
@@ -1436,6 +1442,13 @@ mod tests {
         );
     }
 
+    #[test]
+    fn resolve_gpu_vm_passthrough() {
+        let ids = vec!["vm-passthrough".to_string()];
+        assert_eq!(resolve_gpu_device_ids(&ids, true), ids);
+        assert_eq!(resolve_gpu_device_ids(&ids, false), ids);
+    }
+
     #[test]
     fn resolve_gpu_cdi_ids_passthrough() {
         let ids = vec!["nvidia.com/gpu=all".to_string()];
diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml
index b3a006fdd..20ba1e5f7 100644
--- a/crates/openshell-cli/Cargo.toml
+++ b/crates/openshell-cli/Cargo.toml
@@ -21,6 +21,7 @@ openshell-policy = { path = "../openshell-policy" }
 openshell-providers = { path = "../openshell-providers" }
 openshell-prover = { path = "../openshell-prover" }
 openshell-tui = { path = "../openshell-tui" }
+openshell-vfio = { path = "../openshell-vfio" }
 serde = { workspace = true }
 serde_json = { workspace = true }
 prost-types = { workspace = true }
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 292922411..05d1fb7c1 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -807,18 +807,21 @@ enum GatewayCommands {
         #[arg(long, env = "OPENSHELL_REGISTRY_TOKEN")]
         registry_token: Option<String>,
 
-        /// Enable NVIDIA GPU passthrough.
+        /// Enable NVIDIA GPU support for the gateway cluster.
         ///
-        /// Passes all host GPUs into the cluster container and deploys the
-        /// NVIDIA k8s-device-plugin so Kubernetes workloads can request
-        /// `nvidia.com/gpu` resources. Requires NVIDIA drivers and the
-        /// NVIDIA Container Toolkit on the host.
+        /// **Docker path (default):** passes GPUs into the gateway container via
+        /// the NVIDIA Container Toolkit — CDI when the daemon supports it, else
+        /// Docker's `--gpus all` — and deploys the NVIDIA device plugin. Use
+        /// `--gpu` or `--gpu auto` only; PCI addresses are not valid CDI device
+        /// names on this path.
         ///
-        /// When enabled, OpenShell auto-selects CDI when the Docker daemon has
-        /// CDI enabled and falls back to Docker's NVIDIA GPU request path
-        /// (`--gpus all`) otherwise.
-        #[arg(long)]
-        gpu: bool,
+        /// **MicroVM path:** set `OPENSHELL_GATEWAY_BACKEND=vm` for deployments
+        /// that use the VM gateway. Then you may pass `--gpu` / `--gpu auto` for
+        /// VFIO auto-select, or `--gpu 0000:41:00.0` (PCI BDF) for a specific GPU.
+        /// Requires IOMMU and the GPU bound to `vfio-pci`. See
+        /// `architecture/vm-gpu-passthrough.md`.
+        #[arg(long, num_args = 0..=1, default_missing_value = "auto")]
+        gpu: Option<String>,
     },
 
     /// Stop the gateway (preserves state).
@@ -1129,10 +1132,9 @@ enum SandboxCommands {
         /// Request GPU resources for the sandbox.
         ///
         /// When no gateway is running, auto-bootstrap starts a GPU-enabled
-        /// gateway using the same automatic injection selection as
-        /// `openshell gateway start --gpu`. GPU intent is also inferred
-        /// automatically for known GPU-designated image names such as
-        /// `nvidia-gpu`.
+        /// gateway using the Docker NVIDIA path (`--gpu auto`), same as
+        /// `openshell gateway start --gpu` without the microVM backend. GPU
+        /// intent is also inferred for known GPU image names (e.g. `nvidia-gpu`).
         #[arg(long)]
         gpu: bool,
 
@@ -1655,12 +1657,11 @@ async fn main() -> Result<()> {
                 registry_token,
                 gpu,
             } => {
-                let gpu = if gpu {
-                    vec!["auto".to_string()]
-                } else {
-                    vec![]
+                let gpu = match gpu {
+                    Some(val) => vec![val],
+                    None => vec![],
                 };
-                run::gateway_admin_deploy(
+                let _gpu_guard = run::gateway_admin_deploy(
                     &name,
                     remote.as_deref(),
                     ssh_key.as_deref(),
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index c41b53518..a104ace4d 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1434,7 +1434,9 @@ pub async fn gateway_admin_deploy(
     registry_username: Option<&str>,
     registry_token: Option<&str>,
     gpu: Vec<String>,
-) -> Result<()> {
+) -> Result<Option<openshell_vfio::GpuBindGuard>> {
+    let (gpu, gpu_guard) = prepare_gateway_deploy_gpu(gpu, remote.as_deref())?;
+
     let location = if remote.is_some() { "remote" } else { "local" };
 
     // Build remote options once so we can reuse them for the existence check
@@ -1457,7 +1459,7 @@ pub async fn gateway_admin_deploy(
                     "{} Gateway '{name}' is already running.",
                     "✓".green().bold()
                 );
-                return Ok(());
+                return Ok(gpu_guard);
             }
         }
     }
@@ -1518,7 +1520,7 @@ pub async fn gateway_admin_deploy(
     save_active_gateway(name)?;
     eprintln!("{} Active gateway set to '{name}'", "✓".green().bold());
 
-    Ok(())
+    Ok(gpu_guard)
 }
 
 /// Resolve the remote SSH destination for a gateway.
@@ -5193,6 +5195,125 @@ fn format_timestamp_ms(ms: i64) -> String {
     }
 }
 
+/// Environment variable selecting the gateway deployment backend for GPU checks.
+///
+/// VFIO sysfs probes apply only to the microVM (`openshell-vm`) deploy path.
+/// The default `openshell gateway start` flow uses Docker with the NVIDIA
+/// Container Toolkit; leave this unset for that path.
+const OPENSHELL_GATEWAY_BACKEND_ENV: &str = "OPENSHELL_GATEWAY_BACKEND";
+
+fn gateway_deploy_uses_vm_backend() -> bool {
+    std::env::var(OPENSHELL_GATEWAY_BACKEND_ENV)
+        .ok()
+        .map(|v| {
+            matches!(
+                v.trim().to_ascii_lowercase().as_str(),
+                "vm" | "microvm" | "openshell-vm"
+            )
+        })
+        .unwrap_or(false)
+}
+
+/// Heuristic: value looks like a PCI domain:bus:dev.fn address (Linux sysfs BDF).
+fn looks_like_pci_bdf(s: &str) -> bool {
+    let s = s.trim();
+    let rest = if let Some((prefix, after_colon)) = s.split_once(':') {
+        if prefix.len() == 4 && prefix.chars().all(|c| c.is_ascii_hexdigit()) {
+            after_colon
+        } else {
+            s
+        }
+    } else {
+        return false;
+    };
+
+    let Some((bus, dev_fn)) = rest.split_once(':') else {
+        return false;
+    };
+    if bus.len() != 2 || !bus.chars().all(|c| c.is_ascii_hexdigit()) {
+        return false;
+    }
+    let Some((dev, func)) = dev_fn.split_once('.') else {
+        return false;
+    };
+    if dev.len() != 2 || !dev.chars().all(|c| c.is_ascii_hexdigit()) {
+        return false;
+    }
+    if func.len() != 1 || !func.chars().all(|c| ('0'..='7').contains(&c)) {
+        return false;
+    }
+    true
+}
+
+/// Validate `--gpu` for `gateway start`, run VFIO checks only for the VM deploy path,
+/// and normalize Docker-path requests to CDI-compatible `auto`.
+fn prepare_gateway_deploy_gpu(
+    gpu: Vec<String>,
+    remote: Option<&str>,
+) -> Result<(Vec<String>, Option<openshell_vfio::GpuBindGuard>)> {
+    if gpu.is_empty() {
+        return Ok((gpu, None));
+    }
+
+    if gateway_deploy_uses_vm_backend() {
+        if remote.is_none() {
+            let guard = check_gpu_readiness(&gpu)?;
+            // Signal that GPU is enabled but passthrough is handled by QEMU/VFIO,
+            // not by Docker CDI. The bootstrap sets GPU_ENABLED=true for the
+            // k3s NVIDIA device plugin but skips Docker DeviceRequests.
+            let updated_gpu = vec!["vm-passthrough".to_string()];
+            return Ok((updated_gpu, Some(guard)));
+        } else {
+            eprintln!(
+                "{} Local VFIO GPU probe skipped (--remote): GPU readiness is checked on the remote host during deployment.",
+                "ℹ".cyan().bold()
+            );
+        }
+        return Ok((gpu, None));
+    }
+
+    let Some(first) = gpu.first() else {
+        return Ok((gpu, None));
+    };
+    if first.as_str() != "auto" {
+        if looks_like_pci_bdf(first) {
+            return Err(miette!(
+                "PCI address GPU selection ({first}) is only supported for the microVM gateway backend.\n\n\
+                 `openshell gateway start` uses Docker by default (NVIDIA Container Toolkit / CDI, or Docker `--gpus all`). \
+                 Use `--gpu` or `--gpu auto` for that path.\n\n\
+                 For VFIO passthrough, set {}=vm and follow architecture/vm-gpu-passthrough.md.",
+                OPENSHELL_GATEWAY_BACKEND_ENV,
+            ));
+        }
+        return Err(miette!(
+            "Unrecognized --gpu value `{first}` for Docker gateway deploy. Use `--gpu` or `--gpu auto`.",
+        ));
+    }
+
+    Ok((vec!["auto".to_string()], None))
+}
+
+/// Bind a GPU for VFIO passthrough and return an RAII guard that restores it on drop.
+fn check_gpu_readiness(gpu: &[String]) -> Result<openshell_vfio::GpuBindGuard> {
+    use openshell_vfio::{GpuBindGuard, prepare_gpu_for_passthrough};
+
+    let requested_addr = gpu
+        .first()
+        .filter(|v| v.as_str() != "auto")
+        .map(|v| v.as_str());
+
+    let bind_state = prepare_gpu_for_passthrough(requested_addr).map_err(|e| miette!("{e}"))?;
+
+    eprintln!(
+        "{} GPU {} bound to vfio-pci (was: {})",
+        "✓".green().bold(),
+        bind_state.pci_addr,
+        bind_state.original_driver,
+    );
+
+    Ok(GpuBindGuard::new(bind_state))
+}
+
 #[cfg(test)]
 mod tests {
     use super::{
@@ -5416,6 +5537,16 @@ mod tests {
         assert!(sandbox_should_persist(false, Some(&spec)));
     }
 
+    #[test]
+    fn looks_like_pci_bdf_recognizes_sysfs_addresses() {
+        assert!(super::looks_like_pci_bdf("0000:41:00.0"));
+        assert!(super::looks_like_pci_bdf("41:00.0"));
+        assert!(super::looks_like_pci_bdf(" 0a:1f.7 "));
+        assert!(!super::looks_like_pci_bdf("auto"));
+        assert!(!super::looks_like_pci_bdf("nvidia.com/gpu=all"));
+        assert!(!super::looks_like_pci_bdf("00:00.8")); // invalid function
+    }
+
     #[test]
     fn image_requests_gpu_matches_known_gpu_image_names() {
         for image in [
diff --git a/crates/openshell-vfio/Cargo.toml b/crates/openshell-vfio/Cargo.toml
new file mode 100644
index 000000000..d4c4f32de
--- /dev/null
+++ b/crates/openshell-vfio/Cargo.toml
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+[package]
+name = "openshell-vfio"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+repository.workspace = true
+description = "Host-side NVIDIA GPU VFIO bind/unbind for VM passthrough"
+
+[dependencies]
+nix = { workspace = true }
+
+[dev-dependencies]
+tempfile = "3"
+
+[lints]
+workspace = true
diff --git a/crates/openshell-vfio/src/lib.rs b/crates/openshell-vfio/src/lib.rs
new file mode 100644
index 000000000..675928db7
--- /dev/null
+++ b/crates/openshell-vfio/src/lib.rs
@@ -0,0 +1,2935 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Host-side NVIDIA GPU VFIO bind/unbind for VM passthrough.
+
+//!
+//! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs
+//! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU
+//! group cleanliness — the prerequisites for passing a physical GPU into
+//! a VM via VFIO.
+//!
+//! Returns per-device readiness for multi-GPU hosts.
+//!
+//! On non-Linux platforms, probing returns an empty list.
+
+use std::fmt;
+use std::path::PathBuf;
+use std::time::Duration;
+
+/// Per-device readiness state for NVIDIA GPU VFIO passthrough.
+///
+/// Each variant represents a distinct readiness state for a single PCI device.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum HostNvidiaVfioReadiness {
+    /// The current platform does not support VFIO passthrough (non-Linux).
+    UnsupportedPlatform,
+
+    /// No PCI device with NVIDIA vendor ID (`0x10de`) was found.
+    NoNvidiaDevice,
+
+    /// An NVIDIA device exists but is bound to the nvidia (or other non-VFIO) driver.
+    BoundToNvidia,
+
+    /// An NVIDIA device is bound to `vfio-pci` and its IOMMU group is clean — ready for passthrough.
+    VfioBoundReady,
+
+    /// An NVIDIA device is bound to `vfio-pci` but its IOMMU group contains
+    /// devices not bound to `vfio-pci`, which prevents safe passthrough.
+    VfioBoundDirtyGroup,
+
+    /// Some NVIDIA devices are bound to `vfio-pci` while others use
+    /// a different driver (mixed fleet).
+    MixedVfioAndOther,
+}
+
+impl fmt::Display for HostNvidiaVfioReadiness {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::UnsupportedPlatform => write!(
+                f,
+                "VFIO passthrough is not supported on this platform (Linux required)"
+            ),
+            Self::NoNvidiaDevice => write!(f, "no NVIDIA PCI device found"),
+            Self::BoundToNvidia => {
+                write!(f, "NVIDIA device found but not bound to vfio-pci driver")
+            }
+            Self::VfioBoundReady => write!(
+                f,
+                "NVIDIA device bound to vfio-pci and IOMMU group is clean"
+            ),
+            Self::VfioBoundDirtyGroup => write!(
+                f,
+                "NVIDIA device bound to vfio-pci but IOMMU group contains non-VFIO devices"
+            ),
+            Self::MixedVfioAndOther => write!(
+                f,
+                "some NVIDIA devices are on vfio-pci while others use a different driver"
+            ),
+        }
+    }
+}
+
+const NVIDIA_VENDOR_ID: &str = "0x10de";
+
+#[cfg(target_os = "linux")]
+const SYSFS_WRITE_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Reject sysfs data containing characters outside the safe set for shell
+/// interpolation. All legitimate sysfs writes in this crate use PCI BDF
+/// addresses, driver names, or single digits — this blocks anything else.
+#[cfg(target_os = "linux")]
+fn validate_sysfs_data(data: &str) -> Result<(), std::io::Error> {
+    if data.is_empty()
+        || data
+            .bytes()
+            .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.' || b == b':')
+    {
+        Ok(())
+    } else {
+        Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!("sysfs data contains unexpected characters: {data:?}"),
+        ))
+    }
+}
+
+#[cfg(target_os = "linux")]
+fn sysfs_write_with_timeout(
+    path: &std::path::Path,
+    data: &str,
+    timeout: Duration,
+) -> Result<(), std::io::Error> {
+    use std::process::{Command, Stdio};
+    use std::thread;
+
+    if data.is_empty() {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!(
+                "sysfs_write_with_timeout called with empty data for {}",
+                path.display()
+            ),
+        ));
+    }
+    validate_sysfs_data(data)?;
+
+    let mut child = Command::new("sh")
+        .arg("-c")
+        .arg(format!(
+            r#"printf '%s' '{}' > '{}'"#,
+            data.replace('\'', "'\\''"),
+            path.display().to_string().replace('\'', "'\\''")
+        ))
+        .stdin(Stdio::null())
+        .stdout(Stdio::null())
+        .stderr(Stdio::piped())
+        .spawn()
+        .map_err(|e| {
+            std::io::Error::new(
+                e.kind(),
+                format!(
+                    "failed to spawn sysfs write subprocess for {}: {e}",
+                    path.display()
+                ),
+            )
+        })?;
+
+    let poll_interval = Duration::from_millis(100);
+    let start = std::time::Instant::now();
+
+    loop {
+        match child.try_wait() {
+            Ok(Some(status)) => {
+                if status.success() {
+                    return Ok(());
+                }
+                let mut stderr_buf = String::new();
+                if let Some(mut stderr) = child.stderr.take() {
+                    use std::io::Read;
+                    let _ = stderr.read_to_string(&mut stderr_buf);
+                }
+                let hint = if stderr_buf.contains("Permission denied") {
+                    " — run as root"
+                } else {
+                    ""
+                };
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    format!(
+                        "sysfs write to {} failed (exit {}){hint}: {stderr_buf}",
+                        path.display(),
+                        status.code().unwrap_or(-1),
+                    ),
+                ));
+            }
+            Ok(None) => {
+                if start.elapsed() > timeout {
+                    let pid = child.id();
+                    let _ = child.kill();
+                    // CRITICAL: Do NOT call child.wait() here. If the child
+                    // is stuck in uninterruptible sleep (D-state) — which is
+                    // the nvidia unbind deadlock scenario — wait() will block
+                    // the parent indefinitely, making it unkillable too.
+                    //
+                    // Dropping the Child struct closes pipe handles but does
+                    // NOT wait. The zombie child is reparented to init and
+                    // reaped when/if it eventually exits.
+                    drop(child);
+                    return Err(std::io::Error::new(
+                        std::io::ErrorKind::TimedOut,
+                        format!(
+                            "sysfs write to {} timed out after {:.0}s (subprocess pid {pid}) — \
+                             possible nvidia driver deadlock. The subprocess may still be \
+                             stuck in kernel space; a reboot may be required to clear it.",
+                            path.display(),
+                            timeout.as_secs_f64(),
+                        ),
+                    ));
+                }
+                thread::sleep(poll_interval);
+            }
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+/// Check whether a PCI device supports MSI-X by walking the PCI capability
+/// list in the sysfs `config` file. MSI-X is capability ID `0x11`.
+///
+/// MSI-X support is tracked for informational purposes. QEMU handles
+/// devices with or without MSI-X via legacy interrupt emulation fallback.
+#[cfg(target_os = "linux")]
+pub fn check_msix_support(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    let config_path = sysfs.sys_bus_pci_devices().join(pci_addr).join("config");
+    let config = match std::fs::read(&config_path) {
+        Ok(data) => data,
+        Err(_) => return false,
+    };
+
+    // PCI config space: capability pointer at offset 0x34.
+    if config.len() < 0x35 {
+        return false;
+    }
+
+    // Status register (offset 0x06, bit 4) indicates capability list present.
+    if config.len() > 0x07 && (config[0x06] & 0x10) == 0 {
+        return false;
+    }
+
+    // PCI spec: capability pointers are DWORD-aligned (low 2 bits reserved).
+    let mut cap_ptr = (config[0x34] & 0xFC) as usize;
+    // Walk the capability linked list (max 48 iterations to avoid infinite loops).
+    for _ in 0..48 {
+        if cap_ptr == 0 || cap_ptr + 1 >= config.len() {
+            break;
+        }
+        let cap_id = config[cap_ptr];
+        if cap_id == 0x11 {
+            return true;
+        }
+        cap_ptr = (config[cap_ptr + 1] & 0xFC) as usize;
+    }
+    false
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn check_msix_support(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+/// Validates that `addr` matches the PCI BDF format `DDDD:BB:DD.F`.
+fn validate_pci_addr(addr: &str) -> Result<(), std::io::Error> {
+    let bytes = addr.as_bytes();
+    let valid = bytes.len() == 12
+        && bytes[4] == b':'
+        && bytes[7] == b':'
+        && bytes[10] == b'.'
+        && bytes[..4].iter().all(|b| b.is_ascii_hexdigit())
+        && bytes[5..7].iter().all(|b| b.is_ascii_hexdigit())
+        && bytes[8..10].iter().all(|b| b.is_ascii_hexdigit())
+        && bytes[11] >= b'0'
+        && bytes[11] <= b'7';
+    if valid {
+        Ok(())
+    } else {
+        Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!("invalid PCI address '{addr}': expected DDDD:BB:DD.F format"),
+        ))
+    }
+}
+
+/// Probe the host for NVIDIA GPU VFIO readiness by scanning Linux sysfs.
+///
+/// Returns a per-device list of `(pci_address, readiness)` tuples for every
+/// NVIDIA GPU found. On non-Linux platforms the list is empty.
+///
+/// On Linux, walks `/sys/bus/pci/devices/` and for each device:
+/// 1. Reads `vendor` to check for NVIDIA (`0x10de`).
+/// 2. Reads the `driver` symlink to determine which kernel driver is bound.
+/// 3. If bound to `vfio-pci`, inspects the `iommu_group/devices/` directory
+///    to verify all group members are also on `vfio-pci`.
+pub fn probe_host_nvidia_vfio_readiness() -> Vec<(String, HostNvidiaVfioReadiness)> {
+    #[cfg(not(target_os = "linux"))]
+    {
+        Vec::new()
+    }
+
+    #[cfg(target_os = "linux")]
+    {
+        probe_linux_sysfs()
+    }
+}
+
+#[cfg(target_os = "linux")]
+fn probe_linux_sysfs() -> Vec<(String, HostNvidiaVfioReadiness)> {
+    use std::fs;
+    use std::path::Path;
+
+    let pci_devices = Path::new("/sys/bus/pci/devices");
+    let entries = match fs::read_dir(pci_devices) {
+        Ok(e) => e,
+        Err(_) => return Vec::new(),
+    };
+
+    let mut results = Vec::new();
+
+    for entry in entries.filter_map(Result::ok) {
+        let dev_path = entry.path();
+
+        let vendor = match fs::read_to_string(dev_path.join("vendor")) {
+            Ok(v) => v.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+
+        if vendor != NVIDIA_VENDOR_ID {
+            continue;
+        }
+
+        let pci_addr = entry.file_name().to_string_lossy().to_string();
+
+        let driver_link = dev_path.join("driver");
+        let driver_name = fs::read_link(&driver_link).ok().and_then(|target| {
+            target
+                .file_name()
+                .map(|name| name.to_string_lossy().to_string())
+        });
+
+        let state = match driver_name.as_deref() {
+            Some("vfio-pci") => {
+                let iommu_group_devices = dev_path.join("iommu_group/devices");
+                let group_clean = match fs::read_dir(&iommu_group_devices) {
+                    Ok(group_entries) => group_entries.filter_map(Result::ok).all(|ge| {
+                        let peer_path = iommu_group_devices.join(ge.file_name()).join("driver");
+                        fs::read_link(&peer_path)
+                            .ok()
+                            .and_then(|t| t.file_name().map(|n| n.to_string_lossy().to_string()))
+                            .as_deref()
+                            == Some("vfio-pci")
+                    }),
+                    Err(_) => false,
+                };
+
+                if group_clean {
+                    HostNvidiaVfioReadiness::VfioBoundReady
+                } else {
+                    HostNvidiaVfioReadiness::VfioBoundDirtyGroup
+                }
+            }
+            _ => HostNvidiaVfioReadiness::BoundToNvidia,
+        };
+
+        results.push((pci_addr, state));
+    }
+
+    results
+}
+
+/// Returns whether any NVIDIA GPU is fully available for VM passthrough.
+///
+/// Requires `OPENSHELL_VM_GPU_E2E=1` to activate probing. When the env var
+/// is unset or not `"1"`, returns `false` unconditionally so non-GPU CI
+/// runners are never affected.
+///
+/// When activated, checks two conditions:
+/// 1. At least one NVIDIA device reports [`VfioBoundReady`].
+/// 2. The QEMU binary (`qemu-system-x86_64`) exists in `runtime_dir` or on PATH (if provided).
+pub fn nvidia_gpu_available_for_vm_passthrough(runtime_dir: Option<PathBuf>) -> bool {
+    if std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() != Ok("1") {
+        return false;
+    }
+
+    let has_vfio_ready = probe_host_nvidia_vfio_readiness()
+        .iter()
+        .any(|(_, state)| *state == HostNvidiaVfioReadiness::VfioBoundReady);
+
+    if !has_vfio_ready {
+        return false;
+    }
+
+    let has_qemu = runtime_dir
+        .map(|dir| dir.join("qemu-system-x86_64").is_file())
+        .unwrap_or(false);
+    let has_qemu_on_path = std::process::Command::new("qemu-system-x86_64")
+        .arg("--version")
+        .stdout(std::process::Stdio::null())
+        .stderr(std::process::Stdio::null())
+        .status()
+        .is_ok();
+
+    has_qemu || has_qemu_on_path
+}
+
+/// Sysfs root path, defaulting to "/" in production and a temp dir in tests.
+#[derive(Debug, Clone)]
+pub struct SysfsRoot(PathBuf);
+
+impl Default for SysfsRoot {
+    fn default() -> Self {
+        Self(PathBuf::from("/"))
+    }
+}
+
+impl SysfsRoot {
+    #[cfg(test)]
+    pub fn new(root: PathBuf) -> Self {
+        Self(root)
+    }
+
+    pub fn sys_bus_pci_devices(&self) -> PathBuf {
+        self.0.join("sys/bus/pci/devices")
+    }
+
+    pub fn sys_class_drm(&self) -> PathBuf {
+        self.0.join("sys/class/drm")
+    }
+
+    pub fn sys_module(&self, module: &str) -> PathBuf {
+        self.0.join("sys/module").join(module)
+    }
+
+    pub fn sys_bus_pci_drivers(&self, driver: &str) -> PathBuf {
+        self.0.join("sys/bus/pci/drivers").join(driver)
+    }
+
+    pub fn sys_kernel_iommu_groups(&self) -> PathBuf {
+        self.0.join("sys/kernel/iommu_groups")
+    }
+
+    fn is_real_sysfs(&self) -> bool {
+        self.0 == std::path::Path::new("/")
+    }
+
+    #[cfg(target_os = "linux")]
+    fn write_sysfs(&self, path: &std::path::Path, data: &str) -> Result<(), std::io::Error> {
+        if self.is_real_sysfs() {
+            if data.is_empty() {
+                // Clearing a sysfs attribute requires a direct write() syscall.
+                // Shell-based approaches (`printf '%s' '' > file`) produce zero
+                // bytes of output, and sysfs doesn't support truncation — so the
+                // kernel store function is never invoked and the attribute keeps
+                // its old value. A direct write("\n") always works: the kernel
+                // strips trailing newlines in store functions like
+                // driver_override_store(), resulting in an empty string that
+                // clears the attribute. Uses O_WRONLY only (no O_CREAT/O_TRUNC)
+                // for sysfs compatibility. This path does NOT use the timeout
+                // wrapper because clearing attributes never hangs — unlike driver
+                // unbind which can deadlock in nvidia's remove().
+                use std::io::Write;
+                let mut f = std::fs::OpenOptions::new()
+                    .write(true)
+                    .open(path)
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            e.kind(),
+                            format!("failed to open {} for clearing: {e}", path.display()),
+                        )
+                    })?;
+                f.write_all(b"\n").map_err(|e| {
+                    std::io::Error::new(
+                        e.kind(),
+                        format!("failed to write newline to {}: {e}", path.display()),
+                    )
+                })?;
+                return Ok(());
+            }
+            sysfs_write_with_timeout(path, data, SYSFS_WRITE_TIMEOUT)
+        } else {
+            std::fs::write(path, data).map_err(|e| {
+                std::io::Error::new(e.kind(), format!("failed to write {}: {e}", path.display()))
+            })
+        }
+    }
+}
+
+#[cfg(target_os = "linux")]
+pub fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    use std::fs;
+
+    let drm_dir = sysfs.sys_class_drm();
+    let entries = match fs::read_dir(&drm_dir) {
+        Ok(e) => e,
+        Err(_) => return false,
+    };
+
+    for entry in entries.filter_map(Result::ok) {
+        let name = entry.file_name().to_string_lossy().to_string();
+        if !name.starts_with("card") || name.contains('-') {
+            continue;
+        }
+
+        let card_dir = entry.path();
+        let device_link = card_dir.join("device");
+
+        let target = match fs::read_link(&device_link) {
+            Ok(t) => t,
+            Err(_) => continue,
+        };
+        if !target.to_string_lossy().ends_with(pci_addr) {
+            continue;
+        }
+
+        let boot_vga_path = card_dir.join("device").join("boot_vga");
+        if let Ok(val) = fs::read_to_string(&boot_vga_path) {
+            if val.trim() == "1" {
+                return true;
+            }
+        }
+
+        if let Ok(sub_entries) = fs::read_dir(&card_dir) {
+            for sub in sub_entries.filter_map(Result::ok) {
+                let sub_name = sub.file_name().to_string_lossy().to_string();
+                if sub_name.starts_with(&format!("{name}-")) {
+                    if let Ok(status) = fs::read_to_string(sub.path().join("status")) {
+                        if status.trim() == "connected" {
+                            return true;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    false
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+/// Checks whether any process on the host has an open handle to an NVIDIA GPU
+/// device (`/dev/nvidia*`). This is a host-wide check across ALL NVIDIA GPUs,
+/// not scoped to a single PCI address. Returns a list of (pid, comm) pairs.
+pub fn check_active_gpu_processes() -> std::io::Result<Vec<(u32, String)>> {
+    use std::fs;
+
+    let mut result = Vec::new();
+
+    let proc_dir = match fs::read_dir("/proc") {
+        Ok(d) => d,
+        Err(e) => {
+            return Err(std::io::Error::new(
+                e.kind(),
+                format!(
+                    "cannot scan /proc for active GPU processes: {e} — \
+                     refusing to unbind (fail-closed)"
+                ),
+            ));
+        }
+    };
+
+    for proc_entry in proc_dir.filter_map(Result::ok) {
+        let pid: u32 = match proc_entry.file_name().to_string_lossy().parse() {
+            Ok(p) => p,
+            Err(_) => continue,
+        };
+
+        let fd_dir = proc_entry.path().join("fd");
+        let fds = match fs::read_dir(&fd_dir) {
+            Ok(d) => d,
+            Err(_) => continue,
+        };
+
+        for fd_entry in fds.filter_map(Result::ok) {
+            if let Ok(target) = fs::read_link(fd_entry.path()) {
+                if target.to_string_lossy().starts_with("/dev/nvidia") {
+                    let comm = fs::read_to_string(format!("/proc/{pid}/comm"))
+                        .unwrap_or_default()
+                        .trim()
+                        .to_string();
+                    result.push((pid, comm));
+                    break;
+                }
+            }
+        }
+    }
+
+    Ok(result)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn check_active_gpu_processes() -> std::io::Result<Vec<(u32, String)>> {
+    Ok(vec![])
+}
+
+#[cfg(target_os = "linux")]
+pub fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    let iommu_groups = sysfs.sys_kernel_iommu_groups();
+    if !iommu_groups.is_dir() {
+        return false;
+    }
+    sysfs
+        .sys_bus_pci_devices()
+        .join(pci_addr)
+        .join("iommu_group")
+        .exists()
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn check_iommu_enabled(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+pub fn check_vfio_modules_loaded(sysfs: &SysfsRoot) -> bool {
+    sysfs.sys_module("vfio_pci").is_dir() && sysfs.sys_module("vfio_iommu_type1").is_dir()
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn check_vfio_modules_loaded(_sysfs: &SysfsRoot) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+pub fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    use nix::unistd::{AccessFlags, access};
+
+    let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr);
+    let driver_override = dev_dir.join("driver_override");
+    let unbind = dev_dir.join("driver/unbind");
+    let bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind");
+
+    let writable = |path: &std::path::Path| -> bool { access(path, AccessFlags::W_OK).is_ok() };
+
+    let unbind_ok = !unbind.exists() || writable(&unbind);
+    writable(&driver_override) && unbind_ok && writable(&bind)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn check_sysfs_permissions(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+#[cfg(target_os = "linux")]
+pub fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option<String> {
+    let driver_link = sysfs.sys_bus_pci_devices().join(pci_addr).join("driver");
+    std::fs::read_link(&driver_link)
+        .ok()
+        .and_then(|target| target.file_name().map(|n| n.to_string_lossy().to_string()))
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn current_driver(_sysfs: &SysfsRoot, _pci_addr: &str) -> Option<String> {
+    None
+}
+
+/// Nvidia kernel modules that hold internal references to GPU devices and can
+/// prevent a clean unbind. Unloaded in order (most-dependent first).
+#[cfg(target_os = "linux")]
+const NVIDIA_SUBMODULES: &[&str] = &["nvidia_uvm", "nvidia_drm", "nvidia_modeset"];
+
+/// Timeout for nvidia prep commands (nvidia-smi, modprobe). These commands
+/// can wedge if the nvidia driver is in a bad state.
+#[cfg(target_os = "linux")]
+const NVIDIA_PREP_TIMEOUT: Duration = Duration::from_secs(15);
+
+/// Run a command with a timeout. Returns `Some(ExitStatus)` on success,
+/// `None` on timeout or spawn failure. On timeout, kills the child and
+/// drops it without calling `wait()` (same D-state safety as sysfs writes).
+#[cfg(target_os = "linux")]
+fn run_with_timeout(
+    mut cmd: std::process::Command,
+    timeout: Duration,
+) -> Option<std::process::ExitStatus> {
+    use std::thread;
+
+    let mut child = match cmd.spawn() {
+        Ok(c) => c,
+        Err(_) => return None,
+    };
+
+    let poll_interval = Duration::from_millis(100);
+    let start = std::time::Instant::now();
+
+    loop {
+        match child.try_wait() {
+            Ok(Some(status)) => return Some(status),
+            Ok(None) => {
+                if start.elapsed() > timeout {
+                    let _ = child.kill();
+                    drop(child);
+                    return None;
+                }
+                thread::sleep(poll_interval);
+            }
+            Err(_) => return None,
+        }
+    }
+}
+
+/// Best-effort preparation of the nvidia driver before a raw sysfs unbind.
+///
+/// Reduces the probability of the nvidia unbind deadlock by:
+/// 1. Disabling persistence mode (nvidia-persistenced holds device refs).
+/// 2. Unloading nvidia submodules that keep internal references open.
+///
+/// All commands run with a timeout — if `nvidia-smi` or `modprobe` hangs
+/// (which can happen when the nvidia driver is in a bad state), the parent
+/// process is not blocked. Failures are logged but not fatal.
+#[cfg(target_os = "linux")]
+fn nvidia_pre_unbind_prep(pci_addr: &str) {
+    use std::process::{Command, Stdio};
+
+    // 1. Disable persistence mode via nvidia-smi (if available).
+    let mut cmd = Command::new("nvidia-smi");
+    cmd.args(["-i", pci_addr, "-pm", "0"])
+        .stdin(Stdio::null())
+        .stdout(Stdio::null())
+        .stderr(Stdio::null());
+    match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) {
+        Some(s) if s.success() => {
+            eprintln!("GPU {pci_addr}: disabled nvidia persistence mode");
+        }
+        None => {
+            eprintln!(
+                "GPU {pci_addr}: nvidia-smi timed out after {:.0}s — skipping persistence mode",
+                NVIDIA_PREP_TIMEOUT.as_secs_f64()
+            );
+        }
+        _ => {}
+    }
+
+    // 2. Unload nvidia submodules that hold device references.
+    //    This is best-effort — modules may be in use by other GPUs.
+    for module in NVIDIA_SUBMODULES {
+        let mut cmd = Command::new("modprobe");
+        cmd.args(["-r", module])
+            .stdin(Stdio::null())
+            .stdout(Stdio::null())
+            .stderr(Stdio::null());
+        match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) {
+            Some(s) if s.success() => {
+                eprintln!("GPU {pci_addr}: unloaded {module}");
+            }
+            None => {
+                eprintln!(
+                    "GPU {pci_addr}: modprobe -r {module} timed out after {:.0}s",
+                    NVIDIA_PREP_TIMEOUT.as_secs_f64()
+                );
+            }
+            _ => {}
+        }
+    }
+}
+
+/// Reset a PCI device to clear stale IOMMU state after VFIO passthrough.
+///
+/// Tries the device's own `reset` file (FLR) first. If that doesn't exist,
+/// locates the parent PCI bridge and triggers a secondary bus reset (SBR).
+/// Either reset clears stale IOMMU page table entries that would otherwise
+/// cause `RmInitAdapter` failures when the nvidia driver initialises.
+#[cfg(target_os = "linux")]
+fn pci_reset_device(sysfs: &SysfsRoot, pci_addr: &str) {
+    let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr);
+
+    // Try device-level FLR first.
+    let device_reset = dev_dir.join("reset");
+    if device_reset.exists() {
+        eprintln!("GPU {pci_addr}: performing PCI function-level reset");
+        match sysfs.write_sysfs(&device_reset, "1") {
+            Ok(()) => {
+                std::thread::sleep(Duration::from_secs(1));
+                eprintln!("GPU {pci_addr}: FLR complete");
+                return;
+            }
+            Err(e) => {
+                eprintln!("GPU {pci_addr}: FLR failed ({e}), trying bridge SBR");
+            }
+        }
+    }
+
+    // Fall back to secondary bus reset on the parent bridge. The sysfs
+    // device path is a symlink whose real path encodes the PCI topology:
+    //   /sys/devices/pci0000:00/0000:00:03.1/0000:2d:00.0
+    // The parent directory (0000:00:03.1) is the bridge.
+    if let Ok(real) = std::fs::canonicalize(&dev_dir) {
+        if let Some(bridge_dir) = real.parent() {
+            let bridge_reset = bridge_dir.join("reset");
+            if bridge_reset.exists() {
+                let bridge_name = bridge_dir
+                    .file_name()
+                    .and_then(|n| n.to_str())
+                    .unwrap_or("unknown");
+                eprintln!("GPU {pci_addr}: performing secondary bus reset on bridge {bridge_name}");
+                if let Err(e) = std::fs::write(&bridge_reset, "1") {
+                    eprintln!("GPU {pci_addr}: bridge SBR failed: {e}");
+                } else {
+                    std::thread::sleep(Duration::from_secs(1));
+                    eprintln!("GPU {pci_addr}: SBR complete");
+                }
+            }
+        }
+    }
+}
+
+/// Reload nvidia kernel modules so the driver's sysfs bind file exists.
+///
+/// Called during restore to ensure `modprobe nvidia` brings back the driver
+/// that `nvidia_pre_unbind_prep` may have unloaded. Loads the base `nvidia`
+/// module plus its dependent submodules in the correct order.
+#[cfg(target_os = "linux")]
+fn nvidia_reload_modules() {
+    use std::process::{Command, Stdio};
+
+    // Load in dependency order: base module first, then dependents.
+    // If the base "nvidia" module fails, skip submodules (they depend on it).
+    for (i, module) in ["nvidia", "nvidia_modeset", "nvidia_uvm", "nvidia_drm"]
+        .iter()
+        .enumerate()
+    {
+        let mut cmd = Command::new("modprobe");
+        cmd.arg(module)
+            .stdin(Stdio::null())
+            .stdout(Stdio::null())
+            .stderr(Stdio::piped());
+        match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) {
+            Some(s) if s.success() => {
+                eprintln!("GPU: loaded {module} for restore");
+            }
+            None => {
+                eprintln!(
+                    "GPU: modprobe {module} timed out after {:.0}s during restore",
+                    NVIDIA_PREP_TIMEOUT.as_secs_f64()
+                );
+                break;
+            }
+            Some(s) => {
+                eprintln!(
+                    "GPU: modprobe {module} exited {} during restore (non-fatal)",
+                    s.code().unwrap_or(-1)
+                );
+                if i == 0 {
+                    break;
+                }
+            }
+        }
+    }
+}
+
+#[cfg(target_os = "linux")]
+pub fn bind_gpu_to_vfio(sysfs: &SysfsRoot, pci_addr: &str) -> Result<String, std::io::Error> {
+    validate_pci_addr(pci_addr)?;
+    let drv = current_driver(sysfs, pci_addr);
+
+    if drv.as_deref() == Some("vfio-pci") {
+        return Ok("vfio-pci".to_string());
+    }
+
+    let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr);
+
+    if drv.is_some() {
+        let is_nvidia = drv.as_deref() == Some("nvidia");
+        if is_nvidia && sysfs.is_real_sysfs() {
+            nvidia_pre_unbind_prep(pci_addr);
+
+            // nvidia_pre_unbind_prep may cascade-remove the nvidia module when
+            // all submodules are unloaded, which automatically unbinds the device.
+            // Re-check before attempting the sysfs unbind write.
+            if current_driver(sysfs, pci_addr).is_none() {
+                eprintln!("GPU {pci_addr}: device already unbound after nvidia module cleanup");
+            } else if current_driver(sysfs, pci_addr).as_deref() == Some("vfio-pci") {
+                return Ok("vfio-pci".to_string());
+            }
+        }
+
+        // Only attempt the sysfs unbind if a driver is still bound.
+        if current_driver(sysfs, pci_addr).is_some() {
+            let unbind = dev_dir.join("driver/unbind");
+            let unbind_result = sysfs.write_sysfs(&unbind, pci_addr);
+
+            if let Err(ref e) = unbind_result {
+                if e.kind() == std::io::ErrorKind::TimedOut {
+                    // The nvidia unbind deadlock can complete the unbind at the
+                    // hardware level while the syscall never returns to userspace.
+                    // Check if the device is actually unbound despite the timeout.
+                    if current_driver(sysfs, pci_addr).is_none() {
+                        eprintln!(
+                            "GPU {pci_addr}: sysfs unbind timed out but device is unbound — \
+                             continuing (zombie subprocess may linger until reboot)"
+                        );
+                    } else {
+                        return Err(std::io::Error::new(
+                            std::io::ErrorKind::TimedOut,
+                            format!(
+                                "Failed to unbind {pci_addr}: timed out and device is still \
+                                 bound to {}. A reboot may be required.",
+                                drv.as_deref().unwrap_or("unknown"),
+                            ),
+                        ));
+                    }
+                } else {
+                    let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+                        " — run as root"
+                    } else {
+                        ""
+                    };
+                    return Err(std::io::Error::new(
+                        e.kind(),
+                        format!(
+                            "Failed to unbind device at {path}{hint}",
+                            path = unbind.display()
+                        ),
+                    ));
+                }
+            }
+        }
+    }
+
+    let driver_override = dev_dir.join("driver_override");
+    if let Err(e) = sysfs.write_sysfs(&driver_override, "vfio-pci") {
+        if let Some(ref orig) = drv {
+            let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind");
+            let _ = sysfs.write_sysfs(&orig_bind, pci_addr);
+        }
+        let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+            " — run as root"
+        } else {
+            ""
+        };
+        return Err(std::io::Error::new(
+            e.kind(),
+            format!(
+                "Failed to write driver_override at {path}{hint}",
+                path = driver_override.display()
+            ),
+        ));
+    }
+
+    let vfio_bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind");
+    if let Err(e) = sysfs.write_sysfs(&vfio_bind, pci_addr) {
+        let _ = sysfs.write_sysfs(&driver_override, "");
+        if let Some(ref orig) = drv {
+            let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind");
+            let _ = sysfs.write_sysfs(&orig_bind, pci_addr);
+        }
+        let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+            " — run as root"
+        } else {
+            ""
+        };
+        return Err(std::io::Error::new(
+            e.kind(),
+            format!(
+                "Failed to bind to vfio-pci at {path}{hint} — is the vfio-pci module loaded?",
+                path = vfio_bind.display()
+            ),
+        ));
+    }
+
+    // When the device had no driver (e.g. nvidia modules were already unloaded
+    // from a previous crash, or display-manager was stopped), infer the restore
+    // target from vendor + PCI class so the right driver is rebound on exit.
+    let original = match drv {
+        Some(d) if !d.is_empty() => d,
+        _ => {
+            let vendor = std::fs::read_to_string(dev_dir.join("vendor"))
+                .map(|v| v.trim().to_lowercase())
+                .unwrap_or_default();
+            let class = std::fs::read_to_string(dev_dir.join("class"))
+                .map(|c| c.trim().to_lowercase())
+                .unwrap_or_default();
+            if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x0403") {
+                // NVIDIA HDA audio companion (HDMI/DP audio)
+                eprintln!(
+                    "GPU {pci_addr}: no driver was bound, defaulting restore target to snd_hda_intel (audio device)"
+                );
+                "snd_hda_intel".to_string()
+            } else if vendor == NVIDIA_VENDOR_ID {
+                eprintln!(
+                    "GPU {pci_addr}: no driver was bound, defaulting restore target to nvidia"
+                );
+                "nvidia".to_string()
+            } else {
+                String::new()
+            }
+        }
+    };
+
+    Ok(original)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn bind_gpu_to_vfio(_sysfs: &SysfsRoot, _pci_addr: &str) -> Result<String, std::io::Error> {
+    Ok(String::new())
+}
+
+#[cfg(target_os = "linux")]
+pub fn rebind_gpu_to_original(
+    sysfs: &SysfsRoot,
+    pci_addr: &str,
+    original_driver: &str,
+) -> Result<(), std::io::Error> {
+    validate_pci_addr(pci_addr)?;
+    let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr);
+
+    // Restore is best-effort: attempt every step even if earlier ones fail,
+    // so a partial failure (e.g. unbind succeeds but driver_override clear
+    // fails) doesn't leave the device in a worse state than before. Track
+    // the first error to return at the end.
+    let mut first_err: Option<std::io::Error> = None;
+
+    // Step 1: Unbind from the current driver. Without this, modprobe for
+    // the original driver fails with "No such device" because the kernel
+    // still considers the PCI slot claimed.
+    let cur_drv = current_driver(sysfs, pci_addr);
+    if cur_drv.as_deref() == Some("vfio-pci") {
+        let vfio_unbind = sysfs.sys_bus_pci_drivers("vfio-pci").join("unbind");
+        if let Err(e) = sysfs.write_sysfs(&vfio_unbind, pci_addr) {
+            let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+                " — run as root"
+            } else {
+                ""
+            };
+            eprintln!(
+                "GPU {pci_addr}: failed to unbind from vfio-pci at {}{hint} — continuing restore",
+                vfio_unbind.display()
+            );
+            if first_err.is_none() {
+                first_err = Some(std::io::Error::new(
+                    e.kind(),
+                    format!(
+                        "Failed to unbind {pci_addr} from vfio-pci at {path}{hint}",
+                        path = vfio_unbind.display()
+                    ),
+                ));
+            }
+        }
+    } else if cur_drv.is_some() {
+        let unbind = dev_dir.join("driver/unbind");
+        if let Err(e) = sysfs.write_sysfs(&unbind, pci_addr) {
+            let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+                " — run as root"
+            } else {
+                ""
+            };
+            eprintln!(
+                "GPU {pci_addr}: failed to unbind from {} at {}{hint} — continuing restore",
+                cur_drv.as_deref().unwrap_or("unknown"),
+                unbind.display()
+            );
+            if first_err.is_none() {
+                first_err = Some(std::io::Error::new(
+                    e.kind(),
+                    format!(
+                        "Failed to unbind device at {path}{hint}",
+                        path = unbind.display()
+                    ),
+                ));
+            }
+        }
+    }
+
+    // Step 2: Clear driver_override so modprobe can claim the device. This
+    // is required even when the device is already unbound — a killed VM
+    // process can leave driver_override set to "vfio-pci" with no driver
+    // actually bound.
+    let driver_override = dev_dir.join("driver_override");
+    if let Err(e) = sysfs.write_sysfs(&driver_override, "") {
+        let hint = if e.kind() == std::io::ErrorKind::PermissionDenied {
+            " — run as root"
+        } else {
+            ""
+        };
+        eprintln!(
+            "GPU {pci_addr}: failed to clear driver_override at {}{hint} — continuing restore",
+            driver_override.display()
+        );
+        if first_err.is_none() {
+            first_err = Some(std::io::Error::new(
+                e.kind(),
+                format!(
+                    "Failed to clear driver_override at {path}{hint}",
+                    path = driver_override.display()
+                ),
+            ));
+        }
+    }
+
+    // Step 3: PCI device reset to clear stale IOMMU state.
+    // After VFIO passthrough (especially on AMD-Vi systems), the GPU may
+    // retain stale IOMMU page table entries. Without a reset, modprobe
+    // nvidia fails with RmInitAdapter errors and IO_PAGE_FAULTs.
+    if sysfs.is_real_sysfs() {
+        pci_reset_device(sysfs, pci_addr);
+    }
+
+    // Step 4: Reload modules and bind to the original driver.
+    if !original_driver.is_empty() && original_driver != "none" {
+        if original_driver == "nvidia" && sysfs.is_real_sysfs() {
+            nvidia_reload_modules();
+        } else if sysfs.is_real_sysfs() {
+            let _ = std::process::Command::new("modprobe")
+                .arg(original_driver)
+                .output();
+        }
+
+        // modprobe may have auto-bound the device (now that driver_override is
+        // cleared). Skip the explicit bind if already on the right driver.
+        let cur = current_driver(sysfs, pci_addr);
+        if cur.as_deref() == Some(original_driver) {
+            eprintln!("GPU {pci_addr}: already bound to {original_driver}");
+        } else {
+            let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind");
+            if let Err(e) = sysfs.write_sysfs(&bind, pci_addr) {
+                eprintln!(
+                    "GPU {pci_addr}: explicit bind to {original_driver} failed ({e}), \
+                     falling back to PCI rescan"
+                );
+                let rescan = sysfs.0.join("sys/bus/pci/rescan");
+                if let Err(rescan_err) = sysfs.write_sysfs(&rescan, "1") {
+                    eprintln!("GPU {pci_addr}: PCI rescan write failed: {rescan_err}");
+                }
+                std::thread::sleep(Duration::from_secs(1));
+
+                match current_driver(sysfs, pci_addr) {
+                    None => {
+                        let bind_err = std::io::Error::new(
+                            e.kind(),
+                            format!(
+                                "Failed to restore {pci_addr} to {original_driver}: \
+                                 explicit bind and PCI rescan both failed. \
+                                 Manual fix:\n  \
+                                 echo {pci_addr} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind\n  \
+                                 echo | sudo tee /sys/bus/pci/devices/{pci_addr}/driver_override\n  \
+                                 sudo modprobe {original_driver}"
+                            ),
+                        );
+                        if first_err.is_none() {
+                            first_err = Some(bind_err);
+                        }
+                    }
+                    Some(new_drv) => {
+                        eprintln!("GPU {pci_addr}: PCI rescan bound device to {new_drv}");
+                    }
+                }
+            }
+        }
+    } else {
+        let rescan = sysfs.0.join("sys/bus/pci/rescan");
+        if let Err(rescan_err) = sysfs.write_sysfs(&rescan, "1") {
+            eprintln!("GPU {pci_addr}: PCI rescan write failed: {rescan_err}");
+        }
+    }
+
+    if first_err.is_none() {
+        if current_driver(sysfs, pci_addr).is_none() {
+            eprintln!(
+                "GPU {pci_addr}: warning: driver link missing in sysfs after restore \
+                 (nvidia-smi may still work via character devices). \
+                 To re-create the sysfs link: echo {pci_addr} | sudo tee /sys/bus/pci/drivers/{original_driver}/bind"
+            );
+        }
+    }
+
+    match first_err {
+        Some(e) => Err(e),
+        None => Ok(()),
+    }
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn rebind_gpu_to_original(
+    _sysfs: &SysfsRoot,
+    _pci_addr: &str,
+    _original_driver: &str,
+) -> Result<(), std::io::Error> {
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+pub fn iommu_group_peers(sysfs: &SysfsRoot, pci_addr: &str) -> Result<Vec<String>, std::io::Error> {
+    validate_pci_addr(pci_addr)?;
+    let iommu_devices = sysfs
+        .sys_bus_pci_devices()
+        .join(pci_addr)
+        .join("iommu_group/devices");
+
+    let entries = match std::fs::read_dir(&iommu_devices) {
+        Ok(e) => e,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(vec![]),
+        Err(e) => return Err(e),
+    };
+
+    let mut peers = Vec::new();
+    for entry in entries.filter_map(Result::ok) {
+        let name = entry.file_name().to_string_lossy().to_string();
+        if name != pci_addr {
+            peers.push(name);
+        }
+    }
+    Ok(peers)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn iommu_group_peers(
+    _sysfs: &SysfsRoot,
+    _pci_addr: &str,
+) -> Result<Vec<String>, std::io::Error> {
+    Ok(vec![])
+}
+
+#[cfg(target_os = "linux")]
+pub fn bind_iommu_group_peers(
+    sysfs: &SysfsRoot,
+    pci_addr: &str,
+) -> Result<Vec<(String, String)>, std::io::Error> {
+    let peers = iommu_group_peers(sysfs, pci_addr)?;
+    let mut restore_list = Vec::new();
+
+    for peer in peers {
+        match bind_gpu_to_vfio(sysfs, &peer) {
+            Ok(original) => {
+                if original != "vfio-pci" {
+                    restore_list.push((peer, original));
+                }
+            }
+            Err(e) => {
+                let _ = rebind_iommu_group_peers(sysfs, &restore_list);
+                return Err(std::io::Error::new(
+                    e.kind(),
+                    format!(
+                        "Failed to bind IOMMU peer {peer}: {e}. Rolled back {} peer(s).",
+                        restore_list.len()
+                    ),
+                ));
+            }
+        }
+    }
+
+    Ok(restore_list)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn bind_iommu_group_peers(
+    _sysfs: &SysfsRoot,
+    _pci_addr: &str,
+) -> Result<Vec<(String, String)>, std::io::Error> {
+    Ok(vec![])
+}
+
+#[cfg(target_os = "linux")]
+pub fn rebind_iommu_group_peers(
+    sysfs: &SysfsRoot,
+    peers: &[(String, String)],
+) -> Result<(), std::io::Error> {
+    let mut first_err = None;
+    for (peer_addr, original_driver) in peers {
+        if let Err(e) = rebind_gpu_to_original(sysfs, peer_addr, original_driver) {
+            eprintln!("IOMMU peer {peer_addr}: failed to restore to {original_driver}: {e}");
+            if first_err.is_none() {
+                first_err = Some(e);
+            }
+        }
+    }
+    match first_err {
+        Some(e) => Err(e),
+        None => Ok(()),
+    }
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn rebind_iommu_group_peers(
+    _sysfs: &SysfsRoot,
+    _peers: &[(String, String)],
+) -> Result<(), std::io::Error> {
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+fn is_iommu_group_clean(sysfs: &SysfsRoot, pci_addr: &str) -> bool {
+    let peers = match iommu_group_peers(sysfs, pci_addr) {
+        Ok(p) => p,
+        Err(_) => return false,
+    };
+    peers
+        .iter()
+        .all(|peer| current_driver(sysfs, peer).as_deref() == Some("vfio-pci"))
+}
+
+#[cfg(not(target_os = "linux"))]
+fn is_iommu_group_clean(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool {
+    false
+}
+
+/// Discover IOMMU group peers already on vfio-pci (inherited from a previous
+/// session) and infer their original driver from the PCI class code so they
+/// can be restored on exit.
+#[cfg(target_os = "linux")]
+fn inherited_peer_binds(sysfs: &SysfsRoot, gpu_addr: &str) -> Vec<(String, String)> {
+    iommu_group_peers(sysfs, gpu_addr)
+        .unwrap_or_default()
+        .into_iter()
+        .filter(|peer| peer != gpu_addr)
+        .filter_map(|peer| {
+            if current_driver(sysfs, &peer).as_deref() != Some("vfio-pci") {
+                return None;
+            }
+            let class =
+                std::fs::read_to_string(sysfs.sys_bus_pci_devices().join(&peer).join("class"))
+                    .unwrap_or_default()
+                    .trim()
+                    .to_lowercase();
+            // 0x0403xx = multimedia audio controller — typically snd_hda_intel
+            let orig = if class.starts_with("0x0403") {
+                "snd_hda_intel"
+            } else {
+                "nvidia"
+            };
+            Some((peer, orig.to_string()))
+        })
+        .collect()
+}
+
+#[cfg(not(target_os = "linux"))]
+fn inherited_peer_binds(_sysfs: &SysfsRoot, _gpu_addr: &str) -> Vec<(String, String)> {
+    vec![]
+}
+
+/// Captures the bind state for a GPU so it can be restored on shutdown.
+#[derive(Debug)]
+pub struct GpuBindState {
+    /// PCI address of the GPU that was bound.
+    pub pci_addr: String,
+    /// Driver the GPU was on before binding (e.g. "nvidia").
+    pub original_driver: String,
+    /// IOMMU group peers that were rebound, with their original drivers.
+    pub peer_binds: Vec<(String, String)>,
+    /// Whether this instance performed the bind (false if GPU was already on vfio-pci).
+    pub did_bind: bool,
+    /// Whether the GPU supports MSI-X (informational; QEMU handles both cases).
+    pub has_msix: bool,
+}
+
+impl GpuBindState {
+    /// Shell commands to manually restore the GPU and its peers to their
+    /// original drivers. Useful for printing recovery instructions when
+    /// the process might be force-killed (SIGKILL).
+    pub fn recovery_commands(&self) -> String {
+        let mut cmds = Vec::new();
+
+        cmds.push(format!(
+            "echo {} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind",
+            self.pci_addr
+        ));
+
+        for (peer_addr, _) in &self.peer_binds {
+            cmds.push(format!(
+                "echo {} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind",
+                peer_addr
+            ));
+        }
+
+        cmds.push(format!(
+            "echo | sudo tee /sys/bus/pci/devices/{}/driver_override",
+            self.pci_addr
+        ));
+
+        for (peer_addr, _) in &self.peer_binds {
+            cmds.push(format!(
+                "echo | sudo tee /sys/bus/pci/devices/{}/driver_override",
+                peer_addr
+            ));
+        }
+
+        if self.original_driver == "nvidia" || self.original_driver.is_empty() {
+            cmds.push("sudo modprobe nvidia".to_string());
+        }
+
+        let mut peer_drivers: Vec<&str> = Vec::new();
+        for (_, original_drv) in &self.peer_binds {
+            if !original_drv.is_empty()
+                && original_drv != "nvidia"
+                && !peer_drivers.contains(&original_drv.as_str())
+            {
+                peer_drivers.push(original_drv.as_str());
+            }
+        }
+        for drv in peer_drivers {
+            cmds.push(format!("sudo modprobe {drv}"));
+        }
+
+        cmds.join("\n")
+    }
+
+    /// Restore the GPU and its IOMMU peers to their original drivers.
+    pub fn restore(&self) -> Result<(), std::io::Error> {
+        self.restore_with_sysfs(&SysfsRoot::default())
+    }
+
+    pub fn restore_with_sysfs(&self, sysfs: &SysfsRoot) -> Result<(), std::io::Error> {
+        if !self.did_bind {
+            return Ok(());
+        }
+
+        // Restore IOMMU peers (e.g. the HDA audio companion) BEFORE the GPU.
+        // nvidia_reload_modules() during GPU restore can claim peer devices
+        // through nvidia-modeset/nvidia-drm if they're still unbound, racing
+        // with the snd_hda_intel rebind. Restoring peers first avoids this.
+        let peer_result = rebind_iommu_group_peers(sysfs, &self.peer_binds);
+        if let Err(ref e) = peer_result {
+            eprintln!("GPU: peer restore failed: {e}");
+        }
+
+        eprintln!(
+            "GPU: rebinding {} to {}",
+            self.pci_addr, self.original_driver
+        );
+        let gpu_result = rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver);
+
+        if let Err(ref gpu_err) = gpu_result {
+            return Err(std::io::Error::new(gpu_err.kind(), gpu_err.to_string()));
+        }
+        peer_result
+    }
+}
+
+/// RAII guard that restores GPU driver binding when dropped.
+///
+/// Ensures the GPU is rebound to its original driver on normal exit,
+/// early return (?), or panic. Cannot protect against SIGKILL.
+pub struct GpuBindGuard {
+    state: Option<GpuBindState>,
+}
+
+impl GpuBindGuard {
+    pub fn new(state: GpuBindState) -> Self {
+        Self { state: Some(state) }
+    }
+
+    /// Take the state out, preventing restore on drop.
+    pub fn disarm(&mut self) -> Option<GpuBindState> {
+        self.state.take()
+    }
+
+    /// Access the inner bind state, if present.
+    pub fn state(&self) -> Option<&GpuBindState> {
+        self.state.as_ref()
+    }
+
+    /// Get the PCI address of the bound GPU, if any.
+    pub fn pci_addr(&self) -> Option<&str> {
+        self.state.as_ref().map(|s| s.pci_addr.as_str())
+    }
+}
+
+impl Drop for GpuBindGuard {
+    fn drop(&mut self) {
+        if let Some(ref state) = self.state {
+            eprintln!(
+                "GPU: restoring {} to {} (cleanup)",
+                state.pci_addr, state.original_driver
+            );
+            if let Err(e) = state.restore() {
+                eprintln!("GPU: restore failed: {e}");
+            }
+        }
+    }
+}
+
+/// Known display server process names (matched against `/proc/PID/comm`).
+const DISPLAY_SERVER_NAMES: &[&str] = &[
+    "Xorg",
+    "X",
+    "Xwayland",
+    "gnome-shell",
+    "kwin_wayland",
+    "kwin_x11",
+    "sway",
+    "weston",
+    "mutter",
+];
+
+/// Returns `true` if `comm` matches a known display server process name.
+pub fn is_display_server_process(comm: &str) -> bool {
+    DISPLAY_SERVER_NAMES.contains(&comm)
+}
+
+/// Information about display manager processes blocking GPU passthrough.
+///
+/// Returned by [`detect_display_blocker`] when a GPU that would otherwise
+/// be eligible for passthrough is held by Xorg or a Wayland compositor.
+#[derive(Debug, Clone)]
+pub struct DisplayBlockerInfo {
+    /// PCI address of the GPU blocked by the display manager.
+    pub pci_addr: String,
+    /// Display-server processes holding `/dev/nvidia*` device files open.
+    pub display_processes: Vec<(u32, String)>,
+    /// Whether the GPU has active display outputs (DRM connectors).
+    pub has_active_outputs: bool,
+    /// Non-display processes also holding `/dev/nvidia*` device files.
+    /// If non-empty, stopping the display manager alone won't free the GPU.
+    pub other_processes: Vec<(u32, String)>,
+}
+
+/// Detect whether a display manager is blocking GPU passthrough.
+///
+/// Returns `Some(info)` when at least one GPU that would otherwise pass
+/// safety checks is blocked by display-server processes (Xorg, Wayland
+/// compositor) or has active display outputs. The caller can use this to
+/// prompt the user to stop the display manager before retrying.
+///
+/// Returns `None` when no display-related blocker is detected (GPUs may
+/// still be blocked by other issues like missing IOMMU or permissions).
+pub fn detect_display_blocker(requested_bdf: Option<&str>) -> Option<DisplayBlockerInfo> {
+    detect_display_blocker_with_sysfs(&SysfsRoot::default(), requested_bdf)
+}
+
+#[cfg(target_os = "linux")]
+pub fn detect_display_blocker_with_sysfs(
+    sysfs: &SysfsRoot,
+    requested_bdf: Option<&str>,
+) -> Option<DisplayBlockerInfo> {
+    let addrs: Vec<String> = match requested_bdf {
+        Some(bdf) => {
+            if validate_pci_addr(bdf).is_err() {
+                return None;
+            }
+            vec![bdf.to_string()]
+        }
+        None => find_nvidia_gpu_addrs(sysfs),
+    };
+
+    if addrs.is_empty() {
+        return None;
+    }
+
+    let active_procs = check_active_gpu_processes().unwrap_or_default();
+
+    let display_procs: Vec<(u32, String)> = active_procs
+        .iter()
+        .filter(|(_, comm)| is_display_server_process(comm))
+        .cloned()
+        .collect();
+
+    let other_procs: Vec<(u32, String)> = active_procs
+        .iter()
+        .filter(|(_, comm)| !is_display_server_process(comm))
+        .cloned()
+        .collect();
+
+    for addr in &addrs {
+        if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") {
+            continue;
+        }
+
+        let has_outputs = check_display_attached(sysfs, addr);
+
+        if has_outputs || !display_procs.is_empty() {
+            return Some(DisplayBlockerInfo {
+                pci_addr: addr.clone(),
+                display_processes: display_procs,
+                other_processes: other_procs,
+                has_active_outputs: has_outputs,
+            });
+        }
+    }
+
+    None
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn detect_display_blocker_with_sysfs(
+    _sysfs: &SysfsRoot,
+    _requested_bdf: Option<&str>,
+) -> Option<DisplayBlockerInfo> {
+    None
+}
+
+/// Find all NVIDIA GPU PCI addresses (class 0x03xxxx) in sysfs.
+#[cfg(target_os = "linux")]
+fn find_nvidia_gpu_addrs(sysfs: &SysfsRoot) -> Vec<String> {
+    let pci_dir = sysfs.sys_bus_pci_devices();
+    let Ok(entries) = std::fs::read_dir(&pci_dir) else {
+        return vec![];
+    };
+
+    let mut addrs = Vec::new();
+    for entry in entries.filter_map(Result::ok) {
+        let dev_path = entry.path();
+        let vendor = match std::fs::read_to_string(dev_path.join("vendor")) {
+            Ok(v) => v.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+        let class = match std::fs::read_to_string(dev_path.join("class")) {
+            Ok(c) => c.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+        if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") {
+            addrs.push(entry.file_name().to_string_lossy().to_string());
+        }
+    }
+    addrs.sort();
+    addrs
+}
+
+/// Prepare a GPU for VFIO passthrough: run safety checks, select, and bind.
+///
+/// When `requested_bdf` is Some, targets that specific device.
+/// When None (auto mode), selects the best available GPU.
+///
+/// All safety checks are hard failures — if any check fails, this returns
+/// an error and does not bind anything.
+pub fn prepare_gpu_for_passthrough(
+    requested_bdf: Option<&str>,
+) -> Result<GpuBindState, std::io::Error> {
+    prepare_gpu_with_sysfs(&SysfsRoot::default(), requested_bdf)
+}
+
+pub fn prepare_gpu_with_sysfs(
+    sysfs: &SysfsRoot,
+    requested_bdf: Option<&str>,
+) -> Result<GpuBindState, std::io::Error> {
+    match requested_bdf {
+        Some(bdf) => prepare_specific_gpu(sysfs, bdf),
+        None => prepare_auto_gpu(sysfs),
+    }
+}
+
+fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result<GpuBindState, std::io::Error> {
+    validate_pci_addr(bdf)?;
+
+    let dev_dir = sysfs.sys_bus_pci_devices().join(bdf);
+    if !dev_dir.exists() {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            format!("PCI device {bdf} not found in sysfs"),
+        ));
+    }
+
+    let vendor = std::fs::read_to_string(dev_dir.join("vendor"))
+        .map(|v| v.trim().to_lowercase())
+        .unwrap_or_default();
+    if vendor != NVIDIA_VENDOR_ID {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!("PCI device {bdf} is not an NVIDIA device (vendor: {vendor})"),
+        ));
+    }
+    let class = std::fs::read_to_string(dev_dir.join("class"))
+        .map(|c| c.trim().to_lowercase())
+        .unwrap_or_default();
+    if !class.starts_with("0x03") {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            format!("PCI device {bdf} is not a GPU (class: {class})"),
+        ));
+    }
+
+    let has_msix = check_msix_support(sysfs, bdf);
+    if !has_msix {
+        eprintln!("GPU {bdf}: no MSI-X support (QEMU will use legacy interrupt emulation)");
+    }
+
+    if current_driver(sysfs, bdf).as_deref() == Some("vfio-pci") && is_iommu_group_clean(sysfs, bdf)
+    {
+        let peer_binds = inherited_peer_binds(sysfs, bdf);
+        eprintln!(
+            "GPU {bdf}: already on vfio-pci (inherited from previous session), \
+             will restore to nvidia on exit ({} peer(s) also tracked)",
+            peer_binds.len()
+        );
+        return Ok(GpuBindState {
+            pci_addr: bdf.to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds,
+            did_bind: true,
+            has_msix,
+        });
+    }
+
+    if check_display_attached(sysfs, bdf) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            format!("GPU {bdf}: has active display outputs"),
+        ));
+    }
+
+    let procs = check_active_gpu_processes().map_err(|e| {
+        std::io::Error::new(
+            e.kind(),
+            format!("GPU {bdf}: cannot verify GPU is idle — {e}"),
+        )
+    })?;
+    if !procs.is_empty() {
+        let desc: Vec<String> = procs
+            .iter()
+            .map(|(pid, comm)| format!("{pid} ({comm})"))
+            .collect();
+        let display_procs: Vec<&str> = procs
+            .iter()
+            .filter(|(_, comm)| is_display_server_process(comm))
+            .map(|(_, comm)| comm.as_str())
+            .collect();
+        let mut msg = format!("GPU {bdf}: in use by PIDs: {}", desc.join(", "));
+        if !display_procs.is_empty() {
+            msg.push_str(&format!(
+                "\n\n  {} {} a display server \
+                 — stop the display manager to release the GPU:\n    \
+                 sudo systemctl stop display-manager\
+                 \n\n  The display manager will need to be restarted after the VM exits:\n    \
+                 sudo systemctl start display-manager",
+                display_procs.join(", "),
+                if display_procs.len() == 1 {
+                    "is"
+                } else {
+                    "are"
+                },
+            ));
+        }
+        return Err(std::io::Error::new(std::io::ErrorKind::Other, msg));
+    }
+
+    if !check_iommu_enabled(sysfs, bdf) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            format!("GPU {bdf}: IOMMU not enabled or device has no IOMMU group"),
+        ));
+    }
+
+    if !check_vfio_modules_loaded(sysfs) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            format!("GPU {bdf}: VFIO kernel modules not loaded"),
+        ));
+    }
+
+    if !check_sysfs_permissions(sysfs, bdf) {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::PermissionDenied,
+            format!("GPU {bdf}: insufficient sysfs permissions — run as root"),
+        ));
+    }
+
+    let original_driver = bind_gpu_to_vfio(sysfs, bdf)?;
+    let peer_binds = match bind_iommu_group_peers(sysfs, bdf) {
+        Ok(peers) => peers,
+        Err(e) => {
+            let _ = rebind_gpu_to_original(sysfs, bdf, &original_driver);
+            return Err(e);
+        }
+    };
+
+    Ok(GpuBindState {
+        pci_addr: bdf.to_string(),
+        original_driver,
+        peer_binds,
+        did_bind: true,
+        has_msix,
+    })
+}
+
+fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result<GpuBindState, std::io::Error> {
+    let pci_dir = sysfs.sys_bus_pci_devices();
+    let entries = std::fs::read_dir(&pci_dir).map_err(|e| {
+        std::io::Error::new(e.kind(), format!("cannot read {}: {e}", pci_dir.display()))
+    })?;
+
+    let mut nvidia_addrs = Vec::new();
+    for entry in entries.filter_map(Result::ok) {
+        let dev_path = entry.path();
+        let vendor = match std::fs::read_to_string(dev_path.join("vendor")) {
+            Ok(v) => v.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+        let class = match std::fs::read_to_string(dev_path.join("class")) {
+            Ok(c) => c.trim().to_lowercase(),
+            Err(_) => continue,
+        };
+        if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") {
+            nvidia_addrs.push(entry.file_name().to_string_lossy().to_string());
+        }
+    }
+
+    if nvidia_addrs.is_empty() {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            "no NVIDIA PCI device found",
+        ));
+    }
+
+    nvidia_addrs.sort();
+
+    // Phase 1: prefer GPUs already on vfio-pci with clean IOMMU group.
+    // MSI-X GPUs get slight priority (better interrupt performance).
+    let mut vfio_msix: Option<String> = None;
+    let mut vfio_no_msix: Option<String> = None;
+    for addr in &nvidia_addrs {
+        if current_driver(sysfs, addr).as_deref() == Some("vfio-pci")
+            && is_iommu_group_clean(sysfs, addr)
+        {
+            if check_msix_support(sysfs, addr) {
+                if vfio_msix.is_none() {
+                    vfio_msix = Some(addr.clone());
+                }
+            } else if vfio_no_msix.is_none() {
+                vfio_no_msix = Some(addr.clone());
+            }
+        }
+    }
+    if let Some(addr) = vfio_msix {
+        let peer_binds = inherited_peer_binds(sysfs, &addr);
+        eprintln!(
+            "GPU {addr}: already on vfio-pci (inherited from previous session), \
+             will restore to nvidia on exit ({} peer(s) also tracked)",
+            peer_binds.len()
+        );
+        return Ok(GpuBindState {
+            pci_addr: addr,
+            original_driver: "nvidia".to_string(),
+            peer_binds,
+            did_bind: true,
+            has_msix: true,
+        });
+    }
+    if let Some(ref addr) = vfio_no_msix {
+        let peer_binds = inherited_peer_binds(sysfs, addr);
+        eprintln!("GPU {addr}: no MSI-X support (QEMU will use legacy interrupt emulation)");
+        eprintln!(
+            "GPU {addr}: already on vfio-pci (inherited from previous session), \
+             will restore to nvidia on exit ({} peer(s) also tracked)",
+            peer_binds.len()
+        );
+        return Ok(GpuBindState {
+            pci_addr: addr.clone(),
+            original_driver: "nvidia".to_string(),
+            peer_binds,
+            did_bind: true,
+            has_msix: false,
+        });
+    }
+
+    // Phase 2: try to bind idle GPUs. Collect eligible candidates, then
+    // pick the best one (MSI-X preferred over non-MSI-X).
+    let mut blocked: Vec<(String, String)> = Vec::new();
+    let mut has_display_blocker = false;
+    let active_procs = check_active_gpu_processes()
+        .map_err(|e| std::io::Error::new(e.kind(), format!("cannot verify GPUs are idle — {e}")))?;
+
+    let mut idle_candidates: Vec<(String, bool)> = Vec::new();
+
+    for addr in &nvidia_addrs {
+        if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") {
+            blocked.push((addr.clone(), "IOMMU group not clean".to_string()));
+            continue;
+        }
+
+        if check_display_attached(sysfs, addr) {
+            has_display_blocker = true;
+            blocked.push((addr.clone(), "has active display outputs".to_string()));
+            continue;
+        }
+
+        if !active_procs.is_empty() {
+            let display_names: Vec<&str> = active_procs
+                .iter()
+                .filter(|(_, comm)| is_display_server_process(comm))
+                .map(|(_, comm)| comm.as_str())
+                .collect();
+            if !display_names.is_empty() {
+                has_display_blocker = true;
+            }
+            let desc: Vec<String> = active_procs
+                .iter()
+                .map(|(pid, comm)| format!("{pid} ({comm})"))
+                .collect();
+            blocked.push((addr.clone(), format!("in use by PIDs: {}", desc.join(", "))));
+            continue;
+        }
+
+        if !check_iommu_enabled(sysfs, addr) {
+            blocked.push((addr.clone(), "IOMMU not enabled".to_string()));
+            continue;
+        }
+
+        if !check_vfio_modules_loaded(sysfs) {
+            blocked.push((addr.clone(), "VFIO modules not loaded".to_string()));
+            continue;
+        }
+
+        if !check_sysfs_permissions(sysfs, addr) {
+            blocked.push((addr.clone(), "insufficient sysfs permissions".to_string()));
+            continue;
+        }
+
+        let has_msix = check_msix_support(sysfs, addr);
+        idle_candidates.push((addr.clone(), has_msix));
+    }
+
+    // Sort: MSI-X candidates first (better interrupt performance).
+    idle_candidates.sort_by_key(|(_, has_msix)| !has_msix);
+
+    for (addr, has_msix) in &idle_candidates {
+        if !has_msix {
+            eprintln!("GPU {addr}: no MSI-X support (QEMU will use legacy interrupt emulation)");
+        }
+        eprintln!("GPU: binding {addr} for VFIO passthrough");
+        let original_driver = bind_gpu_to_vfio(sysfs, addr)?;
+        let peer_binds = match bind_iommu_group_peers(sysfs, addr) {
+            Ok(peers) => peers,
+            Err(e) => {
+                let _ = rebind_gpu_to_original(sysfs, addr, &original_driver);
+                return Err(e);
+            }
+        };
+
+        return Ok(GpuBindState {
+            pci_addr: addr.clone(),
+            original_driver,
+            peer_binds,
+            did_bind: true,
+            has_msix: *has_msix,
+        });
+    }
+
+    let mut msg =
+        String::from("GPU passthrough blocked by safety checks.\n\n  Detected devices:\n");
+    for (addr, reason) in &blocked {
+        msg.push_str(&format!("    {addr}: {reason}\n"));
+    }
+    if has_display_blocker {
+        msg.push_str(
+            "\n  A display server is using the GPU. \
+             Stop the display manager to release it:\n    \
+             sudo systemctl stop display-manager\
+             \n\n  The display manager will be restarted automatically if you use the --gpu flag,\
+             \n  or manually with: sudo systemctl start display-manager\n",
+        );
+    }
+    msg.push_str("\n  No GPU is available for passthrough.");
+
+    Err(std::io::Error::new(std::io::ErrorKind::Other, msg))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+    use std::path::Path;
+
+    #[test]
+    #[allow(unsafe_code)]
+    fn passthrough_gate_is_false_without_env_var() {
+        // SAFETY: test runs single-threaded; no other thread reads this var.
+        unsafe { std::env::remove_var("OPENSHELL_VM_GPU_E2E") };
+        assert!(
+            !nvidia_gpu_available_for_vm_passthrough(None),
+            "gate must return false when OPENSHELL_VM_GPU_E2E is unset"
+        );
+    }
+
+    #[test]
+    fn probe_returns_no_device_or_readiness_on_typical_ci() {
+        let results = probe_host_nvidia_vfio_readiness();
+
+        #[cfg(not(target_os = "linux"))]
+        assert!(results.is_empty(), "non-Linux should return empty Vec");
+
+        #[cfg(target_os = "linux")]
+        {
+            // CI machines typically have no NVIDIA GPU bound to vfio-pci.
+            // Accept an empty list or any per-device readiness state.
+            for (addr, state) in &results {
+                assert!(!addr.is_empty(), "PCI address should not be empty");
+                assert!(
+                    matches!(
+                        state,
+                        HostNvidiaVfioReadiness::BoundToNvidia
+                            | HostNvidiaVfioReadiness::VfioBoundReady
+                            | HostNvidiaVfioReadiness::VfioBoundDirtyGroup
+                    ),
+                    "unexpected per-device readiness state for {addr}: {state:?}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn display_impl_is_meaningful() {
+        let states = [
+            HostNvidiaVfioReadiness::UnsupportedPlatform,
+            HostNvidiaVfioReadiness::NoNvidiaDevice,
+            HostNvidiaVfioReadiness::BoundToNvidia,
+            HostNvidiaVfioReadiness::VfioBoundReady,
+            HostNvidiaVfioReadiness::VfioBoundDirtyGroup,
+            HostNvidiaVfioReadiness::MixedVfioAndOther,
+        ];
+        for state in &states {
+            let msg = format!("{state}");
+            assert!(!msg.is_empty(), "Display for {state:?} should not be empty");
+        }
+    }
+
+    /// Build a minimal PCI config space (64 bytes) with a capability list
+    /// containing a single MSI-X entry (cap ID 0x11) so `check_msix_support`
+    /// sees the device as passthrough-capable.
+    fn mock_pci_config_with_msix() -> Vec<u8> {
+        let mut cfg = vec![0u8; 64];
+        // Status register (offset 0x06): set bit 4 = capabilities list present.
+        cfg[0x06] = 0x10;
+        // Capabilities pointer (offset 0x34): first cap at 0x40.
+        cfg[0x34] = 0x40;
+        // Extend to include the capability at offset 0x40.
+        cfg.resize(0x42, 0);
+        // Cap at 0x40: ID = 0x11 (MSI-X), next = 0x00 (end of list).
+        cfg[0x40] = 0x11;
+        cfg[0x41] = 0x00;
+        cfg
+    }
+
+    fn mock_pci_device(root: &Path, pci_addr: &str, vendor: &str, driver: Option<&str>) {
+        use std::fs;
+        let dev_dir = root.join("sys/bus/pci/devices").join(pci_addr);
+        fs::create_dir_all(&dev_dir).unwrap();
+        fs::write(dev_dir.join("vendor"), vendor).unwrap();
+        fs::write(dev_dir.join("class"), "0x030000").unwrap();
+        fs::write(dev_dir.join("config"), mock_pci_config_with_msix()).unwrap();
+        if let Some(drv) = driver {
+            let driver_dir = root.join("sys/bus/pci/drivers").join(drv);
+            fs::create_dir_all(&driver_dir).unwrap();
+            #[cfg(unix)]
+            std::os::unix::fs::symlink(&driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(dev_dir.join("driver_override"), "").unwrap();
+    }
+
+    fn mock_drm_card(root: &Path, card: &str, pci_addr: &str, outputs: &[(&str, &str)]) {
+        use std::fs;
+        let card_dir = root.join("sys/class/drm").join(card);
+        fs::create_dir_all(&card_dir).unwrap();
+        #[cfg(unix)]
+        std::os::unix::fs::symlink(
+            root.join("sys/bus/pci/devices").join(pci_addr),
+            card_dir.join("device"),
+        )
+        .unwrap();
+        for (output, status) in outputs {
+            let out_dir = card_dir.join(format!("{card}-{output}"));
+            fs::create_dir_all(&out_dir).unwrap();
+            fs::write(out_dir.join("status"), status).unwrap();
+        }
+    }
+
+    fn mock_iommu_group(root: &Path, group_id: u32, members: &[&str]) {
+        use std::fs;
+        let group_dir = root.join(format!("sys/kernel/iommu_groups/{group_id}/devices"));
+        fs::create_dir_all(&group_dir).unwrap();
+        for member in members {
+            let dev_dir = root.join("sys/bus/pci/devices").join(member);
+            fs::create_dir_all(&dev_dir).unwrap();
+            #[cfg(unix)]
+            {
+                let iommu_group_target = root.join(format!("sys/kernel/iommu_groups/{group_id}"));
+                let _ =
+                    std::os::unix::fs::symlink(&iommu_group_target, dev_dir.join("iommu_group"));
+                let _ = std::os::unix::fs::symlink(&dev_dir, group_dir.join(member));
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_attached_detects_active_framebuffer() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+        assert!(check_display_attached(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_attached_false_on_headless() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "disconnected")],
+        );
+        assert!(!check_display_attached(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_attached_false_no_drm_card() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        assert!(!check_display_attached(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_check_fails_without_groups_dir() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        assert!(!check_iommu_enabled(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_check_passes_with_group() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0"]);
+        assert!(check_iommu_enabled(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn vfio_modules_loaded_true() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+        assert!(check_vfio_modules_loaded(&sysfs));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn vfio_modules_missing() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        assert!(!check_vfio_modules_loaded(&sysfs));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn permissions_writable() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&bind_dir).unwrap();
+        fs::write(bind_dir.join("bind"), "").unwrap();
+        assert!(check_sysfs_permissions(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn permissions_readonly_driver_override() {
+        use std::os::unix::fs::PermissionsExt;
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        let driver_override = root
+            .path()
+            .join("sys/bus/pci/devices/0000:41:00.0/driver_override");
+        fs::set_permissions(&driver_override, fs::Permissions::from_mode(0o444)).unwrap();
+        assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn permissions_readonly_bind() {
+        use std::os::unix::fs::PermissionsExt;
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&bind_dir).unwrap();
+        let bind_path = bind_dir.join("bind");
+        fs::write(&bind_path, "").unwrap();
+        fs::set_permissions(&bind_path, fs::Permissions::from_mode(0o444)).unwrap();
+        assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0"));
+    }
+
+    fn mock_bindable_gpu(root: &Path, pci_addr: &str) {
+        mock_pci_device(root, pci_addr, "0x10de", Some("nvidia"));
+        let drv_unbind = root.join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        let vfio_dir = root.join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_dir).unwrap();
+        fs::write(vfio_dir.join("bind"), "").unwrap();
+        mock_iommu_group(root, 15, &[pci_addr]);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn bind_gpu_writes_correct_sysfs_paths() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+
+        bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+
+        let unbind_content =
+            fs::read_to_string(root.path().join("sys/bus/pci/drivers/nvidia/unbind")).unwrap();
+        assert_eq!(unbind_content, "0000:41:00.0");
+
+        let override_content = fs::read_to_string(
+            root.path()
+                .join("sys/bus/pci/devices/0000:41:00.0/driver_override"),
+        )
+        .unwrap();
+        assert_eq!(override_content, "vfio-pci");
+
+        let bind_content =
+            fs::read_to_string(root.path().join("sys/bus/pci/drivers/vfio-pci/bind")).unwrap();
+        assert_eq!(bind_content, "0000:41:00.0");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn bind_returns_original_driver() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+
+        let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(result, "nvidia");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn bind_noop_when_already_vfio() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci"));
+        let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_dir).unwrap();
+        fs::write(vfio_dir.join("bind"), "").unwrap();
+
+        let nvidia_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::create_dir_all(nvidia_unbind.parent().unwrap()).unwrap();
+        fs::write(&nvidia_unbind, "").unwrap();
+
+        let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(result, "vfio-pci");
+
+        let unbind_content = fs::read_to_string(&nvidia_unbind).unwrap();
+        assert_eq!(
+            unbind_content, "",
+            "nvidia unbind should NOT have been written"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn rebind_clears_driver_override() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap();
+
+        let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap();
+        assert_eq!(override_content, "");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn rebind_writes_to_original_driver_bind() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap();
+
+        let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap();
+        assert_eq!(bind_content, "0000:41:00.0");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_peers_listed_correctly() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        mock_pci_device(root.path(), "0000:41:00.1", "0x10de", None);
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]);
+
+        let peers = iommu_group_peers(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(peers, vec!["0000:41:00.1"]);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn iommu_peers_bound_together() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia"));
+        let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]);
+
+        let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(
+            restore,
+            vec![("0000:41:00.1".to_string(), "nvidia".to_string())]
+        );
+
+        let override_content = fs::read_to_string(
+            root.path()
+                .join("sys/bus/pci/devices/0000:41:00.1/driver_override"),
+        )
+        .unwrap();
+        assert_eq!(override_content, "vfio-pci");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn peer_restore_rebinds_to_original() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia"));
+        let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]);
+
+        let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap();
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.1");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        rebind_iommu_group_peers(&sysfs, &restore).unwrap();
+
+        let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap();
+        assert_eq!(override_content, "");
+    }
+
+    fn mock_multi_gpu_host(root: &Path) {
+        // GPU 0: on nvidia, has display attached
+        mock_bindable_gpu(root, "0000:41:00.0");
+        mock_drm_card(root, "card0", "0000:41:00.0", &[("DP-1", "connected")]);
+
+        // GPU 1: on nvidia, idle (no display, no processes)
+        mock_bindable_gpu(root, "0000:42:00.0");
+
+        // GPU 2: already on vfio-pci, clean IOMMU group
+        mock_pci_device(root, "0000:43:00.0", "0x10de", Some("vfio-pci"));
+        mock_iommu_group(root, 17, &["0000:43:00.0"]);
+
+        fs::create_dir_all(root.join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.join("sys/module/vfio_iommu_type1")).unwrap();
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_prefers_already_vfio() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_multi_gpu_host(root.path());
+
+        let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap();
+        assert_eq!(state.pci_addr, "0000:43:00.0");
+        assert!(
+            state.did_bind,
+            "inherited vfio-pci should set did_bind=true for restore"
+        );
+        assert_eq!(state.original_driver, "nvidia");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_selects_idle_gpu_when_no_vfio() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0"]);
+
+        mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia"));
+        mock_iommu_group(root.path(), 16, &["0000:42:00.0"]);
+
+        let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind");
+        fs::write(&drv_unbind, "").unwrap();
+        let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_dir).unwrap();
+        fs::write(vfio_dir.join("bind"), "").unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap();
+        assert_eq!(state.pci_addr, "0000:42:00.0");
+        assert!(state.did_bind);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_fails_when_all_blocked() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+        mock_iommu_group(root.path(), 15, &["0000:41:00.0"]);
+
+        mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card1",
+            "0000:42:00.0",
+            &[("HDMI-1", "connected")],
+        );
+        mock_iommu_group(root.path(), 16, &["0000:42:00.0"]);
+
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err();
+        let msg = err.to_string();
+        assert!(
+            msg.contains("display"),
+            "error should mention display: {msg}"
+        );
+        assert!(
+            msg.contains("0000:41:00.0"),
+            "error should list first GPU: {msg}"
+        );
+        assert!(
+            msg.contains("0000:42:00.0"),
+            "error should list second GPU: {msg}"
+        );
+        assert!(
+            msg.contains("sudo systemctl stop display-manager"),
+            "error should suggest stopping display-manager: {msg}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_blocked_by_display_includes_restart_hint() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        mock_pci_device(root.path(), "0000:61:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:61:00.0",
+            &[("DP-1", "connected")],
+        );
+        mock_iommu_group(root.path(), 20, &["0000:61:00.0"]);
+
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err();
+        let msg = err.to_string();
+        assert!(
+            msg.contains("sudo systemctl stop display-manager"),
+            "error should include display-manager stop command: {msg}"
+        );
+        assert!(
+            msg.contains("sudo systemctl start display-manager"),
+            "error should include display-manager restart command: {msg}"
+        );
+        assert!(
+            msg.contains("0000:61:00.0"),
+            "error should list the blocked GPU: {msg}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn auto_fails_on_empty_host() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        fs::create_dir_all(root.path().join("sys/bus/pci/devices")).unwrap();
+
+        let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err();
+        assert!(
+            err.to_string().contains("no NVIDIA PCI device found"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_binds_target() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap();
+        assert_eq!(state.pci_addr, "0000:41:00.0");
+        assert!(state.did_bind);
+        assert_eq!(state.original_driver, "nvidia");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_validates_format() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        let err = prepare_gpu_with_sysfs(&sysfs, Some("invalid")).unwrap_err();
+        assert!(
+            err.to_string().contains("invalid PCI address"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_fails_display_check() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+
+        let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err();
+        assert!(
+            err.to_string().contains("display"),
+            "error should mention display: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn specific_bdf_fails_iommu_check() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+
+        let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err();
+        assert!(
+            err.to_string().contains("IOMMU"),
+            "error should mention IOMMU: {err}"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn restore_round_trips() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_bindable_gpu(root.path(), "0000:41:00.0");
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap();
+        assert!(state.did_bind);
+        assert_eq!(state.original_driver, "nvidia");
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        #[cfg(unix)]
+        {
+            let _ = fs::remove_file(dev_dir.join("driver"));
+            std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap();
+        }
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        state.restore_with_sysfs(&sysfs).unwrap();
+
+        let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap();
+        assert_eq!(override_content, "");
+
+        let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap();
+        assert_eq!(bind_content, "0000:41:00.0");
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn restore_inherited_vfio_rebinds_to_nvidia() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        mock_pci_device(root.path(), "0000:43:00.0", "0x10de", Some("vfio-pci"));
+        mock_iommu_group(root.path(), 17, &["0000:43:00.0"]);
+        fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap();
+        fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap();
+
+        let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:43:00.0")).unwrap();
+        assert!(
+            state.did_bind,
+            "inherited vfio-pci state should set did_bind=true"
+        );
+        assert_eq!(
+            state.original_driver, "nvidia",
+            "inherited vfio-pci should target nvidia for restore"
+        );
+
+        let dev_dir = root.path().join("sys/bus/pci/devices/0000:43:00.0");
+        let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_driver_dir).unwrap();
+        fs::write(vfio_driver_dir.join("unbind"), "").unwrap();
+        let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia");
+        fs::create_dir_all(&nvidia_dir).unwrap();
+        fs::write(nvidia_dir.join("bind"), "").unwrap();
+
+        state.restore_with_sysfs(&sysfs).unwrap();
+
+        let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap();
+        assert_eq!(
+            override_content, "",
+            "driver_override should be cleared after restore"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn bind_unbound_nvidia_defaults_to_nvidia_driver() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        // Device with no driver bound (simulating post-crash state).
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci");
+        fs::create_dir_all(&vfio_dir).unwrap();
+        fs::write(vfio_dir.join("bind"), "").unwrap();
+
+        let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap();
+        assert_eq!(
+            result, "nvidia",
+            "unbound NVIDIA device should default to nvidia as restore driver"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn msix_detected_in_config() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None);
+        assert!(check_msix_support(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn msix_absent_msi_only() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0");
+        fs::create_dir_all(&dev_dir).unwrap();
+        // Config with MSI (cap 0x05) only, no MSI-X (0x11).
+        let mut cfg = vec![0u8; 0x42];
+        cfg[0x06] = 0x10; // capabilities list present
+        cfg[0x34] = 0x40; // cap pointer
+        cfg[0x40] = 0x05; // MSI capability
+        cfg[0x41] = 0x00; // end of list
+        fs::write(dev_dir.join("config"), &cfg).unwrap();
+        assert!(!check_msix_support(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn msix_empty_cap_list() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0");
+        fs::create_dir_all(&dev_dir).unwrap();
+        let mut cfg = vec![0u8; 0x40];
+        cfg[0x06] = 0x10; // capabilities list present
+        cfg[0x34] = 0x00; // null cap pointer
+        fs::write(dev_dir.join("config"), &cfg).unwrap();
+        assert!(!check_msix_support(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn msix_circular_cap_list() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0");
+        fs::create_dir_all(&dev_dir).unwrap();
+        // Circular: cap at 0x40 points back to 0x40.
+        let mut cfg = vec![0u8; 0x42];
+        cfg[0x06] = 0x10;
+        cfg[0x34] = 0x40;
+        cfg[0x40] = 0x05; // MSI (not MSI-X)
+        cfg[0x41] = 0x40; // points back to self
+        fs::write(dev_dir.join("config"), &cfg).unwrap();
+        // Should terminate via the 48-iteration guard, not hang.
+        assert!(!check_msix_support(&sysfs, "0000:41:00.0"));
+    }
+
+    #[test]
+    fn guard_has_pci_addr() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: true,
+            has_msix: true,
+        };
+        let guard = GpuBindGuard::new(state);
+        assert_eq!(guard.pci_addr(), Some("0000:41:00.0"));
+    }
+
+    #[test]
+    fn guard_disarm_returns_state() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: true,
+            has_msix: true,
+        };
+        let mut guard = GpuBindGuard::new(state);
+        let taken = guard.disarm();
+        assert!(taken.is_some());
+        assert_eq!(guard.pci_addr(), None);
+    }
+
+    #[test]
+    fn guard_disarm_prevents_double_restore() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: true,
+            has_msix: true,
+        };
+        let mut guard = GpuBindGuard::new(state);
+        let _ = guard.disarm();
+        let second = guard.disarm();
+        assert!(second.is_none());
+    }
+
+    #[test]
+    fn recovery_commands_includes_gpu_and_peers() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![("0000:41:00.1".to_string(), "snd_hda_intel".to_string())],
+            did_bind: true,
+            has_msix: true,
+        };
+        let cmds = state.recovery_commands();
+        assert!(
+            cmds.contains("vfio-pci/unbind"),
+            "should unbind GPU from vfio-pci"
+        );
+        assert!(
+            cmds.contains("0000:41:00.0"),
+            "should reference GPU address"
+        );
+        assert!(
+            cmds.contains("0000:41:00.1"),
+            "should reference peer address"
+        );
+        assert!(
+            cmds.contains("driver_override"),
+            "should clear driver_override"
+        );
+        assert!(
+            cmds.contains("modprobe nvidia"),
+            "should reload nvidia modules"
+        );
+        assert!(
+            cmds.contains("modprobe snd_hda_intel"),
+            "should reload peer original driver"
+        );
+    }
+
+    #[test]
+    fn guard_drop_noop_when_did_not_bind() {
+        let state = GpuBindState {
+            pci_addr: "0000:41:00.0".to_string(),
+            original_driver: "nvidia".to_string(),
+            peer_binds: vec![],
+            did_bind: false,
+            has_msix: true,
+        };
+        let guard = GpuBindGuard::new(state);
+        drop(guard);
+    }
+
+    #[test]
+    fn guard_drop_on_panic_is_safe() {
+        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            let state = GpuBindState {
+                pci_addr: "0000:41:00.0".to_string(),
+                original_driver: "nvidia".to_string(),
+                peer_binds: vec![],
+                did_bind: false,
+                has_msix: true,
+            };
+            let _guard = GpuBindGuard::new(state);
+            panic!("test panic");
+        }));
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn display_server_process_detection() {
+        assert!(is_display_server_process("Xorg"));
+        assert!(is_display_server_process("X"));
+        assert!(is_display_server_process("Xwayland"));
+        assert!(is_display_server_process("gnome-shell"));
+        assert!(is_display_server_process("kwin_wayland"));
+        assert!(is_display_server_process("sway"));
+        assert!(is_display_server_process("mutter"));
+
+        assert!(!is_display_server_process("firefox"));
+        assert!(!is_display_server_process("python3"));
+        assert!(!is_display_server_process("nvidia-smi"));
+        assert!(!is_display_server_process("cuda_app"));
+        assert!(!is_display_server_process(""));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_blocker_detected_with_active_outputs() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+
+        let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0"));
+        assert!(info.is_some(), "should detect display blocker");
+        let info = info.unwrap();
+        assert_eq!(info.pci_addr, "0000:41:00.0");
+        assert!(info.has_active_outputs);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_blocker_none_when_gpu_already_vfio() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+
+        let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0"));
+        assert!(
+            info.is_none(),
+            "should not detect blocker when GPU is already on vfio-pci"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_blocker_none_on_headless_idle_gpu() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "disconnected")],
+        );
+
+        let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0"));
+        assert!(
+            info.is_none(),
+            "headless idle GPU should not trigger display blocker"
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn display_blocker_auto_finds_blocked_gpu() {
+        let root = tempfile::tempdir().unwrap();
+        let sysfs = SysfsRoot::new(root.path().to_path_buf());
+
+        mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia"));
+        mock_drm_card(
+            root.path(),
+            "card0",
+            "0000:41:00.0",
+            &[("DP-1", "connected")],
+        );
+
+        mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("vfio-pci"));
+
+        let info = detect_display_blocker_with_sysfs(&sysfs, None);
+        assert!(info.is_some());
+        assert_eq!(info.unwrap().pci_addr, "0000:41:00.0");
+    }
+}
diff --git a/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs
new file mode 100644
index 000000000..08c658f7a
--- /dev/null
+++ b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Integration tests for GPU passthrough on real hardware.
+//!
+//! Gated by `OPENSHELL_VM_GPU_E2E=1`. On machines without a real GPU,
+//! all tests early-return and pass.
+
+use openshell_vfio::{
+    GpuBindGuard, HostNvidiaVfioReadiness, prepare_gpu_for_passthrough,
+    probe_host_nvidia_vfio_readiness,
+};
+
+fn gpu_e2e_enabled() -> bool {
+    std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() == Ok("1")
+}
+
+#[test]
+fn nvidia_gpu_passthrough_is_available() {
+    if !gpu_e2e_enabled() {
+        eprintln!("OPENSHELL_VM_GPU_E2E not set — skipping GPU passthrough gate test");
+        return;
+    }
+    assert!(
+        openshell_vfio::nvidia_gpu_available_for_vm_passthrough(None),
+        "GPU passthrough gate returned false on a GPU CI runner — \
+         check VFIO binding and VM runtime bundle"
+    );
+}
+
+#[test]
+fn bind_and_rebind_real_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU");
+
+    let results = probe_host_nvidia_vfio_readiness();
+    let (_, readiness) = results
+        .iter()
+        .find(|(a, _)| a == &state.pci_addr)
+        .expect("bound GPU should appear in probe");
+    assert_eq!(*readiness, HostNvidiaVfioReadiness::VfioBoundReady);
+
+    state.restore().expect("restore should succeed");
+
+    let results = probe_host_nvidia_vfio_readiness();
+    let (_, readiness) = results
+        .iter()
+        .find(|(a, _)| a == &state.pci_addr)
+        .expect("restored GPU should appear in probe");
+    assert_eq!(*readiness, HostNvidiaVfioReadiness::BoundToNvidia);
+}
+
+#[test]
+fn safety_checks_pass_on_ci_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    // `prepare_gpu_for_passthrough` runs all safety checks internally
+    // (display-attached, IOMMU enabled, VFIO modules loaded, sysfs
+    // permissions). Success here validates that the CI GPU is headless,
+    // IOMMU is on, and VFIO modules are loaded.
+    let state = prepare_gpu_for_passthrough(None)
+        .expect("all safety checks should pass on a headless CI GPU");
+    assert!(!state.pci_addr.is_empty());
+
+    state.restore().expect("restore should succeed");
+}
+
+#[test]
+fn guard_restores_on_drop_real_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU");
+    let pci_addr = state.pci_addr.clone();
+
+    let guard = GpuBindGuard::new(state);
+    drop(guard);
+
+    let output = std::process::Command::new("nvidia-smi")
+        .arg("--query-gpu=pci.bus_id")
+        .arg("--format=csv,noheader")
+        .output()
+        .expect("nvidia-smi should be available after guard drop");
+    assert!(
+        output.status.success(),
+        "nvidia-smi failed after guard drop"
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let normalized_addr = pci_addr.to_uppercase();
+    assert!(
+        stdout.to_uppercase().contains(&normalized_addr),
+        "nvidia-smi should list the restored GPU {pci_addr}, got: {stdout}"
+    );
+}
+
+#[test]
+fn auto_select_finds_ci_gpu() {
+    if !gpu_e2e_enabled() {
+        return;
+    }
+
+    let state = prepare_gpu_for_passthrough(None).expect("auto-select should find a GPU on CI");
+    assert!(!state.pci_addr.is_empty());
+    assert!(state.did_bind);
+
+    state.restore().expect("restore should succeed");
+}
diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml
index 7d74b3139..aa3d85a4a 100644
--- a/crates/openshell-vm/Cargo.toml
+++ b/crates/openshell-vm/Cargo.toml
@@ -28,6 +28,7 @@ miette = { workspace = true }
 nix = { workspace = true }
 openshell-bootstrap = { path = "../openshell-bootstrap" }
 openshell-core = { path = "../openshell-core" }
+openshell-vfio = { path = "../openshell-vfio" }
 serde = { workspace = true }
 serde_json = "1"
 tar = "0.4"
@@ -46,5 +47,8 @@ tokio-rustls = { workspace = true }
 [build-dependencies]
 zstd = "0.13"
 
+[dev-dependencies]
+tempfile = "3"
+
 [lints]
 workspace = true
diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs
index 33fab9a78..7c709defd 100644
--- a/crates/openshell-vm/build.rs
+++ b/crates/openshell-vm/build.rs
@@ -12,7 +12,7 @@
 //! Environment:
 //!   `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` - Path to compressed artifacts
 
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::{env, fs};
 
 fn main() {
@@ -28,6 +28,7 @@ fn main() {
             "libkrunfw.5.dylib.zst",
             "gvproxy.zst",
             "rootfs.tar.zst",
+            "rootfs-gpu.tar.zst",
         ] {
             println!("cargo:rerun-if-changed={dir}/{name}");
         }
@@ -68,24 +69,30 @@ fn main() {
         return;
     }
 
-    // Copy compressed files to OUT_DIR
-    let files = [
+    // Copy compressed files to OUT_DIR.
+    // Core artifacts are required; rootfs has two variants (base and GPU) and
+    // the presence of either one is sufficient.
+    let core_files = [
         (format!("{libkrun_name}.zst"), format!("{libkrun_name}.zst")),
         (
             format!("{libkrunfw_name}.zst"),
             format!("{libkrunfw_name}.zst"),
         ),
         ("gvproxy.zst".to_string(), "gvproxy.zst".to_string()),
-        ("rootfs.tar.zst".to_string(), "rootfs.tar.zst".to_string()),
     ];
 
     let mut all_found = true;
-    for (src_name, dst_name) in &files {
+    let mut total_embedded_size: u64 = 0;
+
+    let copy_artifact = |src_name: &str,
+                         dst_name: &str,
+                         compressed_dir: &Path,
+                         out_dir: &Path,
+                         total: &mut u64|
+     -> bool {
         let src_path = compressed_dir.join(src_name);
         let dst_path = out_dir.join(dst_name);
-
         if src_path.exists() {
-            // Remove existing file first (may be read-only from previous build)
             if dst_path.exists() {
                 let _ = fs::remove_file(&dst_path);
             }
@@ -98,25 +105,104 @@ fn main() {
                 )
             });
             let size = fs::metadata(&dst_path).map(|m| m.len()).unwrap_or(0);
+            *total += size;
             println!("cargo:warning=Embedded {src_name}: {size} bytes");
+            true
         } else {
+            false
+        }
+    };
+
+    for (src_name, dst_name) in &core_files {
+        if !copy_artifact(
+            src_name,
+            dst_name,
+            &compressed_dir,
+            &out_dir,
+            &mut total_embedded_size,
+        ) {
             println!(
                 "cargo:warning=Missing compressed artifact: {}",
-                src_path.display()
+                compressed_dir.join(src_name).display()
             );
             all_found = false;
         }
     }
 
+    // Rootfs: accept either the base rootfs or the GPU rootfs (or both).
+    let has_base = copy_artifact(
+        "rootfs.tar.zst",
+        "rootfs.tar.zst",
+        &compressed_dir,
+        &out_dir,
+        &mut total_embedded_size,
+    );
+    let has_gpu = copy_artifact(
+        "rootfs-gpu.tar.zst",
+        "rootfs-gpu.tar.zst",
+        &compressed_dir,
+        &out_dir,
+        &mut total_embedded_size,
+    );
+    if !has_base && !has_gpu {
+        println!(
+            "cargo:warning=Missing rootfs artifact: neither rootfs.tar.zst nor rootfs-gpu.tar.zst found in {}",
+            compressed_dir.display()
+        );
+    } else if !has_base {
+        println!(
+            "cargo:warning=Only rootfs-gpu.tar.zst found (base rootfs.tar.zst absent). \
+             This is fine for GPU-only builds; run `mise run vm:setup` to get the base rootfs."
+        );
+    } else if !has_gpu {
+        println!(
+            "cargo:warning=Only rootfs.tar.zst found (GPU rootfs-gpu.tar.zst absent). \
+             This is fine for non-GPU builds; run `mise run vm:rootfs -- --gpu` to get the GPU rootfs."
+        );
+    }
+
+    // Write empty stubs for any missing rootfs variant so that
+    // `include_bytes!()` in embedded.rs always resolves. The embedded module
+    // treats zero-length slices as "not available".
+    for (found, name) in [
+        (has_base, "rootfs.tar.zst"),
+        (has_gpu, "rootfs-gpu.tar.zst"),
+    ] {
+        if !found {
+            let stub = out_dir.join(name);
+            if !stub.exists() {
+                fs::write(&stub, b"")
+                    .unwrap_or_else(|e| panic!("Failed to write stub {name}: {e}"));
+            }
+        }
+    }
+
     if !all_found {
         println!("cargo:warning=Some artifacts missing. Run: mise run vm:setup");
         generate_stub_resources(&out_dir);
     }
+
+    // Warn when total embedded data approaches the x86_64 small code model limit.
+    // The default code model uses R_X86_64_PC32 (±2 GiB) relocations; embedding
+    // blobs that push .rodata past 2 GiB will cause linker failures unless
+    // RUSTFLAGS="-C code-model=large" is set. The vm:build task does this
+    // automatically, but direct cargo invocations may not.
+    const LARGE_BLOB_THRESHOLD: u64 = 1_800_000_000; // ~1.8 GiB
+    if target_arch == "x86_64" && total_embedded_size > LARGE_BLOB_THRESHOLD {
+        println!(
+            "cargo:warning=Total embedded data is {total_embedded_size} bytes ({:.1} GiB).",
+            total_embedded_size as f64 / (1024.0 * 1024.0 * 1024.0)
+        );
+        println!("cargo:warning=This exceeds the x86_64 small code model limit (~2 GiB).");
+        println!(
+            "cargo:warning=Ensure RUSTFLAGS includes '-C code-model=large' or use `mise run vm:build`."
+        );
+    }
 }
 
 /// Generate stub (empty) resource files so the build can complete.
 /// The embedded module will fail at runtime if these stubs are used.
-fn generate_stub_resources(out_dir: &PathBuf) {
+fn generate_stub_resources(out_dir: &Path) {
     let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
 
     let (libkrun_name, libkrunfw_name) = match target_os.as_str() {
@@ -129,6 +215,7 @@ fn generate_stub_resources(out_dir: &PathBuf) {
         format!("{libkrunfw_name}.zst"),
         "gvproxy.zst".to_string(),
         "rootfs.tar.zst".to_string(),
+        "rootfs-gpu.tar.zst".to_string(),
     ];
 
     for name in &stubs {
diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env
index b3d802292..cedc15d85 100644
--- a/crates/openshell-vm/pins.env
+++ b/crates/openshell-vm/pins.env
@@ -42,3 +42,37 @@ GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}"
 # Repo: https://github.com/containers/libkrunfw
 # Pinned: 2026-03-27 (main branch HEAD at time of pinning)
 LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}"
+
+# ── virtiofsd (virtio-fs daemon for QEMU rootfs) ────────────────────────
+# Repo: https://gitlab.com/virtio-fs/virtiofsd
+VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}"
+
+# ── NVIDIA GPU support (GPU rootfs variant) ────────────────────────────
+# Driver branch: 570.x (open kernel modules, data-center/workstation)
+#
+# Compatibility matrix:
+#   Minimum driver version:      570 (NVIDIA 570.x open kernel modules)
+#   Minimum compute capability:  sm_70 (Volta V100 and newer)
+#   Supported architectures:     Volta (V100), Turing (T4, RTX 20xx),
+#                                Ampere (A100, A10, RTX 30xx),
+#                                Hopper (H100, H200), Ada Lovelace (L40S),
+#                                Blackwell (B100, B200)
+#   Guest architecture:          x86_64 only (NVIDIA does not publish
+#                                aarch64 data-center drivers in APT form)
+#   Host requirements:           IOMMU enabled, GPU bound to vfio-pci driver,
+#                                host driver version >= guest driver version
+#
+# The 570.x branch uses the open kernel module flavour
+# (nvidia-headless-570-open), required for data-center GPUs (Turing+).
+# Consumer GPUs (GeForce) may work but are not officially supported
+# for VFIO passthrough.
+NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}"
+NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.19.0}"
+
+# NVIDIA open kernel module source tag (must match nvidia-headless-570-open version).
+# Repo: https://github.com/NVIDIA/open-gpu-kernel-modules
+# The tag must be the exact driver version so that the compiled kernel modules
+# match the userspace libraries installed by nvidia-headless-570-open in the
+# rootfs.  A mismatch causes "API mismatch" errors from nvidia-smi.
+# Find the APT version: apt-cache show nvidia-headless-570-open | grep Version
+NVIDIA_DRIVER_TAG="${NVIDIA_DRIVER_TAG:-570.211.01}"
diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig
index b5f0330af..d1244ad32 100644
--- a/crates/openshell-vm/runtime/kernel/openshell.kconfig
+++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig
@@ -115,6 +115,10 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PIDS=y
 CONFIG_MEMCG=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_CGROUP_FREEZER=y
 
 # ── Disable kernel headers archive (avoids cpio issues in CI) ──────────
 # CONFIG_IKHEADERS is not set
@@ -126,3 +130,58 @@ CONFIG_POSIX_MQUEUE_SYSCTL=y
 # ── Security features required by the sandbox runtime ───────────────────
 CONFIG_SECURITY_LANDLOCK=y
 CONFIG_SECCOMP_FILTER=y
+
+# ── PCI / GPU passthrough (harmless for non-GPU boots) ──────────────────
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DRM=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+
+# MTRR — required dependency for CONFIG_X86_PAT below.
+CONFIG_MTRR=y
+
+# MMU notifier — required by NVIDIA UVM module for GPU memory management.
+CONFIG_MMU_NOTIFIER=y
+
+# PAT (Page Attribute Table) — required for correct GPU memory mapping.
+# Without this, the NVIDIA driver compiles a fallback code path in nv-pat.c
+# that calls __flush_tlb(), which was removed in kernel 6.12+. All modern
+# x86_64 CPUs support PAT; every distro kernel enables it.
+CONFIG_X86_PAT=y
+
+# ── Firmware loading (required for NVIDIA GSP firmware) ──────────────────
+# The NVIDIA open kernel modules use request_firmware() to load GSP firmware
+# from /lib/firmware/nvidia/<version>/. Without CONFIG_FW_LOADER, the kernel
+# has no firmware loading infrastructure and GPU init fails with:
+#   NVRM: RmFetchGspRmImages: No firmware image found
+# On kernel 6.12+, CONFIG_FW_LOADER includes the sysfs loading interface
+# (previously CONFIG_FW_LOADER_SYSFS, now merged).
+CONFIG_FW_LOADER=y
+
+# ── Compressed firmware support ──────────────────────────────────────────
+# NVIDIA driver packages (570.x+) ship GSP firmware as compressed files
+# (gsp_*.bin.xz). Without decompression support, request_firmware() fails
+# to find the firmware even when the files exist in /lib/firmware/.
+CONFIG_FW_LOADER_COMPRESS=y
+CONFIG_FW_LOADER_COMPRESS_XZ=y
+CONFIG_FW_LOADER_COMPRESS_ZSTD=y
+
+# ── QEMU backend support ─────────────────────────────────────────────────
+# QEMU uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers
+# coexist safely — the kernel probes whichever transport the hypervisor
+# provides.
+CONFIG_VIRTIO_PCI=y
+
+# Serial console for QEMU (8250/16550 UART). libkrun uses virtio-console
+# which is already enabled in the base config.
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+
+# ACPI support for QEMU power management. Required for `poweroff -f`
+# to trigger a clean ACPI shutdown that QEMU detects.
+CONFIG_ACPI=y
+
+# x2APIC support — QEMU uses x2APIC MADT entries for multi-vCPU VMs.
+# Without this, only the bootstrap CPU is activated.
+CONFIG_X86_X2APIC=y
diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh
index d43046d4f..efcf7ed10 100755
--- a/crates/openshell-vm/scripts/build-rootfs.sh
+++ b/crates/openshell-vm/scripts/build-rootfs.sh
@@ -18,11 +18,16 @@
 # - NO pre-initialized k3s state (cold start on first boot)
 # First boot will be slower (~30-60s) as k3s initializes and pulls images.
 #
+# With --gpu, installs NVIDIA driver packages and the nvidia-container-toolkit
+# into the rootfs, producing a GPU-capable variant. The launcher selects this
+# rootfs when `--gpu` is passed. Only supported on x86_64 (NVIDIA does not
+# publish aarch64 data-center drivers for Ubuntu in this packaging form).
+#
 # Supports aarch64 and x86_64 guest architectures. The target architecture
 # is auto-detected from the host but can be overridden with --arch.
 #
 # Usage:
-#   ./build-rootfs.sh [--base] [--arch aarch64|x86_64] [output_dir]
+#   ./build-rootfs.sh [--base] [--gpu] [--arch aarch64|x86_64] [output_dir]
 #
 # If output_dir is omitted, the rootfs is built under target/rootfs-build.
 #
@@ -43,12 +48,15 @@ fi
 
 # ── Argument parsing ───────────────────────────────────────────────────
 BASE_ONLY=false
+GPU_BUILD=false
 GUEST_ARCH=""
 POSITIONAL_ARGS=()
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --base)
             BASE_ONLY=true; shift ;;
+        --gpu)
+            GPU_BUILD=true; shift ;;
         --arch)
             GUEST_ARCH="$2"; shift 2 ;;
         *)
@@ -90,6 +98,14 @@ case "$GUEST_ARCH" in
         ;;
 esac
 
+# GPU builds are only supported on x86_64 — NVIDIA does not publish
+# aarch64 data-center driver packages in the same APT repository.
+if [ "$GPU_BUILD" = true ] && [ "$GUEST_ARCH" != "x86_64" ]; then
+    echo "ERROR: --gpu is only supported for x86_64 guest architecture." >&2
+    echo "       Current arch: ${GUEST_ARCH}" >&2
+    exit 1
+fi
+
 # Project root (two levels up from crates/openshell-vm/scripts/)
 PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
 DEFAULT_ROOTFS="${PROJECT_ROOT}/target/rootfs-build"
@@ -119,12 +135,76 @@ verify_checksum() {
     fi
 }
 
+verify_gpu_rootfs() {
+    local rootfs_dir="$1"
+    local kernel_version="$2"
+    local driver_tag="$3"
+    local driver_version="$4"
+
+    echo "==> Verifying GPU components in rootfs..."
+    if [ ! -f "${rootfs_dir}/usr/bin/nvidia-smi" ]; then
+        echo "ERROR: nvidia-smi not found in rootfs."
+        exit 1
+    fi
+    echo "    nvidia-smi: found"
+    if ls "${rootfs_dir}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then
+        echo "    nvidia-container-runtime: found"
+    else
+        echo "WARNING: nvidia-container-runtime not found — GPU pods may not work."
+    fi
+    if [ -z "${kernel_version}" ]; then
+        echo "ERROR: VM_KERNEL_VERSION not set — kernel module injection may have been skipped" >&2
+        exit 1
+    fi
+    if [ -d "${rootfs_dir}/lib/modules/${kernel_version}" ]; then
+        local mod_count
+        mod_count=$(find "${rootfs_dir}/lib/modules/${kernel_version}" -name "nvidia*.ko" | wc -l)
+        echo "    nvidia kernel modules: ${mod_count} found (kernel ${kernel_version})"
+        if [ "$mod_count" -eq 0 ]; then
+            echo "ERROR: no nvidia kernel modules in /lib/modules/${kernel_version}/"
+            echo "       Run: mise run vm:nvidia-modules"
+            exit 1
+        fi
+    else
+        echo "ERROR: /lib/modules/${kernel_version}/ not found in rootfs"
+        echo "       Run: mise run vm:nvidia-modules"
+        exit 1
+    fi
+    local fw_dir="${rootfs_dir}/lib/firmware/nvidia/${driver_tag}"
+    if [ ! -d "${fw_dir}" ]; then
+        fw_dir="${rootfs_dir}/usr/lib/firmware/nvidia/${driver_tag}"
+    fi
+    if [ -d "${fw_dir}" ]; then
+        local fw_count
+        fw_count=$(ls "${fw_dir}"/gsp_*.bin 2>/dev/null | wc -l)
+        echo "    GSP firmware: ${fw_count} files found"
+        for fw in "${fw_dir}"/gsp_*.bin; do
+            [ -f "$fw" ] || continue
+            echo "      $(basename "$fw") ($(du -h "$fw" | cut -f1))"
+        done
+        if [ "$fw_count" -eq 0 ]; then
+            echo "ERROR: No GSP firmware files (gsp_*.bin) in ${fw_dir}" >&2
+            echo "       nvidia-smi will fail with: RmFetchGspRmImages: No firmware image found" >&2
+            exit 1
+        fi
+    else
+        echo "ERROR: GSP firmware directory not found" >&2
+        echo "       Checked: ${rootfs_dir}/lib/firmware/nvidia/${driver_tag}/" >&2
+        echo "       and:     ${rootfs_dir}/usr/lib/firmware/nvidia/${driver_tag}/" >&2
+        echo "       Install: nvidia-firmware-${driver_version}-${driver_tag}" >&2
+        exit 1
+    fi
+}
+
 if [ "$BASE_ONLY" = true ]; then
     echo "==> Building base openshell-vm rootfs"
     echo "    Guest arch:  ${GUEST_ARCH}"
     echo "    k3s version: ${K3S_VERSION}"
     echo "    Output:      ${ROOTFS_DIR}"
     echo "    Mode:        base (no pre-loaded images, cold start)"
+    if [ "$GPU_BUILD" = true ]; then
+        echo "    GPU:         yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})"
+    fi
 else
     echo "==> Building openshell-vm rootfs"
     echo "    Guest arch:  ${GUEST_ARCH}"
@@ -132,6 +212,9 @@ else
     echo "    Images:      ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}"
     echo "    Output:      ${ROOTFS_DIR}"
     echo "    Mode:        full (pre-loaded images, pre-initialized)"
+    if [ "$GPU_BUILD" = true ]; then
+        echo "    GPU:         yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})"
+    fi
 fi
 echo ""
 
@@ -222,38 +305,110 @@ fi
 docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true
 
 echo "==> Building base image..."
-docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \
-    --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE'
+if [ "$GPU_BUILD" = true ]; then
+    docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \
+        --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" \
+        --build-arg "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \
+        --build-arg "NVIDIA_DRIVER_TAG=${NVIDIA_DRIVER_TAG}" \
+        --build-arg "NVIDIA_CONTAINER_TOOLKIT_VERSION=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \
+        -f - . <<'DOCKERFILE'
 ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
+ARG NVIDIA_DRIVER_VERSION
+ARG NVIDIA_DRIVER_TAG
+ARG NVIDIA_CONTAINER_TOOLKIT_VERSION
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         ca-certificates \
         e2fsprogs \
         iptables \
         iproute2 \
+        kmod \
         python3 \
         busybox-static \
         sqlite3 \
         util-linux \
         zstd \
+        gnupg \
+        curl \
     && rm -rf /var/lib/apt/lists/*
 # busybox-static provides udhcpc for DHCP inside the VM.
 RUN mkdir -p /usr/share/udhcpc && \
     ln -sf /bin/busybox /sbin/udhcpc
 RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s
+# ── NVIDIA driver and container toolkit ──────────────────────────────
+# Add the NVIDIA package repository and install the open kernel module
+# flavour of the driver plus nvidia-container-toolkit. The open modules
+# are required for data-center GPUs (Turing+ / compute capability >= 7.0).
+# Userspace packages are pinned to $NVIDIA_DRIVER_TAG so they match the
+# kernel modules compiled by build-nvidia-modules.sh.
+RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+        | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+    && curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+        | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+        > /etc/apt/sources.list.d/nvidia-container-toolkit.list
+RUN apt-get update && \
+    HEADLESS_VER=$(apt-cache madison nvidia-headless-${NVIDIA_DRIVER_VERSION}-open \
+        | awk -v tag="${NVIDIA_DRIVER_TAG}" '$3 ~ "^"tag {print $3; exit}') && \
+    UTILS_VER=$(apt-cache madison nvidia-utils-${NVIDIA_DRIVER_VERSION} \
+        | awk -v tag="${NVIDIA_DRIVER_TAG}" '$3 ~ "^"tag {print $3; exit}') && \
+    if [ -z "$HEADLESS_VER" ] || [ -z "$UTILS_VER" ]; then \
+        echo "ERROR: No APT package found for driver tag ${NVIDIA_DRIVER_TAG}" >&2; \
+        echo "  headless: ${HEADLESS_VER:-not found}"; \
+        echo "  utils:    ${UTILS_VER:-not found}"; \
+        exit 1; \
+    fi && \
+    echo "Pinning NVIDIA packages: headless=${HEADLESS_VER} utils=${UTILS_VER}" && \
+    apt-get install -y --no-install-recommends \
+        nvidia-headless-${NVIDIA_DRIVER_VERSION}-open=${HEADLESS_VER} \
+        nvidia-utils-${NVIDIA_DRIVER_VERSION}=${UTILS_VER} \
+        nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}-1 \
+    && rm -rf /var/lib/apt/lists/*
+# Configure the NVIDIA container runtime as the default for containerd.
+RUN nvidia-ctk runtime configure --runtime=containerd --set-as-default
 DOCKERFILE
+else
+    docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \
+        --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE'
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        e2fsprogs \
+        iptables \
+        iproute2 \
+        python3 \
+        busybox-static \
+        sqlite3 \
+        util-linux \
+        zstd \
+    && rm -rf /var/lib/apt/lists/*
+# busybox-static provides udhcpc for DHCP inside the VM.
+RUN mkdir -p /usr/share/udhcpc && \
+    ln -sf /bin/busybox /sbin/udhcpc
+RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s
+DOCKERFILE
+fi
 
 # Create a container and export the filesystem
 echo "==> Creating container..."
 docker create --platform "${DOCKER_PLATFORM}" --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true
 
 echo "==> Exporting filesystem..."
-# Previous builds may leave overlayfs work/ dirs with permissions that
-# prevent rm on macOS. Force-fix permissions before removing.
+# Previous builds (especially VM pre-init) may leave root-owned files
+# (k3s data, CNI, kubelet) that prevent non-root removal. Try normal
+# cleanup first, fall back to sudo if needed.
 if [ -d "${ROOTFS_DIR}" ]; then
+    if [ -z "${ROOTFS_DIR}" ] || [ "${ROOTFS_DIR}" = "/" ]; then
+        echo "ERROR: ROOTFS_DIR is empty or root — refusing to rm -rf" >&2
+        exit 1
+    fi
     chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true
-    rm -rf "${ROOTFS_DIR}"
+    if ! rm -rf "${ROOTFS_DIR}" 2>/dev/null; then
+        echo "    Root-owned files detected in ${ROOTFS_DIR}, using sudo to clean..."
+        sudo rm -rf "${ROOTFS_DIR}"
+    fi
 fi
 mkdir -p "${ROOTFS_DIR}"
 docker export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf -
@@ -363,6 +518,71 @@ for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do
     fi
 done
 
+# ── Inject GPU manifests (when building GPU rootfs) ───────────────────
+# These are deployed by openshell-vm-init.sh when GPU_ENABLED=true.
+GPU_MANIFEST_SRC="${SCRIPT_DIR}/gpu-manifests"
+GPU_MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/gpu-manifests"
+if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_SRC}" ]; then
+    echo "==> Injecting GPU manifests..."
+    mkdir -p "${GPU_MANIFEST_DEST}"
+    GPU_MANIFEST_COPIED=0
+    for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        cp "$manifest" "${GPU_MANIFEST_DEST}/"
+        echo "    $(basename "$manifest")"
+        GPU_MANIFEST_COPIED=$((GPU_MANIFEST_COPIED + 1))
+    done
+    # Sentinel only when at least one manifest was staged (empty glob must not create it).
+    if [ "$GPU_MANIFEST_COPIED" -gt 0 ]; then
+        echo "gpu" > "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu"
+    else
+        echo "WARNING: No GPU manifests (*.yaml) found in ${GPU_MANIFEST_SRC}; not writing .rootfs-gpu sentinel." >&2
+    fi
+fi
+
+# ── Inject NVIDIA kernel modules (GPU rootfs only) ────────────────────
+# The kernel modules are compiled separately by build-nvidia-modules.sh
+# against the VM kernel source tree.  We inject them here so modprobe
+# can load nvidia.ko at VM boot time.
+if [ "$GPU_BUILD" = true ]; then
+    NVIDIA_MODULES_DIR="${PROJECT_ROOT}/target/libkrun-build/nvidia-modules"
+
+    # Read the kernel version exported by build-libkrun.sh.
+    KERNEL_VERSION_FILE="${PROJECT_ROOT}/target/libkrun-build/kernel-version.txt"
+    if [ -f "$KERNEL_VERSION_FILE" ]; then
+        VM_KERNEL_VERSION="$(cat "$KERNEL_VERSION_FILE")"
+    else
+        echo "ERROR: kernel-version.txt not found at ${KERNEL_VERSION_FILE}" >&2
+        echo "       Run: FROM_SOURCE=1 mise run vm:setup" >&2
+        exit 1
+    fi
+
+    MODULE_DEST="${ROOTFS_DIR}/lib/modules/${VM_KERNEL_VERSION}/kernel/drivers/video/nvidia"
+
+    if [ -d "${NVIDIA_MODULES_DIR}" ] && ls "${NVIDIA_MODULES_DIR}"/*.ko >/dev/null 2>&1; then
+        echo "==> Injecting NVIDIA kernel modules (kernel ${VM_KERNEL_VERSION})..."
+        mkdir -p "${MODULE_DEST}"
+        cp "${NVIDIA_MODULES_DIR}"/*.ko "${MODULE_DEST}/"
+        for mod in "${MODULE_DEST}"/*.ko; do
+            echo "    $(basename "$mod") ($(du -h "$mod" | cut -f1))"
+        done
+
+        # Generate module dependency metadata so modprobe works.
+        KERNEL_DIR_NAME="$(grep '^KERNEL_VERSION' "${PROJECT_ROOT}/target/libkrun-build/libkrunfw/Makefile" | head -1 | awk '{print $3}')"
+        SYSTEM_MAP="${PROJECT_ROOT}/target/libkrun-build/libkrunfw/${KERNEL_DIR_NAME}/System.map"
+        if [ -f "$SYSTEM_MAP" ]; then
+            depmod -a -b "${ROOTFS_DIR}" -F "$SYSTEM_MAP" "${VM_KERNEL_VERSION}"
+        else
+            depmod -a -b "${ROOTFS_DIR}" "${VM_KERNEL_VERSION}"
+        fi
+        echo "    depmod: module dependencies generated"
+    else
+        echo "ERROR: NVIDIA kernel modules not found at ${NVIDIA_MODULES_DIR}" >&2
+        echo "       Run: tasks/scripts/vm/build-nvidia-modules.sh" >&2
+        exit 1
+    fi
+fi
+
 # ── Base mode: mark rootfs type and skip pre-loading ───────────────────
 
 if [ "$BASE_ONLY" = true ]; then
@@ -384,10 +604,22 @@ if [ "$BASE_ONLY" = true ]; then
         exit 1
     fi
 
+    if [ "$GPU_BUILD" = true ]; then
+        if [ ! -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ]; then
+            echo "ERROR: GPU sentinel file not found in rootfs."
+            exit 1
+        fi
+        verify_gpu_rootfs "${ROOTFS_DIR}" "${VM_KERNEL_VERSION:-}" "${NVIDIA_DRIVER_TAG}" "${NVIDIA_DRIVER_VERSION}"
+    fi
+
     echo ""
     echo "==> Base rootfs ready at: ${ROOTFS_DIR}"
     echo "    Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)"
-    echo "    Type: base (cold start, images pulled on demand)"
+    if [ "$GPU_BUILD" = true ]; then
+        echo "    Type: base + GPU (cold start, NVIDIA driver ${NVIDIA_DRIVER_VERSION})"
+    else
+        echo "    Type: base (cold start, images pulled on demand)"
+    fi
     echo ""
     echo "Note: First boot will take ~30-60s as k3s initializes."
     echo "      Container images will be pulled from registries on first use."
@@ -475,6 +707,15 @@ for manifest in "${MANIFEST_DEST}"/*.yaml; do
     cp "$manifest" "${INIT_MANIFESTS}/"
 done
 
+# GPU manifests: same pre-init path as other auto-deploy manifests so k3s
+# sees them during cluster bake (not only under /opt/openshell/gpu-manifests).
+if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_DEST}" ]; then
+    for manifest in "${GPU_MANIFEST_DEST}"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        cp "$manifest" "${INIT_MANIFESTS}/"
+    done
+fi
+
 # Patch HelmChart for local images and VM settings.
 HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml"
 if [ -f "$HELMCHART" ]; then
@@ -589,6 +830,7 @@ else
 fi
 # Pre-initialize directly on virtio-fs. Runtime boots attach a separate
 # block-backed state disk and seed it from the rootfs on first launch.
+rm -f "${ROOTFS_DIR}-console.log" 2>/dev/null || sudo rm -f "${ROOTFS_DIR}-console.log" 2>/dev/null || true
 OPENSHELL_VM_DISABLE_STATE_DISK=1 "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" --reset &
 VM_PID=$!
 
@@ -599,6 +841,13 @@ cleanup_vm() {
         kill "${VM_PID}" 2>/dev/null || true
         wait "${VM_PID}" 2>/dev/null || true
     fi
+    # Kill orphaned gvproxy processes left by the VM (holds port 30051).
+    local gvproxy_pids
+    gvproxy_pids=$(pgrep -f "gvproxy.*listen-qemu" 2>/dev/null || true)
+    if [ -n "$gvproxy_pids" ]; then
+        echo "    Killing orphaned gvproxy: $gvproxy_pids"
+        kill $gvproxy_pids 2>/dev/null || true
+    fi
 }
 trap cleanup_vm EXIT
 
@@ -616,15 +865,16 @@ for i in $(seq 1 120); do
     sleep 1
 done
 
-# Wait for containerd to be ready.
+# Wait for containerd to be ready. The first boot after a --reset may
+# need extra time for k3s to extract its data dir and start containerd.
 echo "    Waiting for containerd..."
-for i in $(seq 1 60); do
+for i in $(seq 1 180); do
     if vm_exec k3s ctr version >/dev/null 2>&1; then
         echo "    Containerd ready (${i}s)"
         break
     fi
-    if [ "$i" -eq 60 ]; then
-        echo "ERROR: containerd did not become ready in 60s"
+    if [ "$i" -eq 180 ]; then
+        echo "ERROR: containerd did not become ready in 180s"
         exit 1
     fi
     sleep 1
@@ -669,8 +919,8 @@ done
 # per-boot layer extraction that previously added ~3-5s per container.
 echo "    Pre-unpacking container images..."
 for img in \
-    "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" \
-    "ghcr.io/nvidia/openshell/gateway:latest"; do
+    "${COMMUNITY_SANDBOX_IMAGE}" \
+    "${SERVER_IMAGE}"; do
     if vm_exec k3s ctr -n k8s.io images ls -q 2>/dev/null | grep -qF "$img"; then
         echo "      unpacking: $img"
         vm_exec k3s ctr -n k8s.io run --rm "$img" "pre-unpack-$(date +%s)" true 2>/dev/null || true
@@ -741,10 +991,18 @@ if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then
     exit 1
 fi
 
+# ── GPU verification (full mode) ──────────────────────────────────────
+if [ "$GPU_BUILD" = true ]; then
+    verify_gpu_rootfs "${ROOTFS_DIR}" "${VM_KERNEL_VERSION:-}" "${NVIDIA_DRIVER_TAG}" "${NVIDIA_DRIVER_VERSION}"
+fi
+
 echo ""
 echo "==> Rootfs ready at: ${ROOTFS_DIR}"
 echo "    Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)"
 echo "    Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')"
+if [ "$GPU_BUILD" = true ]; then
+    echo "    GPU: NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION}"
+fi
 
 # Show k3s data size
 K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s"
diff --git a/crates/openshell-vm/scripts/gpu-manifests/README.md b/crates/openshell-vm/scripts/gpu-manifests/README.md
new file mode 100644
index 000000000..c72deb1aa
--- /dev/null
+++ b/crates/openshell-vm/scripts/gpu-manifests/README.md
@@ -0,0 +1,41 @@
+# GPU Rootfs Manifests
+
+These Kubernetes manifests are injected into the VM rootfs when
+`build-rootfs.sh --gpu` is used. During a **full** rootfs build they are
+also copied into the k3s auto-deploy manifest directory so they are
+applied at pre-init time.
+
+**Phase 2:** deployment from `openshell-vm-init.sh` when
+`GPU_ENABLED=true` is not implemented yet; that path will copy or
+reconcile these manifests at VM boot.
+
+## NVIDIA Driver Compatibility
+
+| Property | Value |
+|---|---|
+| Driver branch | 570.x (open kernel modules) |
+| Minimum compute capability | sm_70 (Volta V100 and newer) |
+| Container toolkit | nvidia-container-toolkit 1.17.x |
+| Device plugin Helm chart | 0.18.2 |
+
+### Why open kernel modules?
+
+The 570.x open kernel modules are required for data-center GPUs
+(Volta, Turing, Ampere, Hopper, Blackwell). They are the
+NVIDIA-recommended driver for passthrough and container workloads.
+Consumer GPUs (GeForce) prior to Turing (sm_75) are **not supported**
+with open modules — use the proprietary driver branch if needed.
+
+### Host requirements
+
+- IOMMU enabled in BIOS and kernel (`intel_iommu=on` or `amd_iommu=on`)
+- GPU bound to `vfio-pci` driver on the host
+- `/dev/vfio/vfio` and `/dev/vfio/<group>` accessible
+- Host NVIDIA driver version >= 570 (must match or exceed guest driver)
+
+### Files
+
+- `nvidia-device-plugin.yaml` — HelmChart CR that deploys the NVIDIA
+  k8s-device-plugin via the k3s Helm controller.
+- `nvidia-runtime-class.yaml` — RuntimeClass object so pods can use
+  `runtimeClassName: nvidia`.
diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml
new file mode 100644
index 000000000..c1cbeaa8a
--- /dev/null
+++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# HelmChart CR for auto-deploying the NVIDIA k8s-device-plugin via k3s Helm controller.
+#
+# This manifest is copied into /var/lib/rancher/k3s/server/manifests/ by the
+# VM init script when GPU_ENABLED=true. It is the VM-specific equivalent of
+# deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml used by the
+# Docker-based gateway.
+#
+# The chart installs:
+#   - NVIDIA device plugin DaemonSet (advertises nvidia.com/gpu resources)
+#
+# NFD and GFD are disabled; the device plugin's default nodeAffinity
+# (which requires nvidia.com/gpu.present=true) is overridden to empty
+# so it schedules on any node without requiring NFD/GFD labels.
+#
+# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that
+# devices are injected via CDI hooks before container start. Sandbox pods only
+# need the nvidia.com/gpu resource request — no runtimeClassName is required.
+#
+# k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia"
+# RuntimeClass automatically, so no manual RuntimeClass manifest is needed.
+
+apiVersion: helm.cattle.io/v1
+kind: HelmChart
+metadata:
+  name: nvidia-device-plugin
+  namespace: kube-system
+spec:
+  repo: https://nvidia.github.io/k8s-device-plugin
+  chart: nvidia-device-plugin
+  version: "0.18.2"
+  targetNamespace: nvidia-device-plugin
+  createNamespace: true
+  valuesContent: |-
+    runtimeClassName: nvidia
+    deviceListStrategy: cdi-cri
+    deviceIDStrategy: index
+    cdi:
+      nvidiaHookPath: /usr/bin/nvidia-cdi-hook
+    nvidiaDriverRoot: "/"
+    gfd:
+      enabled: false
+    nfd:
+      enabled: false
+    affinity: null
diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml
new file mode 100644
index 000000000..fe2ccbd6e
--- /dev/null
+++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# RuntimeClass for NVIDIA GPU workloads.
+# Deployed alongside the device plugin when GPU_ENABLED=true.
+# Pods requesting nvidia.com/gpu resources should set
+# runtimeClassName: nvidia to use the NVIDIA container runtime.
+---
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh
index 1cb686a31..ab871e334 100755
--- a/crates/openshell-vm/scripts/openshell-vm-init.sh
+++ b/crates/openshell-vm/scripts/openshell-vm-init.sh
@@ -46,6 +46,31 @@ mkdir -p /sys/fs/cgroup
 mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null &
 wait
 
+# ── Parse kernel cmdline for env vars ─────────────────────────────────
+# The QEMU backend passes environment variables via kernel cmdline
+# (KEY=VALUE tokens). These are not automatically exported to init.
+# Must run after /proc is mounted.
+if [ -f /proc/cmdline ]; then
+    for token in $(cat /proc/cmdline); do
+        case "$token" in
+            GPU_ENABLED=*|OPENSHELL_VM_STATE_DISK_DEVICE=*|VM_NET_IP=*|VM_NET_GW=*|VM_NET_DNS=*)
+                export "$token"
+                ;;
+        esac
+    done
+fi
+
+# Enable cgroup v2 controllers in the root cgroup hierarchy.
+# k3s/kubelet requires cpu, cpuset, memory, and pids controllers.
+# The kernel must have CONFIG_CGROUP_SCHED=y for the cpu controller.
+if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
+    for ctrl in cpu cpuset memory pids io; do
+        if grep -qw "$ctrl" /sys/fs/cgroup/cgroup.controllers; then
+            echo "+$ctrl" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true
+        fi
+    done
+fi
+
 ts "filesystems mounted"
 
 # ── Networking ──────────────────────────────────────────────────────────
@@ -97,20 +122,26 @@ DHCP_SCRIPT
         # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries
         # -A 1: wait 1s before first retry (aggressive for local gvproxy)
         if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then
-            ts "WARNING: DHCP failed, falling back to static config"
-            ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true
-            ip route add default via 192.168.127.1 2>/dev/null || true
+            STATIC_IP="${VM_NET_IP:-192.168.127.2}"
+            STATIC_GW="${VM_NET_GW:-192.168.127.1}"
+            ts "WARNING: DHCP failed, falling back to static config ($STATIC_IP gw $STATIC_GW)"
+            ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true
+            ip route add default via "$STATIC_GW" 2>/dev/null || true
         fi
     else
-        # Fallback to static config if no DHCP client available.
-        ts "no DHCP client, using static config"
-        ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true
-        ip route add default via 192.168.127.1 2>/dev/null || true
+        STATIC_IP="${VM_NET_IP:-192.168.127.2}"
+        STATIC_GW="${VM_NET_GW:-192.168.127.1}"
+        ts "no DHCP client, using static config ($STATIC_IP gw $STATIC_GW)"
+        ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true
+        ip route add default via "$STATIC_GW" 2>/dev/null || true
     fi
 
-    # Ensure DNS is configured. DHCP should have set /etc/resolv.conf,
-    # but if it didn't (or static fallback was used), provide a default.
-    if [ ! -s /etc/resolv.conf ]; then
+    # Ensure DNS is configured. When VM_NET_DNS is set (TAP networking),
+    # always use it — the rootfs may have a stale resolv.conf from a
+    # previous gvproxy run that points to an unreachable gateway.
+    if [ -n "${VM_NET_DNS:-}" ]; then
+        echo "nameserver $VM_NET_DNS" > /etc/resolv.conf
+    elif [ ! -s /etc/resolv.conf ]; then
         echo "nameserver 8.8.8.8" > /etc/resolv.conf
         echo "nameserver 8.8.4.4" >> /etc/resolv.conf
     fi
@@ -248,12 +279,20 @@ find /run -name '*.sock' -delete 2>/dev/null || true
 # start; clear it so k3s doesn't fail node re-registration validation.
 rm -f /var/lib/rancher/k3s/server/cred/node-passwd 2>/dev/null || true
 
+# Clean stale k3s TLS certificates from previous boots. If k3s crashes
+# mid-write it can leave partially-written (0-byte or non-PEM) cert files
+# that cause "tls: failed to find any PEM data in certificate input" on
+# restart. Wiping the TLS directory forces k3s to regenerate self-signed
+# certs on startup. This is safe for both cold and warm boots — the certs
+# are ephemeral per-cluster and recreated automatically by k3s.
+rm -rf /var/lib/rancher/k3s/server/tls 2>/dev/null || true
+
 # Clean stale containerd runtime state from previous boots.
 #
-# The rootfs persists across VM restarts via virtio-fs. The overlayfs
-# snapshotter now lives on the host-backed state disk when present, so
-# snapshot data and meta.db persist across boots. We only clean runtime
-# state (shim PIDs, sockets) that becomes stale when the VM restarts.
+# The rootfs persists across VM restarts via virtio-fs. The snapshotter
+# (overlayfs on state disk, native on virtiofs) persists across boots,
+# so snapshot data and meta.db survive. We only clean runtime state
+# (shim PIDs, sockets) that becomes stale when the VM restarts.
 if [ -d "$CONTAINERD_DIR" ]; then
     # Remove runtime task state (stale shim PIDs, sockets from dead processes).
     rm -rf "${CONTAINERD_DIR}/io.containerd.runtime.v2.task" 2>/dev/null || true
@@ -265,24 +304,27 @@ if [ -d "$CONTAINERD_DIR" ]; then
     # Clean stale ingest temp files from the content store.
     rm -rf "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" 2>/dev/null || true
     mkdir -p "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest"
-    # meta.db and overlayfs snapshots persist across boots on virtio-fs.
-    # No need to delete meta.db — snapshot metadata remains valid since
-    # the snapshotter directory is no longer backed by volatile tmpfs.
+    # meta.db and snapshots persist across boots.
     ts "cleaned containerd runtime state (meta.db + snapshots preserved)"
 fi
 rm -rf /run/k3s 2>/dev/null || true
 
-# Ensure the overlayfs snapshotter directory exists. The snapshotter
-# runs directly on virtio-fs, so layer data and snapshot metadata
-# persist across VM restarts. This eliminates the need to re-import
-# image tarballs and re-extract layers on every boot, significantly
-# reducing sandbox creation time.
-OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs"
-mkdir -p "$OVERLAYFS_DIR"
+# Select snapshotter based on the backing filesystem. overlayfs requires
+# filesystem features (redirect_dir xattrs) that virtiofs does not
+# support. When containerd lives on the block-backed state disk (ext4),
+# overlayfs works and provides efficient layer sharing. On virtiofs
+# (no state disk), fall back to the native snapshotter which uses
+# simple directory copies and works on any POSIX filesystem.
 if [ "$STATE_DISK_ACTIVE" = true ]; then
-    ts "overlayfs snapshotter on block-backed containerd state"
+    SNAPSHOTTER="overlayfs"
+    OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs"
+    mkdir -p "$OVERLAYFS_DIR"
+    ts "snapshotter: overlayfs on block-backed containerd state"
 else
-    ts "overlayfs snapshotter on virtio-fs (persistent)"
+    SNAPSHOTTER="native"
+    NATIVE_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.native"
+    mkdir -p "$NATIVE_DIR"
+    ts "snapshotter: native on virtio-fs (overlayfs unsupported on virtiofs)"
 fi
 
 ts "stale artifacts cleaned"
@@ -366,6 +408,69 @@ if [ "$_caps_ok" = false ]; then
     exit 1
 fi
 
+# ── GPU: NVIDIA driver and device plugin ─────────────────────────────
+# When the VM is launched with --gpu, the Rust launcher passes
+# GPU_ENABLED=true. Load the NVIDIA kernel modules, verify the device
+# is visible via nvidia-smi, and confirm that the container runtime is
+# available before k3s starts.
+
+if [ "${GPU_ENABLED:-false}" = "true" ]; then
+    ts "GPU mode enabled — loading NVIDIA drivers"
+
+    if ! command -v modprobe >/dev/null 2>&1; then
+        echo "FATAL: modprobe not found — the kmod package is missing from the GPU rootfs" >&2
+        echo "Fix: add 'kmod' to the apt-get install list in build-rootfs.sh and rebuild" >&2
+        exit 1
+    fi
+
+    # ── Stage NVIDIA GSP firmware onto tmpfs for reliable loading ─────
+    # The kernel's request_firmware() calls kernel_read_file_from_path_initns()
+    # which must read the full firmware blob (64MB+ for GSP) through the VFS
+    # layer. On virtiofs (FUSE-based), each read is a round-trip through the
+    # virtio ring to virtiofsd. This can fail or stall on non-DAX virtiofs
+    # configurations (QEMU vhost-user-fs-pci without cache-size).
+    #
+    # Copying firmware to /run (tmpfs) eliminates the FUSE path entirely —
+    # kernel_read_file() reads directly from page cache backed by RAM.
+    NVIDIA_FW_SRC="/lib/firmware/nvidia"
+    NVIDIA_FW_TMPFS="/run/firmware/nvidia"
+    if [ -d "$NVIDIA_FW_SRC" ]; then
+        mkdir -p "/run/firmware"
+        cp -a "$NVIDIA_FW_SRC" "/run/firmware/"
+        ts "staged NVIDIA firmware to tmpfs ($(du -sh "$NVIDIA_FW_TMPFS" | cut -f1))"
+
+        if [ -f /sys/module/firmware_class/parameters/path ]; then
+            echo -n "/run/firmware" > /sys/module/firmware_class/parameters/path
+            ts "firmware_class.path set to /run/firmware"
+        fi
+    else
+        echo "WARNING: NVIDIA firmware directory not found at $NVIDIA_FW_SRC" >&2
+        echo "         modprobe nvidia will likely fail with: RmFetchGspRmImages: No firmware image found" >&2
+    fi
+
+    modprobe nvidia || { echo "FATAL: failed to load nvidia kernel module" >&2; exit 1; }
+    modprobe nvidia_uvm || { echo "FATAL: failed to load nvidia_uvm kernel module" >&2; exit 1; }
+    modprobe nvidia_modeset || { echo "FATAL: failed to load nvidia_modeset kernel module" >&2; exit 1; }
+    ts "NVIDIA kernel modules loaded"
+
+    # Firmware is now in kernel memory; free the tmpfs copy.
+    rm -rf /run/firmware 2>/dev/null || true
+
+    if ! nvidia-smi > /dev/null 2>&1; then
+        echo "FATAL: GPU_ENABLED=true but nvidia-smi failed — GPU not visible to guest" >&2
+        echo "Check: VFIO passthrough, IOMMU groups, guest kernel modules" >&2
+        exit 1
+    fi
+    ts "nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
+
+    if command -v nvidia-container-runtime >/dev/null 2>&1; then
+        ts "nvidia-container-runtime: $(command -v nvidia-container-runtime)"
+    else
+        echo "FATAL: nvidia-container-runtime not found — GPU pods will fail" >&2
+        exit 1
+    fi
+fi
+
 # ── Deploy bundled manifests (cold boot only) ───────────────────────────
 # On pre-initialized rootfs, manifests are already in place from the
 # build-time k3s boot. Skip this entirely for fast startup.
@@ -411,6 +516,29 @@ else
     ts "skipping manifest deploy (pre-initialized)"
 fi
 
+# ── GPU manifests (device plugin, runtime class) ─────────────────────
+# Deployed on every boot (not just cold boot) so the device plugin is
+# always present when GPU_ENABLED=true. Mirrors cluster-entrypoint.sh.
+if [ "${GPU_ENABLED:-false}" = "true" ]; then
+    GPU_MANIFESTS="/opt/openshell/gpu-manifests"
+    if [ ! -d "$GPU_MANIFESTS" ]; then
+        echo "FATAL: GPU_ENABLED=true but GPU manifests directory missing: $GPU_MANIFESTS" >&2
+        exit 1
+    fi
+    mkdir -p "$K3S_MANIFESTS"
+    _gpu_manifest_deployed=false
+    for manifest in "$GPU_MANIFESTS"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        _gpu_manifest_deployed=true
+        cp "$manifest" "$K3S_MANIFESTS/"
+        ts "deployed GPU manifest: $(basename "$manifest")"
+    done
+    if [ "$_gpu_manifest_deployed" = false ]; then
+        echo "FATAL: GPU_ENABLED=true but no YAML manifests found in $GPU_MANIFESTS" >&2
+        exit 1
+    fi
+fi
+
 # Patch manifests for VM deployment constraints.
 HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml"
 if [ -f "$HELMCHART" ]; then
@@ -737,9 +865,9 @@ K3S_ARGS=(
     --node-ip="$NODE_IP"
     --kube-apiserver-arg=bind-address=0.0.0.0
     --resolv-conf=/etc/resolv.conf
-    --tls-san=localhost,127.0.0.1,10.0.2.15,192.168.127.2
+    --tls-san="localhost,127.0.0.1,10.0.2.15,192.168.127.2,$NODE_IP"
     --flannel-backend=none
-    --snapshotter=overlayfs
+    --snapshotter="$SNAPSHOTTER"
     --kube-proxy-arg=proxy-mode=nftables
     --kube-proxy-arg=nodeport-addresses=0.0.0.0/0
     # virtio-fs passthrough reports the host disk usage, which is
@@ -755,7 +883,7 @@ K3S_ARGS=(
     # container create after an image import may still be slow if
     # containerd needs to extract layers. 10m is a conservative safety
     # margin; typical operations complete much faster with persistent
-    # overlayfs snapshots.
+    # snapshots (overlayfs on state disk, native on virtiofs).
     --kubelet-arg=runtime-request-timeout=10m
 )
 
@@ -803,30 +931,51 @@ setsid sh -c '
 ' &
 fi
 
-# ── Clear stale kine bootstrap lock ─────────────────────────────────────
-# k3s uses kine with a SQLite backend at state.db. When k3s starts, kine
-# sets a bootstrap lock row; if k3s is killed before completing bootstrap
-# (SIGKILL, host crash, power loss), the lock persists and the next k3s
-# instance hangs forever on:
-#   "Bootstrap key already locked — waiting for data to be populated by
-#    another server"
+# ── Kine database health check ───────────────────────────────────────────
+# k3s uses kine with a SQLite backend at state.db. Two failure modes:
+#
+# 1. Page-level corruption (SQLITE_CORRUPT) — from a killed VM mid-write.
+#    Detected via PRAGMA quick_check; the DB is removed so k3s starts fresh.
+#    The host-side recover_corrupt_kine_db() in exec.rs only checks the
+#    virtiofs path, so it misses corruption on the state disk (--gpu).
+#    This in-VM check is the authoritative corruption gate.
 #
-# We clear the lock row before starting k3s so that a warm boot with
-# persistent state.db succeeds. If state.db doesn't exist (first boot or
-# --reset), this is a harmless no-op. If state.db is corrupt, sqlite3
-# fails silently (|| true) and the host-side corruption check in exec.rs
-# will have already removed the file.
+# 2. Stale bootstrap lock — kine sets a lock row on startup; if k3s is
+#    killed before completing bootstrap, the lock persists and the next
+#    instance hangs on "Bootstrap key already locked". Cleared via DELETE.
 KINE_DB="/var/lib/rancher/k3s/server/db/state.db"
 if [ -f "$KINE_DB" ]; then
-    ts "clearing stale kine bootstrap lock (if any)"
-    # If sqlite3 fails (corrupt DB, missing binary), log the failure.
-    # The host-side corruption check in exec.rs handles the corrupt case,
-    # but we should still know about it.
-    if ! sqlite3 "$KINE_DB" "DELETE FROM kine WHERE name LIKE '/bootstrap/%';" 2>/dev/null; then
-        ts "WARNING: failed to clear kine bootstrap lock — k3s may hang if DB is corrupt"
+    # When the state disk is in use, the kine DB lives on the block device,
+    # not on the virtiofs rootfs. The host-side recover_corrupt_kine_db()
+    # in exec.rs can only check the virtiofs path, so it misses corruption
+    # on the state disk. Run a quick_check here inside the VM where the
+    # bind-mount is active and the DB is at its final runtime path.
+    _kine_corrupt=false
+    if command -v sqlite3 >/dev/null 2>&1; then
+        _qc_result=$(sqlite3 "$KINE_DB" "PRAGMA quick_check;" 2>&1) || _kine_corrupt=true
+        if [ "$_kine_corrupt" = false ] && [ "$_qc_result" != "ok" ]; then
+            _kine_corrupt=true
+        fi
+    else
+        # No sqlite3 binary — can't verify, try to proceed.
+        ts "WARNING: sqlite3 not available, skipping kine DB integrity check"
     fi
-    if ! sqlite3 "$KINE_DB" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null; then
-        ts "WARNING: failed to checkpoint kine WAL"
+
+    if [ "$_kine_corrupt" = true ]; then
+        ts "WARNING: kine database is corrupt ($_qc_result), removing for clean boot"
+        rm -f "$KINE_DB" "${KINE_DB}-wal" "${KINE_DB}-shm"
+        ts "corrupt kine DB removed — k3s will recreate from manifests"
+    else
+        ts "clearing stale kine bootstrap lock (if any)"
+        if ! sqlite3 "$KINE_DB" "DELETE FROM kine WHERE name LIKE '/bootstrap/%';" 2>/dev/null; then
+            ts "WARNING: failed to clear kine bootstrap lock — removing DB for safety"
+            rm -f "$KINE_DB" "${KINE_DB}-wal" "${KINE_DB}-shm"
+        fi
+        if [ -f "$KINE_DB" ]; then
+            if ! sqlite3 "$KINE_DB" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null; then
+                ts "WARNING: failed to checkpoint kine WAL"
+            fi
+        fi
     fi
 fi
 
diff --git a/crates/openshell-vm/src/backend/libkrun.rs b/crates/openshell-vm/src/backend/libkrun.rs
new file mode 100644
index 000000000..3ab2d6631
--- /dev/null
+++ b/crates/openshell-vm/src/backend/libkrun.rs
@@ -0,0 +1,470 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! libkrun hypervisor backend.
+//!
+//! Implements [`VmBackend`] using the libkrun C API for lightweight microVMs.
+//! This is the original backend — on macOS it uses Hypervisor.framework,
+//! on Linux it uses KVM.
+
+use std::ffi::CString;
+use std::path::Path;
+use std::time::Instant;
+
+use super::{VmBackend, setup_gvproxy_port_forwarding, start_gvproxy};
+use crate::exec::{clear_vm_runtime_state, write_vm_runtime_state};
+use crate::{
+    GvproxyGuard, NetBackend, StateDiskConfig, VmConfig, VmError, VsockPort, bootstrap_gateway,
+    c_string_array, check, ffi, gateway_host_port, health, path_to_cstring, vm_rootfs_key,
+};
+
+/// libkrun hypervisor backend.
+pub struct LibkrunBackend;
+
+impl VmBackend for LibkrunBackend {
+    fn launch(&self, config: &VmConfig) -> Result<i32, VmError> {
+        launch_libkrun(config)
+    }
+}
+
+/// VM context wrapping the libkrun FFI context ID.
+struct VmContext {
+    krun: &'static ffi::LibKrun,
+    ctx_id: u32,
+}
+
+impl VmContext {
+    fn create(log_level: u32) -> Result<Self, VmError> {
+        let krun = ffi::libkrun()?;
+        unsafe {
+            check(
+                (krun.krun_init_log)(
+                    ffi::KRUN_LOG_TARGET_DEFAULT,
+                    crate::clamp_log_level(log_level),
+                    ffi::KRUN_LOG_STYLE_AUTO,
+                    ffi::KRUN_LOG_OPTION_NO_ENV,
+                ),
+                "krun_init_log",
+            )?;
+        }
+
+        let ctx_id = unsafe { (krun.krun_create_ctx)() };
+        if ctx_id < 0 {
+            return Err(VmError::Krun {
+                func: "krun_create_ctx",
+                code: ctx_id,
+            });
+        }
+
+        Ok(Self {
+            krun,
+            ctx_id: ctx_id as u32,
+        })
+    }
+
+    fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> {
+        unsafe {
+            check(
+                (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib),
+                "krun_set_vm_config",
+            )
+        }
+    }
+
+    fn set_root(&self, rootfs: &Path) -> Result<(), VmError> {
+        let rootfs_c = path_to_cstring(rootfs)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()),
+                "krun_set_root",
+            )
+        }
+    }
+
+    fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> {
+        let Some(add_disk3) = self.krun.krun_add_disk3 else {
+            return Err(VmError::HostSetup(
+                "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support"
+                    .to_string(),
+            ));
+        };
+
+        let block_id_c = CString::new(state_disk.block_id.as_str())?;
+        let disk_path_c = path_to_cstring(&state_disk.path)?;
+        unsafe {
+            check(
+                add_disk3(
+                    self.ctx_id,
+                    block_id_c.as_ptr(),
+                    disk_path_c.as_ptr(),
+                    ffi::KRUN_DISK_FORMAT_RAW,
+                    false,
+                    false,
+                    crate::state_disk_sync_mode(),
+                ),
+                "krun_add_disk3",
+            )
+        }
+    }
+
+    fn set_workdir(&self, workdir: &str) -> Result<(), VmError> {
+        let workdir_c = CString::new(workdir)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()),
+                "krun_set_workdir",
+            )
+        }
+    }
+
+    fn disable_implicit_vsock(&self) -> Result<(), VmError> {
+        unsafe {
+            check(
+                (self.krun.krun_disable_implicit_vsock)(self.ctx_id),
+                "krun_disable_implicit_vsock",
+            )
+        }
+    }
+
+    fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> {
+        unsafe {
+            check(
+                (self.krun.krun_add_vsock)(self.ctx_id, tsi_features),
+                "krun_add_vsock",
+            )
+        }
+    }
+
+    #[cfg(target_os = "macos")]
+    fn add_net_unixgram(
+        &self,
+        socket_path: &Path,
+        mac: &[u8; 6],
+        features: u32,
+        flags: u32,
+    ) -> Result<(), VmError> {
+        let sock_c = path_to_cstring(socket_path)?;
+        unsafe {
+            check(
+                (self.krun.krun_add_net_unixgram)(
+                    self.ctx_id,
+                    sock_c.as_ptr(),
+                    -1,
+                    mac.as_ptr(),
+                    features,
+                    flags,
+                ),
+                "krun_add_net_unixgram",
+            )
+        }
+    }
+
+    #[allow(dead_code)]
+    fn add_net_unixstream(
+        &self,
+        socket_path: &Path,
+        mac: &[u8; 6],
+        features: u32,
+    ) -> Result<(), VmError> {
+        let sock_c = path_to_cstring(socket_path)?;
+        unsafe {
+            check(
+                (self.krun.krun_add_net_unixstream)(
+                    self.ctx_id,
+                    sock_c.as_ptr(),
+                    -1,
+                    mac.as_ptr(),
+                    features,
+                    0,
+                ),
+                "krun_add_net_unixstream",
+            )
+        }
+    }
+
+    fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> {
+        let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect();
+        let (_port_owners, port_ptrs) = c_string_array(&port_strs)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()),
+                "krun_set_port_map",
+            )
+        }
+    }
+
+    fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> {
+        let socket_c = path_to_cstring(&port.socket_path)?;
+        unsafe {
+            check(
+                (self.krun.krun_add_vsock_port2)(
+                    self.ctx_id,
+                    port.port,
+                    socket_c.as_ptr(),
+                    port.listen,
+                ),
+                "krun_add_vsock_port2",
+            )
+        }
+    }
+
+    fn set_console_output(&self, path: &Path) -> Result<(), VmError> {
+        let console_c = path_to_cstring(path)?;
+        unsafe {
+            check(
+                (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()),
+                "krun_set_console_output",
+            )
+        }
+    }
+
+    fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> {
+        let exec_c = CString::new(exec_path)?;
+        let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect();
+        let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?;
+        let env_strs: Vec<&str> = env.iter().map(String::as_str).collect();
+        let (_env_owners, env_ptrs) = c_string_array(&env_strs)?;
+
+        unsafe {
+            check(
+                (self.krun.krun_set_exec)(
+                    self.ctx_id,
+                    exec_c.as_ptr(),
+                    argv_ptrs.as_ptr(),
+                    env_ptrs.as_ptr(),
+                ),
+                "krun_set_exec",
+            )
+        }
+    }
+
+    fn start_enter(&self) -> i32 {
+        unsafe { (self.krun.krun_start_enter)(self.ctx_id) }
+    }
+}
+
+impl Drop for VmContext {
+    fn drop(&mut self) {
+        unsafe {
+            let ret = (self.krun.krun_free_ctx)(self.ctx_id);
+            if ret < 0 {
+                eprintln!(
+                    "warning: krun_free_ctx({}) failed with code {ret}",
+                    self.ctx_id
+                );
+            }
+        }
+    }
+}
+
+/// Launch a VM using the libkrun backend.
+///
+/// This contains the VM-specific configuration, networking, fork/exec,
+/// signal forwarding, bootstrap, and cleanup logic that was previously
+/// inline in `lib.rs::launch()`.
+#[allow(clippy::similar_names)]
+fn launch_libkrun(config: &VmConfig) -> Result<i32, VmError> {
+    let launch_start = Instant::now();
+
+    let vm = VmContext::create(config.log_level)?;
+    vm.set_vm_config(config.vcpus, config.mem_mib)?;
+    vm.set_root(&config.rootfs)?;
+    if let Some(state_disk) = &config.state_disk {
+        vm.add_state_disk(state_disk)?;
+    }
+    vm.set_workdir(&config.workdir)?;
+
+    let mut gvproxy_guard: Option<GvproxyGuard> = None;
+    let mut gvproxy_api_sock: Option<std::path::PathBuf> = None;
+
+    match &config.net {
+        NetBackend::Tsi => {}
+        NetBackend::None => {
+            vm.disable_implicit_vsock()?;
+            vm.add_vsock(0)?;
+            eprintln!("Networking: disabled (no TSI, no virtio-net)");
+        }
+        NetBackend::Gvproxy { .. } => {
+            let gvproxy_setup = start_gvproxy(config, launch_start)?;
+
+            vm.disable_implicit_vsock()?;
+            vm.add_vsock(0)?;
+            let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee];
+
+            const NET_FEATURE_CSUM: u32 = 1 << 0;
+            const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1;
+            const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7;
+            const NET_FEATURE_GUEST_UFO: u32 = 1 << 10;
+            const NET_FEATURE_HOST_TSO4: u32 = 1 << 11;
+            const NET_FEATURE_HOST_UFO: u32 = 1 << 14;
+            const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM
+                | NET_FEATURE_GUEST_CSUM
+                | NET_FEATURE_GUEST_TSO4
+                | NET_FEATURE_GUEST_UFO
+                | NET_FEATURE_HOST_TSO4
+                | NET_FEATURE_HOST_UFO;
+
+            #[cfg(target_os = "linux")]
+            vm.add_net_unixstream(&gvproxy_setup.net_sock, &mac, COMPAT_NET_FEATURES)?;
+            #[cfg(target_os = "macos")]
+            {
+                const NET_FLAG_VFKIT: u32 = 1 << 0;
+                vm.add_net_unixgram(
+                    &gvproxy_setup.net_sock,
+                    &mac,
+                    COMPAT_NET_FEATURES,
+                    NET_FLAG_VFKIT,
+                )?;
+            }
+
+            eprintln!(
+                "Networking: gvproxy (virtio-net) [{:.1}s]",
+                launch_start.elapsed().as_secs_f64()
+            );
+            gvproxy_api_sock = Some(gvproxy_setup.api_sock);
+            gvproxy_guard = Some(gvproxy_setup.guard);
+        }
+    }
+
+    if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) {
+        vm.set_port_map(&config.port_map)?;
+    }
+
+    for vsock_port in &config.vsock_ports {
+        if let Some(parent) = vsock_port.socket_path.parent() {
+            std::fs::create_dir_all(parent).map_err(|e| {
+                VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display()))
+            })?;
+        }
+        let _ = std::fs::remove_file(&vsock_port.socket_path);
+        vm.add_vsock_port(vsock_port)?;
+    }
+
+    let console_log = config.console_output.clone().unwrap_or_else(|| {
+        config
+            .rootfs
+            .parent()
+            .unwrap_or(&config.rootfs)
+            .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs)))
+    });
+    vm.set_console_output(&console_log)?;
+
+    let mut env: Vec<String> = if config.env.is_empty() {
+        vec![
+            "HOME=/root",
+            "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+            "TERM=xterm",
+        ]
+        .into_iter()
+        .map(ToOwned::to_owned)
+        .collect()
+    } else {
+        config.env.clone()
+    };
+    if let Some(state_disk) = &config.state_disk
+        && !env
+            .iter()
+            .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE="))
+    {
+        env.push(format!(
+            "OPENSHELL_VM_STATE_DISK_DEVICE={}",
+            state_disk.guest_device
+        ));
+    }
+    if config.gpu_enabled {
+        env.push("GPU_ENABLED=true".to_string());
+    }
+    vm.set_exec(&config.exec_path, &config.args, &env)?;
+
+    // Fork and enter the VM
+    let boot_start = Instant::now();
+    eprintln!("Booting microVM...");
+
+    let pid = unsafe { libc::fork() };
+    match pid {
+        -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())),
+        0 => {
+            let ret = vm.start_enter();
+            eprintln!("krun_start_enter failed: {ret}");
+            std::process::exit(1);
+        }
+        _ => {
+            if !config.is_exec_mode() {
+                let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id);
+                if let Err(err) =
+                    write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid, false)
+                {
+                    unsafe {
+                        libc::kill(pid, libc::SIGTERM);
+                    }
+                    drop(gvproxy_guard);
+                    clear_vm_runtime_state(&config.rootfs);
+                    return Err(err);
+                }
+            }
+            eprintln!(
+                "VM started (child pid {pid}) [{:.1}s]",
+                boot_start.elapsed().as_secs_f64()
+            );
+            for pm in &config.port_map {
+                let host_port = pm.split(':').next().unwrap_or(pm);
+                eprintln!("  port {pm} -> http://localhost:{host_port}");
+            }
+            eprintln!("Console output: {}", console_log.display());
+
+            if let Some(ref api_sock) = gvproxy_api_sock {
+                setup_gvproxy_port_forwarding(api_sock, &config.port_map)?;
+            }
+
+            if !config.is_exec_mode() && !config.port_map.is_empty() {
+                let gateway_port = gateway_host_port(config);
+                bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?;
+                health::wait_for_gateway_ready(gateway_port, &config.gateway_name, config.gpu_enabled)?;
+            }
+
+            eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64());
+            eprintln!("Press Ctrl+C to stop.");
+
+            crate::CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed);
+            unsafe {
+                libc::signal(
+                    libc::SIGINT,
+                    crate::forward_signal as *const () as libc::sighandler_t,
+                );
+                libc::signal(
+                    libc::SIGTERM,
+                    crate::forward_signal as *const () as libc::sighandler_t,
+                );
+            }
+
+            let mut status: libc::c_int = 0;
+            unsafe {
+                libc::waitpid(pid, &raw mut status, 0);
+            }
+            crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed);
+
+            if !config.is_exec_mode() {
+                clear_vm_runtime_state(&config.rootfs);
+            }
+            if let Some(mut guard) = gvproxy_guard
+                && let Some(mut child) = guard.disarm()
+            {
+                let _ = child.kill();
+                let _ = child.wait();
+                eprintln!("gvproxy stopped");
+            }
+
+            if libc::WIFEXITED(status) {
+                let code = libc::WEXITSTATUS(status);
+                eprintln!("VM exited with code {code}");
+                return Ok(code);
+            } else if libc::WIFSIGNALED(status) {
+                let sig = libc::WTERMSIG(status);
+                eprintln!("VM killed by signal {sig}");
+                return Ok(128 + sig);
+            }
+
+            Ok(status)
+        }
+    }
+}
diff --git a/crates/openshell-vm/src/backend/mod.rs b/crates/openshell-vm/src/backend/mod.rs
new file mode 100644
index 000000000..0fe4abab1
--- /dev/null
+++ b/crates/openshell-vm/src/backend/mod.rs
@@ -0,0 +1,618 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! VM backend abstraction layer.
+//!
+//! Defines the [`VmBackend`] trait that all hypervisor backends implement,
+//! and shared infrastructure (gvproxy startup, networking helpers) used by
+//! the libkrun and QEMU backends.
+
+pub mod libkrun;
+pub mod qemu;
+
+use std::os::unix::net::UnixStream;
+use std::path::{Path, PathBuf};
+use std::time::{Duration, Instant};
+
+use crate::{
+    GvproxyGuard, NetBackend, VmConfig, VmError, gvproxy_expose, gvproxy_socket_dir,
+    kill_stale_gvproxy, kill_stale_gvproxy_by_port, pick_gvproxy_ssh_port, vm_rootfs_key,
+};
+
+/// Trait implemented by each hypervisor backend (libkrun, QEMU).
+pub trait VmBackend {
+    /// Launch a VM with the given configuration.
+    ///
+    /// Returns the VM exit code.
+    fn launch(&self, config: &VmConfig) -> Result<i32, VmError>;
+}
+
+/// Result of starting a gvproxy instance, used by both backends.
+pub(crate) struct GvproxySetup {
+    pub(crate) guard: GvproxyGuard,
+    pub(crate) api_sock: PathBuf,
+    pub(crate) net_sock: PathBuf,
+}
+
+/// Start gvproxy for the given configuration.
+///
+/// Shared between libkrun and QEMU backends. Handles stale process
+/// cleanup, socket setup, and process spawning with exponential backoff
+/// waiting for the network socket.
+pub(crate) fn start_gvproxy(
+    config: &VmConfig,
+    launch_start: Instant,
+) -> Result<GvproxySetup, VmError> {
+    let binary = match &config.net {
+        NetBackend::Gvproxy { binary } => binary,
+        _ => {
+            return Err(VmError::HostSetup(
+                "start_gvproxy called without Gvproxy net backend".into(),
+            ));
+        }
+    };
+
+    if !binary.exists() {
+        return Err(VmError::BinaryNotFound {
+            path: binary.display().to_string(),
+            hint: "Install Podman Desktop or place gvproxy in PATH".to_string(),
+        });
+    }
+
+    let run_dir = config
+        .rootfs
+        .parent()
+        .unwrap_or(&config.rootfs)
+        .to_path_buf();
+    let rootfs_key = vm_rootfs_key(&config.rootfs);
+    let sock_base = gvproxy_socket_dir(&config.rootfs)?;
+    let net_sock = sock_base.with_extension("v");
+    let api_sock = sock_base.with_extension("a");
+
+    kill_stale_gvproxy(&config.rootfs);
+    for pm in &config.port_map {
+        if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::<u16>().ok()) {
+            kill_stale_gvproxy_by_port(host_port);
+        }
+    }
+
+    let _ = std::fs::remove_file(&net_sock);
+    let _ = std::fs::remove_file(&api_sock);
+    let krun_sock = sock_base.with_extension("v-krun.sock");
+    let _ = std::fs::remove_file(&krun_sock);
+
+    eprintln!("Starting gvproxy: {}", binary.display());
+    let ssh_port = pick_gvproxy_ssh_port()?;
+    let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log"));
+    let gvproxy_log_file = std::fs::File::create(&gvproxy_log)
+        .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?;
+
+    #[cfg(target_os = "linux")]
+    let (gvproxy_net_flag, gvproxy_net_url) =
+        ("-listen-qemu", format!("unix://{}", net_sock.display()));
+    #[cfg(target_os = "macos")]
+    let (gvproxy_net_flag, gvproxy_net_url) = (
+        "-listen-vfkit",
+        format!("unixgram://{}", net_sock.display()),
+    );
+
+    let child = std::process::Command::new(binary)
+        .arg(gvproxy_net_flag)
+        .arg(&gvproxy_net_url)
+        .arg("-listen")
+        .arg(format!("unix://{}", api_sock.display()))
+        .arg("-ssh-port")
+        .arg(ssh_port.to_string())
+        .stdout(std::process::Stdio::null())
+        .stderr(gvproxy_log_file)
+        .spawn()
+        .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?;
+
+    eprintln!(
+        "gvproxy started (pid {}, ssh port {}) [{:.1}s]",
+        child.id(),
+        ssh_port,
+        launch_start.elapsed().as_secs_f64()
+    );
+
+    {
+        let deadline = Instant::now() + Duration::from_secs(5);
+        let mut interval = Duration::from_millis(5);
+        while !net_sock.exists() {
+            if Instant::now() >= deadline {
+                return Err(VmError::Fork(
+                    "gvproxy socket did not appear within 5s".to_string(),
+                ));
+            }
+            std::thread::sleep(interval);
+            interval = (interval * 2).min(Duration::from_millis(100));
+        }
+    }
+
+    Ok(GvproxySetup {
+        guard: GvproxyGuard::new(child),
+        api_sock,
+        net_sock,
+    })
+}
+
+/// Set up port forwarding via the gvproxy HTTP API.
+///
+/// Translates `host:guest` port map entries into gvproxy expose calls.
+pub(crate) fn setup_gvproxy_port_forwarding(
+    api_sock: &Path,
+    port_map: &[String],
+) -> Result<(), VmError> {
+    let fwd_start = Instant::now();
+    {
+        let deadline = Instant::now() + Duration::from_secs(2);
+        let mut interval = Duration::from_millis(5);
+        while !api_sock.exists() {
+            if Instant::now() >= deadline {
+                eprintln!("warning: gvproxy API socket not ready after 2s, attempting anyway");
+                break;
+            }
+            std::thread::sleep(interval);
+            interval = (interval * 2).min(Duration::from_millis(200));
+        }
+    }
+
+    let guest_ip = "192.168.127.2";
+
+    for pm in port_map {
+        let parts: Vec<&str> = pm.split(':').collect();
+        let (host_port, guest_port) = match parts.len() {
+            2 => (parts[0], parts[1]),
+            1 => (parts[0], parts[0]),
+            _ => {
+                eprintln!("  skipping invalid port mapping: {pm}");
+                continue;
+            }
+        };
+
+        let expose_body = format!(
+            r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"#
+        );
+
+        let mut expose_ok = false;
+        let mut retry_interval = Duration::from_millis(100);
+        let expose_deadline = Instant::now() + Duration::from_secs(10);
+        loop {
+            match gvproxy_expose(api_sock, &expose_body) {
+                Ok(()) => {
+                    eprintln!("  port {host_port} -> {guest_ip}:{guest_port}");
+                    expose_ok = true;
+                    break;
+                }
+                Err(e) => {
+                    if Instant::now() >= expose_deadline {
+                        eprintln!("  port {host_port}: {e} (retries exhausted)");
+                        break;
+                    }
+                    std::thread::sleep(retry_interval);
+                    retry_interval = (retry_interval * 2).min(Duration::from_secs(1));
+                }
+            }
+        }
+        if !expose_ok {
+            return Err(VmError::HostSetup(format!(
+                "failed to forward port {host_port} via gvproxy"
+            )));
+        }
+    }
+    eprintln!(
+        "Port forwarding ready [{:.1}s]",
+        fwd_start.elapsed().as_secs_f64()
+    );
+
+    Ok(())
+}
+
+// ── TAP networking constants ────────────────────────────────────────────
+// The QEMU backend uses 192.168.249.1/24 on the host side of the TAP
+// device. The guest uses .2 with the host as its gateway.
+
+/// Fixed MAC for the guest TAP interface. Only one VM runs per host.
+pub(crate) const GUEST_MAC: &str = "5a:94:ef:e4:0c:ee";
+
+pub(crate) const TAP_HOST_IP: &str = "192.168.249.1";
+pub(crate) const TAP_GUEST_IP: &str = "192.168.249.2";
+pub(crate) const TAP_SUBNET: &str = "192.168.249.0/24";
+
+/// Wait for a Unix socket to appear on the filesystem.
+pub(crate) fn wait_for_socket(
+    socket_path: &Path,
+    label: &str,
+    timeout: Duration,
+) -> Result<(), VmError> {
+    let deadline = Instant::now() + timeout;
+    let mut interval = Duration::from_millis(10);
+
+    while !socket_path.exists() {
+        if Instant::now() >= deadline {
+            return Err(VmError::HostSetup(format!(
+                "{label} socket did not appear within {}s: {}",
+                timeout.as_secs(),
+                socket_path.display(),
+            )));
+        }
+        std::thread::sleep(interval);
+        interval = (interval * 2).min(Duration::from_millis(200));
+    }
+
+    Ok(())
+}
+
+/// Run a command, returning an error if it fails.
+pub(crate) fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), VmError> {
+    let output = std::process::Command::new(cmd)
+        .args(args)
+        .output()
+        .map_err(|e| VmError::HostSetup(format!("{cmd}: {e}")))?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        return Err(VmError::HostSetup(format!(
+            "{cmd} {}: {}",
+            args.join(" "),
+            stderr.trim()
+        )));
+    }
+
+    Ok(())
+}
+
+/// Escape a string for use in a shell script.
+///
+/// Uses an allowlist of safe characters; anything outside the list gets
+/// single-quoted. Single quotes inside the value are escaped with the
+/// standard `'\''` idiom.
+pub(crate) fn shell_escape(s: &str) -> String {
+    if s.is_empty() {
+        return "''".to_string();
+    }
+    if s.bytes().all(|b| {
+        matches!(b,
+            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9'
+            | b'_' | b'-' | b'.' | b'/' | b':' | b'@' | b'='
+        )
+    }) {
+        return s.to_string();
+    }
+    format!("'{}'", s.replace('\'', "'\\''"))
+}
+
+/// Parse a DNS server from resolv.conf content.
+///
+/// Returns the first non-`127.x.x.x` nameserver, or `8.8.8.8` if none found.
+pub(crate) fn parse_dns_server(content: &str) -> String {
+    content
+        .lines()
+        .filter(|line| line.starts_with("nameserver"))
+        .filter_map(|line| line.split_whitespace().nth(1))
+        .find(|ip| !ip.starts_with("127."))
+        .map(String::from)
+        .unwrap_or_else(|| "8.8.8.8".to_string())
+}
+
+/// Read the host's primary DNS server.
+///
+/// Checks `/etc/resolv.conf` first. If every nameserver there is a loopback
+/// address (e.g. systemd-resolved's `127.0.0.53`), falls back to the
+/// upstream resolv.conf at `/run/systemd/resolve/resolv.conf` which
+/// contains the real upstream nameservers. Final fallback is `8.8.8.8`.
+pub(crate) fn host_dns_server() -> String {
+    for path in &["/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"] {
+        if let Ok(content) = std::fs::read_to_string(path) {
+            let server = parse_dns_server(&content);
+            if server != "8.8.8.8" {
+                return server;
+            }
+        }
+    }
+    "8.8.8.8".to_string()
+}
+
+// ── Kernel command line ─────────────────────────────────────────────────
+
+/// Build the kernel command line shared by all backends that use virtiofs
+/// rootfs and the standard init path.
+pub(crate) fn build_kernel_cmdline(
+    config: &VmConfig,
+    effective_exec_path: &str,
+    use_tap_net: bool,
+) -> String {
+    let mut parts = vec![
+        "console=ttyS0".to_string(),
+        "root=rootfs".to_string(),
+        "rootfstype=virtiofs".to_string(),
+        "rw".to_string(),
+        "panic=-1".to_string(),
+        format!("init={effective_exec_path}"),
+    ];
+
+    if config.gpu_enabled && config.vfio_device.is_some() {
+        parts.push("GPU_ENABLED=true".to_string());
+        // Tell the kernel firmware loader to search /lib/firmware explicitly.
+        // The init script stages firmware to tmpfs and overrides this via
+        // sysfs, but the cmdline provides an early fallback so
+        // request_firmware() can find GSP blobs on the virtiofs rootfs even
+        // before init runs the staging logic.
+        parts.push("firmware_class.path=/lib/firmware".to_string());
+    }
+    if let Some(state_disk) = &config.state_disk {
+        parts.push(format!(
+            "OPENSHELL_VM_STATE_DISK_DEVICE={}",
+            state_disk.guest_device
+        ));
+    }
+    for var in &config.env {
+        if var.contains('=') && !var.contains(' ') && !var.contains('"') {
+            parts.push(var.clone());
+        }
+    }
+
+    if use_tap_net {
+        parts.push(format!("VM_NET_IP={TAP_GUEST_IP}"));
+        parts.push(format!("VM_NET_GW={TAP_HOST_IP}"));
+        parts.push(format!("VM_NET_DNS={}", host_dns_server()));
+    }
+
+    parts.join(" ")
+}
+
+// ── TAP host networking ─────────────────────────────────────────────────
+
+/// Set up host-side networking so the guest can reach the internet via TAP.
+///
+/// 1. Enable IP forwarding (saving the original value for teardown)
+/// 2. MASQUERADE outbound traffic from the VM subnet
+/// 3. Allow forwarding to/from the VM subnet
+///
+/// Returns the original value of `ip_forward` so the caller can restore it.
+pub(crate) fn setup_tap_host_networking() -> Result<String, VmError> {
+    let original_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward")
+        .map(|s| s.trim().to_string())
+        .unwrap_or_else(|_| "0".to_string());
+
+    std::fs::write("/proc/sys/net/ipv4/ip_forward", "1")
+        .map_err(|e| VmError::HostSetup(format!("enable IP forwarding: {e}")))?;
+
+    let _ = run_cmd(
+        "iptables",
+        &[
+            "-t",
+            "nat",
+            "-D",
+            "POSTROUTING",
+            "-s",
+            TAP_SUBNET,
+            "!",
+            "-d",
+            TAP_SUBNET,
+            "-j",
+            "MASQUERADE",
+        ],
+    );
+    run_cmd(
+        "iptables",
+        &[
+            "-t",
+            "nat",
+            "-A",
+            "POSTROUTING",
+            "-s",
+            TAP_SUBNET,
+            "!",
+            "-d",
+            TAP_SUBNET,
+            "-j",
+            "MASQUERADE",
+        ],
+    )?;
+
+    let _ = run_cmd(
+        "iptables",
+        &["-D", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"],
+    );
+    run_cmd(
+        "iptables",
+        &["-A", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"],
+    )?;
+
+    let _ = run_cmd(
+        "iptables",
+        &["-D", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"],
+    );
+    run_cmd(
+        "iptables",
+        &["-A", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"],
+    )?;
+
+    eprintln!("host networking: IP forwarding + NAT masquerade for {TAP_SUBNET}");
+    Ok(original_ip_forward)
+}
+
+/// Remove the iptables rules added by [`setup_tap_host_networking`] and
+/// restore the original `ip_forward` sysctl value.
+pub(crate) fn teardown_tap_host_networking(original_ip_forward: &str) {
+    let _ = run_cmd(
+        "iptables",
+        &[
+            "-t",
+            "nat",
+            "-D",
+            "POSTROUTING",
+            "-s",
+            TAP_SUBNET,
+            "!",
+            "-d",
+            TAP_SUBNET,
+            "-j",
+            "MASQUERADE",
+        ],
+    );
+    let _ = run_cmd(
+        "iptables",
+        &["-D", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"],
+    );
+    let _ = run_cmd(
+        "iptables",
+        &["-D", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"],
+    );
+    if original_ip_forward != "1" {
+        let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward);
+    }
+    eprintln!(
+        "host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}"
+    );
+}
+
+// ── TCP port forwarding ─────────────────────────────────────────────────
+
+/// Start a background TCP proxy that forwards `127.0.0.1:{host_port}`
+/// to `{guest_ip}:{guest_port}`.
+///
+/// Each accepted connection spawns two threads for bidirectional copy.
+/// The listener thread runs until the process exits.
+pub(crate) fn start_tcp_port_forwarder(
+    host_port: u16,
+    guest_ip: &str,
+    guest_port: u16,
+) -> Result<(), VmError> {
+    use std::net::{TcpListener, TcpStream};
+
+    let listener = TcpListener::bind(("127.0.0.1", host_port))
+        .map_err(|e| VmError::HostSetup(format!("bind port forwarder on :{host_port}: {e}")))?;
+
+    let guest_addr = format!("{guest_ip}:{guest_port}");
+    eprintln!("port forwarder: 127.0.0.1:{host_port} -> {guest_addr}");
+
+    std::thread::spawn(move || {
+        for stream in listener.incoming() {
+            let client = match stream {
+                Ok(s) => s,
+                Err(_) => continue,
+            };
+
+            let addr = guest_addr.clone();
+            std::thread::spawn(move || {
+                if let Ok(remote) = TcpStream::connect(&addr) {
+                    forward_tcp_bidirectional(client, remote);
+                }
+            });
+        }
+    });
+
+    Ok(())
+}
+
+/// Copy data bidirectionally between two TCP streams until either side closes.
+fn forward_tcp_bidirectional(client: std::net::TcpStream, remote: std::net::TcpStream) {
+    let Ok(mut client_r) = client.try_clone() else {
+        return;
+    };
+    let mut client_w = client;
+    let Ok(mut remote_r) = remote.try_clone() else {
+        return;
+    };
+    let mut remote_w = remote;
+
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut client_r, &mut remote_w);
+    });
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut remote_r, &mut client_w);
+    });
+}
+
+// ── Bidirectional Unix stream bridge ────────────────────────────────────
+
+/// Spawn two threads that copy data between two Unix streams.
+pub(crate) fn bridge_bidirectional(client: UnixStream, guest: UnixStream) {
+    let Ok(mut client_r) = client.try_clone() else {
+        return;
+    };
+    let mut client_w = client;
+    let Ok(mut guest_r) = guest.try_clone() else {
+        return;
+    };
+    let mut guest_w = guest;
+
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut client_r, &mut guest_w);
+    });
+    std::thread::spawn(move || {
+        let _ = std::io::copy(&mut guest_r, &mut client_w);
+    });
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_dns_server_returns_first_non_loopback() {
+        let content = "nameserver 10.0.0.1\nnameserver 8.8.8.8\n";
+        assert_eq!(parse_dns_server(content), "10.0.0.1");
+    }
+
+    #[test]
+    fn parse_dns_server_skips_systemd_resolved() {
+        let content = "nameserver 127.0.0.53\nnameserver 1.1.1.1\n";
+        assert_eq!(parse_dns_server(content), "1.1.1.1");
+    }
+
+    #[test]
+    fn parse_dns_server_skips_all_loopback_variants() {
+        let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 172.16.0.1\n";
+        assert_eq!(parse_dns_server(content), "172.16.0.1");
+    }
+
+    #[test]
+    fn parse_dns_server_falls_back_when_only_loopback() {
+        let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\n";
+        assert_eq!(parse_dns_server(content), "8.8.8.8");
+    }
+
+    #[test]
+    fn parse_dns_server_handles_empty_content() {
+        assert_eq!(parse_dns_server(""), "8.8.8.8");
+    }
+
+    #[test]
+    fn parse_dns_server_ignores_comments_and_other_lines() {
+        let content = "# Generated by NetworkManager\nsearch example.com\nnameserver 10.1.2.3\n";
+        assert_eq!(parse_dns_server(content), "10.1.2.3");
+    }
+
+    #[test]
+    fn shell_escape_empty_string() {
+        assert_eq!(shell_escape(""), "''");
+    }
+
+    #[test]
+    fn shell_escape_simple_string() {
+        assert_eq!(shell_escape("hello"), "hello");
+    }
+
+    #[test]
+    fn shell_escape_string_with_single_quotes() {
+        assert_eq!(shell_escape("it's"), "'it'\\''s'");
+    }
+
+    #[test]
+    fn shell_escape_string_with_spaces() {
+        assert_eq!(shell_escape("hello world"), "'hello world'");
+    }
+
+    #[test]
+    fn shell_escape_string_with_double_quotes() {
+        assert_eq!(shell_escape(r#"say "hi""#), r#"'say "hi"'"#);
+    }
+
+    #[test]
+    fn shell_escape_string_with_backslash() {
+        assert_eq!(shell_escape("path\\to"), "'path\\to'");
+    }
+}
diff --git a/crates/openshell-vm/src/backend/qemu.rs b/crates/openshell-vm/src/backend/qemu.rs
new file mode 100644
index 000000000..10a9d7149
--- /dev/null
+++ b/crates/openshell-vm/src/backend/qemu.rs
@@ -0,0 +1,1048 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! QEMU backend for GPU passthrough VMs.
+//!
+//! Uses QEMU's command-line interface with KVM acceleration and VFIO device
+//! passthrough. This backend is Linux-only and requires a separate kernel
+//! image (`vmlinux`) and `virtiofsd` for the root filesystem.
+//!
+//! QEMU handles VFIO devices with or without MSI-X capability, falling
+//! back to legacy interrupt emulation when MSI-X is unavailable.
+
+use std::os::unix::net::UnixStream;
+use std::os::unix::process::CommandExt;
+use std::path::{Path, PathBuf};
+use std::time::{Duration, Instant};
+
+use super::{
+    GUEST_MAC, TAP_GUEST_IP, TAP_HOST_IP, VmBackend, bridge_bidirectional, build_kernel_cmdline,
+    run_cmd, setup_tap_host_networking, shell_escape, start_tcp_port_forwarder,
+    teardown_tap_host_networking, wait_for_socket,
+};
+use crate::exec::{
+    VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state,
+};
+use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key};
+
+const VSOCK_GUEST_CID: u32 = 3;
+const QEMU_BINARY_NAME: &str = "qemu-system-x86_64";
+
+/// QEMU hypervisor backend for GPU passthrough.
+pub struct QemuBackend {
+    qemu_binary: PathBuf,
+    vmlinux: PathBuf,
+    virtiofsd: PathBuf,
+}
+
+impl QemuBackend {
+    /// Create a new QEMU backend, validating required binaries.
+    pub fn new() -> Result<Self, VmError> {
+        let runtime_dir = crate::configured_runtime_dir()?;
+
+        let qemu_binary = {
+            let bundled = runtime_dir.join(QEMU_BINARY_NAME);
+            if bundled.is_file() {
+                bundled
+            } else {
+                find_in_path(QEMU_BINARY_NAME).ok_or_else(|| VmError::BinaryNotFound {
+                    path: bundled.display().to_string(),
+                    hint: "QEMU backend requires qemu-system-x86_64. Install QEMU or set OPENSHELL_VM_RUNTIME_DIR".to_string(),
+                })?
+            }
+        };
+
+        let vmlinux = runtime_dir.join("vmlinux");
+        if !vmlinux.is_file() {
+            return Err(VmError::BinaryNotFound {
+                path: vmlinux.display().to_string(),
+                hint: "QEMU backend requires a vmlinux kernel. Run the GPU build pipeline"
+                    .to_string(),
+            });
+        }
+
+        let virtiofsd = runtime_dir.join("virtiofsd");
+        if !virtiofsd.is_file() {
+            return Err(VmError::BinaryNotFound {
+                path: virtiofsd.display().to_string(),
+                hint: "QEMU backend requires virtiofsd. Run the GPU build pipeline".to_string(),
+            });
+        }
+
+        // Verify vhost-vsock is available. QEMU's vhost-vsock-pci device
+        // needs /dev/vhost-vsock (provided by the vhost_vsock kernel module).
+        // A plain AF_VSOCK socket() can succeed with just the vsock module,
+        // but connect() will fail with ENODEV if vhost_vsock isn't loaded.
+        if !Path::new("/dev/vhost-vsock").exists() {
+            return Err(VmError::HostSetup(
+                "/dev/vhost-vsock not found.\n\
+                 QEMU backend requires the vhost_vsock kernel module.\n\
+                 Fix: sudo modprobe vhost_vsock"
+                    .to_string(),
+            ));
+        }
+        {
+            let fd = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_STREAM, 0) };
+            if fd < 0 {
+                let err = std::io::Error::last_os_error();
+                return Err(VmError::HostSetup(format!(
+                    "AF_VSOCK socket creation failed: {err}\n\
+                     QEMU backend requires the vhost_vsock kernel module.\n\
+                     Fix: sudo modprobe vhost_vsock"
+                )));
+            }
+            unsafe { libc::close(fd) };
+        }
+
+        Ok(Self {
+            qemu_binary,
+            vmlinux,
+            virtiofsd,
+        })
+    }
+}
+
+impl VmBackend for QemuBackend {
+    fn launch(&self, config: &VmConfig) -> Result<i32, VmError> {
+        launch_qemu(self, config)
+    }
+}
+
+/// Search `$PATH` for a binary by name.
+fn find_in_path(name: &str) -> Option<PathBuf> {
+    let path_var = std::env::var_os("PATH")?;
+    for dir in std::env::split_paths(&path_var) {
+        let candidate = dir.join(name);
+        if candidate.is_file() {
+            return Some(candidate);
+        }
+    }
+    None
+}
+
+const TAP_DEVICE_NAME: &str = "vmtap0";
+
+/// Create and configure the TAP device before QEMU starts.
+///
+/// QEMU with `script=no` expects the TAP device to already exist.
+fn setup_tap_device() -> Result<(), VmError> {
+    // Clean up stale TAP device from a previous crashed run.
+    if Path::new(&format!("/sys/class/net/{TAP_DEVICE_NAME}")).exists() {
+        eprintln!("TAP device {TAP_DEVICE_NAME} already exists, removing stale device");
+        let _ = run_cmd("ip", &["link", "delete", TAP_DEVICE_NAME]);
+    }
+    run_cmd(
+        "ip",
+        &["tuntap", "add", "dev", TAP_DEVICE_NAME, "mode", "tap"],
+    )?;
+    run_cmd(
+        "ip",
+        &[
+            "addr",
+            "add",
+            &format!("{TAP_HOST_IP}/24"),
+            "dev",
+            TAP_DEVICE_NAME,
+        ],
+    )?;
+    run_cmd("ip", &["link", "set", TAP_DEVICE_NAME, "up"])?;
+    eprintln!("TAP device {TAP_DEVICE_NAME} created with {TAP_HOST_IP}");
+    Ok(())
+}
+
+/// Remove the TAP device created by [`setup_tap_device`].
+fn teardown_tap_device() {
+    let _ = run_cmd("ip", &["link", "delete", TAP_DEVICE_NAME]);
+    eprintln!("TAP device {TAP_DEVICE_NAME} removed");
+}
+
+// ── Build QEMU command-line arguments ───────────────────────────────────
+
+fn build_qemu_args(
+    backend: &QemuBackend,
+    config: &VmConfig,
+    effective_exec_path: &str,
+    vfio_device: Option<&str>,
+    virtiofsd_sock: &Path,
+    state_disk_path: Option<&Path>,
+    use_tap_net: bool,
+    guest_cid: u32,
+    console_log: &Path,
+) -> Vec<String> {
+    let mut args = Vec::new();
+
+    // Machine, CPU, resources
+    args.extend([
+        "-machine".into(),
+        "q35,accel=kvm".into(),
+        "-cpu".into(),
+        "host".into(),
+        "-smp".into(),
+        config.vcpus.to_string(),
+        "-m".into(),
+        format!("{}M", config.mem_mib),
+    ]);
+
+    // Kernel
+    args.extend(["-kernel".into(), backend.vmlinux.display().to_string()]);
+
+    let cmdline = build_kernel_cmdline(config, effective_exec_path, use_tap_net);
+    args.extend(["-append".into(), cmdline]);
+
+    // virtiofs rootfs
+    args.extend([
+        "-chardev".into(),
+        format!("socket,id=vfsock,path={}", virtiofsd_sock.display()),
+        "-device".into(),
+        "vhost-user-fs-pci,chardev=vfsock,tag=rootfs".into(),
+        "-object".into(),
+        format!(
+            "memory-backend-file,id=mem,size={}M,mem-path=/dev/shm,share=on",
+            config.mem_mib
+        ),
+        "-numa".into(),
+        "node,memdev=mem".into(),
+    ]);
+
+    // State disk
+    if let Some(disk_path) = state_disk_path {
+        args.extend([
+            "-drive".into(),
+            format!("file={},format=raw,if=virtio", disk_path.display()),
+        ]);
+    }
+
+    // PCIe root ports — Q35's pcie.0 root bus does not support
+    // hotplugging. VFIO and vhost-vsock-pci need dedicated root ports
+    // to initialize correctly under the Q35 PCIe topology.
+    // virtio-net-pci and vhost-user-fs-pci are QEMU-emulated devices
+    // that work directly on the root bus without dedicated root ports.
+    const PCIE_SLOT_VFIO: u8 = 1;
+    const PCIE_SLOT_VSOCK: u8 = 2;
+
+    // VFIO device passthrough
+    if let Some(bdf) = vfio_device {
+        args.extend([
+            "-device".into(),
+            format!("pcie-root-port,id=vfio-rp,chassis={PCIE_SLOT_VFIO},slot={PCIE_SLOT_VFIO}"),
+            "-device".into(),
+            format!("vfio-pci,host={bdf},bus=vfio-rp"),
+        ]);
+    }
+
+    // vsock
+    args.extend([
+        "-device".into(),
+        format!("pcie-root-port,id=vsock-rp,chassis={PCIE_SLOT_VSOCK},slot={PCIE_SLOT_VSOCK}"),
+        "-device".into(),
+        format!("vhost-vsock-pci,guest-cid={guest_cid},bus=vsock-rp"),
+    ]);
+
+    // TAP networking
+    if use_tap_net {
+        args.extend([
+            "-netdev".into(),
+            "tap,id=net0,ifname=vmtap0,script=no,downscript=no".into(),
+            "-device".into(),
+            format!("virtio-net-pci,netdev=net0,mac={GUEST_MAC}"),
+        ]);
+    }
+
+    // Console / display — disable monitor explicitly to prevent
+    // stdin from being interpreted as monitor commands.
+    args.extend([
+        "-serial".into(),
+        format!("file:{}", console_log.display()),
+        "-display".into(),
+        "none".into(),
+        "-monitor".into(),
+        "none".into(),
+        "-no-reboot".into(),
+    ]);
+
+    args
+}
+
+// ── Launch ──────────────────────────────────────────────────────────────
+
+#[allow(clippy::similar_names)]
+fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result<i32, VmError> {
+    let launch_start = Instant::now();
+
+    let run_dir = config
+        .rootfs
+        .parent()
+        .unwrap_or(&config.rootfs)
+        .to_path_buf();
+    let rootfs_key = vm_rootfs_key(&config.rootfs);
+
+    let sock_dir = PathBuf::from(format!("/tmp/ovm-qemu-{}", std::process::id()));
+    if let Ok(entries) = std::fs::read_dir("/tmp") {
+        for entry in entries.filter_map(Result::ok) {
+            let name = entry.file_name().to_string_lossy().to_string();
+            if name.starts_with("ovm-qemu-") && entry.path() != sock_dir {
+                let is_stale = name
+                    .strip_prefix("ovm-qemu-")
+                    .and_then(|pid_str| pid_str.parse::<i32>().ok())
+                    .map(|pid| unsafe { libc::kill(pid, 0) } != 0)
+                    .unwrap_or(true);
+                if is_stale {
+                    let _ = std::fs::remove_dir_all(entry.path());
+                }
+            }
+        }
+    }
+    std::fs::create_dir_all(&sock_dir).map_err(|e| {
+        VmError::HostSetup(format!("create socket dir {}: {e}", sock_dir.display()))
+    })?;
+
+    let virtiofsd_sock_path = sock_dir.join("virtiofsd.sock");
+    let console_log = config
+        .console_output
+        .clone()
+        .unwrap_or_else(|| run_dir.join(format!("{rootfs_key}-console.log")));
+
+    let _ = std::fs::remove_file(&virtiofsd_sock_path);
+
+    // Start virtiofsd
+    eprintln!("Starting virtiofsd: {}", backend.virtiofsd.display());
+    let virtiofsd_log = run_dir.join(format!("{rootfs_key}-virtiofsd.log"));
+    let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log)
+        .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?;
+
+    let mut virtiofsd_cmd = std::process::Command::new(&backend.virtiofsd);
+    virtiofsd_cmd
+        .arg(format!("--socket-path={}", virtiofsd_sock_path.display()))
+        .arg(format!("--shared-dir={}", config.rootfs.display()))
+        .arg("--cache=always")
+        .stdout(std::process::Stdio::null())
+        .stderr(virtiofsd_log_file);
+    #[allow(unsafe_code)]
+    unsafe {
+        virtiofsd_cmd.pre_exec(|| {
+            libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM);
+            Ok(())
+        });
+    }
+    let mut virtiofsd_child = virtiofsd_cmd
+        .spawn()
+        .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?;
+
+    let virtiofsd_pid = virtiofsd_child.id() as i32;
+    crate::VIRTIOFSD_PID.store(virtiofsd_pid, std::sync::atomic::Ordering::Relaxed);
+
+    eprintln!(
+        "virtiofsd started (pid {virtiofsd_pid}) [{:.1}s]",
+        launch_start.elapsed().as_secs_f64()
+    );
+
+    wait_for_socket(&virtiofsd_sock_path, "virtiofsd", Duration::from_secs(5))?;
+
+    let use_tap_net = !matches!(config.net, NetBackend::None);
+
+    // Build exec wrapper for --exec mode
+    let is_exec_mode = config.is_exec_mode();
+    let wrapper_path = config.rootfs.join("tmp/qemu-exec-wrapper.sh");
+    let effective_exec_path;
+    if is_exec_mode {
+        let args_str = config
+            .args
+            .iter()
+            .map(|a| shell_escape(a))
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        let env_str = config
+            .env
+            .iter()
+            .map(|v| format!("export {}", shell_escape(v)))
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        let wrapper = format!(
+            "#!/bin/sh\n\
+             mount -t proc proc /proc 2>/dev/null\n\
+             mount -t sysfs sysfs /sys 2>/dev/null\n\
+             mount -t devtmpfs devtmpfs /dev 2>/dev/null\n\
+             {env_str}\n\
+             cd {workdir}\n\
+             {exec} {args}\n\
+             RC=$?\n\
+             if command -v poweroff >/dev/null 2>&1; then\n\
+               poweroff -f\n\
+             elif [ -x /usr/bin/busybox ]; then\n\
+               /usr/bin/busybox poweroff -f\n\
+             else\n\
+               echo o > /proc/sysrq-trigger\n\
+             fi\n\
+             exit $RC\n",
+            env_str = env_str,
+            workdir = shell_escape(&config.workdir),
+            exec = shell_escape(&config.exec_path),
+            args = args_str,
+        );
+
+        if let Some(parent) = wrapper_path.parent() {
+            std::fs::create_dir_all(parent)
+                .map_err(|e| VmError::HostSetup(format!("create wrapper dir: {e}")))?;
+        }
+        std::fs::write(&wrapper_path, &wrapper)
+            .map_err(|e| VmError::HostSetup(format!("write exec wrapper: {e}")))?;
+        #[cfg(unix)]
+        {
+            use std::os::unix::fs::PermissionsExt;
+            let _ = std::fs::set_permissions(&wrapper_path, std::fs::Permissions::from_mode(0o755));
+        }
+        effective_exec_path = "/tmp/qemu-exec-wrapper.sh".to_string();
+    } else {
+        effective_exec_path = config.exec_path.clone();
+    }
+
+    // Build QEMU command line
+    let state_disk_path = config.state_disk.as_ref().map(|sd| sd.path.as_path());
+    let qemu_args = build_qemu_args(
+        backend,
+        config,
+        &effective_exec_path,
+        config.vfio_device.as_deref(),
+        &virtiofsd_sock_path,
+        state_disk_path,
+        use_tap_net,
+        VSOCK_GUEST_CID,
+        &console_log,
+    );
+
+    // Create TAP device before QEMU starts (QEMU with script=no expects it).
+    if use_tap_net {
+        setup_tap_device()?;
+    }
+
+    // Spawn QEMU
+    eprintln!("Starting QEMU: {}", backend.qemu_binary.display());
+    let qemu_log = run_dir.join(format!("{rootfs_key}-qemu.log"));
+    let qemu_log_file = std::fs::File::create(&qemu_log)
+        .map_err(|e| VmError::Fork(format!("create QEMU log: {e}")))?;
+
+    let mut qemu_cmd = std::process::Command::new(&backend.qemu_binary);
+    qemu_cmd
+        .args(&qemu_args)
+        .stdout(std::process::Stdio::null())
+        .stderr(qemu_log_file);
+    #[allow(unsafe_code)]
+    unsafe {
+        qemu_cmd.pre_exec(|| {
+            libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM);
+            Ok(())
+        });
+    }
+    let mut qemu_child = qemu_cmd
+        .spawn()
+        .map_err(|e| VmError::Fork(format!("start QEMU: {e}")))?;
+
+    let qemu_pid = qemu_child.id() as i32;
+    eprintln!(
+        "QEMU started (pid {qemu_pid}) [{:.1}s]",
+        launch_start.elapsed().as_secs_f64()
+    );
+
+    // Install signal handlers immediately so SIGTERM during the long
+    // gateway bootstrap (30-120s) forwards to QEMU instead of killing
+    // the parent via the default handler (which skips Drop and leaves
+    // the GPU bound to vfio-pci).
+    //
+    // We use sigaction with SA_RESTART so that the wait() syscall in the
+    // main thread auto-restarts after the handler returns, rather than
+    // failing with EINTR. This prevents a second signal from killing the
+    // process before cleanup runs.
+    crate::CHILD_PID.store(qemu_pid, std::sync::atomic::Ordering::Relaxed);
+    unsafe {
+        let mut sa: libc::sigaction = std::mem::zeroed();
+        sa.sa_sigaction = crate::forward_signal as *const () as libc::sighandler_t;
+        sa.sa_flags = libc::SA_RESTART;
+        libc::sigemptyset(&raw mut sa.sa_mask);
+        libc::sigaction(libc::SIGTERM, &sa, std::ptr::null_mut());
+        libc::sigaction(libc::SIGINT, &sa, std::ptr::null_mut());
+    }
+
+    // Set up host-side TAP networking
+    let mut original_ip_forward: Option<String> = None;
+    if use_tap_net {
+        match setup_tap_host_networking() {
+            Ok(orig) => original_ip_forward = Some(orig),
+            Err(e) => {
+                eprintln!("WARNING: host networking setup failed: {e}");
+                eprintln!("  The VM may not have internet access.");
+            }
+        }
+    }
+
+    // Start AF_VSOCK exec bridge
+    let exec_socket = vm_exec_socket_path(&config.rootfs);
+    start_vsock_exec_bridge_af_vsock(
+        &exec_socket,
+        VSOCK_GUEST_CID,
+        VM_EXEC_VSOCK_PORT,
+        qemu_child.id(),
+    )?;
+
+    // Write runtime state (vsock_bridge: true — uses AF_VSOCK bridging)
+    if !config.is_exec_mode() {
+        if let Err(err) = write_vm_runtime_state(&config.rootfs, qemu_pid, &console_log, None, true)
+        {
+            let _ = qemu_child.kill();
+            let _ = qemu_child.wait();
+            let _ = virtiofsd_child.kill();
+            let _ = virtiofsd_child.wait();
+            if let Some(ref orig) = original_ip_forward {
+                teardown_tap_host_networking(orig);
+            }
+            if use_tap_net {
+                teardown_tap_device();
+            }
+            clear_vm_runtime_state(&config.rootfs);
+            return Err(err);
+        }
+    }
+
+    // TCP port forwarding for TAP networking
+    if use_tap_net {
+        for pm in &config.port_map {
+            let parts: Vec<&str> = pm.split(':').collect();
+            if parts.len() == 2 {
+                if let (Ok(hp), Ok(gp)) = (parts[0].parse::<u16>(), parts[1].parse::<u16>()) {
+                    if let Err(e) = start_tcp_port_forwarder(hp, TAP_GUEST_IP, gp) {
+                        let _ = qemu_child.kill();
+                        let _ = qemu_child.wait();
+                        let _ = virtiofsd_child.kill();
+                        let _ = virtiofsd_child.wait();
+                        if let Some(ref orig) = original_ip_forward {
+                            teardown_tap_host_networking(orig);
+                        }
+                        if use_tap_net {
+                            teardown_tap_device();
+                        }
+                        clear_vm_runtime_state(&config.rootfs);
+                        let _ = std::fs::remove_dir_all(&sock_dir);
+                        let _ = std::fs::remove_file(&exec_socket);
+                        return Err(e);
+                    }
+                }
+            }
+        }
+    }
+
+    for pm in &config.port_map {
+        let host_port = pm.split(':').next().unwrap_or(pm);
+        eprintln!("  port {pm} -> http://localhost:{host_port}");
+    }
+    eprintln!("Console output: {}", console_log.display());
+
+    // Gateway bootstrap and health check
+    if !config.is_exec_mode() && !config.port_map.is_empty() {
+        let gateway_port = crate::gateway_host_port(config);
+        if let Err(e) = crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)
+            .and_then(|_| crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name, config.gpu_enabled))
+        {
+            let _ = qemu_child.kill();
+            let _ = qemu_child.wait();
+            let _ = virtiofsd_child.kill();
+            let _ = virtiofsd_child.wait();
+            if let Some(ref orig) = original_ip_forward {
+                teardown_tap_host_networking(orig);
+            }
+            if use_tap_net {
+                teardown_tap_device();
+            }
+            clear_vm_runtime_state(&config.rootfs);
+            let _ = std::fs::remove_dir_all(&sock_dir);
+            let _ = std::fs::remove_file(&exec_socket);
+            return Err(e);
+        }
+    }
+
+    eprintln!("Ready [{:.1}s total]", launch_start.elapsed().as_secs_f64());
+    eprintln!("Press Ctrl+C to stop.");
+
+    // Wait for QEMU to exit. SA_RESTART ensures the wait() syscall
+    // auto-restarts after our signal handler runs, so QEMU gets a
+    // chance to shut down gracefully before we proceed to cleanup.
+    let status = qemu_child
+        .wait()
+        .map_err(|e| VmError::HostSetup(format!("wait for QEMU: {e}")))?;
+
+    // Clear all signal-related atomics now that QEMU has exited.
+    crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed);
+    crate::VIRTIOFSD_PID.store(0, std::sync::atomic::Ordering::Relaxed);
+
+    let was_shutdown = crate::SHUTDOWN_REQUESTED.load(std::sync::atomic::Ordering::Relaxed);
+    if was_shutdown {
+        eprintln!("Shutdown signal received, running explicit cleanup...");
+    }
+
+    // ── Explicit cleanup (does NOT rely on Drop) ──────────────────
+    //
+    // This runs whether QEMU exited normally or was signalled. The
+    // signal handler forwarded SIGTERM to the process group, but we
+    // still need to clean up host-side state.
+
+    // 1. Kill virtiofsd
+    let _ = virtiofsd_child.kill();
+    let _ = virtiofsd_child.wait();
+    eprintln!("virtiofsd stopped");
+
+    // 2. Tear down TAP device
+    if use_tap_net {
+        teardown_tap_device();
+    }
+
+    // 3. Tear down host networking (iptables)
+    if let Some(ref orig) = original_ip_forward {
+        teardown_tap_host_networking(orig);
+    }
+
+    // 4. Clean up runtime state files
+    if !config.is_exec_mode() {
+        clear_vm_runtime_state(&config.rootfs);
+    }
+
+    // 5. Clean up socket directories and temporary files
+    let _ = std::fs::remove_dir_all(&sock_dir);
+    let _ = std::fs::remove_file(&exec_socket);
+    if is_exec_mode {
+        let _ = std::fs::remove_file(&wrapper_path);
+    }
+
+    let code = status.code().unwrap_or(1);
+    eprintln!("VM exited with code {code}");
+    Ok(code)
+}
+
+// ── AF_VSOCK exec bridge ────────────────────────────────────────────────
+
+/// Start a background bridge: exec Unix socket → guest AF_VSOCK.
+///
+/// QEMU uses kernel `vhost-vsock-pci` which exposes guest vsock via the
+/// kernel's `AF_VSOCK` address family. We connect directly to the guest
+/// CID and port using raw `AF_VSOCK` sockets.
+fn start_vsock_exec_bridge_af_vsock(
+    exec_socket: &Path,
+    guest_cid: u32,
+    guest_port: u32,
+    qemu_pid: u32,
+) -> Result<(), VmError> {
+    use std::os::unix::net::UnixListener;
+
+    if let Some(parent) = exec_socket.parent() {
+        std::fs::create_dir_all(parent).map_err(|e| {
+            VmError::HostSetup(format!("create exec bridge dir {}: {e}", parent.display()))
+        })?;
+    }
+    let _ = std::fs::remove_file(exec_socket);
+
+    let listener = UnixListener::bind(exec_socket).map_err(|e| {
+        VmError::HostSetup(format!(
+            "bind vsock exec bridge {}: {e}",
+            exec_socket.display()
+        ))
+    })?;
+
+    eprintln!(
+        "vsock exec bridge (AF_VSOCK): {} → CID {} port {}",
+        exec_socket.display(),
+        guest_cid,
+        guest_port,
+    );
+
+    std::thread::spawn(move || {
+        af_vsock_bridge_accept_loop(listener, guest_cid, guest_port, qemu_pid);
+    });
+
+    Ok(())
+}
+
+/// Connect to a guest vsock port via kernel AF_VSOCK.
+///
+/// Returns the connected socket wrapped as a `UnixStream`. The `UnixStream`
+/// type is used solely for its `Read`/`Write` trait impls which delegate to
+/// raw `read()`/`write()` syscalls — address-family-specific methods like
+/// `peer_addr()` must not be called on the returned stream.
+fn connect_af_vsock(cid: u32, port: u32) -> std::io::Result<UnixStream> {
+    use std::os::unix::io::FromRawFd;
+
+    let fd = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_STREAM, 0) };
+    if fd < 0 {
+        return Err(std::io::Error::last_os_error());
+    }
+
+    let addr = libc::sockaddr_vm {
+        svm_family: libc::AF_VSOCK as u16,
+        svm_reserved1: 0,
+        svm_port: port,
+        svm_cid: cid,
+        svm_zero: [0; 4],
+    };
+
+    let ret = unsafe {
+        libc::connect(
+            fd,
+            std::ptr::from_ref(&addr).cast::<libc::sockaddr>(),
+            size_of::<libc::sockaddr_vm>() as libc::socklen_t,
+        )
+    };
+
+    if ret < 0 {
+        let err = std::io::Error::last_os_error();
+        unsafe { libc::close(fd) };
+        return Err(err);
+    }
+
+    // SAFETY: fd is a valid, connected socket. We wrap it as UnixStream
+    // purely for Read/Write access used by bridge_bidirectional().
+    Ok(unsafe { UnixStream::from_raw_fd(fd) })
+}
+
+/// Whether a vsock connect error is transient (expected during VM boot).
+///
+/// The guest exec agent takes time to start, and the vhost-vsock transport
+/// may not be fully initialized when QEMU first launches. These errors
+/// resolve on their own once the guest is ready.
+fn is_transient_vsock_error(e: &std::io::Error) -> bool {
+    if e.kind() == std::io::ErrorKind::ConnectionRefused {
+        return true;
+    }
+    match e.raw_os_error() {
+        Some(code) => {
+            code == libc::ENODEV         // vsock transport not ready
+                || code == libc::EHOSTUNREACH // guest CID not reachable yet
+                || code == libc::ECONNRESET   // connection reset during startup
+                || code == libc::ETIMEDOUT // connect timed out
+        }
+        None => false,
+    }
+}
+
+/// Accept loop for the AF_VSOCK bridge background thread.
+///
+/// Connection failures during boot are expected — the guest exec agent
+/// isn't listening yet. We keep retrying since the bootstrap caller has
+/// its own 120s timeout. If the QEMU process exits, we stop immediately
+/// rather than retrying against a dead CID for 120s.
+fn af_vsock_bridge_accept_loop(
+    listener: std::os::unix::net::UnixListener,
+    guest_cid: u32,
+    port: u32,
+    qemu_pid: u32,
+) {
+    // Give QEMU time to initialize the vhost-vsock-pci device and register
+    // the CID with the kernel transport before accepting connections.
+    std::thread::sleep(Duration::from_secs(2));
+
+    let mut fatal_failures: u32 = 0;
+    let mut logged_transient = false;
+
+    for stream in listener.incoming() {
+        if !is_process_alive(qemu_pid) {
+            eprintln!("vsock bridge: QEMU (pid {qemu_pid}) exited, stopping bridge");
+            return;
+        }
+
+        let client = match stream {
+            Ok(s) => s,
+            Err(e) => {
+                eprintln!("vsock bridge: accept: {e}");
+                continue;
+            }
+        };
+
+        match connect_af_vsock(guest_cid, port) {
+            Ok(guest) => {
+                fatal_failures = 0;
+                bridge_bidirectional(client, guest);
+            }
+            Err(e) if is_transient_vsock_error(&e) => {
+                if !is_process_alive(qemu_pid) {
+                    eprintln!(
+                        "vsock bridge: QEMU (pid {qemu_pid}) exited — \
+                         check console log for VM boot errors"
+                    );
+                    return;
+                }
+                if !logged_transient {
+                    eprintln!(
+                        "vsock bridge: guest not ready on CID {guest_cid} port {port} ({e}), \
+                         will keep retrying..."
+                    );
+                    logged_transient = true;
+                }
+                std::thread::sleep(Duration::from_secs(1));
+            }
+            Err(e) => {
+                fatal_failures += 1;
+                if fatal_failures <= 2 {
+                    eprintln!("vsock bridge: AF_VSOCK connect failed: {e}");
+                }
+                if fatal_failures >= 5 {
+                    eprintln!("vsock bridge: too many AF_VSOCK failures, stopping bridge");
+                    return;
+                }
+                std::thread::sleep(Duration::from_secs(1));
+            }
+        }
+    }
+}
+
+fn is_process_alive(pid: u32) -> bool {
+    unsafe { libc::kill(pid as i32, 0) == 0 }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_backend() -> QemuBackend {
+        QemuBackend {
+            qemu_binary: "/usr/bin/qemu-system-x86_64".into(),
+            vmlinux: "/boot/vmlinux".into(),
+            virtiofsd: "/usr/bin/virtiofsd".into(),
+        }
+    }
+
+    fn base_config() -> VmConfig {
+        VmConfig {
+            rootfs: "/tmp/rootfs".into(),
+            vcpus: 4,
+            mem_mib: 8192,
+            exec_path: "/srv/openshell-vm-init.sh".into(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".into(),
+            port_map: vec![],
+            vsock_ports: vec![],
+            log_level: 1,
+            console_output: None,
+            net: NetBackend::None,
+            reset: false,
+            gateway_name: "test".into(),
+            state_disk: None,
+            gpu_enabled: false,
+            gpu_has_msix: false,
+            vfio_device: None,
+            backend: crate::VmBackendChoice::Qemu,
+        }
+    }
+
+    #[test]
+    fn build_qemu_args_basic() {
+        let backend = test_backend();
+        let config = base_config();
+
+        let args = build_qemu_args(
+            &backend,
+            &config,
+            &config.exec_path,
+            None,
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            false,
+            VSOCK_GUEST_CID,
+            Path::new("/tmp/console.log"),
+        );
+
+        assert!(args.contains(&"-machine".to_string()));
+        assert!(args.contains(&"q35,accel=kvm".to_string()));
+        assert!(args.contains(&"-cpu".to_string()));
+        assert!(args.contains(&"host".to_string()));
+        assert!(args.contains(&"-smp".to_string()));
+        assert!(args.contains(&"4".to_string()));
+        assert!(args.contains(&"-m".to_string()));
+        assert!(args.contains(&"8192M".to_string()));
+        assert!(args.contains(&"-monitor".to_string()));
+        assert!(args.contains(&"none".to_string()));
+        assert!(args.contains(&"-no-reboot".to_string()));
+        assert!(!args.iter().any(|a| a.contains("vfio-pci")));
+        assert!(!args.iter().any(|a| a.contains("tap")));
+        assert!(
+            args.iter()
+                .any(|a| a.contains("pcie-root-port,id=vsock-rp")),
+            "args should contain PCIe root port for vsock: {args:?}"
+        );
+        assert!(
+            args.iter()
+                .any(|a| a.contains("vhost-vsock-pci,guest-cid=3,bus=vsock-rp")),
+            "args should contain vsock on root port: {args:?}"
+        );
+    }
+
+    #[test]
+    fn build_qemu_args_with_vfio() {
+        let backend = test_backend();
+        let mut config = base_config();
+        config.gpu_enabled = true;
+        config.vfio_device = Some("0000:41:00.0".into());
+
+        let args = build_qemu_args(
+            &backend,
+            &config,
+            &config.exec_path,
+            config.vfio_device.as_deref(),
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            false,
+            VSOCK_GUEST_CID,
+            Path::new("/tmp/console.log"),
+        );
+
+        assert!(
+            args.iter()
+                .any(|a| a.contains("vfio-pci,host=0000:41:00.0,bus=vfio-rp")),
+            "args should contain VFIO device on root port: {args:?}"
+        );
+        assert!(
+            args.iter().any(|a| a.contains("pcie-root-port,id=vfio-rp")),
+            "args should contain PCIe root port for VFIO: {args:?}"
+        );
+    }
+
+    #[test]
+    fn build_qemu_args_with_tap_net() {
+        let backend = test_backend();
+        let mut config = base_config();
+        config.net = NetBackend::Gvproxy {
+            binary: "/usr/bin/gvproxy".into(),
+        };
+
+        let args = build_qemu_args(
+            &backend,
+            &config,
+            &config.exec_path,
+            None,
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            true,
+            VSOCK_GUEST_CID,
+            Path::new("/tmp/console.log"),
+        );
+
+        assert!(
+            args.iter().any(|a| a.contains("tap,id=net0")),
+            "args should contain TAP netdev: {args:?}"
+        );
+        assert!(
+            args.iter()
+                .any(|a| a.contains("virtio-net-pci,netdev=net0")),
+            "args should contain virtio-net device: {args:?}"
+        );
+    }
+
+    #[test]
+    fn build_qemu_args_without_net() {
+        let backend = test_backend();
+        let config = base_config();
+
+        let args = build_qemu_args(
+            &backend,
+            &config,
+            &config.exec_path,
+            None,
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            false,
+            VSOCK_GUEST_CID,
+            Path::new("/tmp/console.log"),
+        );
+
+        assert!(
+            !args.iter().any(|a| a.contains("tap")),
+            "args should not contain TAP: {args:?}"
+        );
+        assert!(
+            !args.iter().any(|a| a.contains("virtio-net")),
+            "args should not contain virtio-net: {args:?}"
+        );
+    }
+
+    #[test]
+    fn build_qemu_args_gpu_enabled_cmdline() {
+        let backend = test_backend();
+        let mut config = base_config();
+        config.gpu_enabled = true;
+        config.vfio_device = Some("0000:41:00.0".into());
+
+        let args = build_qemu_args(
+            &backend,
+            &config,
+            &config.exec_path,
+            config.vfio_device.as_deref(),
+            Path::new("/tmp/virtiofsd.sock"),
+            None,
+            false,
+            VSOCK_GUEST_CID,
+            Path::new("/tmp/console.log"),
+        );
+
+        let append_idx = args.iter().position(|a| a == "-append").unwrap();
+        let cmdline = &args[append_idx + 1];
+        assert!(
+            cmdline.contains("GPU_ENABLED=true"),
+            "cmdline should contain GPU_ENABLED=true: {cmdline}"
+        );
+        assert!(
+            cmdline.contains("firmware_class.path=/lib/firmware"),
+            "cmdline should contain firmware_class.path for GPU: {cmdline}"
+        );
+    }
+
+    #[test]
+    fn transient_vsock_errors_classified_correctly() {
+        // Kind-based: ConnectionRefused
+        let refused = std::io::Error::from(std::io::ErrorKind::ConnectionRefused);
+        assert!(
+            is_transient_vsock_error(&refused),
+            "ConnectionRefused should be transient"
+        );
+
+        // OS-error-based transient codes
+        let enodev = std::io::Error::from_raw_os_error(libc::ENODEV);
+        assert!(
+            is_transient_vsock_error(&enodev),
+            "ENODEV should be transient"
+        );
+
+        let ehostunreach = std::io::Error::from_raw_os_error(libc::EHOSTUNREACH);
+        assert!(
+            is_transient_vsock_error(&ehostunreach),
+            "EHOSTUNREACH should be transient"
+        );
+
+        let econnreset = std::io::Error::from_raw_os_error(libc::ECONNRESET);
+        assert!(
+            is_transient_vsock_error(&econnreset),
+            "ECONNRESET should be transient"
+        );
+
+        let etimedout = std::io::Error::from_raw_os_error(libc::ETIMEDOUT);
+        assert!(
+            is_transient_vsock_error(&etimedout),
+            "ETIMEDOUT should be transient"
+        );
+
+        // Non-transient errors
+        let eperm = std::io::Error::from_raw_os_error(libc::EPERM);
+        assert!(
+            !is_transient_vsock_error(&eperm),
+            "EPERM should not be transient"
+        );
+
+        let eacces = std::io::Error::from_raw_os_error(libc::EACCES);
+        assert!(
+            !is_transient_vsock_error(&eacces),
+            "EACCES should not be transient"
+        );
+
+        let other = std::io::Error::new(std::io::ErrorKind::Other, "something else");
+        assert!(
+            !is_transient_vsock_error(&other),
+            "ErrorKind::Other should not be transient"
+        );
+    }
+}
diff --git a/crates/openshell-vm/src/embedded.rs b/crates/openshell-vm/src/embedded.rs
index 731f34b10..6a4a2d3f6 100644
--- a/crates/openshell-vm/src/embedded.rs
+++ b/crates/openshell-vm/src/embedded.rs
@@ -26,6 +26,7 @@ mod resources {
     pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.5.dylib.zst"));
     pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst"));
     pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst"));
+    pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst"));
     pub const LIBKRUN_NAME: &str = "libkrun.dylib";
     pub const LIBKRUNFW_NAME: &str = "libkrunfw.5.dylib";
 }
@@ -36,6 +37,7 @@ mod resources {
     pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst"));
     pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst"));
     pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst"));
+    pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst"));
     pub const LIBKRUN_NAME: &str = "libkrun.so";
     pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5";
 }
@@ -46,6 +48,7 @@ mod resources {
     pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst"));
     pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst"));
     pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst"));
+    pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst"));
     pub const LIBKRUN_NAME: &str = "libkrun.so";
     pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5";
 }
@@ -61,6 +64,7 @@ mod resources {
     pub const LIBKRUNFW: &[u8] = &[];
     pub const GVPROXY: &[u8] = &[];
     pub const ROOTFS: &[u8] = &[];
+    pub const ROOTFS_GPU: &[u8] = &[];
     pub const LIBKRUN_NAME: &str = "libkrun";
     pub const LIBKRUNFW_NAME: &str = "libkrunfw";
 }
@@ -232,11 +236,16 @@ pub fn cleanup_old_rootfs() -> Result<(), VmError> {
     cleanup_old_versions_in_base(&base, &current_version_dir)
 }
 
-/// Check if the rootfs is embedded (non-empty).
+/// Check if the base rootfs is embedded (non-empty).
 pub fn has_embedded_rootfs() -> bool {
     !resources::ROOTFS.is_empty()
 }
 
+/// Check if the GPU rootfs is embedded (non-empty).
+pub fn has_embedded_gpu_rootfs() -> bool {
+    !resources::ROOTFS_GPU.is_empty()
+}
+
 // ── Internal helpers ───────────────────────────────────────────────────────
 
 /// Build a cache key that combines the version string with a short content
diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs
index 6195556e1..e7fe27e12 100644
--- a/crates/openshell-vm/src/exec.rs
+++ b/crates/openshell-vm/src/exec.rs
@@ -48,6 +48,21 @@ fn safe_remove_dir_all(path: &Path) -> Result<bool, VmError> {
 
 pub const VM_EXEC_VSOCK_PORT: u32 = 10_777;
 
+/// How to connect to the VM exec agent.
+///
+/// libkrun bridges each guest vsock port to a host Unix socket via
+/// `krun_add_vsock_port2`. QEMU uses kernel AF_VSOCK via vhost-vsock-pci,
+/// bridged through a host Unix socket by the exec bridge thread.
+#[derive(Debug, Clone)]
+pub enum VsockConnectMode {
+    /// Connect via a host Unix socket (libkrun per-port bridging).
+    UnixSocket(PathBuf),
+    /// Connect via a vsock proxy bridge (QEMU AF_VSOCK).
+    /// The path points to a bridged Unix socket that connects to
+    /// guest CID 3, port [`VM_EXEC_VSOCK_PORT`].
+    VsockBridge(PathBuf),
+}
+
 const VM_STATE_NAME: &str = "vm-state.json";
 const VM_LOCK_NAME: &str = "vm.lock";
 const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml";
@@ -72,6 +87,10 @@ pub struct VmRuntimeState {
     /// PID of the gvproxy process (if networking uses gvproxy).
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub gvproxy_pid: Option<u32>,
+    /// Whether this VM uses vsock-bridge mode (QEMU AF_VSOCK) vs
+    /// Unix socket mode (libkrun). Defaults to false for backward compat.
+    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
+    pub vsock_bridge: bool,
 }
 
 #[derive(Debug, Serialize)]
@@ -132,6 +151,7 @@ pub fn write_vm_runtime_state(
     pid: i32,
     console_log: &Path,
     gvproxy_pid: Option<u32>,
+    vsock_bridge: bool,
 ) -> Result<(), VmError> {
     let state = VmRuntimeState {
         pid,
@@ -141,6 +161,7 @@ pub fn write_vm_runtime_state(
         console_log: console_log.to_path_buf(),
         started_at_ms: now_ms()?,
         gvproxy_pid,
+        vsock_bridge,
     };
     let path = vm_state_path(rootfs);
     let bytes = serde_json::to_vec_pretty(&state)
@@ -154,8 +175,10 @@ pub fn write_vm_runtime_state(
 
 pub fn clear_vm_runtime_state(rootfs: &Path) {
     let state_path = vm_state_path(rootfs);
+    let lock_path = vm_lock_path(rootfs);
     let socket_path = vm_exec_socket_path(rootfs);
     let _ = fs::remove_file(state_path);
+    let _ = fs::remove_file(lock_path);
     let _ = fs::remove_file(socket_path);
 }
 
@@ -285,6 +308,13 @@ pub fn reset_runtime_state(rootfs: &Path, gateway_name: &str) -> Result<(), VmEr
 /// create a fresh database on startup and cluster state will be re-applied from
 /// the auto-deploy manifests in `server/manifests/`.
 ///
+/// **Limitation — state disk:** When a state disk is configured (common with
+/// `--gpu`), the kine DB lives inside the raw disk image, not on the virtiofs
+/// rootfs. This host-side check only sees the virtiofs path and cannot detect
+/// corruption on the state disk. The init script (`openshell-vm-init.sh`) runs
+/// `PRAGMA quick_check` inside the VM where the state disk is mounted, catching
+/// corruption that this function misses.
+///
 /// **Stale bootstrap locks** (a kine application-level issue where a killed k3s
 /// server leaves a lock row that causes the next instance to hang) are handled
 /// separately by the init script (`openshell-vm-init.sh`), which runs
@@ -358,6 +388,10 @@ fn remove_kine_db_files(db_path: &Path) -> Result<(), VmError> {
 /// automatically. This provides a reliable guard against two VM processes
 /// sharing the same rootfs — even if the state file is deleted.
 ///
+/// When the lock file already contains a PID from a previous holder that
+/// is no longer alive, a warning is logged and any stale VM state files
+/// are cleaned up proactively.
+///
 /// Returns `Ok(File)` on success. The caller must keep the `File` alive
 /// for as long as the VM is running.
 pub fn acquire_rootfs_lock(rootfs: &Path) -> Result<File, VmError> {
@@ -383,14 +417,13 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result<File, VmError> {
     if rc != 0 {
         let err = std::io::Error::last_os_error();
         if err.raw_os_error() == Some(libc::EWOULDBLOCK) {
-            // Another process holds the lock — read its PID for diagnostics.
+            // Another process holds the flock. Read the PID recorded in
+            // the file for diagnostics — but verify it's still alive,
+            // because the file may contain a stale PID from a crashed
+            // predecessor while a different process now holds the flock.
             let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default();
             let holder_pid = holder_pid.trim();
-            return Err(VmError::RuntimeState(format!(
-                "another process (pid {holder_pid}) is using rootfs {}. \
-                 Stop the running VM first",
-                rootfs.display()
-            )));
+            return Err(stale_lock_error(rootfs, holder_pid, &lock_path));
         }
         return Err(VmError::RuntimeState(format!(
             "lock rootfs {}: {err}",
@@ -398,7 +431,11 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result<File, VmError> {
         )));
     }
 
-    // Lock acquired — write our PID (truncate first, then write).
+    // Lock acquired — check for stale state from a crashed predecessor.
+    // Read the previous PID before we overwrite it.
+    cleanup_stale_state_on_lock_acquire(rootfs, &lock_path);
+
+    // Write our PID (truncate first, then write).
     // This is informational only; the flock is the real guard.
     let _ = file.set_len(0);
     {
@@ -409,6 +446,56 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result<File, VmError> {
     Ok(file)
 }
 
+/// Build an appropriate error when flock returns EWOULDBLOCK.
+///
+/// If the PID recorded in the lock file is dead, the flock holder is a
+/// different (unknown) process — provide enhanced diagnostics so the user
+/// isn't misled by a stale PID.
+fn stale_lock_error(rootfs: &Path, recorded_pid: &str, _lock_path: &Path) -> VmError {
+    if let Ok(pid) = recorded_pid.parse::<i32>() {
+        if pid > 0 && !process_alive(pid) {
+            return VmError::RuntimeState(format!(
+                "rootfs {} is locked, but the recorded holder (pid {pid}) is dead. \
+                 A different openshell-vm process likely holds the lock. \
+                 Check for running openshell-vm processes (`ps aux | grep openshell-vm`) \
+                 and stop them before retrying.",
+                rootfs.display(),
+            ));
+        }
+    }
+    VmError::RuntimeState(format!(
+        "another process (pid {recorded_pid}) is using rootfs {}. \
+         Stop the running VM first",
+        rootfs.display()
+    ))
+}
+
+/// After successfully acquiring the flock, check whether the lock file
+/// contained a PID from a dead process (crash recovery). If so, log a
+/// warning and clean up stale VM state/socket files.
+fn cleanup_stale_state_on_lock_acquire(rootfs: &Path, lock_path: &Path) {
+    let prev_contents = fs::read_to_string(lock_path).unwrap_or_default();
+    let Ok(prev_pid) = prev_contents.trim().parse::<i32>() else {
+        return;
+    };
+    if prev_pid <= 0 || process_alive(prev_pid) {
+        return;
+    }
+
+    eprintln!("Warning: cleaning up stale lock from dead process (pid {prev_pid})");
+
+    let state_path = vm_state_path(rootfs);
+    if let Ok(bytes) = fs::read(&state_path) {
+        if let Ok(state) = serde_json::from_slice::<VmRuntimeState>(&bytes) {
+            if !process_alive(state.pid) {
+                eprintln!("  Removing stale VM state (pid {})", state.pid);
+                let _ = fs::remove_file(&state_path);
+                let _ = fs::remove_file(vm_exec_socket_path(rootfs));
+            }
+        }
+    }
+}
+
 /// Check whether the rootfs lock file is currently held by another process.
 ///
 /// Returns `Ok(())` if the lock is free (or can be acquired), and an
@@ -431,11 +518,7 @@ fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> {
         if err.raw_os_error() == Some(libc::EWOULDBLOCK) {
             let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default();
             let holder_pid = holder_pid.trim();
-            return Err(VmError::RuntimeState(format!(
-                "another process (pid {holder_pid}) is using rootfs {}. \
-                 Stop the running VM first",
-                rootfs.display()
-            )));
+            return Err(stale_lock_error(rootfs, holder_pid, &lock_path));
         }
     } else {
         // We acquired the lock — release it immediately since we're only probing.
@@ -446,35 +529,35 @@ fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> {
 }
 
 pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> {
-    // Primary guard: check the flock. This works even if the state file
-    // has been deleted, because the kernel holds the lock until the
-    // owning process exits.
+    // The flock is the definitive guard: the kernel releases it
+    // automatically when the owning process exits (even via SIGKILL).
+    // If this succeeds, no VM process holds the rootfs.
     check_rootfs_lock_free(rootfs)?;
 
-    // Secondary guard: check the state file for any stale state.
-    match load_vm_runtime_state(Some(rootfs)) {
-        Ok(state) => Err(VmError::RuntimeState(format!(
-            "VM is already running (pid {}) with exec socket {}",
-            state.pid,
-            state.socket_path.display()
-        ))),
-        Err(VmError::RuntimeState(message))
-            if message.starts_with("read VM runtime state")
-                || message.starts_with("VM is not running") =>
-        {
-            clear_vm_runtime_state(rootfs);
-            Ok(())
-        }
-        Err(err) => Err(err),
-    }
+    // Flock is free — no VM process holds the rootfs lock. Any remaining
+    // state file is stale (from a killed/crashed VM or PID reuse by an
+    // unrelated process). Clean it up unconditionally.
+    clear_vm_runtime_state(rootfs);
+    Ok(())
 }
 
 pub fn exec_running_vm(options: VmExecOptions) -> Result<i32, VmError> {
     let state = load_vm_runtime_state(options.rootfs.as_deref())?;
-    let mut stream = UnixStream::connect(&state.socket_path).map_err(|e| {
+
+    let connect_mode = if state.vsock_bridge {
+        VsockConnectMode::VsockBridge(state.socket_path.clone())
+    } else {
+        VsockConnectMode::UnixSocket(state.socket_path.clone())
+    };
+
+    let socket_path = match &connect_mode {
+        VsockConnectMode::UnixSocket(p) | VsockConnectMode::VsockBridge(p) => p,
+    };
+
+    let mut stream = UnixStream::connect(socket_path).map_err(|e| {
         VmError::Exec(format!(
             "connect to VM exec socket {}: {e}",
-            state.socket_path.display()
+            socket_path.display()
         ))
     })?;
     let mut writer = stream
diff --git a/crates/openshell-vm/src/health.rs b/crates/openshell-vm/src/health.rs
index 096a35d1f..ce9a10169 100644
--- a/crates/openshell-vm/src/health.rs
+++ b/crates/openshell-vm/src/health.rs
@@ -76,20 +76,60 @@ async fn grpc_health_check(gateway_port: u16, gateway_name: &str) -> Result<(),
     }
 }
 
+/// Default health check timeout for standard (non-GPU) VMs.
+const DEFAULT_HEALTH_TIMEOUT_SECS: u64 = 90;
+
+/// Extended health check timeout for GPU-enabled VMs.
+///
+/// Cold boot with GPU passthrough involves pulling container images (no layer
+/// cache on a fresh state disk) and loading NVIDIA drivers/firmware, which
+/// legitimately takes longer than a standard VM boot.
+const GPU_HEALTH_TIMEOUT_SECS: u64 = 240;
+
+/// Initial poll interval between health check attempts.
+const INITIAL_POLL_INTERVAL_SECS: u64 = 2;
+
+/// Maximum poll interval (exponential backoff cap).
+const MAX_POLL_INTERVAL_SECS: u64 = 10;
+
+/// How often to emit a progress log line during the health check wait.
+const PROGRESS_LOG_INTERVAL_SECS: u64 = 15;
+
 /// Wait for the gateway service to be fully ready by polling the gRPC health endpoint.
 ///
 /// This replaces the TCP-only probe with a proper gRPC health check that verifies
 /// the service is actually responding to requests, not just accepting connections.
 ///
+/// When `gpu_enabled` is true, the timeout is extended to accommodate cold-boot
+/// scenarios where container image pulls and NVIDIA driver/firmware loading push
+/// total startup well past the standard 90-second window.
+///
+/// Uses exponential backoff between retry attempts (2s initial, 10s cap) to
+/// avoid hammering the endpoint while still detecting readiness promptly.
+///
 /// Returns `Ok(())` when the gateway is confirmed healthy, or `Err` if the health
 /// check fails or times out. Falls back to TCP probe if mTLS materials aren't
 /// available yet.
-pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<(), VmError> {
+pub fn wait_for_gateway_ready(
+    gateway_port: u16,
+    gateway_name: &str,
+    gpu_enabled: bool,
+) -> Result<(), VmError> {
     let start = std::time::Instant::now();
-    let timeout = Duration::from_secs(90);
-    let poll_interval = Duration::from_secs(1);
+    let timeout_secs = if gpu_enabled {
+        GPU_HEALTH_TIMEOUT_SECS
+    } else {
+        DEFAULT_HEALTH_TIMEOUT_SECS
+    };
+    let timeout = Duration::from_secs(timeout_secs);
+    let mut poll_interval = Duration::from_secs(INITIAL_POLL_INTERVAL_SECS);
+    let max_poll_interval = Duration::from_secs(MAX_POLL_INTERVAL_SECS);
+    let progress_interval = Duration::from_secs(PROGRESS_LOG_INTERVAL_SECS);
 
-    eprintln!("Waiting for gateway gRPC health check...");
+    eprintln!(
+        "Waiting for gateway gRPC health check (timeout {timeout_secs}s{})...",
+        if gpu_enabled { ", GPU mode" } else { "" }
+    );
 
     // Create a runtime for async health checks
     let rt = match tokio::runtime::Builder::new_current_thread()
@@ -103,7 +143,16 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<(
         }
     };
 
+    let mut attempt: u32 = 0;
+    let mut last_progress_log = start;
+    // The initial value is never read (overwritten on each loop iteration before
+    // the progress log), but we need a valid String to satisfy the borrow checker.
+    #[allow(unused_assignments)]
+    let mut last_error = String::new();
+
     loop {
+        attempt += 1;
+
         // Try gRPC health check
         let result = rt.block_on(async {
             tokio::time::timeout(
@@ -119,26 +168,40 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<(
                 return Ok(());
             }
             Ok(Err(e)) => {
+                last_error = e.clone();
                 // gRPC call completed but failed
                 if start.elapsed() >= timeout {
                     return Err(VmError::Bootstrap(format!(
-                        "gateway health check failed after {:.0}s: {e}",
+                        "gateway health check failed after {:.0}s (attempt {attempt}): {e}",
                         timeout.as_secs_f64()
                     )));
                 }
             }
             Err(_) => {
+                last_error = "health probe timed out".to_string();
                 // Timeout on the health check itself
                 if start.elapsed() >= timeout {
                     return Err(VmError::Bootstrap(format!(
-                        "gateway health check timed out after {:.0}s",
+                        "gateway health check timed out after {:.0}s (attempt {attempt})",
                         timeout.as_secs_f64()
                     )));
                 }
             }
         }
 
+        // Periodic progress logging so operators know the check is still running
+        if last_progress_log.elapsed() >= progress_interval {
+            eprintln!(
+                "  health check: attempt {attempt}, elapsed {:.0}s/{timeout_secs}s ({last_error})",
+                start.elapsed().as_secs_f64()
+            );
+            last_progress_log = std::time::Instant::now();
+        }
+
         std::thread::sleep(poll_interval);
+
+        // Exponential backoff: double the interval up to the cap
+        poll_interval = std::cmp::min(poll_interval * 2, max_poll_interval);
     }
 }
 
@@ -146,11 +209,18 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<(
 fn wait_for_tcp_only(
     gateway_port: u16,
     timeout: Duration,
-    poll_interval: Duration,
+    mut poll_interval: Duration,
 ) -> Result<(), VmError> {
     let start = std::time::Instant::now();
+    let max_poll_interval = Duration::from_secs(MAX_POLL_INTERVAL_SECS);
+    let progress_interval = Duration::from_secs(PROGRESS_LOG_INTERVAL_SECS);
+    let timeout_secs = timeout.as_secs();
+    let mut attempt: u32 = 0;
+    let mut last_progress_log = start;
 
     loop {
+        attempt += 1;
+
         if host_tcp_probe(gateway_port) {
             eprintln!(
                 "Service reachable (TCP) [{:.1}s]",
@@ -161,12 +231,22 @@ fn wait_for_tcp_only(
 
         if start.elapsed() >= timeout {
             return Err(VmError::Bootstrap(format!(
-                "gateway TCP probe failed after {:.0}s",
+                "gateway TCP probe failed after {:.0}s (attempt {attempt})",
                 timeout.as_secs_f64()
             )));
         }
 
+        // Periodic progress logging
+        if last_progress_log.elapsed() >= progress_interval {
+            eprintln!(
+                "  TCP probe: attempt {attempt}, elapsed {:.0}s/{timeout_secs}s",
+                start.elapsed().as_secs_f64()
+            );
+            last_progress_log = std::time::Instant::now();
+        }
+
         std::thread::sleep(poll_interval);
+        poll_interval = std::cmp::min(poll_interval * 2, max_poll_interval);
     }
 }
 
diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs
index 2b78a7669..15e2cbde6 100644
--- a/crates/openshell-vm/src/lib.rs
+++ b/crates/openshell-vm/src/lib.rs
@@ -14,6 +14,7 @@
 
 #![allow(unsafe_code)]
 
+pub mod backend;
 mod embedded;
 mod exec;
 mod ffi;
@@ -22,12 +23,13 @@ mod health;
 use std::ffi::CString;
 use std::path::{Path, PathBuf};
 use std::ptr;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 
 pub use exec::{
-    VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state,
-    ensure_vm_not_running, exec_capture, exec_running_vm, recover_corrupt_kine_db,
-    reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state,
+    VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, VsockConnectMode, acquire_rootfs_lock,
+    clear_vm_runtime_state, ensure_vm_not_running, exec_capture, exec_running_vm,
+    recover_corrupt_kine_db, reset_runtime_state, vm_exec_socket_path, vm_state_path,
+    write_vm_runtime_state,
 };
 
 // ── Error type ─────────────────────────────────────────────────────────
@@ -45,6 +47,22 @@ pub enum VmError {
     )]
     RootfsNotFound { path: String },
 
+    /// The GPU rootfs directory does not exist.
+    #[error(
+        "GPU rootfs not found: {path}\n\
+         The --gpu flag requires a rootfs built with GPU support (NVIDIA drivers,\n\
+         nvidia-container-toolkit, and GPU manifests).\n\
+         Build one with:\n\
+         \x20 mise run vm:rootfs -- --base --gpu\n\
+         \x20 mise run vm:build\n\
+         Or manually:\n\
+         \x20 - Place rootfs-gpu.tar.zst in the openshell-vm.runtime/ sidecar directory\n\
+         \x20 - Or set OPENSHELL_VM_GPU_ROOTFS_TARBALL=/path/to/rootfs-gpu.tar.zst\n\
+         \x20 - Or copy the extracted rootfs to: {path}\n\
+         \x20 - Or use: openshell-vm --gpu --rootfs <path>"
+    )]
+    GpuRootfsNotFound { path: String },
+
     /// A path contained invalid UTF-8.
     #[error("path is not valid UTF-8: {0}")]
     InvalidPath(String),
@@ -98,6 +116,18 @@ fn check(ret: i32, func: &'static str) -> Result<(), VmError> {
 
 // ── Configuration ──────────────────────────────────────────────────────
 
+/// Hypervisor backend selection.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum VmBackendChoice {
+    /// Auto-select: QEMU when a VFIO device is configured, libkrun otherwise.
+    #[default]
+    Auto,
+    /// Force the libkrun backend.
+    Libkrun,
+    /// Force the QEMU backend (Linux-only, supports VFIO GPU passthrough).
+    Qemu,
+}
+
 /// Networking backend for the microVM.
 #[derive(Debug, Clone)]
 pub enum NetBackend {
@@ -202,9 +232,28 @@ pub struct VmConfig {
 
     /// Optional host-backed raw block image for mutable guest state.
     pub state_disk: Option<StateDiskConfig>,
+
+    /// Whether GPU passthrough is enabled for this VM.
+    pub gpu_enabled: bool,
+
+    /// Whether the GPU supports MSI-X. Retained for informational purposes
+    /// but no longer affects backend selection (QEMU handles both cases).
+    pub gpu_has_msix: bool,
+
+    /// VFIO PCI device address for GPU passthrough (e.g. `0000:41:00.0`).
+    /// When set, the QEMU backend is used instead of libkrun.
+    pub vfio_device: Option<String>,
+
+    /// Hypervisor backend override. Defaults to [`VmBackendChoice::Auto`].
+    pub backend: VmBackendChoice,
 }
 
 impl VmConfig {
+    /// Returns true when the VM runs in exec mode (one-shot command) rather than gateway mode.
+    pub(crate) fn is_exec_mode(&self) -> bool {
+        self.exec_path != "/srv/openshell-vm-init.sh"
+    }
+
     /// Default gateway configuration: boots k3s server inside the VM.
     ///
     /// Runs `/srv/openshell-vm-init.sh` which mounts essential filesystems,
@@ -245,6 +294,10 @@ impl VmConfig {
             reset: false,
             gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"),
             state_disk: Some(state_disk),
+            gpu_enabled: false,
+            gpu_has_msix: true,
+            vfio_device: None,
+            backend: VmBackendChoice::Auto,
         }
     }
 }
@@ -277,6 +330,130 @@ pub fn named_rootfs_dir(instance_name: &str) -> Result<PathBuf, VmError> {
         .join("rootfs"))
 }
 
+/// Resolve the GPU rootfs path for a named instance.
+///
+/// Layout: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances/{name}/rootfs-gpu`
+///
+/// The GPU rootfs is built separately with `build-rootfs.sh --gpu` and is
+/// never embedded (too large with NVIDIA drivers). If it doesn't exist,
+/// callers should return [`VmError::GpuRootfsNotFound`].
+pub fn named_gpu_rootfs_dir(instance_name: &str) -> Result<PathBuf, VmError> {
+    let name = sanitize_instance_name(instance_name)?;
+    let base = openshell_bootstrap::paths::openshell_vm_base_dir()
+        .map_err(|e| VmError::RuntimeState(format!("resolve openshell-vm base dir: {e}")))?;
+    Ok(base
+        .join(env!("CARGO_PKG_VERSION"))
+        .join("instances")
+        .join(name)
+        .join("rootfs-gpu"))
+}
+
+/// Ensure a GPU rootfs exists for the named instance.
+///
+/// When the GPU rootfs directory doesn't exist, looks for a
+/// `rootfs-gpu.tar.zst` tarball in these locations (in order):
+///
+/// 1. Sidecar runtime dir: `<binary_dir>/openshell-vm.runtime/rootfs-gpu.tar.zst`
+/// 2. Environment variable: `OPENSHELL_VM_GPU_ROOTFS_TARBALL`
+///
+/// If found, extracts to the instance `rootfs-gpu` path. This mirrors the
+/// pattern used by [`ensure_named_rootfs`] for the standard rootfs.
+///
+/// Validates that the rootfs contains the `.rootfs-gpu` sentinel written
+/// by `build-rootfs.sh --gpu`, catching the case where a regular rootfs
+/// was accidentally placed at the `rootfs-gpu` path.
+pub fn ensure_gpu_rootfs(instance_name: &str) -> Result<PathBuf, VmError> {
+    let gpu_rootfs = named_gpu_rootfs_dir(instance_name)?;
+    if !gpu_rootfs.is_dir() {
+        if let Some(tarball) = find_gpu_rootfs_tarball() {
+            extract_gpu_rootfs_tarball(&tarball, &gpu_rootfs)?;
+        } else {
+            return Err(VmError::GpuRootfsNotFound {
+                path: gpu_rootfs.display().to_string(),
+            });
+        }
+    }
+
+    let sentinel = gpu_rootfs.join("opt/openshell/.rootfs-gpu");
+    if !sentinel.is_file() {
+        return Err(VmError::GpuRootfsNotFound {
+            path: format!(
+                "{} (directory exists but missing .rootfs-gpu sentinel — \
+                 was it built with --gpu?)",
+                gpu_rootfs.display()
+            ),
+        });
+    }
+
+    eprintln!("GPU rootfs: {}", gpu_rootfs.display());
+    Ok(gpu_rootfs)
+}
+
+const GPU_ROOTFS_TARBALL_ENV: &str = "OPENSHELL_VM_GPU_ROOTFS_TARBALL";
+const GPU_ROOTFS_TARBALL_NAME: &str = "rootfs-gpu.tar.zst";
+
+/// Search for a GPU rootfs tarball in known locations.
+fn find_gpu_rootfs_tarball() -> Option<PathBuf> {
+    // 1. Sidecar runtime dir next to the binary
+    if let Ok(exe) = std::env::current_exe() {
+        if let Some(exe_dir) = exe.parent() {
+            let sidecar = exe_dir
+                .join("openshell-vm.runtime")
+                .join(GPU_ROOTFS_TARBALL_NAME);
+            if sidecar.is_file() {
+                return Some(sidecar);
+            }
+        }
+    }
+
+    // 2. Environment variable override
+    if let Some(path) = std::env::var_os(GPU_ROOTFS_TARBALL_ENV) {
+        let path = PathBuf::from(path);
+        if path.is_file() {
+            return Some(path);
+        }
+    }
+
+    None
+}
+
+/// Extract a `rootfs-gpu.tar.zst` tarball into the given destination directory.
+fn extract_gpu_rootfs_tarball(tarball: &Path, dest: &Path) -> Result<(), VmError> {
+    eprintln!(
+        "Extracting GPU rootfs...\n  source: {}\n  dest:   {}",
+        tarball.display(),
+        dest.display()
+    );
+
+    let file = std::fs::File::open(tarball).map_err(|e| {
+        VmError::HostSetup(format!(
+            "open GPU rootfs tarball {}: {e}",
+            tarball.display()
+        ))
+    })?;
+
+    let decoder = zstd::Decoder::new(std::io::BufReader::new(file)).map_err(|e| {
+        VmError::HostSetup(format!(
+            "create zstd decoder for {}: {e}",
+            tarball.display()
+        ))
+    })?;
+
+    std::fs::create_dir_all(dest).map_err(|e| {
+        VmError::HostSetup(format!("create GPU rootfs dir {}: {e}", dest.display()))
+    })?;
+
+    let mut archive = tar::Archive::new(decoder);
+    archive.unpack(dest).map_err(|e| {
+        // Clean up partial extraction
+        let _ = std::fs::remove_dir_all(dest);
+        VmError::HostSetup(format!("extract GPU rootfs tarball: {e}"))
+    })?;
+
+    eprintln!("  GPU rootfs extracted to {}", dest.display());
+    Ok(())
+}
+
 /// Ensure a named instance rootfs exists, extracting from the embedded
 /// rootfs tarball on first use.
 ///
@@ -365,7 +542,9 @@ fn sanitize_instance_name(name: &str) -> Result<String, VmError> {
 /// Build a null-terminated C string array from a slice of strings.
 ///
 /// Returns both the `CString` owners (to keep them alive) and the pointer array.
-fn c_string_array(strings: &[&str]) -> Result<(Vec<CString>, Vec<*const libc::c_char>), VmError> {
+pub(crate) fn c_string_array(
+    strings: &[&str],
+) -> Result<(Vec<CString>, Vec<*const libc::c_char>), VmError> {
     let owned: Vec<CString> = strings
         .iter()
         .map(|s| CString::new(*s))
@@ -570,7 +749,7 @@ fn extract_json_string(json: &str, key: &str) -> Option<String> {
     map.get(key)?.as_str().map(ToOwned::to_owned)
 }
 
-fn clamp_log_level(level: u32) -> u32 {
+pub(crate) fn clamp_log_level(level: u32) -> u32 {
     match level {
         0 => ffi::KRUN_LOG_LEVEL_OFF,
         1 => ffi::KRUN_LOG_LEVEL_ERROR,
@@ -581,258 +760,29 @@ fn clamp_log_level(level: u32) -> u32 {
     }
 }
 
-struct VmContext {
-    krun: &'static ffi::LibKrun,
-    ctx_id: u32,
-}
-
-impl VmContext {
-    fn create(log_level: u32) -> Result<Self, VmError> {
-        let krun = ffi::libkrun()?;
-        unsafe {
-            check(
-                (krun.krun_init_log)(
-                    ffi::KRUN_LOG_TARGET_DEFAULT,
-                    clamp_log_level(log_level),
-                    ffi::KRUN_LOG_STYLE_AUTO,
-                    ffi::KRUN_LOG_OPTION_NO_ENV,
-                ),
-                "krun_init_log",
-            )?;
-        }
-
-        let ctx_id = unsafe { (krun.krun_create_ctx)() };
-        if ctx_id < 0 {
-            return Err(VmError::Krun {
-                func: "krun_create_ctx",
-                code: ctx_id,
-            });
-        }
-
-        Ok(Self {
-            krun,
-            ctx_id: ctx_id as u32,
-        })
-    }
-
-    fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> {
-        unsafe {
-            check(
-                (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib),
-                "krun_set_vm_config",
-            )
-        }
-    }
-
-    fn set_root(&self, rootfs: &Path) -> Result<(), VmError> {
-        let rootfs_c = path_to_cstring(rootfs)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()),
-                "krun_set_root",
-            )
-        }
-    }
-
-    fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> {
-        let Some(add_disk3) = self.krun.krun_add_disk3 else {
-            return Err(VmError::HostSetup(
-                "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support"
-                    .to_string(),
-            ));
-        };
-
-        let block_id_c = CString::new(state_disk.block_id.as_str())?;
-        let disk_path_c = path_to_cstring(&state_disk.path)?;
-        unsafe {
-            check(
-                add_disk3(
-                    self.ctx_id,
-                    block_id_c.as_ptr(),
-                    disk_path_c.as_ptr(),
-                    ffi::KRUN_DISK_FORMAT_RAW,
-                    false,
-                    false,
-                    state_disk_sync_mode(),
-                ),
-                "krun_add_disk3",
-            )
-        }
-    }
-
-    fn set_workdir(&self, workdir: &str) -> Result<(), VmError> {
-        let workdir_c = CString::new(workdir)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()),
-                "krun_set_workdir",
-            )
-        }
-    }
-
-    fn disable_implicit_vsock(&self) -> Result<(), VmError> {
-        unsafe {
-            check(
-                (self.krun.krun_disable_implicit_vsock)(self.ctx_id),
-                "krun_disable_implicit_vsock",
-            )
-        }
-    }
-
-    fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> {
-        unsafe {
-            check(
-                (self.krun.krun_add_vsock)(self.ctx_id, tsi_features),
-                "krun_add_vsock",
-            )
-        }
-    }
-
-    #[cfg(target_os = "macos")]
-    fn add_net_unixgram(
-        &self,
-        socket_path: &Path,
-        mac: &[u8; 6],
-        features: u32,
-        flags: u32,
-    ) -> Result<(), VmError> {
-        let sock_c = path_to_cstring(socket_path)?;
-        unsafe {
-            check(
-                (self.krun.krun_add_net_unixgram)(
-                    self.ctx_id,
-                    sock_c.as_ptr(),
-                    -1,
-                    mac.as_ptr(),
-                    features,
-                    flags,
-                ),
-                "krun_add_net_unixgram",
-            )
-        }
-    }
-
-    #[allow(dead_code)] // FFI binding for future use (e.g. Linux networking)
-    fn add_net_unixstream(
-        &self,
-        socket_path: &Path,
-        mac: &[u8; 6],
-        features: u32,
-    ) -> Result<(), VmError> {
-        let sock_c = path_to_cstring(socket_path)?;
-        unsafe {
-            check(
-                (self.krun.krun_add_net_unixstream)(
-                    self.ctx_id,
-                    sock_c.as_ptr(),
-                    -1,
-                    mac.as_ptr(),
-                    features,
-                    0,
-                ),
-                "krun_add_net_unixstream",
-            )
-        }
-    }
-
-    fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> {
-        let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect();
-        let (_port_owners, port_ptrs) = c_string_array(&port_strs)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()),
-                "krun_set_port_map",
-            )
-        }
-    }
-
-    fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> {
-        let socket_c = path_to_cstring(&port.socket_path)?;
-        unsafe {
-            check(
-                (self.krun.krun_add_vsock_port2)(
-                    self.ctx_id,
-                    port.port,
-                    socket_c.as_ptr(),
-                    port.listen,
-                ),
-                "krun_add_vsock_port2",
-            )
-        }
-    }
-
-    fn set_console_output(&self, path: &Path) -> Result<(), VmError> {
-        let console_c = path_to_cstring(path)?;
-        unsafe {
-            check(
-                (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()),
-                "krun_set_console_output",
-            )
-        }
-    }
-
-    fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> {
-        let exec_c = CString::new(exec_path)?;
-        let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect();
-        let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?;
-        let env_strs: Vec<&str> = env.iter().map(String::as_str).collect();
-        let (_env_owners, env_ptrs) = c_string_array(&env_strs)?;
-
-        unsafe {
-            check(
-                (self.krun.krun_set_exec)(
-                    self.ctx_id,
-                    exec_c.as_ptr(),
-                    argv_ptrs.as_ptr(),
-                    env_ptrs.as_ptr(),
-                ),
-                "krun_set_exec",
-            )
-        }
-    }
-
-    fn start_enter(&self) -> i32 {
-        unsafe { (self.krun.krun_start_enter)(self.ctx_id) }
-    }
-}
-
-impl Drop for VmContext {
-    fn drop(&mut self) {
-        unsafe {
-            let ret = (self.krun.krun_free_ctx)(self.ctx_id);
-            if ret < 0 {
-                eprintln!(
-                    "warning: krun_free_ctx({}) failed with code {ret}",
-                    self.ctx_id
-                );
-            }
-        }
-    }
-}
-
 /// RAII guard that kills and waits on a gvproxy child process when dropped.
 ///
 /// This prevents orphaned gvproxy processes when early `?` returns in the
 /// launch function cause the child to be dropped before cleanup code runs.
 /// Call [`GvproxyGuard::disarm`] to take ownership of the child when it
 /// should outlive the guard (i.e., after a successful fork).
-struct GvproxyGuard {
+pub(crate) struct GvproxyGuard {
     child: Option<std::process::Child>,
 }
 
 impl GvproxyGuard {
-    fn new(child: std::process::Child) -> Self {
+    pub(crate) fn new(child: std::process::Child) -> Self {
         Self { child: Some(child) }
     }
 
     /// Take the child out of the guard, preventing it from being killed on drop.
     /// Use this after the launch is successful and the parent will manage cleanup.
-    fn disarm(&mut self) -> Option<std::process::Child> {
+    pub(crate) fn disarm(&mut self) -> Option<std::process::Child> {
         self.child.take()
     }
 
     /// Get the child's PID without disarming.
-    fn id(&self) -> Option<u32> {
+    pub(crate) fn id(&self) -> Option<u32> {
         self.child.as_ref().map(std::process::Child::id)
     }
 }
@@ -852,7 +802,7 @@ impl Drop for GvproxyGuard {
 ///
 /// Sends a raw HTTP/1.1 POST request over the unix socket to avoid
 /// depending on `curl` being installed on the host.
-fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> {
+pub(crate) fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> {
     use std::io::{Read, Write};
     use std::os::unix::net::UnixStream;
 
@@ -908,7 +858,7 @@ fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> {
 /// runtime state. If the state file was deleted (e.g. the user ran
 /// `rm -rf` on the data directory), we fall back to killing any gvproxy
 /// process holding the target ports.
-fn kill_stale_gvproxy(rootfs: &Path) {
+pub(crate) fn kill_stale_gvproxy(rootfs: &Path) {
     kill_stale_gvproxy_by_state(rootfs);
 }
 
@@ -929,7 +879,7 @@ fn kill_stale_gvproxy_by_state(rootfs: &Path) {
 ///
 /// Used as a fallback when the VM state file is missing (e.g. after the
 /// user deleted the data directory while a VM was running).
-fn kill_stale_gvproxy_by_port(port: u16) {
+pub(crate) fn kill_stale_gvproxy_by_port(port: u16) {
     // Use lsof to find PIDs listening on the target port.
     let output = std::process::Command::new("lsof")
         .args(["-ti", &format!(":{port}")])
@@ -953,23 +903,54 @@ fn kill_stale_gvproxy_by_port(port: u16) {
 fn kill_gvproxy_pid(gvproxy_pid: u32) {
     let pid_i32 = gvproxy_pid as libc::pid_t;
     let is_alive = unsafe { libc::kill(pid_i32, 0) } == 0;
-    if is_alive {
-        // Verify the process is actually gvproxy before killing.
-        // Without this check, PID reuse could cause us to kill an
-        // unrelated process.
-        if !is_process_named(pid_i32, "gvproxy") {
-            eprintln!(
-                "Stale gvproxy pid {gvproxy_pid} is no longer gvproxy (PID reused), skipping kill"
-            );
+    if !is_alive {
+        return;
+    }
+
+    if !is_process_named(pid_i32, "gvproxy") {
+        eprintln!(
+            "Stale gvproxy pid {gvproxy_pid} is no longer gvproxy (PID reused), skipping kill"
+        );
+        return;
+    }
+
+    unsafe {
+        libc::kill(pid_i32, libc::SIGTERM);
+    }
+    eprintln!("Killing stale gvproxy process (pid {gvproxy_pid})...");
+
+    // Wait up to 2 seconds for graceful shutdown, then escalate to SIGKILL.
+    let deadline = Instant::now() + Duration::from_secs(2);
+    loop {
+        std::thread::sleep(Duration::from_millis(50));
+        if unsafe { libc::kill(pid_i32, 0) } != 0 {
+            eprintln!("Stale gvproxy (pid {gvproxy_pid}) terminated");
+            std::thread::sleep(Duration::from_millis(100));
             return;
         }
-        unsafe {
-            libc::kill(pid_i32, libc::SIGTERM);
+        if Instant::now() >= deadline {
+            break;
         }
-        eprintln!("Killed stale gvproxy process (pid {gvproxy_pid})");
-        // Brief pause for the port to be released.
-        std::thread::sleep(std::time::Duration::from_millis(200));
     }
+
+    eprintln!("gvproxy (pid {gvproxy_pid}) did not exit after SIGTERM, sending SIGKILL");
+    unsafe {
+        libc::kill(pid_i32, libc::SIGKILL);
+    }
+
+    // Wait for the process to be reaped (up to 2 more seconds).
+    let kill_deadline = Instant::now() + Duration::from_secs(2);
+    loop {
+        std::thread::sleep(Duration::from_millis(50));
+        if unsafe { libc::kill(pid_i32, 0) } != 0 {
+            break;
+        }
+        if Instant::now() >= kill_deadline {
+            eprintln!("warning: gvproxy (pid {gvproxy_pid}) still alive after SIGKILL");
+            break;
+        }
+    }
+    std::thread::sleep(Duration::from_millis(100));
 }
 
 /// Check whether a process with the given PID has the expected name.
@@ -1009,7 +990,7 @@ fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool {
     false
 }
 
-fn vm_rootfs_key(rootfs: &Path) -> String {
+pub(crate) fn vm_rootfs_key(rootfs: &Path) -> String {
     let name = rootfs
         .file_name()
         .and_then(|part| part.to_str())
@@ -1078,7 +1059,7 @@ fn ensure_state_disk_image(state_disk: &StateDiskConfig) -> Result<(), VmError>
     Ok(())
 }
 
-fn state_disk_sync_mode() -> u32 {
+pub(crate) fn state_disk_sync_mode() -> u32 {
     #[cfg(target_os = "macos")]
     {
         ffi::KRUN_SYNC_RELAXED
@@ -1126,12 +1107,13 @@ fn secure_socket_base(subdir: &str) -> Result<PathBuf, VmError> {
                 dir.display()
             )));
         }
-        // Verify ownership matches current user.
+        // Verify ownership matches current user. Root (uid 0) can safely
+        // use any directory, so skip this check under sudo / as root.
         #[cfg(unix)]
         {
             use std::os::unix::fs::MetadataExt as _;
-            let uid = unsafe { libc::getuid() };
-            if meta.uid() != uid {
+            let uid = unsafe { libc::geteuid() };
+            if uid != 0 && meta.uid() != uid {
                 return Err(VmError::HostSetup(format!(
                     "socket directory {} is owned by uid {} but we are uid {} — refusing to use it",
                     dir.display(),
@@ -1154,7 +1136,7 @@ fn secure_socket_base(subdir: &str) -> Result<PathBuf, VmError> {
     Ok(dir)
 }
 
-fn gvproxy_socket_dir(rootfs: &Path) -> Result<PathBuf, VmError> {
+pub(crate) fn gvproxy_socket_dir(rootfs: &Path) -> Result<PathBuf, VmError> {
     let dir = secure_socket_base("ovm-gv")?;
 
     // macOS unix socket path limit is tight (~104 bytes). Keep paths very short.
@@ -1162,16 +1144,44 @@ fn gvproxy_socket_dir(rootfs: &Path) -> Result<PathBuf, VmError> {
     Ok(dir.join(id))
 }
 
-fn gateway_host_port(config: &VmConfig) -> u16 {
-    config
-        .port_map
-        .first()
-        .and_then(|pm| pm.split(':').next())
-        .and_then(|port| port.parse::<u16>().ok())
-        .unwrap_or(DEFAULT_GATEWAY_PORT)
+/// Validate that a VFIO PCI address matches the BDF format `DDDD:BB:DD.F`.
+///
+/// Rejects strings containing `/`, `..`, or non-hex characters to prevent
+/// path traversal when the address is interpolated into sysfs paths.
+fn validate_vfio_address(addr: &str) -> Result<(), VmError> {
+    let bytes = addr.as_bytes();
+    if bytes.len() == 12
+        && bytes[4] == b':'
+        && bytes[7] == b':'
+        && bytes[10] == b'.'
+        && bytes[..4].iter().all(u8::is_ascii_hexdigit)
+        && bytes[5..7].iter().all(u8::is_ascii_hexdigit)
+        && bytes[8..10].iter().all(u8::is_ascii_hexdigit)
+        && bytes[11].is_ascii_digit()
+        && bytes[11] <= b'7'
+    {
+        return Ok(());
+    }
+    Err(VmError::HostSetup(format!(
+        "invalid VFIO PCI address '{addr}': expected BDF format DDDD:BB:DD.F (e.g. 0000:41:00.0)"
+    )))
 }
 
-fn pick_gvproxy_ssh_port() -> Result<u16, VmError> {
+pub(crate) fn gateway_host_port(config: &VmConfig) -> u16 {
+    for pm in &config.port_map {
+        let parts: Vec<&str> = pm.split(':').collect();
+        if parts.len() == 2 {
+            if let Ok(guest) = parts[1].parse::<u16>() {
+                if guest == GUEST_GATEWAY_NODEPORT {
+                    return parts[0].parse::<u16>().unwrap_or(DEFAULT_GATEWAY_PORT);
+                }
+            }
+        }
+    }
+    DEFAULT_GATEWAY_PORT
+}
+
+pub(crate) fn pick_gvproxy_ssh_port() -> Result<u16, VmError> {
     let listener = std::net::TcpListener::bind(("127.0.0.1", 0))
         .map_err(|e| VmError::HostSetup(format!("allocate gvproxy ssh port on localhost: {e}")))?;
     let port = listener
@@ -1182,7 +1192,7 @@ fn pick_gvproxy_ssh_port() -> Result<u16, VmError> {
     Ok(port)
 }
 
-fn path_to_cstring(path: &Path) -> Result<CString, VmError> {
+pub(crate) fn path_to_cstring(path: &Path) -> Result<CString, VmError> {
     let s = path
         .to_str()
         .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?;
@@ -1236,7 +1246,7 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
     #[cfg(target_os = "linux")]
     check_kvm_access()?;
 
-    if config.exec_path == "/srv/openshell-vm-init.sh" {
+    if !config.is_exec_mode() {
         ensure_vm_not_running(&config.rootfs)?;
     }
 
@@ -1245,7 +1255,7 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
     // is killed (even SIGKILL), the OS releases the lock automatically.
     // This prevents a second launch or rootfs rebuild from corrupting a
     // running VM's filesystem via virtio-fs.
-    let _rootfs_lock = if config.exec_path == "/srv/openshell-vm-init.sh" {
+    let _rootfs_lock = if !config.is_exec_mode() {
         Some(acquire_rootfs_lock(&config.rootfs)?)
     } else {
         None
@@ -1257,7 +1267,7 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
     // every normal boot (not --reset, which wipes k3s/server/ entirely).
     // Must happen after the lock so we know no other VM process is using
     // the rootfs.
-    if !config.reset && config.exec_path == "/srv/openshell-vm-init.sh" {
+    if !config.reset && !config.is_exec_mode() {
         recover_corrupt_kine_db(&config.rootfs)?;
     }
 
@@ -1277,11 +1287,22 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
             state_disk.path.display()
         )));
     }
-    if let Some(state_disk) = &config.state_disk {
+    let fresh_state_disk = if let Some(state_disk) = &config.state_disk {
+        let existed_before = state_disk.path.is_file();
         ensure_state_disk_image(state_disk)?;
+        !existed_before
+    } else {
+        false
+    };
+
+    // When the state disk is freshly created (deleted by user, --reset, or
+    // first boot), the VM will generate new PKI. Clear any cached host-side
+    // mTLS certs so `bootstrap_gateway` runs the cold-boot PKI fetch path
+    // instead of using stale certs that won't match the new VM CA.
+    if fresh_state_disk || config.reset {
+        clear_warm_boot_certs(&config.gateway_name);
     }
 
-    let launch_start = Instant::now();
     eprintln!("rootfs: {}", config.rootfs.display());
     if let Some(state_disk) = &config.state_disk {
         eprintln!(
@@ -1292,421 +1313,58 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
     }
     eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib);
 
-    // The runtime is embedded in the binary and extracted on first use.
-    // Can be overridden via OPENSHELL_VM_RUNTIME_DIR for development.
-    let runtime_gvproxy = resolve_runtime_bundle()?;
-    let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| {
-        VmError::HostSetup(format!(
-            "runtime bundle file has no parent directory: {}",
-            runtime_gvproxy.display()
-        ))
-    })?;
-    configure_runtime_loader_env(runtime_dir)?;
     raise_nofile_limit();
 
-    // ── Log runtime provenance ─────────────────────────────────────
-    // After configuring the loader, trigger library loading so that
-    // provenance is captured before we proceed with VM configuration.
-    let _ = ffi::libkrun()?;
-    log_runtime_provenance(runtime_dir);
-
-    // ── Configure the microVM ──────────────────────────────────────
+    // ── Dispatch to the appropriate backend ─────────────────────────
 
-    let vm = VmContext::create(config.log_level)?;
-    vm.set_vm_config(config.vcpus, config.mem_mib)?;
-    vm.set_root(&config.rootfs)?;
-    if let Some(state_disk) = &config.state_disk {
-        vm.add_state_disk(state_disk)?;
+    enum SelectedBackend {
+        Libkrun,
+        Qemu,
     }
-    vm.set_workdir(&config.workdir)?;
-
-    // Networking setup — use a drop guard so gvproxy is killed if we
-    // return early via `?` before reaching the parent's cleanup code.
-    let mut gvproxy_guard: Option<GvproxyGuard> = None;
-    let mut gvproxy_api_sock: Option<PathBuf> = None;
 
-    match &config.net {
-        NetBackend::Tsi => {
-            // Default TSI — no special setup needed.
-        }
-        NetBackend::None => {
-            vm.disable_implicit_vsock()?;
-            vm.add_vsock(0)?;
-            eprintln!("Networking: disabled (no TSI, no virtio-net)");
-        }
-        NetBackend::Gvproxy { binary } => {
-            if !binary.exists() {
-                return Err(VmError::BinaryNotFound {
-                    path: binary.display().to_string(),
-                    hint: "Install Podman Desktop or place gvproxy in PATH".to_string(),
-                });
+    let selected = match config.backend {
+        VmBackendChoice::Libkrun => SelectedBackend::Libkrun,
+        VmBackendChoice::Qemu => SelectedBackend::Qemu,
+        VmBackendChoice::Auto => {
+            if config.gpu_enabled || config.vfio_device.is_some() {
+                SelectedBackend::Qemu
+            } else {
+                SelectedBackend::Libkrun
             }
+        }
+    };
 
-            // Create temp socket paths
-            let run_dir = config
-                .rootfs
-                .parent()
-                .unwrap_or(&config.rootfs)
-                .to_path_buf();
-            let rootfs_key = vm_rootfs_key(&config.rootfs);
-            let sock_base = gvproxy_socket_dir(&config.rootfs)?;
-            let net_sock = sock_base.with_extension("v");
-            let api_sock = sock_base.with_extension("a");
-
-            // Kill any stale gvproxy process from a previous run.
-            // First try via the saved PID in the state file, then fall
-            // back to killing any gvproxy holding our target ports (covers
-            // the case where the state file was deleted).
-            kill_stale_gvproxy(&config.rootfs);
-            for pm in &config.port_map {
-                if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::<u16>().ok()) {
-                    kill_stale_gvproxy_by_port(host_port);
-                }
-            }
+    match selected {
+        SelectedBackend::Qemu => {
+            #[cfg(not(target_os = "linux"))]
+            return Err(VmError::HostSetup(
+                "QEMU backend requires Linux with KVM".into(),
+            ));
 
-            // Clean stale sockets (including the -krun.sock file that
-            // libkrun creates as its datagram endpoint on macOS).
-            let _ = std::fs::remove_file(&net_sock);
-            let _ = std::fs::remove_file(&api_sock);
-            let krun_sock = sock_base.with_extension("v-krun.sock");
-            let _ = std::fs::remove_file(&krun_sock);
-
-            // Start gvproxy
-            eprintln!("Starting gvproxy: {}", binary.display());
-            let ssh_port = pick_gvproxy_ssh_port()?;
-            let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log"));
-            let gvproxy_log_file = std::fs::File::create(&gvproxy_log)
-                .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?;
-
-            // On Linux, gvproxy uses QEMU mode (SOCK_STREAM) since the vfkit
-            // unixgram scheme is macOS/vfkit-specific.  On macOS, use vfkit mode.
             #[cfg(target_os = "linux")]
-            let (gvproxy_net_flag, gvproxy_net_url) =
-                ("-listen-qemu", format!("unix://{}", net_sock.display()));
-            #[cfg(target_os = "macos")]
-            let (gvproxy_net_flag, gvproxy_net_url) = (
-                "-listen-vfkit",
-                format!("unixgram://{}", net_sock.display()),
-            );
-
-            let child = std::process::Command::new(binary)
-                .arg(gvproxy_net_flag)
-                .arg(&gvproxy_net_url)
-                .arg("-listen")
-                .arg(format!("unix://{}", api_sock.display()))
-                .arg("-ssh-port")
-                .arg(ssh_port.to_string())
-                .stdout(std::process::Stdio::null())
-                .stderr(gvproxy_log_file)
-                .spawn()
-                .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?;
-
-            eprintln!(
-                "gvproxy started (pid {}, ssh port {}) [{:.1}s]",
-                child.id(),
-                ssh_port,
-                launch_start.elapsed().as_secs_f64()
-            );
-
-            // Wait for the socket to appear (exponential backoff: 5ms → 100ms).
             {
-                let deadline = Instant::now() + std::time::Duration::from_secs(5);
-                let mut interval = std::time::Duration::from_millis(5);
-                while !net_sock.exists() {
-                    if Instant::now() >= deadline {
-                        return Err(VmError::Fork(
-                            "gvproxy socket did not appear within 5s".to_string(),
-                        ));
-                    }
-                    std::thread::sleep(interval);
-                    interval = (interval * 2).min(std::time::Duration::from_millis(100));
+                if let Some(ref addr) = config.vfio_device {
+                    validate_vfio_address(addr)?;
                 }
+                let qemu_backend = backend::qemu::QemuBackend::new()?;
+                backend::VmBackend::launch(&qemu_backend, config)
             }
-
-            // Disable implicit TSI and add virtio-net via gvproxy
-            vm.disable_implicit_vsock()?;
-            vm.add_vsock(0)?;
-            // This MAC matches gvproxy's default static DHCP lease for
-            // 192.168.127.2. Using a different MAC can cause the gVisor
-            // network stack to misroute or drop packets.
-            let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee];
-
-            // COMPAT_NET_FEATURES from libkrun.h
-            const NET_FEATURE_CSUM: u32 = 1 << 0;
-            const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1;
-            const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7;
-            const NET_FEATURE_GUEST_UFO: u32 = 1 << 10;
-            const NET_FEATURE_HOST_TSO4: u32 = 1 << 11;
-            const NET_FEATURE_HOST_UFO: u32 = 1 << 14;
-            const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM
-                | NET_FEATURE_GUEST_CSUM
-                | NET_FEATURE_GUEST_TSO4
-                | NET_FEATURE_GUEST_UFO
-                | NET_FEATURE_HOST_TSO4
-                | NET_FEATURE_HOST_UFO;
-
-            // On Linux use unixstream (SOCK_STREAM) to connect to gvproxy's
-            // QEMU listener.  On macOS use unixgram (SOCK_DGRAM) with the vfkit
-            // magic byte for the vfkit listener.
-            #[cfg(target_os = "linux")]
-            vm.add_net_unixstream(&net_sock, &mac, COMPAT_NET_FEATURES)?;
-            #[cfg(target_os = "macos")]
-            {
-                const NET_FLAG_VFKIT: u32 = 1 << 0;
-                vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?;
-            }
-
-            eprintln!(
-                "Networking: gvproxy (virtio-net) [{:.1}s]",
-                launch_start.elapsed().as_secs_f64()
-            );
-            gvproxy_guard = Some(GvproxyGuard::new(child));
-            gvproxy_api_sock = Some(api_sock);
         }
-    }
-
-    // Port mapping (TSI only)
-    if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) {
-        vm.set_port_map(&config.port_map)?;
-    }
-
-    for vsock_port in &config.vsock_ports {
-        if let Some(parent) = vsock_port.socket_path.parent() {
-            std::fs::create_dir_all(parent).map_err(|e| {
-                VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display()))
+        SelectedBackend::Libkrun => {
+            let runtime_gvproxy = resolve_runtime_bundle()?;
+            let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| {
+                VmError::HostSetup(format!(
+                    "runtime bundle file has no parent directory: {}",
+                    runtime_gvproxy.display()
+                ))
             })?;
-        }
-        // libkrun returns EEXIST if the socket file is already present from a
-        // previous run. Remove any stale socket before registering the port.
-        let _ = std::fs::remove_file(&vsock_port.socket_path);
-        vm.add_vsock_port(vsock_port)?;
-    }
-
-    // Console output
-    let console_log = config.console_output.clone().unwrap_or_else(|| {
-        config
-            .rootfs
-            .parent()
-            .unwrap_or(&config.rootfs)
-            .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs)))
-    });
-    vm.set_console_output(&console_log)?;
-
-    // envp: use provided env or minimal defaults
-    let mut env: Vec<String> = if config.env.is_empty() {
-        vec![
-            "HOME=/root",
-            "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-            "TERM=xterm",
-        ]
-        .into_iter()
-        .map(ToOwned::to_owned)
-        .collect()
-    } else {
-        config.env.clone()
-    };
-    if let Some(state_disk) = &config.state_disk
-        && !env
-            .iter()
-            .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE="))
-    {
-        env.push(format!(
-            "OPENSHELL_VM_STATE_DISK_DEVICE={}",
-            state_disk.guest_device
-        ));
-    }
-    vm.set_exec(&config.exec_path, &config.args, &env)?;
+            configure_runtime_loader_env(runtime_dir)?;
 
-    // ── Fork and enter the VM ──────────────────────────────────────
-    //
-    // krun_start_enter() never returns — it calls exit() when the guest
-    // process exits. We fork so the parent can monitor and report.
-
-    let boot_start = Instant::now();
-    eprintln!("Booting microVM...");
-
-    let pid = unsafe { libc::fork() };
-    match pid {
-        -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())),
-        0 => {
-            // Child process: enter the VM (never returns on success)
-            let ret = vm.start_enter();
-            eprintln!("krun_start_enter failed: {ret}");
-            std::process::exit(1);
-        }
-        _ => {
-            // Parent: wait for child
-            if config.exec_path == "/srv/openshell-vm-init.sh" {
-                let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id);
-                if let Err(err) =
-                    write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid)
-                {
-                    unsafe {
-                        libc::kill(pid, libc::SIGTERM);
-                    }
-                    // Guard drop will kill gvproxy automatically
-                    drop(gvproxy_guard);
-                    clear_vm_runtime_state(&config.rootfs);
-                    return Err(err);
-                }
-            }
-            eprintln!(
-                "VM started (child pid {pid}) [{:.1}s]",
-                boot_start.elapsed().as_secs_f64()
-            );
-            for pm in &config.port_map {
-                let host_port = pm.split(':').next().unwrap_or(pm);
-                eprintln!("  port {pm} -> http://localhost:{host_port}");
-            }
-            eprintln!("Console output: {}", console_log.display());
-
-            // Set up gvproxy port forwarding via its HTTP API.
-            // The port_map entries use the same "host:guest" format
-            // as TSI, but here we translate them into gvproxy expose
-            // calls targeting the guest IP (192.168.127.2).
-            //
-            // Instead of a fixed 500ms sleep, poll the API socket with
-            // exponential backoff (5ms → 200ms, ~1s total budget).
-            if let Some(ref api_sock) = gvproxy_api_sock {
-                let fwd_start = Instant::now();
-                // Wait for the API socket to appear (it lags slightly
-                // behind the vfkit data socket).
-                {
-                    let deadline = Instant::now() + std::time::Duration::from_secs(2);
-                    let mut interval = std::time::Duration::from_millis(5);
-                    while !api_sock.exists() {
-                        if Instant::now() >= deadline {
-                            eprintln!(
-                                "warning: gvproxy API socket not ready after 2s, attempting anyway"
-                            );
-                            break;
-                        }
-                        std::thread::sleep(interval);
-                        interval = (interval * 2).min(std::time::Duration::from_millis(200));
-                    }
-                }
-
-                let guest_ip = "192.168.127.2";
-
-                for pm in &config.port_map {
-                    let parts: Vec<&str> = pm.split(':').collect();
-                    let (host_port, guest_port) = match parts.len() {
-                        2 => (parts[0], parts[1]),
-                        1 => (parts[0], parts[0]),
-                        _ => {
-                            eprintln!("  skipping invalid port mapping: {pm}");
-                            continue;
-                        }
-                    };
-
-                    let expose_body = format!(
-                        r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"#
-                    );
-
-                    // Retry with exponential backoff — gvproxy's internal
-                    // netstack may not be ready immediately after socket creation.
-                    let mut expose_ok = false;
-                    let mut retry_interval = std::time::Duration::from_millis(100);
-                    let expose_deadline = Instant::now() + std::time::Duration::from_secs(10);
-                    loop {
-                        match gvproxy_expose(api_sock, &expose_body) {
-                            Ok(()) => {
-                                eprintln!("  port {host_port} -> {guest_ip}:{guest_port}");
-                                expose_ok = true;
-                                break;
-                            }
-                            Err(e) => {
-                                if Instant::now() >= expose_deadline {
-                                    eprintln!("  port {host_port}: {e} (retries exhausted)");
-                                    break;
-                                }
-                                std::thread::sleep(retry_interval);
-                                retry_interval =
-                                    (retry_interval * 2).min(std::time::Duration::from_secs(1));
-                            }
-                        }
-                    }
-                    if !expose_ok {
-                        return Err(VmError::HostSetup(format!(
-                            "failed to forward port {host_port} via gvproxy"
-                        )));
-                    }
-                }
-                eprintln!(
-                    "Port forwarding ready [{:.1}s]",
-                    fwd_start.elapsed().as_secs_f64()
-                );
-            }
-
-            // Bootstrap the OpenShell control plane and wait for the
-            // service to be reachable. Only for the gateway preset, and
-            // only when port forwarding is configured (i.e. the gateway
-            // is reachable from the host). During rootfs pre-init builds,
-            // no --port is specified so there is nothing to health-check
-            // — the build script has its own kubectl-based readiness
-            // checks inside the VM.
-            if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() {
-                // Bootstrap stores host-side metadata and mTLS creds.
-                // With pre-baked rootfs (Path 1) this reads PKI directly
-                // from virtio-fs — no kubectl or port forwarding needed.
-                // Cold boot (Path 2) writes secret manifests into the
-                // k3s auto-deploy directory via virtio-fs.
-                let gateway_port = gateway_host_port(config);
-                bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?;
-
-                // Wait for the gRPC health check to pass. This ensures
-                // the service is fully operational, not just accepting
-                // TCP connections. The health check confirms the full
-                // path (gvproxy → kube-proxy nftables → pod:8080) and
-                // that the gRPC service is responding to requests.
-                health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?;
-            }
-
-            eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64());
-            eprintln!("Press Ctrl+C to stop.");
-
-            // Forward signals to child
-            unsafe {
-                libc::signal(
-                    libc::SIGINT,
-                    forward_signal as *const () as libc::sighandler_t,
-                );
-                libc::signal(
-                    libc::SIGTERM,
-                    forward_signal as *const () as libc::sighandler_t,
-                );
-                CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed);
-            }
-
-            let mut status: libc::c_int = 0;
-            unsafe {
-                libc::waitpid(pid, &raw mut status, 0);
-            }
-
-            // Clean up gvproxy — disarm the guard and do explicit cleanup
-            // so we can print the "stopped" message.
-            if config.exec_path == "/srv/openshell-vm-init.sh" {
-                clear_vm_runtime_state(&config.rootfs);
-            }
-            if let Some(mut guard) = gvproxy_guard
-                && let Some(mut child) = guard.disarm()
-            {
-                let _ = child.kill();
-                let _ = child.wait();
-                eprintln!("gvproxy stopped");
-            }
-
-            if libc::WIFEXITED(status) {
-                let code = libc::WEXITSTATUS(status);
-                eprintln!("VM exited with code {code}");
-                return Ok(code);
-            } else if libc::WIFSIGNALED(status) {
-                let sig = libc::WTERMSIG(status);
-                eprintln!("VM killed by signal {sig}");
-                return Ok(128 + sig);
-            }
+            let _ = ffi::libkrun()?;
+            log_runtime_provenance(runtime_dir);
 
-            Ok(status)
+            let libkrun_backend = backend::libkrun::LibkrunBackend;
+            backend::VmBackend::launch(&libkrun_backend, config)
         }
     }
 }
@@ -1716,6 +1374,9 @@ pub fn launch(config: &VmConfig) -> Result<i32, VmError> {
 /// Default gateway port: host port mapped to the `OpenShell` `NodePort` (30051).
 const DEFAULT_GATEWAY_PORT: u16 = 30051;
 
+/// The NodePort the OpenShell gateway listens on inside the VM.
+pub const GUEST_GATEWAY_NODEPORT: u16 = 30051;
+
 /// Bootstrap the `OpenShell` control plane after k3s is ready.
 ///
 /// Two paths:
@@ -1727,7 +1388,11 @@ const DEFAULT_GATEWAY_PORT: u16 = 30051;
 /// 2. **First boot / post-reset**: poll the exec agent to `cat` each PEM file
 ///    from `/opt/openshell/pki/` until the files exist (PKI generation has
 ///    finished), then store them in `~/.config/openshell/gateways/<name>/mtls/`.
-fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Result<(), VmError> {
+pub(crate) fn bootstrap_gateway(
+    rootfs: &Path,
+    gateway_name: &str,
+    gateway_port: u16,
+) -> Result<(), VmError> {
     let bootstrap_start = Instant::now();
 
     let metadata = openshell_bootstrap::GatewayMetadata {
@@ -1761,7 +1426,7 @@ fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Re
         // drift check and the host already has valid certs. If the agent
         // isn't reachable we skip silently rather than blocking boot for
         // 30s.
-        match fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(5)) {
+        match fetch_pki_over_exec(&exec_socket, Duration::from_secs(5)) {
             Ok(bundle) => {
                 if let Err(e) = sync_host_certs_if_stale(gateway_name, &bundle) {
                     eprintln!("Warning: cert sync check failed: {e}");
@@ -1788,7 +1453,7 @@ fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Re
     // We poll the exec agent with `cat <file>` for each PEM file until they
     // exist, retrying to handle the window between VM boot and PKI generation.
     eprintln!("Waiting for VM to generate PKI...");
-    let pki_bundle = fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(120))
+    let pki_bundle = fetch_pki_over_exec(&exec_socket, Duration::from_secs(120))
         .map_err(|e| VmError::Bootstrap(format!("VM did not produce PKI within 120s: {e}")))?;
 
     eprintln!("PKI ready — storing client certs on host...");
@@ -1829,7 +1494,7 @@ const PKI_FILES: &[(&str, &str)] = &[
 /// and PKI generation completing.
 fn fetch_pki_over_exec(
     exec_socket: &Path,
-    timeout: std::time::Duration,
+    timeout: Duration,
 ) -> Result<openshell_bootstrap::pki::PkiBundle, VmError> {
     let deadline = Instant::now() + timeout;
 
@@ -1837,7 +1502,7 @@ fn fetch_pki_over_exec(
         match try_read_pki_files(exec_socket) {
             Ok(bundle) => return Ok(bundle),
             Err(_) if Instant::now() < deadline => {
-                std::thread::sleep(std::time::Duration::from_millis(500));
+                std::thread::sleep(Duration::from_millis(500));
             }
             Err(e) => {
                 return Err(VmError::Bootstrap(format!(
@@ -1921,6 +1586,31 @@ fn is_warm_boot(gateway_name: &str) -> bool {
     true
 }
 
+/// Remove cached mTLS certs from the host so the next `bootstrap_gateway`
+/// call treats this as a cold boot and fetches fresh PKI from the VM.
+///
+/// Called when the state disk is freshly created or `--reset` is used,
+/// since the VM will generate new PKI that won't match stale host certs.
+fn clear_warm_boot_certs(gateway_name: &str) {
+    let Ok(home) = std::env::var("HOME") else {
+        return;
+    };
+    let config_base =
+        std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config"));
+    let mtls_dir = PathBuf::from(&config_base)
+        .join("openshell/gateways")
+        .join(gateway_name)
+        .join("mtls");
+
+    if mtls_dir.is_dir() {
+        if let Err(e) = std::fs::remove_dir_all(&mtls_dir) {
+            eprintln!("Warning: failed to clear stale mTLS certs: {e}");
+        } else {
+            eprintln!("Cleared stale host mTLS certs");
+        }
+    }
+}
+
 /// Compare the CA cert on the rootfs (authoritative source) against the
 /// host-side copy. If they differ, re-copy all client certs from the rootfs.
 ///
@@ -1956,15 +1646,38 @@ fn sync_host_certs_if_stale(
     Ok(())
 }
 
-static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0);
+pub(crate) static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0);
+
+pub(crate) static VIRTIOFSD_PID: std::sync::atomic::AtomicI32 =
+    std::sync::atomic::AtomicI32::new(0);
 
-extern "C" fn forward_signal(_sig: libc::c_int) {
+/// Set to `true` by the signal handler when a shutdown signal (SIGTERM/SIGINT)
+/// is received. The main thread checks this after `qemu_child.wait()` returns
+/// to ensure cleanup runs even if the wait was interrupted.
+pub(crate) static SHUTDOWN_REQUESTED: std::sync::atomic::AtomicBool =
+    std::sync::atomic::AtomicBool::new(false);
+
+/// Signal handler that forwards SIGTERM to child processes and sets the
+/// shutdown flag. Only calls async-signal-safe functions (libc::kill,
+/// atomic stores). No heap allocation, no println, no mutex.
+pub(crate) extern "C" fn forward_signal(_sig: libc::c_int) {
+    SHUTDOWN_REQUESTED.store(true, std::sync::atomic::Ordering::Relaxed);
+
+    // Always send SIGTERM to each child individually. The process-group
+    // approach (kill(-pgid)) is unreliable because setpgid() in QEMU's
+    // pre_exec silently fails — QEMU stays in its parent's group.
     let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed);
     if pid > 0 {
         unsafe {
             libc::kill(pid, libc::SIGTERM);
         }
     }
+    let vfsd_pid = VIRTIOFSD_PID.load(std::sync::atomic::Ordering::Relaxed);
+    if vfsd_pid > 0 {
+        unsafe {
+            libc::kill(vfsd_pid, libc::SIGTERM);
+        }
+    }
 }
 
 #[cfg(test)]
@@ -2082,4 +1795,85 @@ mod tests {
 
         let _ = fs::remove_dir_all(&dir);
     }
+
+    #[test]
+    fn auto_selects_qemu_for_gpu() {
+        enum SelectedBackend {
+            Libkrun,
+            Qemu,
+        }
+
+        let select = |backend: VmBackendChoice, gpu_enabled: bool| match backend {
+            VmBackendChoice::Libkrun => SelectedBackend::Libkrun,
+            VmBackendChoice::Qemu => SelectedBackend::Qemu,
+            VmBackendChoice::Auto => {
+                if gpu_enabled {
+                    SelectedBackend::Qemu
+                } else {
+                    SelectedBackend::Libkrun
+                }
+            }
+        };
+
+        assert!(matches!(
+            select(VmBackendChoice::Auto, true),
+            SelectedBackend::Qemu
+        ));
+        assert!(matches!(
+            select(VmBackendChoice::Auto, false),
+            SelectedBackend::Libkrun
+        ));
+        assert!(matches!(
+            select(VmBackendChoice::Qemu, false),
+            SelectedBackend::Qemu
+        ));
+    }
+
+    fn config_with_port_map(port_map: Vec<String>) -> VmConfig {
+        VmConfig {
+            rootfs: PathBuf::from("/tmp/fake-rootfs"),
+            vcpus: 1,
+            mem_mib: 512,
+            exec_path: "/bin/true".to_string(),
+            args: vec![],
+            env: vec![],
+            workdir: "/".to_string(),
+            port_map,
+            vsock_ports: vec![],
+            log_level: 0,
+            console_output: None,
+            net: NetBackend::Tsi,
+            reset: false,
+            gateway_name: "test".to_string(),
+            state_disk: None,
+            gpu_enabled: false,
+            gpu_has_msix: false,
+            vfio_device: None,
+            backend: VmBackendChoice::Auto,
+        }
+    }
+
+    #[test]
+    fn gateway_host_port_default_mapping() {
+        let cfg = config_with_port_map(vec!["30051:30051".to_string()]);
+        assert_eq!(gateway_host_port(&cfg), 30051);
+    }
+
+    #[test]
+    fn gateway_host_port_no_gateway_mapping_returns_default() {
+        let cfg = config_with_port_map(vec!["6443:6443".to_string(), "8080:8080".to_string()]);
+        assert_eq!(gateway_host_port(&cfg), DEFAULT_GATEWAY_PORT);
+    }
+
+    #[test]
+    fn gateway_host_port_finds_remapped_gateway() {
+        let cfg = config_with_port_map(vec!["6443:6443".to_string(), "9999:30051".to_string()]);
+        assert_eq!(gateway_host_port(&cfg), 9999);
+    }
+
+    #[test]
+    fn gateway_host_port_empty_port_map() {
+        let cfg = config_with_port_map(vec![]);
+        assert_eq!(gateway_host_port(&cfg), DEFAULT_GATEWAY_PORT);
+    }
 }
diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs
index bb9d854b1..9241db908 100644
--- a/crates/openshell-vm/src/main.rs
+++ b/crates/openshell-vm/src/main.rs
@@ -17,8 +17,9 @@
 //! codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/openshell-vm
 //! ```
 
-use std::io::IsTerminal;
+use std::io::{BufRead, IsTerminal};
 use std::path::PathBuf;
+use std::time::Duration;
 
 use clap::{Parser, Subcommand, ValueHint};
 
@@ -92,6 +93,16 @@ struct Cli {
     /// unclean shutdown.
     #[arg(long)]
     reset: bool,
+
+    /// Enable GPU passthrough. Optionally specify a PCI address
+    /// (e.g. `0000:41:00.0`). Uses QEMU backend with VFIO.
+    #[arg(long, num_args = 0..=1, default_missing_value = "auto")]
+    gpu: Option<String>,
+
+    /// Hypervisor backend: "auto" (default), "libkrun", or "qemu".
+    /// Auto selects QEMU when --gpu is set, and libkrun otherwise.
+    #[arg(long, default_value = "auto")]
+    backend: String,
 }
 
 #[derive(Subcommand)]
@@ -158,6 +169,19 @@ fn main() {
         }
     }
 
+    #[cfg(target_os = "linux")]
+    {
+        #[allow(unsafe_code)]
+        let ret = unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) };
+        if ret != 0 {
+            eprintln!(
+                "warning: prctl(PR_SET_PDEATHSIG) failed: {} — \
+                 signal propagation through sudo may not work",
+                std::io::Error::last_os_error()
+            );
+        }
+    }
+
     tracing_subscriber::fmt::init();
 
     let cli = Cli::parse();
@@ -175,6 +199,102 @@ fn main() {
     }
 }
 
+/// RAII guard that restarts the display manager when dropped.
+///
+/// Created when the user confirms stopping the display manager for GPU
+/// passthrough. On drop (normal exit, error, or panic), restarts the
+/// service so the user's graphical session is restored.
+struct DisplayManagerGuard;
+
+impl DisplayManagerGuard {
+    fn stop_display_manager() -> Result<Self, Box<dyn std::error::Error>> {
+        eprintln!("Stopping display-manager...");
+        let status = std::process::Command::new("systemctl")
+            .args(["stop", "display-manager"])
+            .status()?;
+        if !status.success() {
+            return Err(format!(
+                "failed to stop display-manager (exit {})",
+                status.code().unwrap_or(-1)
+            )
+            .into());
+        }
+        eprintln!("display-manager stopped");
+        // Give Xorg time to release GPU device handles.
+        std::thread::sleep(Duration::from_secs(2));
+        Ok(Self)
+    }
+}
+
+impl Drop for DisplayManagerGuard {
+    fn drop(&mut self) {
+        eprintln!("Restarting display-manager...");
+        match std::process::Command::new("systemctl")
+            .args(["start", "display-manager"])
+            .status()
+        {
+            Ok(s) if s.success() => eprintln!("display-manager restarted"),
+            Ok(s) => eprintln!(
+                "warning: display-manager restart failed (exit {})",
+                s.code().unwrap_or(-1)
+            ),
+            Err(e) => eprintln!("warning: could not restart display-manager: {e}"),
+        }
+    }
+}
+
+/// Prompt the user to stop the display manager for GPU passthrough.
+///
+/// Returns `true` if the user confirms. Always returns `false` when stdin
+/// is not a terminal (non-interactive mode).
+fn prompt_display_manager_stop(info: &openshell_vfio::DisplayBlockerInfo) -> bool {
+    if !std::io::stdin().is_terminal() {
+        return false;
+    }
+
+    eprintln!();
+    eprintln!(
+        "WARNING: GPU {} is in use by the display manager.",
+        info.pci_addr
+    );
+    if !info.display_processes.is_empty() {
+        let procs: Vec<String> = info
+            .display_processes
+            .iter()
+            .map(|(pid, comm)| format!("{comm} (PID {pid})"))
+            .collect();
+        eprintln!("  Display server processes: {}", procs.join(", "));
+    }
+    if info.has_active_outputs {
+        eprintln!("  Active display outputs are connected to this GPU.");
+    }
+    eprintln!();
+    eprintln!("Stopping the display manager will terminate your graphical session.");
+    eprintln!("You will lose access to any open GUI applications.");
+    if !info.other_processes.is_empty() {
+        let procs: Vec<String> = info
+            .other_processes
+            .iter()
+            .map(|(pid, comm)| format!("{comm} (PID {pid})"))
+            .collect();
+        eprintln!();
+        eprintln!(
+            "Other non-display processes are also using the GPU: {}",
+            procs.join(", ")
+        );
+        eprintln!("These will also lose GPU access.");
+    }
+    eprintln!();
+    eprintln!("The display manager will be restarted automatically when the VM exits.");
+    eprint!("Stop display-manager and proceed with GPU passthrough? [y/N] ");
+
+    let mut input = String::new();
+    if std::io::stdin().lock().read_line(&mut input).is_err() {
+        return false;
+    }
+    matches!(input.trim().to_lowercase().as_str(), "y" | "yes")
+}
+
 fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
     if let Some(GatewayCommand::PrepareRootfs { force }) = &cli.command {
         let rootfs = openshell_vm::prepare_rootfs(cli.rootfs.clone(), &cli.name, *force)?;
@@ -196,12 +316,16 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
                 return Err("openshell-vm exec requires a command when stdin is not a TTY".into());
             }
         }
+        let exec_rootfs = if let Some(explicit) = cli.rootfs {
+            explicit
+        } else if cli.gpu.is_some() {
+            openshell_vm::named_gpu_rootfs_dir(&cli.name)?
+        } else {
+            openshell_vm::named_rootfs_dir(&cli.name)?
+        };
         return Ok(openshell_vm::exec_running_vm(
             openshell_vm::VmExecOptions {
-                rootfs: Some(
-                    cli.rootfs
-                        .unwrap_or(openshell_vm::named_rootfs_dir(&cli.name)?),
-                ),
+                rootfs: Some(exec_rootfs),
                 command,
                 workdir,
                 env,
@@ -223,12 +347,101 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
         }
     };
 
-    let rootfs = cli
-        .rootfs
-        .map_or_else(|| openshell_vm::ensure_named_rootfs(&cli.name), Ok)?;
+    let rootfs = if let Some(explicit) = cli.rootfs {
+        Ok(explicit)
+    } else if cli.gpu.is_some() {
+        openshell_vm::ensure_gpu_rootfs(&cli.name)
+    } else {
+        openshell_vm::ensure_named_rootfs(&cli.name)
+    }?;
 
     let gateway_name = openshell_vm::gateway_name(&cli.name)?;
 
+    // Check if the display manager is blocking GPU passthrough and offer
+    // to stop it interactively. The guard restarts display-manager on exit.
+    let _display_manager_guard: Option<DisplayManagerGuard> = if cli.gpu.is_some() {
+        let requested_bdf = match cli.gpu.as_deref() {
+            Some(addr) if addr != "auto" => Some(addr),
+            _ => None,
+        };
+
+        if let Some(blocker) = openshell_vfio::detect_display_blocker(requested_bdf) {
+            if prompt_display_manager_stop(&blocker) {
+                Some(DisplayManagerGuard::stop_display_manager()?)
+            } else {
+                return Err(format!(
+                    "GPU passthrough aborted: GPU {} is in use by the display manager.\n\
+                     To proceed, stop it manually before launching the VM:\n  \
+                     sudo systemctl stop display-manager",
+                    blocker.pci_addr
+                )
+                .into());
+            }
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    let (gpu_enabled, vfio_device, gpu_has_msix, _gpu_guard) = match cli.gpu {
+        Some(ref addr) if addr != "auto" => {
+            let state = openshell_vfio::prepare_gpu_for_passthrough(Some(addr))?;
+            let bdf = state.pci_addr.clone();
+            let has_msix = state.has_msix;
+            (
+                true,
+                Some(bdf),
+                has_msix,
+                Some(openshell_vfio::GpuBindGuard::new(state)),
+            )
+        }
+        Some(_) => {
+            let state = openshell_vfio::prepare_gpu_for_passthrough(None)?;
+            let bdf = state.pci_addr.clone();
+            let has_msix = state.has_msix;
+            (
+                true,
+                Some(bdf),
+                has_msix,
+                Some(openshell_vfio::GpuBindGuard::new(state)),
+            )
+        }
+        None => (false, None, true, None),
+    };
+
+    if let Some(ref guard) = _gpu_guard {
+        if let Some(state) = guard.state() {
+            if state.did_bind {
+                eprintln!(
+                    "\nGPU recovery: if this process is force-killed (kill -9), \
+                     restore your GPU with:\n{}",
+                    state.recovery_commands()
+                );
+            }
+        }
+    }
+
+    let backend_choice = match cli.backend.as_str() {
+        "qemu" => openshell_vm::VmBackendChoice::Qemu,
+        "libkrun" => {
+            if gpu_enabled {
+                return Err(
+                    "--backend libkrun is incompatible with --gpu (libkrun does not support \
+                     VFIO passthrough). Use --backend auto or --backend qemu."
+                        .into(),
+                );
+            }
+            openshell_vm::VmBackendChoice::Libkrun
+        }
+        "auto" => openshell_vm::VmBackendChoice::Auto,
+        other => {
+            return Err(
+                format!("unknown --backend: {other} (expected: auto, libkrun, qemu)").into(),
+            );
+        }
+    };
+
     let mut config = if let Some(exec_path) = cli.exec {
         openshell_vm::VmConfig {
             rootfs,
@@ -246,11 +459,27 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
             reset: cli.reset,
             gateway_name,
             state_disk: None,
+            gpu_enabled,
+            gpu_has_msix,
+            vfio_device,
+            backend: backend_choice,
         }
     } else {
         let mut c = openshell_vm::VmConfig::gateway(rootfs);
         if !cli.port.is_empty() {
             c.port_map = cli.port;
+            let has_gateway = c.port_map.iter().any(|pm| {
+                pm.split(':').nth(1).and_then(|p| p.parse::<u16>().ok())
+                    == Some(openshell_vm::GUEST_GATEWAY_NODEPORT)
+            });
+            if !has_gateway {
+                let gw_port = openshell_vm::GUEST_GATEWAY_NODEPORT;
+                c.port_map.push(format!("{gw_port}:{gw_port}"));
+                eprintln!(
+                    "Auto-added gateway port mapping {gw_port}:{gw_port} \
+                     (required for health check and CLI access)"
+                );
+            }
         }
         if let Some(v) = cli.vcpus {
             c.vcpus = v;
@@ -261,6 +490,10 @@ fn run(cli: Cli) -> Result<i32, Box<dyn std::error::Error>> {
         c.net = net_backend;
         c.reset = cli.reset;
         c.gateway_name = gateway_name;
+        c.gpu_enabled = gpu_enabled;
+        c.gpu_has_msix = gpu_has_msix;
+        c.vfio_device = vfio_device;
+        c.backend = backend_choice;
         if state_disk_disabled() {
             c.state_disk = None;
         }
diff --git a/crates/openshell-vm/tests/vm_boot_smoke.rs b/crates/openshell-vm/tests/vm_boot_smoke.rs
new file mode 100644
index 000000000..f16027129
--- /dev/null
+++ b/crates/openshell-vm/tests/vm_boot_smoke.rs
@@ -0,0 +1,137 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Non-GPU boot smoke tests for the QEMU backend.
+//!
+//! Boots a VM **without** VFIO/GPU passthrough and verifies the kernel boots
+//! and init runs. This catches backend regressions on regular CI runners
+//! that lack GPU hardware.
+//!
+//! Gated on `OPENSHELL_VM_BACKEND` — set to `qemu` to run the tests.
+//! Skipped when the env var is absent.
+//!
+//! Requires the VM runtime bundle (vmlinux, virtiofsd, rootfs, and the
+//! backend binary) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run
+//! `mise run vm:bundle-runtime` first.
+//!
+//! Run explicitly:
+//!
+//! ```sh
+//! OPENSHELL_VM_BACKEND=qemu cargo test -p openshell-vm --test vm_boot_smoke
+//! ```
+
+#![allow(unsafe_code)]
+
+use std::process::{Command, Stdio};
+use std::time::Duration;
+
+const GATEWAY: &str = env!("CARGO_BIN_EXE_openshell-vm");
+
+fn runtime_bundle_dir() -> std::path::PathBuf {
+    std::path::Path::new(GATEWAY)
+        .parent()
+        .expect("openshell-vm binary has no parent")
+        .join("openshell-vm.runtime")
+}
+
+fn require_bundle() {
+    let bundle = runtime_bundle_dir();
+    if !bundle.is_dir() {
+        panic!(
+            "VM runtime bundle not found at {}. Run `mise run vm:bundle-runtime` first.",
+            bundle.display()
+        );
+    }
+}
+
+fn skip_unless_qemu() -> bool {
+    if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("qemu") {
+        eprintln!("OPENSHELL_VM_BACKEND != qemu — skipping");
+        return true;
+    }
+    false
+}
+
+#[test]
+fn qemu_exec_exits_cleanly() {
+    if skip_unless_qemu() {
+        return;
+    }
+    require_bundle();
+
+    let mut child = Command::new(GATEWAY)
+        .args(["--backend", "qemu", "--net", "none", "--exec", "/bin/true"])
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .spawn()
+        .expect("failed to start openshell-vm");
+
+    let timeout = Duration::from_secs(30);
+    let start = std::time::Instant::now();
+
+    loop {
+        match child.try_wait() {
+            Ok(Some(status)) => {
+                assert!(
+                    status.success(),
+                    "qemu --exec /bin/true exited with {status}"
+                );
+                return;
+            }
+            Ok(None) => {
+                if start.elapsed() > timeout {
+                    let _ = unsafe { libc::kill(child.id() as i32, libc::SIGKILL) };
+                    let _ = child.wait();
+                    panic!("QEMU VM did not exit within {timeout:?}");
+                }
+                std::thread::sleep(Duration::from_millis(500));
+            }
+            Err(e) => panic!("error waiting for openshell-vm: {e}"),
+        }
+    }
+}
+
+#[test]
+fn qemu_boots_without_gpu() {
+    if skip_unless_qemu() {
+        return;
+    }
+    require_bundle();
+
+    if !nix_is_root() {
+        eprintln!("skipping full gateway boot — requires root for TAP networking");
+        return;
+    }
+
+    let mut child = Command::new(GATEWAY)
+        .args(["--backend", "qemu"])
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .spawn()
+        .expect("failed to start openshell-vm");
+
+    let addr: std::net::SocketAddr = ([127, 0, 0, 1], 30051).into();
+    let timeout = Duration::from_secs(180);
+    let start = std::time::Instant::now();
+    let mut reachable = false;
+
+    while start.elapsed() < timeout {
+        if std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() {
+            reachable = true;
+            break;
+        }
+        std::thread::sleep(Duration::from_secs(2));
+    }
+
+    let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) };
+    let _ = child.wait();
+
+    assert!(
+        reachable,
+        "QEMU VM service on port 30051 not reachable within {timeout:?}"
+    );
+}
+
+fn nix_is_root() -> bool {
+    unsafe { libc::geteuid() == 0 }
+}
diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images
index b7e854677..59b133629 100644
--- a/deploy/docker/Dockerfile.images
+++ b/deploy/docker/Dockerfile.images
@@ -56,6 +56,7 @@ COPY crates/openshell-router/Cargo.toml crates/openshell-router/Cargo.toml
 COPY crates/openshell-sandbox/Cargo.toml crates/openshell-sandbox/Cargo.toml
 COPY crates/openshell-server/Cargo.toml crates/openshell-server/Cargo.toml
 COPY crates/openshell-tui/Cargo.toml crates/openshell-tui/Cargo.toml
+COPY crates/openshell-vfio/Cargo.toml crates/openshell-vfio/Cargo.toml
 COPY crates/openshell-vm/Cargo.toml crates/openshell-vm/Cargo.toml
 COPY crates/openshell-core/build.rs crates/openshell-core/build.rs
 COPY proto/ proto/
@@ -73,6 +74,7 @@ RUN mkdir -p \
       crates/openshell-sandbox/src \
       crates/openshell-server/src \
       crates/openshell-tui/src \
+      crates/openshell-vfio/src \
       crates/openshell-vm/src && \
     touch crates/openshell-bootstrap/src/lib.rs && \
     printf 'fn main() {}\n' > crates/openshell-cli/src/main.rs && \
@@ -89,6 +91,7 @@ RUN mkdir -p \
     touch crates/openshell-server/src/lib.rs && \
     printf 'fn main() {}\n' > crates/openshell-server/src/main.rs && \
     touch crates/openshell-tui/src/lib.rs && \
+    touch crates/openshell-vfio/src/lib.rs && \
     touch crates/openshell-vm/src/lib.rs && \
     printf 'fn main() {}\n' > crates/openshell-vm/src/main.rs
 
diff --git a/tasks/scripts/vm/build-gpu-deps.sh b/tasks/scripts/vm/build-gpu-deps.sh
new file mode 100755
index 000000000..7265a06c3
--- /dev/null
+++ b/tasks/scripts/vm/build-gpu-deps.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Build GPU passthrough dependencies for the QEMU backend.
+#
+# Builds virtiofsd from source.
+# These are only needed on Linux for VFIO GPU passthrough.
+#
+# Artifacts produced:
+#   virtiofsd         — filesystem daemon used by the QEMU backend
+#
+# The vmlinux kernel is extracted separately by build-libkrun.sh during
+# the kernel build step.
+#
+# QEMU's own binary (qemu-system-x86_64) must be installed on the host
+# separately — it is not built or downloaded by this script.
+# Run `mise run vm:qemu-check` to validate QEMU prerequisites.
+#
+# Usage:
+#   ./build-gpu-deps.sh [--output-dir <DIR>]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/_lib.sh"
+ROOT="$(vm_lib_root)"
+
+source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true
+
+VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}"
+OUTPUT_DIR="${ROOT}/target/libkrun-build"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --output-dir) OUTPUT_DIR="$2"; shift 2 ;;
+        *) echo "Unknown argument: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ "$(uname -s)" != "Linux" ]; then
+  echo "Error: GPU passthrough is Linux-only" >&2
+  exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+HOST_ARCH="$(uname -m)"
+case "$HOST_ARCH" in
+  aarch64) VIRTIOFSD_ARCH="aarch64" ;;
+  x86_64)  VIRTIOFSD_ARCH="x86_64" ;;
+  *)       echo "Error: Unsupported architecture: ${HOST_ARCH}" >&2; exit 1 ;;
+esac
+
+echo "==> Building virtiofsd ${VIRTIOFSD_VERSION} from source..."
+VIRTIOFSD_SRC="$(mktemp -d)"
+VIRTIOFSD_TARBALL_URL="https://gitlab.com/virtio-fs/virtiofsd/-/archive/${VIRTIOFSD_VERSION}/virtiofsd-${VIRTIOFSD_VERSION}.tar.gz"
+curl -fsSL "$VIRTIOFSD_TARBALL_URL" | tar -xzf - -C "$VIRTIOFSD_SRC" --strip-components=1
+rm -f "${VIRTIOFSD_SRC}/Cargo.lock"
+
+CARGO_CMD="cargo"
+if command -v mise &>/dev/null; then
+  CARGO_CMD="mise exec -- cargo"
+fi
+# Prevent external CARGO_TARGET_DIR from redirecting build output away from
+# the local temp directory (e.g. Cursor sandbox sets this globally).
+unset CARGO_TARGET_DIR
+$CARGO_CMD build --release --manifest-path "${VIRTIOFSD_SRC}/Cargo.toml"
+cp "${VIRTIOFSD_SRC}/target/release/virtiofsd" "${OUTPUT_DIR}/virtiofsd"
+chmod +x "${OUTPUT_DIR}/virtiofsd"
+rm -rf "$VIRTIOFSD_SRC"
+echo "    Built: virtiofsd"
+
+echo ""
+echo "==> GPU passthrough binaries ready in ${OUTPUT_DIR}"
+ls -lah "${OUTPUT_DIR}/virtiofsd" 2>/dev/null || true
diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh
index 9e2217f50..c2a1a6d76 100755
--- a/tasks/scripts/vm/build-libkrun.sh
+++ b/tasks/scripts/vm/build-libkrun.sh
@@ -210,9 +210,25 @@ if [ -f openshell.kconfig ]; then
   # Re-run olddefconfig to fill in any new symbols introduced by the fragment.
   make -C "${KERNEL_SOURCES}" ARCH="${KARCH}" olddefconfig
 
+  # Force-enable hidden Kconfig bools required by out-of-tree NVIDIA modules.
+  # CONFIG_MMU_NOTIFIER is a hidden bool (no prompt) that can only be
+  # activated via "select" from another in-tree option. olddefconfig and
+  # syncconfig both strip it if nothing selects it. NVIDIA UVM needs it for
+  # GPU memory management. We patch the DRM Kconfig (already enabled as
+  # CONFIG_DRM=y) to select MMU_NOTIFIER, then re-run olddefconfig so the
+  # dependency chain (INTERVAL_TREE) is resolved properly.
+  if ! grep -q "select MMU_NOTIFIER" "${KERNEL_SOURCES}/drivers/gpu/drm/Kconfig"; then
+    sed -i '/^menuconfig DRM$/,/^[[:space:]]*select VIDEO/ {
+      /^[[:space:]]*select VIDEO/a\
+\tselect MMU_NOTIFIER
+    }' "${KERNEL_SOURCES}/drivers/gpu/drm/Kconfig"
+    echo "    Patched DRM Kconfig to select MMU_NOTIFIER"
+  fi
+  make -C "${KERNEL_SOURCES}" ARCH="${KARCH}" olddefconfig
+
   # Verify that the key options were actually applied.
   all_ok=true
-  for opt in CONFIG_BRIDGE CONFIG_NETFILTER CONFIG_NF_NAT; do
+  for opt in CONFIG_BRIDGE CONFIG_NETFILTER CONFIG_NF_NAT CONFIG_X86_PAT CONFIG_MMU_NOTIFIER CONFIG_FW_LOADER; do
     val="$(grep "^${opt}=" "${KERNEL_SOURCES}/.config" 2>/dev/null || true)"
     if [ -n "$val" ]; then
       echo "    ${opt}: ${val#*=}"
@@ -239,6 +255,25 @@ make -j"$(nproc)"
 cp libkrunfw.so* "$OUTPUT_DIR/"
 echo "    Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')"
 
+# Copy vmlinux kernel image for QEMU GPU passthrough.
+# This is the uncompressed kernel built by libkrunfw's kernel build.
+if [ -f "${KERNEL_SOURCES}/vmlinux" ]; then
+  cp "${KERNEL_SOURCES}/vmlinux" "$OUTPUT_DIR/vmlinux"
+  echo "    Copied vmlinux for QEMU GPU passthrough"
+elif [ -f "vmlinux" ]; then
+  cp "vmlinux" "$OUTPUT_DIR/vmlinux"
+  echo "    Copied vmlinux for QEMU GPU passthrough"
+else
+  echo "    Warning: vmlinux not found in kernel build tree (GPU passthrough will not be available)" >&2
+fi
+
+# Export kernel release string for downstream scripts (nvidia modules, rootfs).
+# Uses kernelrelease (includes CONFIG_LOCALVERSION) so that module vermagic,
+# rootfs module path, and the kernel's uname -r all agree.
+KERNEL_RELEASE="$(make -s -C "${KERNEL_SOURCES}" kernelrelease)"
+echo "${KERNEL_RELEASE}" > "${OUTPUT_DIR}/kernel-version.txt"
+echo "    Exported kernel version: ${KERNEL_RELEASE}"
+
 cd "$BUILD_DIR"
 
 # ── Build libkrun (VMM) ─────────────────────────────────────────────────
diff --git a/tasks/scripts/vm/build-nvidia-modules.sh b/tasks/scripts/vm/build-nvidia-modules.sh
new file mode 100755
index 000000000..064c4bb0c
--- /dev/null
+++ b/tasks/scripts/vm/build-nvidia-modules.sh
@@ -0,0 +1,182 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Build NVIDIA open kernel modules against the VM kernel source tree.
+#
+# Clones the NVIDIA open-gpu-kernel-modules repo at a pinned driver tag
+# and compiles the kernel modules against the kernel built by
+# build-libkrun.sh.  The resulting .ko files are placed in the output
+# directory for injection into the GPU rootfs by build-rootfs.sh.
+#
+# Prerequisites:
+#   - Kernel source tree built by build-libkrun.sh
+#     (target/libkrun-build/libkrunfw/linux-<version>/)
+#   - Build tools: make, gcc
+#
+# Usage:
+#   ./build-nvidia-modules.sh [--output-dir <DIR>]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/_lib.sh"
+ROOT="$(vm_lib_root)"
+
+source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true
+
+NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}"
+
+BUILD_DIR="${ROOT}/target/libkrun-build"
+OUTPUT_DIR="${BUILD_DIR}/nvidia-modules"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --output-dir) OUTPUT_DIR="$2"; shift 2 ;;
+    *) echo "Unknown argument: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [ "$(uname -s)" != "Linux" ]; then
+  echo "Error: NVIDIA GPU module build is Linux-only" >&2
+  exit 1
+fi
+
+HOST_ARCH="$(uname -m)"
+if [ "$HOST_ARCH" != "x86_64" ]; then
+  echo "Error: NVIDIA GPU passthrough is only supported on x86_64 (got: ${HOST_ARCH})" >&2
+  exit 1
+fi
+
+# ── Locate the kernel source tree ────────────────────────────────────────
+
+LIBKRUNFW_DIR="${BUILD_DIR}/libkrunfw"
+if [ ! -f "${LIBKRUNFW_DIR}/Makefile" ]; then
+  echo "ERROR: libkrunfw not found at ${LIBKRUNFW_DIR}" >&2
+  echo "       The GPU module build requires the kernel source tree." >&2
+  echo "       Run: FROM_SOURCE=1 mise run vm:setup" >&2
+  exit 1
+fi
+
+KERNEL_DIR_NAME="$(grep '^KERNEL_VERSION' "${LIBKRUNFW_DIR}/Makefile" | head -1 | awk '{print $3}')"
+KERNEL_SOURCES="${LIBKRUNFW_DIR}/${KERNEL_DIR_NAME}"
+
+if [ ! -f "${KERNEL_SOURCES}/.config" ]; then
+  echo "ERROR: Kernel source tree not found at ${KERNEL_SOURCES}" >&2
+  echo "       Run: FROM_SOURCE=1 mise run vm:setup" >&2
+  exit 1
+fi
+
+if [ ! -f "${KERNEL_SOURCES}/Module.symvers" ]; then
+  echo "ERROR: Kernel tree at ${KERNEL_SOURCES} is missing Module.symvers." >&2
+  echo "       The kernel must have been fully built." >&2
+  echo "       Run: FROM_SOURCE=1 mise run vm:setup" >&2
+  exit 1
+fi
+
+# Use kernelrelease to get the full version string (includes CONFIG_LOCALVERSION).
+KERNEL_VERSION="$(make -s -C "${KERNEL_SOURCES}" kernelrelease)"
+echo "==> Building NVIDIA ${NVIDIA_DRIVER_VERSION} kernel modules for kernel ${KERNEL_VERSION}"
+echo "    Kernel source: ${KERNEL_SOURCES}"
+echo "    Output:        ${OUTPUT_DIR}"
+echo ""
+
+# ── Prepare kernel tree for out-of-tree module builds ────────────────────
+
+echo "==> Preparing kernel tree for external module builds..."
+make -C "${KERNEL_SOURCES}" modules_prepare -j"$(nproc)"
+
+# ── Clone or reuse NVIDIA open-gpu-kernel-modules ────────────────────────
+
+NVIDIA_DRIVER_TAG="${NVIDIA_DRIVER_TAG:-}"
+if [ -z "${NVIDIA_DRIVER_TAG}" ]; then
+  echo "ERROR: NVIDIA_DRIVER_TAG not set in pins.env or environment." >&2
+  echo "       This must be the exact driver version tag matching the" >&2
+  echo "       nvidia-headless-${NVIDIA_DRIVER_VERSION}-open APT package." >&2
+  echo "       Find it:  apt-cache show nvidia-headless-${NVIDIA_DRIVER_VERSION}-open | grep Version" >&2
+  echo "       Example:  NVIDIA_DRIVER_TAG=570.86.16" >&2
+  exit 1
+fi
+
+NVIDIA_SRC="${BUILD_DIR}/open-gpu-kernel-modules"
+
+if [ -d "${NVIDIA_SRC}" ]; then
+  EXISTING_TAG="$(git -C "${NVIDIA_SRC}" describe --tags --exact-match HEAD 2>/dev/null || true)"
+  if [ "${EXISTING_TAG}" = "${NVIDIA_DRIVER_TAG}" ]; then
+    echo "==> Using cached NVIDIA source (tag ${NVIDIA_DRIVER_TAG})"
+  else
+    echo "==> NVIDIA source tag mismatch (have: ${EXISTING_TAG:-unknown}, want: ${NVIDIA_DRIVER_TAG}), re-cloning..."
+    rm -rf "${NVIDIA_SRC}"
+  fi
+fi
+
+if [ ! -d "${NVIDIA_SRC}" ]; then
+  echo "==> Cloning NVIDIA open-gpu-kernel-modules (tag ${NVIDIA_DRIVER_TAG})..."
+  git clone --depth 1 --branch "${NVIDIA_DRIVER_TAG}" \
+    https://github.com/NVIDIA/open-gpu-kernel-modules.git "${NVIDIA_SRC}"
+fi
+
+# ── Build the kernel modules ─────────────────────────────────────────────
+
+echo ""
+echo "==> Compiling NVIDIA kernel modules (this may take 2-5 minutes)..."
+make -C "${NVIDIA_SRC}" -j"$(nproc)" modules \
+  SYSSRC="${KERNEL_SOURCES}" \
+  KERNEL_UNAME="${KERNEL_VERSION}"
+
+# ── Collect built modules ────────────────────────────────────────────────
+
+mkdir -p "${OUTPUT_DIR}"
+
+# The NVIDIA kbuild produces modules at deterministic paths under kernel-open/.
+declare -A MODULE_PATHS=(
+  [nvidia.ko]="kernel-open/nvidia.ko"
+  [nvidia-uvm.ko]="kernel-open/nvidia-uvm.ko"
+  [nvidia-modeset.ko]="kernel-open/nvidia-modeset.ko"
+  [nvidia-drm.ko]="kernel-open/nvidia-drm.ko"
+  [nvidia-peermem.ko]="kernel-open/nvidia-peermem.ko"
+)
+
+EXPECTED_MODULES=(nvidia.ko nvidia-uvm.ko nvidia-modeset.ko nvidia-drm.ko nvidia-peermem.ko)
+
+for mod in "${EXPECTED_MODULES[@]}"; do
+  src_path="${NVIDIA_SRC}/${MODULE_PATHS[$mod]}"
+  if [ -f "$src_path" ]; then
+    cp "$src_path" "${OUTPUT_DIR}/"
+    echo "    Built: $mod ($(du -h "$src_path" | cut -f1))"
+  fi
+done
+
+# Normalize permissions.
+chmod 644 "${OUTPUT_DIR}"/*.ko 2>/dev/null || true
+
+# nvidia-peermem.ko is optional (GPUDirect RDMA); the other four are required.
+REQUIRED_MODULES=(nvidia.ko nvidia-uvm.ko nvidia-modeset.ko nvidia-drm.ko)
+for mod in "${REQUIRED_MODULES[@]}"; do
+  if [ ! -f "${OUTPUT_DIR}/${mod}" ]; then
+    echo "ERROR: Required module ${mod} was not produced by the build." >&2
+    echo "       Check build output above for compilation errors." >&2
+    exit 1
+  fi
+done
+
+echo ""
+echo "==> NVIDIA modules ready at ${OUTPUT_DIR}"
+ls -lah "${OUTPUT_DIR}/"*.ko
+
+# Verify module vermagic matches the kernel.
+echo ""
+echo "==> Verifying module compatibility..."
+if command -v modinfo &>/dev/null; then
+  VERMAGIC="$(modinfo -F vermagic "${OUTPUT_DIR}/nvidia.ko" 2>/dev/null || true)"
+  if [ -n "$VERMAGIC" ]; then
+    echo "    vermagic: ${VERMAGIC}"
+    if echo "$VERMAGIC" | grep -q "^${KERNEL_VERSION} "; then
+      echo "    OK: modules match kernel ${KERNEL_VERSION}"
+    else
+      echo "    ERROR: vermagic does not start with ${KERNEL_VERSION}" >&2
+      echo "           Modules will fail to load in the VM." >&2
+      exit 1
+    fi
+  fi
+fi
diff --git a/tasks/scripts/vm/build-rootfs-tarball.sh b/tasks/scripts/vm/build-rootfs-tarball.sh
index 76e4f6297..d41b2ff25 100755
--- a/tasks/scripts/vm/build-rootfs-tarball.sh
+++ b/tasks/scripts/vm/build-rootfs-tarball.sh
@@ -9,36 +9,43 @@
 # 2. Compresses it to a zstd tarball for embedding
 #
 # Usage:
-#   ./build-rootfs-tarball.sh [--base]
+#   ./build-rootfs-tarball.sh [--base] [--gpu]
 #
 # Options:
 #   --base      Build a base rootfs (~200-300MB) without pre-loaded images.
 #               First boot will be slower but binary size is much smaller.
 #               Default: full rootfs with pre-loaded images (~2GB+).
+#   --gpu       Include NVIDIA drivers and nvidia-container-toolkit for GPU
+#               passthrough. Only supported on x86_64.
 #
-# The resulting tarball is placed at target/vm-runtime-compressed/rootfs.tar.zst
-# for inclusion in the embedded binary build.
+# The resulting tarball is placed at:
+#   target/vm-runtime-compressed/rootfs.tar.zst      (standard)
+#   target/vm-runtime-compressed/rootfs-gpu.tar.zst   (--gpu)
 
 set -euo pipefail
 
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
 ROOTFS_BUILD_DIR="${ROOT}/target/rootfs-build"
 OUTPUT_DIR="${ROOT}/target/vm-runtime-compressed"
-OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst"
 
 # Parse arguments
 BASE_ONLY=false
+GPU=false
 for arg in "$@"; do
     case "$arg" in
         --base)
             BASE_ONLY=true
             ;;
+        --gpu)
+            GPU=true
+            ;;
         --help|-h)
-            echo "Usage: $0 [--base]"
+            echo "Usage: $0 [--base] [--gpu]"
             echo ""
             echo "Options:"
             echo "  --base   Build base rootfs (~200-300MB) without pre-loaded images"
             echo "           First boot will be slower but binary size is much smaller"
+            echo "  --gpu    Include NVIDIA drivers for GPU passthrough (x86_64 only)"
             exit 0
             ;;
         *)
@@ -63,28 +70,33 @@ if ! docker info &>/dev/null; then
     exit 1
 fi
 
+ROOTFS_ARGS=()
+MODE_DESC="full (pre-loaded images, pre-initialized, ~2GB+)"
 if [ "$BASE_ONLY" = true ]; then
-    echo "==> Building BASE rootfs for embedding"
-    echo "    Build dir: ${ROOTFS_BUILD_DIR}"
-    echo "    Output:    ${OUTPUT}"
-    echo "    Mode:      base (no pre-loaded images, ~200-300MB)"
-    echo ""
-    
-    # Build base rootfs
-    echo "==> Step 1/2: Building base rootfs..."
-    "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" --base "${ROOTFS_BUILD_DIR}"
+    ROOTFS_ARGS+=(--base)
+    MODE_DESC="base (no pre-loaded images, ~200-300MB)"
+fi
+if [ "$GPU" = true ]; then
+    ROOTFS_ARGS+=(--gpu)
+    MODE_DESC="${MODE_DESC}, GPU (NVIDIA drivers included)"
+fi
+
+# GPU rootfs gets a distinct tarball name so both can coexist in the output dir
+if [ "$GPU" = true ]; then
+    OUTPUT="${OUTPUT_DIR}/rootfs-gpu.tar.zst"
 else
-    echo "==> Building FULL rootfs for embedding"
-    echo "    Build dir: ${ROOTFS_BUILD_DIR}"
-    echo "    Output:    ${OUTPUT}"
-    echo "    Mode:      full (pre-loaded images, pre-initialized, ~2GB+)"
-    echo ""
-    
-    # Build full rootfs
-    echo "==> Step 1/2: Building full rootfs (this may take 10-15 minutes)..."
-    "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" "${ROOTFS_BUILD_DIR}"
+    OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst"
 fi
 
+echo "==> Building rootfs for embedding"
+echo "    Build dir: ${ROOTFS_BUILD_DIR}"
+echo "    Output:    ${OUTPUT}"
+echo "    Mode:      ${MODE_DESC}"
+echo ""
+
+echo "==> Step 1/2: Building rootfs..."
+"${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" "${ROOTFS_ARGS[@]}" "${ROOTFS_BUILD_DIR}"
+
 # Compress to tarball
 echo ""
 echo "==> Step 2/2: Compressing rootfs to tarball..."
@@ -107,10 +119,13 @@ echo ""
 echo "==> Rootfs tarball created successfully!"
 echo "    Output:     ${OUTPUT}"
 echo "    Compressed: $(du -sh "${OUTPUT}" | cut -f1)"
+TYPE_DESC="full (first boot ~3-5s, images pre-loaded)"
 if [ "$BASE_ONLY" = true ]; then
-    echo "    Type:       base (first boot ~30-60s, images pulled on demand)"
-else
-    echo "    Type:       full (first boot ~3-5s, images pre-loaded)"
+    TYPE_DESC="base (first boot ~30-60s, images pulled on demand)"
+fi
+if [ "$GPU" = true ]; then
+    TYPE_DESC="${TYPE_DESC}, GPU"
 fi
+echo "    Type:       ${TYPE_DESC}"
 echo ""
 echo "Next step: mise run vm:build"
diff --git a/tasks/scripts/vm/bundle-vm-runtime.sh b/tasks/scripts/vm/bundle-vm-runtime.sh
index 6c21e511d..83d53dcac 100755
--- a/tasks/scripts/vm/bundle-vm-runtime.sh
+++ b/tasks/scripts/vm/bundle-vm-runtime.sh
@@ -46,6 +46,9 @@ TARGETS=(
     "${ROOT}/target/release"
 )
 
+COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed"
+GPU_ROOTFS_TARBALL="${COMPRESSED_DIR}/rootfs-gpu.tar.zst"
+
 for target_dir in "${TARGETS[@]}"; do
     # Only stage if the binary exists (avoid creating orphan runtime dirs)
     if [ ! -f "${target_dir}/openshell-vm" ] && [ ! -f "${target_dir}/openshell-vm.d" ]; then
@@ -61,5 +64,11 @@ for target_dir in "${TARGETS[@]}"; do
         install -m 0755 "$file" "${runtime_dir}/${name}"
     done
 
+    # Stage the GPU rootfs tarball if it was built
+    if [ -f "${GPU_ROOTFS_TARBALL}" ]; then
+        install -m 0644 "${GPU_ROOTFS_TARBALL}" "${runtime_dir}/rootfs-gpu.tar.zst"
+        echo "staged GPU rootfs tarball in ${runtime_dir}"
+    fi
+
     echo "staged runtime bundle in ${runtime_dir}"
 done
diff --git a/tasks/scripts/vm/compress-vm-runtime.sh b/tasks/scripts/vm/compress-vm-runtime.sh
index efada8a2e..69e1d5658 100755
--- a/tasks/scripts/vm/compress-vm-runtime.sh
+++ b/tasks/scripts/vm/compress-vm-runtime.sh
@@ -91,11 +91,23 @@ if [ -z "${VM_RUNTIME_TARBALL:-}" ] && _check_compressed_artifacts "$OUTPUT_DIR"
     for f in "${OUTPUT_DIR}"/*.zst; do
         [ -f "$f" ] || continue
         name="$(basename "${f%.zst}")"
-        # Skip rootfs tarball — bundle-vm-runtime.sh doesn't need it
-        [[ "$name" == rootfs.tar ]] && continue
+        # Skip rootfs tarballs — bundle-vm-runtime.sh doesn't need them
+        [[ "$name" == rootfs.tar || "$name" == rootfs-gpu.tar ]] && continue
         zstd -d "$f" -o "${WORK_DIR}/${name}" -f -q
         chmod 0755 "${WORK_DIR}/${name}"
     done
+    # GPU passthrough binaries live in libkrun-build but are not part of the
+    # core compressed set. Copy them into WORK_DIR so bundle-vm-runtime.sh
+    # stages them alongside the core libraries.
+    _BUILD_DIR="${ROOT}/target/libkrun-build"
+    for gpu_bin in vmlinux virtiofsd; do
+        if [ -f "${_BUILD_DIR}/${gpu_bin}" ]; then
+            cp "${_BUILD_DIR}/${gpu_bin}" "$WORK_DIR/"
+            chmod 0755 "${WORK_DIR}/${gpu_bin}"
+            echo "    Included GPU binary: ${gpu_bin}"
+        fi
+    done
+
     echo "    Decompressed files:"
     ls -lah "$WORK_DIR"
 
@@ -126,8 +138,9 @@ if [ -n "${VM_RUNTIME_TARBALL:-}" ]; then
     echo ""
     compress_dir "$WORK_DIR" "$OUTPUT_DIR"
 
-    # Check for rootfs tarball (built separately)
+    # Check for rootfs tarballs (built separately)
     ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs.tar.zst"
+    GPU_ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs-gpu.tar.zst"
     if [ -f "$ROOTFS_TARBALL" ]; then
         echo "    rootfs.tar.zst: $(du -h "$ROOTFS_TARBALL" | cut -f1) (pre-built)"
     else
@@ -135,6 +148,9 @@ if [ -n "${VM_RUNTIME_TARBALL:-}" ]; then
         echo "Note: rootfs.tar.zst not found."
         echo "      To build one, run: mise run vm:rootfs -- --base"
     fi
+    if [ -f "$GPU_ROOTFS_TARBALL" ]; then
+        echo "    rootfs-gpu.tar.zst: $(du -h "$GPU_ROOTFS_TARBALL" | cut -f1) (pre-built)"
+    fi
 
     echo ""
     echo "==> Compressed artifacts in ${OUTPUT_DIR}:"
@@ -256,6 +272,14 @@ case "$(uname -s)-$(uname -m)" in
         "https://github.com/containers/gvisor-tap-vsock/releases/download/${GVPROXY_VERSION}/gvproxy-linux-${GVPROXY_ARCH}"
       chmod +x "$WORK_DIR/gvproxy"
     fi
+
+    # GPU passthrough binaries (optional — included when present in libkrun-build)
+    for gpu_bin in vmlinux virtiofsd; do
+        if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then
+            cp "${BUILD_DIR}/${gpu_bin}" "$WORK_DIR/"
+            echo "    Included GPU binary: ${gpu_bin}"
+        fi
+    done
     ;;
     
   *)
@@ -272,16 +296,20 @@ ls -lah "$WORK_DIR"
 echo ""
 compress_dir "$WORK_DIR" "$OUTPUT_DIR"
 
-# Check for rootfs tarball (built separately by build-rootfs-tarball.sh)
+# Check for rootfs tarballs (built separately by build-rootfs-tarball.sh)
 ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs.tar.zst"
+GPU_ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs-gpu.tar.zst"
 if [ -f "$ROOTFS_TARBALL" ]; then
     echo "    rootfs.tar.zst: $(du -h "$ROOTFS_TARBALL" | cut -f1) (pre-built)"
 else
     echo ""
     echo "Note: rootfs.tar.zst not found."
-      echo "      To build one, run: mise run vm:rootfs -- --base"
-      echo "      Without it, the binary will still work but require the rootfs"
-      echo "      to be built separately on first run."
+    echo "      To build one, run: mise run vm:rootfs -- --base"
+    echo "      Without it, the binary will still work but require the rootfs"
+    echo "      to be built separately on first run."
+fi
+if [ -f "$GPU_ROOTFS_TARBALL" ]; then
+    echo "    rootfs-gpu.tar.zst: $(du -h "$GPU_ROOTFS_TARBALL" | cut -f1) (pre-built)"
 fi
 
 echo ""
diff --git a/tasks/scripts/vm/download-kernel-runtime.sh b/tasks/scripts/vm/download-kernel-runtime.sh
index 8f0427af9..5e60d3c75 100755
--- a/tasks/scripts/vm/download-kernel-runtime.sh
+++ b/tasks/scripts/vm/download-kernel-runtime.sh
@@ -81,11 +81,11 @@ DOWNLOAD_DIR="${ROOT}/target/vm-runtime-download"
 mkdir -p "$DOWNLOAD_DIR" "$OUTPUT_DIR"
 
 echo "==> Downloading ${TARBALL_NAME} from ${RELEASE_TAG}..."
+rm -f "${DOWNLOAD_DIR}/${TARBALL_NAME}"
 gh release download "${RELEASE_TAG}" \
     --repo "${REPO}" \
     --pattern "${TARBALL_NAME}" \
-    --dir "${DOWNLOAD_DIR}" \
-    --clobber
+    --dir "${DOWNLOAD_DIR}"
 
 if [ ! -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" ]; then
     echo "Error: Download failed — ${TARBALL_NAME} not found." >&2
diff --git a/tasks/scripts/vm/package-vm-runtime.sh b/tasks/scripts/vm/package-vm-runtime.sh
index f97eec870..7f5e908c6 100755
--- a/tasks/scripts/vm/package-vm-runtime.sh
+++ b/tasks/scripts/vm/package-vm-runtime.sh
@@ -84,6 +84,13 @@ case "$PLATFORM" in
             versioned="$(ls "${PACKAGE_DIR}"/libkrunfw.so.5.* 2>/dev/null | head -n1 || true)"
             [ -n "$versioned" ] && cp "$versioned" "${PACKAGE_DIR}/libkrunfw.so.5"
         fi
+        # GPU passthrough binaries (optional — only included if present)
+        for gpu_bin in vmlinux virtiofsd; do
+            if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then
+                cp "${BUILD_DIR}/${gpu_bin}" "${PACKAGE_DIR}/"
+                echo "    Included GPU passthrough binary: ${gpu_bin}"
+            fi
+        done
         ;;
     darwin-aarch64)
         cp "${BUILD_DIR}/libkrun.dylib" "${PACKAGE_DIR}/"
diff --git a/tasks/scripts/vm/qemu-check.sh b/tasks/scripts/vm/qemu-check.sh
new file mode 100755
index 000000000..8629ff276
--- /dev/null
+++ b/tasks/scripts/vm/qemu-check.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Validate QEMU host prerequisites for GPU passthrough.
+#
+# Checks that qemu-system-x86_64, vhost-vsock support, and required
+# runtime artifacts (vmlinux, virtiofsd) are available.
+#
+# Usage:
+#   ./qemu-check.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/_lib.sh"
+ROOT="$(vm_lib_root)"
+
+RUNTIME_DIR="${ROOT}/target/libkrun-build"
+
+pass=0
+fail=0
+
+ok()   { echo "  [OK]   $1"; ((pass++)); }
+miss() { echo "  [MISS] $1"; ((fail++)); }
+
+echo "==> QEMU host prerequisite check"
+echo ""
+
+# ── qemu-system-x86_64 ──────────────────────────────────────────────────
+
+echo "--- QEMU binary ---"
+if command -v qemu-system-x86_64 &>/dev/null; then
+    version="$(qemu-system-x86_64 --version | head -n1)"
+    ok "qemu-system-x86_64 found: ${version}"
+else
+    miss "qemu-system-x86_64 not found (install: sudo apt install qemu-system-x86)"
+fi
+
+# ── vhost-vsock ──────────────────────────────────────────────────────────
+
+echo "--- vhost-vsock ---"
+if [ -e /dev/vhost-vsock ]; then
+    ok "/dev/vhost-vsock exists"
+elif lsmod 2>/dev/null | grep -q vhost_vsock; then
+    ok "vhost_vsock module loaded (but /dev/vhost-vsock missing — check permissions)"
+else
+    miss "vhost_vsock not loaded (hint: sudo modprobe vhost_vsock)"
+fi
+
+# ── Runtime artifacts ────────────────────────────────────────────────────
+
+echo "--- Runtime artifacts (${RUNTIME_DIR}) ---"
+
+if [ -f "${RUNTIME_DIR}/vmlinux" ]; then
+    ok "vmlinux found"
+else
+    miss "vmlinux not found (run: FROM_SOURCE=1 mise run vm:setup)"
+fi
+
+if [ -f "${RUNTIME_DIR}/virtiofsd" ]; then
+    ok "virtiofsd found"
+else
+    miss "virtiofsd not found (run: mise run vm:gpu-deps)"
+fi
+
+# ── Summary ──────────────────────────────────────────────────────────────
+
+echo ""
+echo "==> Summary: ${pass} passed, ${fail} missing"
+
+if [ "$fail" -gt 0 ]; then
+    echo ""
+    echo "Fix the missing prerequisites above before running QEMU GPU passthrough."
+    exit 1
+fi
+
+echo ""
+echo "All QEMU prerequisites satisfied."
+exit 0
diff --git a/tasks/scripts/vm/sync-vm-rootfs.sh b/tasks/scripts/vm/sync-vm-rootfs.sh
index 727a9dd18..2c22e360b 100755
--- a/tasks/scripts/vm/sync-vm-rootfs.sh
+++ b/tasks/scripts/vm/sync-vm-rootfs.sh
@@ -141,6 +141,22 @@ fi
 patch_vm_helmchart "${MANIFEST_DST}/openshell-helmchart.yaml"
 patch_vm_helmchart "${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml"
 
+# ── GPU manifests ──────────────────────────────────────────────────────
+# Only sync if the rootfs was built with --gpu (sentinel file present).
+GPU_MANIFEST_SRC="${ROOT}/crates/openshell-vm/scripts/gpu-manifests"
+GPU_MANIFEST_DST="${ROOTFS_DIR}/opt/openshell/gpu-manifests"
+if [ -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ] && [ -d "${GPU_MANIFEST_SRC}" ]; then
+    mkdir -p "${GPU_MANIFEST_DST}"
+    for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do
+        [ -f "$manifest" ] || continue
+        base=$(basename "$manifest")
+        if ! cmp -s "$manifest" "${GPU_MANIFEST_DST}/${base}" 2>/dev/null; then
+            cp "$manifest" "${GPU_MANIFEST_DST}/${base}"
+            echo "  updated: /opt/openshell/gpu-manifests/${base}"
+        fi
+    done
+fi
+
 # ── Gateway image tarball ──────────────────────────────────────────────
 # The VM rootfs airgap-imports openshell/gateway:dev from k3s/agent/images/.
 # Keep that tarball in sync with the local Docker image so `mise run e2e:vm`
diff --git a/tasks/scripts/vm/vm-setup.sh b/tasks/scripts/vm/vm-setup.sh
index e7ae06d08..8afd3883d 100755
--- a/tasks/scripts/vm/vm-setup.sh
+++ b/tasks/scripts/vm/vm-setup.sh
@@ -81,6 +81,11 @@ if [ "$FROM_SOURCE" = "1" ]; then
         linux-*)
             # Linux: build both libkrunfw and libkrun in one go
             "${ROOT}/tasks/scripts/vm/build-libkrun.sh"
+            if [ "${GPU:-0}" = "1" ]; then
+                echo ""
+                echo "==> Building GPU passthrough dependencies..."
+                "${ROOT}/tasks/scripts/vm/build-gpu-deps.sh"
+            fi
             ;;
     esac
     echo ""
diff --git a/tasks/vm.toml b/tasks/vm.toml
index ca06b08c1..8c5fd1afc 100644
--- a/tasks/vm.toml
+++ b/tasks/vm.toml
@@ -5,6 +5,10 @@
 #
 # Workflow:
 #   mise run vm:setup          # one-time: download pre-built runtime (~30s)
+#                              # (with FROM_SOURCE=1: builds kernel + libkrun + GPU deps)
+#   mise run vm:gpu-deps       # (standalone) build GPU passthrough binaries separately
+#   mise run vm:nvidia-modules # (GPU only) build NVIDIA kernel modules
+#   mise run vm:rootfs -- --base --gpu  # build GPU rootfs with NVIDIA drivers
 #   mise run vm                # build + run the VM
 #   mise run vm:clean          # wipe everything and start over
 #
@@ -26,7 +30,14 @@ run = [
 description = "Build the openshell-vm binary with embedded runtime"
 run = [
   "tasks/scripts/vm/compress-vm-runtime.sh",
-  "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm",
+  """
+  # The embedded rootfs.tar.zst can exceed 2 GiB, which overflows x86_64's
+  # default small code model (R_X86_64_PC32 ±2 GiB limit). Use the large
+  # code model so include_bytes!() blobs of any size link correctly.
+  RUSTFLAGS="${RUSTFLAGS:-} -C code-model=large" \
+    OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed \
+    cargo build -p openshell-vm
+  """,
   "tasks/scripts/vm/codesign-openshell-vm.sh",
   "tasks/scripts/vm/bundle-vm-runtime.sh",
 ]
@@ -39,6 +50,18 @@ run = "tasks/scripts/vm/vm-setup.sh"
 description = "Build the VM rootfs tarball (use -- --base for lightweight)"
 run = "tasks/scripts/vm/build-rootfs-tarball.sh"
 
+["vm:nvidia-modules"]
+description = "Build NVIDIA kernel modules for GPU VM rootfs (requires FROM_SOURCE=1 vm:setup)"
+run = "tasks/scripts/vm/build-nvidia-modules.sh"
+
+["vm:gpu-deps"]
+description = "Build GPU passthrough dependencies (virtiofsd) for the QEMU backend"
+run = "tasks/scripts/vm/build-gpu-deps.sh"
+
+["vm:qemu-check"]
+description = "Validate QEMU host prerequisites for GPU passthrough"
+run = "tasks/scripts/vm/qemu-check.sh"
+
 ["vm:clean"]
 description = "Remove all VM cached artifacts (runtime, rootfs, builds)"
 run = "tasks/scripts/vm/vm-clean.sh"