diff --git a/README.md b/README.md index e34346c2..6a34436a 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,13 @@ install -m 0600 /tmp/aks-flex-node-config.json /etc/aks-flex-node/config.json cat /etc/aks-flex-node/config.json ``` -After reviewing the config, bootstrap the node. This installs the long-running agent service and starts the local Kubernetes worker environment. Use a standard `022` umask so bootstrap-created nspawn rootfs paths remain traversable by non-root service users such as `dbus`; the config file remains `0600`. +After reviewing the config, run preflight checks. Preflight is non-mutating and validates host prerequisites, API server reachability, rootfs image reachability, and bootstrap artifact sources before bootstrap changes the host. + +```bash +aks-flex-node preflight --config /etc/aks-flex-node/config.json +``` + +Then bootstrap the node. This installs the long-running agent service and starts the local Kubernetes worker environment. Use a standard `022` umask so bootstrap-created nspawn rootfs paths remain traversable by non-root service users such as `dbus`; the config file remains `0600`. ```bash umask 022 diff --git a/cmd/aks-flex-node/main.go b/cmd/aks-flex-node/main.go index 2582c7eb..511c84f3 100644 --- a/cmd/aks-flex-node/main.go +++ b/cmd/aks-flex-node/main.go @@ -11,6 +11,7 @@ import ( "github.com/spf13/cobra" "github.com/Azure/AKSFlexNode/pkg/cmd/daemon" + "github.com/Azure/AKSFlexNode/pkg/cmd/preflight" "github.com/Azure/AKSFlexNode/pkg/cmd/reset" "github.com/Azure/AKSFlexNode/pkg/cmd/start" "github.com/Azure/AKSFlexNode/pkg/cmd/token" @@ -25,6 +26,7 @@ func main() { } rootCmd.AddCommand(start.NewCommand()) + rootCmd.AddCommand(preflight.NewCommand()) rootCmd.AddCommand(daemon.NewCommand()) rootCmd.AddCommand(reset.NewCommand()) rootCmd.AddCommand(version.NewCommand()) diff --git a/docs/labs/README.md b/docs/labs/README.md index fa03c5d1..bb079eb3 100644 --- a/docs/labs/README.md +++ b/docs/labs/README.md @@ -18,6 +18,7 @@ Before starting a lab, prepare: - [Private AKS Cluster With Unmanaged Cilium And Cross-Region Flex Node](aks-private-cluster-cilium.md) - Private AKS with `--network-plugin none`, unmanaged Cilium, and VXLAN pod networking. - [Private AKS Cluster With Unbounded-Net And Cross-Region Flex Node](aks-private-cluster-unbounded-net.md) - Private AKS with `--network-plugin none`, `unbounded-net`, and private-L3 site peering. - [Public AKS Cluster With Unbounded-Net And Cross-Region VNet-Peered Flex Node](aks-public-cluster-unbounded-net-vnet-peering.md) - Public AKS with `--network-plugin none`, `unbounded-net`, and private-L3 site peering over cross-region VNet peering. +- [AKS Flex Node With Offline Bootstrap Artifacts](aks-public-cluster-offline-bootstrap.md) - Flex VM bootstrap from local host packages, a mirrored rootfs image, and filesystem or local-registry offline artifacts. The walkthrough uses a public AKS cluster, but the same flow applies to private clusters with API reachability. - [Public AKS Cluster With Unbounded-Net WireGuard Flex Node](aks-public-cluster-unbounded-net-wireguard.md) - Public AKS with `--network-plugin none`, `unbounded-net`, and WireGuard gateway connectivity without VNet peering. ## Topic Matrix @@ -31,7 +32,8 @@ Before starting a lab, prepare: | Private AKS API access | [Private AKS with unmanaged Cilium](aks-private-cluster-cilium.md), [Private AKS with unbounded-net](aks-private-cluster-unbounded-net.md) | | Cross-region VNet peering | [Private AKS with unmanaged Cilium](aks-private-cluster-cilium.md), [Private AKS with unbounded-net](aks-private-cluster-unbounded-net.md), [Public AKS with unbounded-net VNet peering](aks-public-cluster-unbounded-net-vnet-peering.md) | | `unbounded-net` CNI | [Private AKS with unbounded-net](aks-private-cluster-unbounded-net.md), [Public AKS with unbounded-net VNet peering](aks-public-cluster-unbounded-net-vnet-peering.md), [Public AKS with unbounded-net WireGuard](aks-public-cluster-unbounded-net-wireguard.md) | -| Public AKS API access | [Public AKS with unbounded-net VNet peering](aks-public-cluster-unbounded-net-vnet-peering.md), [Public AKS with unbounded-net WireGuard](aks-public-cluster-unbounded-net-wireguard.md) | +| Public AKS API access | [Public AKS with unbounded-net VNet peering](aks-public-cluster-unbounded-net-vnet-peering.md), [Offline bootstrap artifacts walkthrough](aks-public-cluster-offline-bootstrap.md), [Public AKS with unbounded-net WireGuard](aks-public-cluster-unbounded-net-wireguard.md) | | WireGuard gateway connectivity | [Public AKS with unbounded-net WireGuard](aks-public-cluster-unbounded-net-wireguard.md) | | No VNet peering | [Public AKS with unbounded-net WireGuard](aks-public-cluster-unbounded-net-wireguard.md) | -| Private-L3 `SitePeering` | [Private AKS with unbounded-net](aks-private-cluster-unbounded-net.md), [Public AKS with unbounded-net VNet peering](aks-public-cluster-unbounded-net-vnet-peering.md) | +| Offline bootstrap artifacts | [AKS Flex Node with offline bootstrap artifacts](aks-public-cluster-offline-bootstrap.md) | +| Private-L3 `SitePeering` | [Private AKS with unbounded-net](aks-private-cluster-unbounded-net.md), [Public AKS with unbounded-net VNet peering](aks-public-cluster-unbounded-net-vnet-peering.md), [Offline bootstrap artifacts walkthrough](aks-public-cluster-offline-bootstrap.md) | diff --git a/docs/labs/aks-public-cluster-offline-bootstrap.md b/docs/labs/aks-public-cluster-offline-bootstrap.md new file mode 100644 index 00000000..da0c68ff --- /dev/null +++ b/docs/labs/aks-public-cluster-offline-bootstrap.md @@ -0,0 +1,479 @@ +# AKS Flex Node With Offline Bootstrap Artifacts + +This lab shows how to join a Flex Node when the bootstrap binaries are served from an offline artifact source instead of public upstream URLs. The walkthrough uses a public AKS API server and the [public AKS + unbounded-net + VNet peering lab](aks-public-cluster-unbounded-net-vnet-peering.md) as the network and cluster base, but the same offline artifact flow also works with a private AKS cluster when the target VM can resolve and reach the private API endpoint. The lab changes the Flex VM bootstrap flow to use: + +- Host prerequisites installed before the node is isolated. +- A mirrored rootfs OCI image, either as a local OCI layout or in a registry reachable from the target VM. +- A mirrored Unbounded bootstrap artifact bundle, either: + - as files on the target VM, or + - as an OCI artifact in a local registry running on the target VM. +- `bootstrap.offlineArtifacts.source` in the AKS Flex Node config. + +The goal is to prove that bootstrap does not fetch Kubernetes, CRI, CNI, or crictl artifacts from public upstream endpoints such as `dl.k8s.io` or `github.com/kubernetes-sigs/cri-tools`. + +## Prerequisites + +Start with the base public VNet-peered lab through these sections: + +1. [Create Resource Groups And Networks](aks-public-cluster-unbounded-net-vnet-peering.md#create-resource-groups-and-networks) +2. [Create A Public No-CNI AKS Cluster](aks-public-cluster-unbounded-net-vnet-peering.md#create-a-public-no-cni-aks-cluster) +3. [Install Unbounded-Net](aks-public-cluster-unbounded-net-vnet-peering.md#install-unbounded-net) +4. [Create Sites And Mesh Peering](aks-public-cluster-unbounded-net-vnet-peering.md#create-sites-and-mesh-peering) +5. Create the Flex VM and verify SSH access. + +You also need the following tools in the connected preparation environment. The preparation environment can be the target VM before egress is restricted, or a separate staging host that can copy files into the target VM. + +- `az`, `kubectl`, `jq`, `ssh`, `scp` +- `aks-flex-node` installed on the target VM +- `oras` for copying OCI images/artifacts and pulling filesystem artifacts. +- `podman` or Docker on the target VM if using the local registry mode. + +This lab uses these example artifact versions: + +```bash +KUBERNETES_VERSION="1.35.0" +KUBERNETES_VERSION_V="v${KUBERNETES_VERSION#v}" +ROOTFS_IMAGE_UPSTREAM="ghcr.io/azure/agent-ubuntu2404:v20260619" +ARTIFACT_TAG="alpha-0cd4fe2-k8s-${KUBERNETES_VERSION_V}" +ARTIFACT_BUNDLE_UPSTREAM="ghcr.io/azure/unbounded/bootstrap-artifacts:${ARTIFACT_TAG}" +``` + +Use a bundle tag that matches the Kubernetes version you configure for the Flex Node. AKS Flex Node accepts `components.kubernetes` with or without a leading `v`; the offline artifact template expands `.KubernetesVersion` with the leading `v`. In offline mode, the artifact manifest supplies the artifact versions used by bootstrap. + +## Install Host Prerequisites On The Target VM + +Offline artifact mode treats missing host packages as fatal during preflight because bootstrap cannot rely on public package repositories after the host is isolated. Install the host prerequisites while the target VM still has package repository access, or install them from your own internal package mirror. + +On the target VM: + +```bash +sudo apt-get update +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \ + systemd-container \ + curl \ + nftables \ + util-linux +``` + +Install lab tooling separately. `jq` is used to patch the generated config, and local registry mode needs a container runtime such as `podman`: + +```bash +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y jq podman +``` + +Install `oras` if it is not already available. This lab uses `oras` for both the rootfs image copy and the bootstrap artifact bundle copy/pull, so `skopeo` is not required: + +```bash +ORAS_VERSION="1.3.2" +curl -fsSL "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz" \ + -o /tmp/oras.tar.gz +sudo tar -C /usr/local/bin -xzf /tmp/oras.tar.gz oras +oras version +``` + +## Generate A Baseline AKS Flex Node Config + +On your workstation, create the bootstrap-token RBAC and generate a node config. This follows the same pattern as the base lab and quickstart. + +```bash +AKS_RG="" +CLUSTER_NAME="" +SUBSCRIPTION_ID="" +AGENT_POOL_NAME="${AGENT_POOL_NAME:-aksflexnodes}" + +scripts/aks-flex-config setup-node-rbac \ + --resource-group "$AKS_RG" \ + --cluster-name "$CLUSTER_NAME" \ + --subscription "$SUBSCRIPTION_ID" + +scripts/aks-flex-config generate-node-config \ + --resource-group "$AKS_RG" \ + --cluster-name "$CLUSTER_NAME" \ + --subscription "$SUBSCRIPTION_ID" \ + --agent-pool-name "$AGENT_POOL_NAME" \ + --bootstrap-token \ + --output ./aks-flex-node-config.json +``` + +If the Flex VM has multiple private IPs, or if you want to pin the node IP, set `node.kubelet.nodeIP` in the generated config. + +## Option A: Use A Filesystem Bootstrap Artifact Bundle + +In filesystem mode, the target VM reads bootstrap artifacts from a local directory or `file://` URL. The rootfs is also staged on the filesystem as a local OCI image layout and referenced with `oci-layout://`. + +### Stage The Rootfs Image As A Local OCI Layout + +Mirror the rootfs image into a local OCI layout while the VM still has egress, or perform this on a connected staging host and copy the OCI layout directory into the VM. + +On the target VM: + +```bash +ROOTFS_LAYOUT_DIR="/opt/aks-flex-node/offline/images/agent-ubuntu2404" +ROOTFS_IMAGE_LOCAL="oci-layout://${ROOTFS_LAYOUT_DIR}:v20260619" + +sudo mkdir -p "$(dirname "$ROOTFS_LAYOUT_DIR")" +sudo chown "$(id -u):$(id -g)" "$(dirname "$ROOTFS_LAYOUT_DIR")" + +oras copy \ + --to-oci-layout \ + "$ROOTFS_IMAGE_UPSTREAM" \ + "${ROOTFS_LAYOUT_DIR}:v20260619" +``` + +### Pull The Bootstrap Artifact Bundle To Files + +Pull the Unbounded bootstrap artifact bundle into the directory that the agent will read. The directory name is versioned so the config can use the `.KubernetesVersion` template value. + +```bash +ARTIFACTS_ROOT="/opt/aks-flex-node/offline/bootstrap-artifacts" +ARTIFACTS_DIR="${ARTIFACTS_ROOT}/${KUBERNETES_VERSION_V}" + +sudo mkdir -p "$ARTIFACTS_DIR" +sudo chown "$(id -u):$(id -g)" "$ARTIFACTS_DIR" + +oras pull \ + --output "$ARTIFACTS_DIR" \ + "$ARTIFACT_BUNDLE_UPSTREAM" + +find "$ARTIFACTS_DIR" -maxdepth 4 -type f | sort | head -50 +``` + +The directory should contain `manifest.json` and the paths referenced by that manifest, such as Kubernetes binaries, checksums, containerd, runc, CNI, and crictl artifacts. + +### Patch The Config For Filesystem Mode + +On your workstation: + +```bash +ROOTFS_IMAGE_LOCAL="oci-layout:///opt/aks-flex-node/offline/images/agent-ubuntu2404:v20260619" +ARTIFACTS_SOURCE="file:///opt/aks-flex-node/offline/bootstrap-artifacts/{{ .KubernetesVersion }}" + +jq \ + --arg kubernetesVersion "$KUBERNETES_VERSION" \ + --arg rootfsImage "$ROOTFS_IMAGE_LOCAL" \ + --arg offlineSource "$ARTIFACTS_SOURCE" \ + '.components = (.components // {}) + | .components.kubernetes = $kubernetesVersion + | .bootstrap = (.bootstrap // {}) + | .bootstrap.ociImage = $rootfsImage + | .bootstrap.offlineArtifacts.source = $offlineSource' \ + ./aks-flex-node-config.json > ./aks-flex-node-config.offline-files.json +``` + +The relevant fields in the rendered config should look like this: + +```json +{ + "components": { + "kubernetes": "1.35.0" + }, + "bootstrap": { + "ociImage": "oci-layout:///opt/aks-flex-node/offline/images/agent-ubuntu2404:v20260619", + "offlineArtifacts": { + "source": "file:///opt/aks-flex-node/offline/bootstrap-artifacts/{{ .KubernetesVersion }}" + } + } +} +``` + +Keep the rest of the generated config, including `azure`, `node`, `networking`, and authentication fields. + +Copy the config to the target VM: + +```bash +TARGET_HOST="@" +scp ./aks-flex-node-config.offline-files.json "$TARGET_HOST:/tmp/aks-flex-node-config.json" +``` + +## Option B: Use A Local OCI Registry For Rootfs And Bootstrap Artifacts + +In OCI registry mode, both the rootfs image and the bootstrap artifact bundle are mirrored into an unauthenticated registry reachable from the target VM. This is usually the best model for larger offline or restricted-egress environments. + +### Start The Local Registry + +On the target VM: + +```bash +sudo podman run -d --name aks-flex-offline-registry --restart=always \ + -p 127.0.0.1:5000:5000 \ + docker.io/library/registry:2 + +curl -fsS http://127.0.0.1:5000/v2/ >/dev/null +``` + +### Mirror The Rootfs Image And Bootstrap Artifact Bundle + +While the target VM still has egress to the public artifact sources: + +```bash +ROOTFS_IMAGE_LOCAL="127.0.0.1:5000/aks-flex/rootfs/agent-ubuntu2404:v20260619" +ARTIFACT_BUNDLE_LOCAL="127.0.0.1:5000/aks-flex/bootstrap-artifacts:${ARTIFACT_TAG}" + +oras copy \ + --to-plain-http \ + "$ROOTFS_IMAGE_UPSTREAM" \ + "$ROOTFS_IMAGE_LOCAL" + +oras copy \ + --to-plain-http \ + "$ARTIFACT_BUNDLE_UPSTREAM" \ + "$ARTIFACT_BUNDLE_LOCAL" +``` + +If the target VM is never allowed to reach public registries, perform the mirror in a connected staging environment, move the registry contents or exported OCI artifacts into the target network, and restore them into the local registry before continuing. + +### Patch The Config For Local Registry Mode + +On your workstation: + +```bash +ROOTFS_IMAGE_LOCAL="127.0.0.1:5000/aks-flex/rootfs/agent-ubuntu2404:v20260619" +ARTIFACTS_SOURCE="oci://127.0.0.1:5000/aks-flex/bootstrap-artifacts:alpha-0cd4fe2-k8s-{{ .KubernetesVersion }}" + +jq \ + --arg kubernetesVersion "$KUBERNETES_VERSION" \ + --arg rootfsImage "$ROOTFS_IMAGE_LOCAL" \ + --arg offlineSource "$ARTIFACTS_SOURCE" \ + '.components = (.components // {}) + | .components.kubernetes = $kubernetesVersion + | .bootstrap = (.bootstrap // {}) + | .bootstrap.ociImage = $rootfsImage + | .bootstrap.offlineArtifacts.source = $offlineSource' \ + ./aks-flex-node-config.json > ./aks-flex-node-config.offline-registry.json +``` + +The relevant fields in the rendered config should look like this: + +```json +{ + "components": { + "kubernetes": "1.35.0" + }, + "bootstrap": { + "ociImage": "127.0.0.1:5000/aks-flex/rootfs/agent-ubuntu2404:v20260619", + "offlineArtifacts": { + "source": "oci://127.0.0.1:5000/aks-flex/bootstrap-artifacts:alpha-0cd4fe2-k8s-{{ .KubernetesVersion }}" + } + } +} +``` + +Keep the rest of the generated config, including `azure`, `node`, `networking`, and authentication fields. + +Copy the config to the target VM: + +```bash +TARGET_HOST="@" +scp ./aks-flex-node-config.offline-registry.json "$TARGET_HOST:/tmp/aks-flex-node-config.json" +``` + +## Optional: Restrict Egress After Staging + +After the host packages, rootfs image, and bootstrap artifact bundle are staged locally, restrict egress using your preferred mechanism, such as NSG rules, Azure Firewall, UDRs, or private-only routing. + +The target VM still needs to reach: + +- The AKS API server over HTTPS 443. +- Any control-plane or node networking paths required by the base VNet peering lab. +- The local registry on `127.0.0.1:5000` if using local registry mode. +- The local rootfs OCI layout path if using filesystem mode. + +For a strict validation, block public artifact endpoints such as: + +```text +dl.k8s.io +github.com +objects.githubusercontent.com +github-releases.githubusercontent.com +pkg-containers.githubusercontent.com +ghcr.io +storage.googleapis.com +packages.microsoft.com +mcr.microsoft.com +registry.k8s.io +``` + +### Preflight-Only Host-Level Block With nftables + +For a quick preflight-only sanity check, add temporary host firewall rules on the target VM after all packages, images, and artifacts are staged. This blocks the currently resolved IPs for common public artifact hosts while leaving the AKS API server and the loopback registry reachable. + +Do not rely on these host-level `nftables` rules as the strict bootstrap isolation mechanism. During `start`, AKS Flex Node installs and starts an nftables reset unit that runs `flush ruleset`, so rules created before bootstrap are intentionally removed. Use the NSG or Azure Firewall option below for isolation that remains in effect during bootstrap. + +```bash +BLOCKED_ARTIFACT_HOSTS=( + dl.k8s.io + github.com + objects.githubusercontent.com + github-releases.githubusercontent.com + pkg-containers.githubusercontent.com + ghcr.io + storage.googleapis.com + packages.microsoft.com + mcr.microsoft.com + registry.k8s.io +) + +sudo nft add table inet aksflex_offline 2>/dev/null || true +sudo nft 'add set inet aksflex_offline blocked_v4 { type ipv4_addr; flags interval; }' 2>/dev/null || true +sudo nft 'add chain inet aksflex_offline output { type filter hook output priority 0; policy accept; }' 2>/dev/null || true +sudo nft 'add rule inet aksflex_offline output ip daddr @blocked_v4 tcp dport { 80, 443 } reject' 2>/dev/null || true + +for host in "${BLOCKED_ARTIFACT_HOSTS[@]}"; do + getent ahostsv4 "$host" | awk '{print $1}' | sort -u | while read -r ip; do + sudo nft add element inet aksflex_offline blocked_v4 "{ ${ip} }" 2>/dev/null || true + done +done + +sudo nft list table inet aksflex_offline +``` + +These rules are intentionally temporary and IP-based. Public artifact hostnames often use CDNs, so DNS answers can change. Re-run the resolver loop if the validation runs much later. For strict bootstrap validation, use subnet-level controls such as NSG rules or Azure Firewall application rules. + +Remove the temporary rules with: + +```bash +sudo nft delete table inet aksflex_offline +``` + +### Subnet-Level Block With NSG Rules + +For stronger validation in Azure, use NSG rules after staging. NSGs cannot deny arbitrary FQDNs, so the strict pattern is to allow only required destinations, then deny outbound Internet. For a public AKS API server, resolve the API FQDN and allow that IP on TCP 443 before adding the deny rule. + +```bash +VM_RG="" +NSG_NAME="" +AKS_API_FQDN="$(az aks show -g "$AKS_RG" -n "$CLUSTER_NAME" --query fqdn -o tsv)" +AKS_API_IP="$(getent ahostsv4 "$AKS_API_FQDN" | awk 'NR==1 {print $1}')" + +az network nsg rule create \ + -g "$VM_RG" \ + --nsg-name "$NSG_NAME" \ + -n allow-aks-api \ + --priority 300 \ + --direction Outbound \ + --access Allow \ + --protocol Tcp \ + --source-address-prefixes '*' \ + --destination-address-prefixes "$AKS_API_IP" \ + --destination-port-ranges 443 + +az network nsg rule create \ + -g "$VM_RG" \ + --nsg-name "$NSG_NAME" \ + -n deny-internet-outbound \ + --priority 4000 \ + --direction Outbound \ + --access Deny \ + --protocol '*' \ + --source-address-prefixes '*' \ + --destination-address-prefixes Internet \ + --destination-port-ranges '*' +``` + +If you are using a private AKS cluster, allow the private API endpoint and required VNet/private endpoint ranges instead of the public API IP. If your validation needs durable FQDN allow/deny controls instead of resolved IPs, use Azure Firewall application rules plus a route table that sends VM subnet egress through the firewall. + +## Install The Config And Run Preflight + +On the target VM: + +```bash +sudo install -d -m 0755 /etc/aks-flex-node +sudo install -m 0600 /tmp/aks-flex-node-config.json /etc/aks-flex-node/config.json + +sudo aks-flex-node preflight --config /etc/aks-flex-node/config.json +``` + +Preflight should report successful host, API server, rootfs image, and artifact checks. In offline artifact mode, this is also the point where missing host packages are detected before bootstrap mutates the machine. + +For automation, use JSON output: + +```bash +sudo aks-flex-node preflight --config /etc/aks-flex-node/config.json --output json +``` + +## Bootstrap The Node + +Run bootstrap from the target VM: + +```bash +sudo sh -c 'umask 022; aks-flex-node start --config /etc/aks-flex-node/config.json' +``` + +Verify the node from your workstation: + +```bash +kubectl get nodes -o wide +kubectl describe node +``` + +## Validate That Offline Artifacts Were Used + +On the target VM, inspect the agent log: + +```bash +sudo grep -E 'pulling OCI image|downloading (kubernetes|cri-tools)|bootstrap-artifacts|dl.k8s.io|kubernetes-sigs' \ + /var/log/aks-flex-node/aks-flex-node.log +``` + +Expected evidence for local registry mode includes URLs like: + +```text +pulling OCI image image=127.0.0.1:5000/aks-flex/rootfs/agent-ubuntu2404:v20260619 +downloading kubernetes binary url=oci://127.0.0.1:5000/aks-flex/bootstrap-artifacts:alpha-0cd4fe2-k8s-v1.35.0#kubernetes/v1.35.0/bin/linux/amd64/kubelet +``` + +Expected evidence for filesystem mode includes log lines and URLs like: + +```text +using local OCI layout image image=oci-layout:///opt/aks-flex-node/offline/images/agent-ubuntu2404:v20260619 layout=/opt/aks-flex-node/offline/images/agent-ubuntu2404 +downloading kubernetes binary url=file:///opt/aks-flex-node/offline/bootstrap-artifacts/v1.35.0/kubernetes/v1.35.0/bin/linux/amd64/kubelet +``` + +There should be no bootstrap downloads from public upstream artifact URLs: + +```bash +sudo grep -E 'https://dl.k8s.io|https://github.com/kubernetes-sigs/cri-tools' \ + /var/log/aks-flex-node/aks-flex-node.log && echo "unexpected public artifact download" +``` + +## Troubleshooting + +### Preflight Fails `host-packages` + +Install the required host packages before isolating the VM from package repositories: + +```bash +sudo apt-get update +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y systemd-container curl nftables util-linux +``` + +### Preflight Fails `oci-image-reachable` + +For filesystem mode, check that the local OCI layout exists and has the expected tag: + +```bash +test -f /opt/aks-flex-node/offline/images/agent-ubuntu2404/oci-layout +oras manifest fetch --oci-layout /opt/aks-flex-node/offline/images/agent-ubuntu2404:v20260619 >/dev/null +``` + +For local registry mode, check the rootfs image reference and local registry health: + +```bash +curl -fsS http://127.0.0.1:5000/v2/ >/dev/null +oras manifest fetch --plain-http 127.0.0.1:5000/aks-flex/rootfs/agent-ubuntu2404:v20260619 >/dev/null +``` + +### Preflight Fails Artifact Checks + +For local registry mode, verify the OCI artifact is present: + +```bash +oras manifest fetch --plain-http "127.0.0.1:5000/aks-flex/bootstrap-artifacts:${ARTIFACT_TAG}" >/dev/null +``` + +For filesystem mode, verify `manifest.json` and expected artifact paths exist: + +```bash +sudo test -f "/opt/aks-flex-node/offline/bootstrap-artifacts/${KUBERNETES_VERSION_V}/manifest.json" +sudo find "/opt/aks-flex-node/offline/bootstrap-artifacts/${KUBERNETES_VERSION_V}" -maxdepth 4 -type f | sort | head -50 +``` diff --git a/docs/usages/configuration.md b/docs/usages/configuration.md index bdf03812..26d29c1d 100644 --- a/docs/usages/configuration.md +++ b/docs/usages/configuration.md @@ -6,16 +6,21 @@ AKS Flex Node reads a JSON config file passed with `--config`. aks-flex-node start --config /etc/aks-flex-node/config.json ``` +Before starting bootstrap, validate the same config with the non-mutating preflight command: + +```bash +aks-flex-node preflight --config /etc/aks-flex-node/config.json +``` + ## Top-Level Sections | Name | Type | Description | |------|------|-------------| | `azure` | object | Azure subscription, target AKS cluster, and authentication settings. | | `agent` | object | Local agent logging and runtime behavior. | -| `containerd` | object | Optional containerd version override. | -| `kubernetes` | object | Kubernetes component settings. | -| `cni` | object | Optional CNI plugin version override. | -| `runc` | object | Optional runc version override. | +| `components` | object | Kubernetes, container runtime, and sandbox image settings. | +| `bootstrap` | object | Bootstrap settings such as the rootfs OCI image. | +| `networking` | object | Cluster networking settings and optional CNI plugin version override. | | `node` | object | Kubelet, labels, taints, and node registration settings. | | `npd` | object | Optional node-problem-detector version override. | @@ -97,6 +102,14 @@ At least one join or Azure authentication method must be configured. `azure.boot | `components.kubernetes` | string | Kubernetes version for kubelet and related binaries. For AKS joins, use the target cluster version. | `1.34.3` | | `components.containerd` | string | Optional containerd version override. | `2.0.4` | | `components.runc` | string | Optional runc version override. | `1.1.12` | +| `components.sandboxImage` | string | Optional CRI sandbox/pause image used by containerd. When omitted, the shared agent default is used. | `mcr.microsoft.com/oss/kubernetes/pause:3.9` | + +## Bootstrap + +| Name | Type | Description | Sample Value | +|------|------|-------------|--------------| +| `bootstrap.ociImage` | string | Optional nspawn rootfs OCI image used during bootstrap. When omitted, the shared agent default image selection is used. | `ghcr.io/example/aks-flex-node-rootfs:ubuntu-24.04` | +| `bootstrap.offlineArtifacts.source` | string | Optional complete offline binary artifact bundle source. Supports absolute paths, `file://`, and unauthenticated `oci://` artifact references. The value is rendered as a strict Go template with `.KubernetesVersion` and `.KubernetesVersionNoV`. Preflight treats missing host packages as fatal when this is set. | `/opt/aks-flex-node/artifacts/{{ .KubernetesVersion }}` | ## Networking @@ -281,7 +294,8 @@ Add these sections when you need to pin runtime component versions explicitly. "components": { "kubernetes": "1.34.3", "containerd": "2.0.4", - "runc": "1.1.12" + "runc": "1.1.12", + "sandboxImage": "mcr.microsoft.com/oss/kubernetes/pause:3.9" }, "networking": { "cniVersion": "v1.6.2" @@ -291,3 +305,18 @@ Add these sections when you need to pin runtime component versions explicitly. } } ``` + +### Bootstrap Image And Offline Artifact Overrides + +Add this section when you need to pin the nspawn rootfs image or use a complete offline binary artifact bundle. `offlineArtifacts.source` follows the Unbounded offline artifact bundle layout and includes `manifest.json` plus Kubernetes, containerd, runc, CNI, crictl, and optional sandbox image archive artifacts. + +```json +{ + "bootstrap": { + "ociImage": "ghcr.io/example/aks-flex-node-rootfs:ubuntu-24.04", + "offlineArtifacts": { + "source": "/opt/aks-flex-node/artifacts/{{ .KubernetesVersion }}" + } + } +} +``` diff --git a/docs/usages/joining-nodes.md b/docs/usages/joining-nodes.md index cf548b80..7e390d5b 100644 --- a/docs/usages/joining-nodes.md +++ b/docs/usages/joining-nodes.md @@ -19,10 +19,11 @@ High-level flow: 1. Run [`scripts/aks-flex-config setup-node-rbac`](../../scripts/aks-flex-config) to setup required node bootstrap RBAC permissions. 2. Run `scripts/aks-flex-config generate-node-config --bootstrap-token` to create a bootstrap token, fetch AKS cluster metadata, and render the host config. 3. Copy the generated config to `/etc/aks-flex-node/config.json` on the target host. -4. Run `aks-flex-node start --config /etc/aks-flex-node/config.json`. -5. Verify with `kubectl get nodes -o wide`. +4. Run `aks-flex-node preflight --config /etc/aks-flex-node/config.json` to validate host, cluster, rootfs, and artifact prerequisites without mutating the node. +5. Run `aks-flex-node start --config /etc/aks-flex-node/config.json`. +6. Verify with `kubectl get nodes -o wide`. -See the repository [README](../../README.md#getting-started) for the complete bootstrap token walkthrough, and [AKS Flex Config Helper](aks-flex-config.md) for all helper command options. +See the repository [README](../../README.md#getting-started) for the complete bootstrap token walkthrough, [AKS Flex Config Helper](aks-flex-config.md) for all helper command options, and [Operations](operations.md#preflight) for preflight options and JSON output. ## Managed Identity diff --git a/docs/usages/operations.md b/docs/usages/operations.md index cc26321c..fae735a4 100644 --- a/docs/usages/operations.md +++ b/docs/usages/operations.md @@ -2,6 +2,31 @@ This guide summarizes common host and cluster operations for AKS Flex Node. +## Preflight + +Run preflight before mutating the host. The command validates the config, resolves the nspawn goal state, and checks host prerequisites, API server reachability, rootfs image reachability, and bootstrap artifact sources. + +```bash +aks-flex-node preflight --config /etc/aks-flex-node/config.json +``` + +Preflight exits non-zero when a fatal check fails. Use JSON output for automation: + +```bash +aks-flex-node preflight --config /etc/aks-flex-node/config.json --output json +``` + +Useful options: + +```bash +aks-flex-node preflight \ + --config /etc/aks-flex-node/config.json \ + --ignore-preflight-errors=[,...] \ + --fail-on-warnings +``` + +When `bootstrap.offlineArtifacts.source` is configured, missing host packages are fatal because offline bootstrap cannot rely on package installation during `start`. + ## Start Start installs host components, starts the nspawn-backed worker, installs the systemd unit, and starts the agent daemon. diff --git a/go.mod b/go.mod index 98eac1ad..01ce610e 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8 v8.3.0-beta.2 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0 github.com/Azure/kubelogin v0.2.15 - github.com/Azure/unbounded v0.1.19 + github.com/Azure/unbounded v0.1.20-rc.4.0.20260702231143-48eb8f2d362c github.com/google/renameio/v2 v2.0.2 github.com/google/uuid v1.6.0 github.com/spf13/cobra v1.10.2 @@ -53,9 +53,7 @@ require ( github.com/go-openapi/swag v0.23.0 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect github.com/golang-jwt/jwt/v5 v5.3.1 // indirect - github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect - github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect diff --git a/go.sum b/go.sum index 650118d3..7f1e99e1 100644 --- a/go.sum +++ b/go.sum @@ -1,15 +1,9 @@ github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= -github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 h1:jHb/wfvRikGdxMXYV3QG/SzUOPYN9KEUUuC0Yd0/vC0= -github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1/go.mod h1:pzBXCYn05zvYIrwLgtK8Ap8QcjRg+0i76tMQdWN6wOk= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.22.0 h1:aokoqcHvaGjiM3VpjKDfMMnF/8epJ+Q1HLJ7CudztqE= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.22.0/go.mod h1:/WYEx9pcM9Y+Dd/APJaNlSvVSvzl54rrMdZT5+Oi2LM= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.14.0 h1:CU4+EJeJi3TKYWEcYuSdWsjzw0nVsK/H0MSQOiPcymU= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.14.0/go.mod h1:q0+UTSRvShwUCrR/s5HtyInYphN7Wvxb7snFM3u+SLA= -github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= -github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.4.0 h1:xFaZZ+IubdftrDHnGGwZ6QvQ3KHTtWl2MCK+GMt2vxs= github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.4.0/go.mod h1:mCBhUhlMjLLJKr5aqw2TNS/VqJOie8MzWq3DAMJeKso= github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 h1:fhqpLE3UEXi9lPaBRpQ6XuRW0nU7hgg4zlmZZa+a9q4= @@ -42,19 +36,12 @@ github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUM github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= github.com/Azure/kubelogin v0.2.15 h1:oJqD8Dvput3rO/xZgMTU+hBrcgg0BfQGPCNHJ2dEmys= github.com/Azure/kubelogin v0.2.15/go.mod h1:RwJS8TzSHTVQhfIZA4HLS79QGfvIp0ocIVLT5oHS/ls= -github.com/Azure/unbounded v0.1.11 h1:U0tQa/K2F1WVKbMFf7A1KD+RzpjiMvCdY2cMpBtpiJM= -github.com/Azure/unbounded v0.1.11/go.mod h1:8/ekWflNUvo8KTVPSYBX+7w/JpcoaeE7DTsdMfVfI6c= -github.com/Azure/unbounded v0.1.19-0.20260625051614-ffa40c0343ca h1:tLaOIuKHbFtfO/Mh2Le2jiC5K4SNqZrojZbfUKxV1GA= -github.com/Azure/unbounded v0.1.19-0.20260625051614-ffa40c0343ca/go.mod h1:Gidt1+u+bKW5uGvB8d1vMju/remIb38F5LaiPugE53M= -github.com/Azure/unbounded v0.1.19-0.20260625061542-cae5d3e6b84c h1:62XpIjdUeOGdlZLD75rs7nrW3doOBPUNTkad33m4Leo= -github.com/Azure/unbounded v0.1.19-0.20260625061542-cae5d3e6b84c/go.mod h1:Gidt1+u+bKW5uGvB8d1vMju/remIb38F5LaiPugE53M= -github.com/Azure/unbounded v0.1.19 h1:Ly4DDbJN/Ipe1hkacsi/ghmGbAHszYgw83PpSDPGTog= -github.com/Azure/unbounded v0.1.19/go.mod h1:Gidt1+u+bKW5uGvB8d1vMju/remIb38F5LaiPugE53M= +github.com/Azure/unbounded v0.1.20-rc.4.0.20260702231143-48eb8f2d362c h1:g6az3ey2a+ZRDgKPoA+OGcw0PbpQvZFUSIcsiBsoNkM= +github.com/Azure/unbounded v0.1.20-rc.4.0.20260702231143-48eb8f2d362c/go.mod h1:WnWLk0Y+Hwnj6MSMGa3YkSXG75y6JS1ek+UyNzAb9d0= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= github.com/AzureAD/microsoft-authentication-library-for-go v1.7.2 h1:RHK7bS+HQMslb1sZpAokUt+zTVmue0hKSs2C791hhzU= github.com/AzureAD/microsoft-authentication-library-for-go v1.7.2/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= -github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/Masterminds/semver/v3 v3.5.0 h1:kQceYJfbupGfZOKZQg0kou0DgAKhzDg2NZPAwZ/2OOE= github.com/Masterminds/semver/v3 v3.5.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= @@ -73,18 +60,13 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= -github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= -github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= github.com/containerd/platforms v1.0.0-rc.4 h1:M42JrUT4zfZTqtkUwkr0GzmUWbfyO5VO0Q5b3op97T4= github.com/containerd/platforms v1.0.0-rc.4/go.mod h1:lKlMXyLybmBedS/JJm11uDofzI8L2v0J2ZbYvNsbq1A= -github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw= -github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/cyphar/filepath-securejoin v0.6.0 h1:BtGB77njd6SVO6VztOHfPxKitJvd/VPT+OFBFMOi1Is= github.com/cyphar/filepath-securejoin v0.6.0/go.mod h1:A8hd4EnAeyujCJRrICiOWqjS1AX0a9kM5XL+NwKoYSc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -129,8 +111,6 @@ github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArs github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= -github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= @@ -195,19 +175,15 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/onsi/ginkgo v1.6.0 h1:Ix8l273rp3QzYgXSR+c8d1fTG7UPgYkOSELPhiY/YGw= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= -github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/ginkgo/v2 v2.27.4 h1:fcEcQW/A++6aZAZQNUmNjvA9PSOzefMJBerHJ4t8v8Y= +github.com/onsi/ginkgo/v2 v2.27.4/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= -github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= -github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q= +github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= -github.com/opencontainers/runtime-spec v1.2.1 h1:S4k4ryNgEpxW1dzyqffOmhI1BHYcjzU8lpJfSlR0xww= -github.com/opencontainers/runtime-spec v1.2.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.3.0 h1:YZupQUdctfhpZy3TM39nN9Ika5CBWT5diQ8ibYCRkxg= github.com/opencontainers/runtime-spec v1.3.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/umoci v0.6.0 h1:Dsm4beJpglN5y2E2EUSZZcNey4Ml4+nKepvwLQwgIec= @@ -224,12 +200,8 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= -github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= @@ -272,8 +244,6 @@ github.com/tj/go-buffer v1.1.0/go.mod h1:iyiJpfFcR2B9sXu7KvjbT9fpM4mOelRSDTbntVj github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= github.com/tj/go-spin v1.1.0/go.mod h1:Mg1mzmePZm4dva8Qz60H2lHwmJ2loum4VIrLgVnKwh4= -github.com/urfave/cli v1.22.12 h1:igJgVw1JdKH+trcLWLeLwZjU9fEfPesQ+9/e4MQ44S8= -github.com/urfave/cli v1.22.12/go.mod h1:sSBEIC79qR6OvcmsD4U3KABeOTxDqQtdDnaFuUN30b8= github.com/urfave/cli v1.22.16 h1:MH0k6uJxdwdeWQTwhSO42Pwr4YLrNLwBtg1MRgTqPdQ= github.com/urfave/cli v1.22.16/go.mod h1:EeJR6BKodywf4zciqrdw6hpCPk68JO9z5LazXZMn5Po= github.com/vbatts/go-mtree v0.6.1-0.20250911112631-8307d76bc1b9 h1:R6l9BtUe83abUGu1YKGkfa17wMMFLt6mhHVQ8MxpfRE= @@ -283,16 +253,12 @@ github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcY github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= -go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= -go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= go.uber.org/mock v0.6.0 h1:hyF9dfmbgIX5EfOdasqLsWD6xqpNZlXblLB/Dbnwv3Y= +go.uber.org/mock v0.6.0/go.mod h1:KiVJ4BqZJaMj4svdfmHM0AUx4NJYO8ZNpPnZn1Z+BBU= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= -go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= -go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo= -go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= -go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q= go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= @@ -302,13 +268,11 @@ golang.org/x/crypto v0.0.0-20190426145343-a29dc8fdc734/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= -golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988= -golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc= golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4= -golang.org/x/mod v0.36.0/go.mod h1:moc6ELqsWcOw5Ef3xVprK5ul/MvtVvkIXLziUOICjUQ= +golang.org/x/mod v0.37.0 h1:vF1DjpVEshcIqoEaauuHebaLk1O1forxjxBaVn884JQ= +golang.org/x/mod v0.37.0/go.mod h1:m8S8VeM9r4dzDwjrKO0a1sZP3YjeMamRRlD+fmR2Q/0= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -316,8 +280,6 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= -golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= golang.org/x/net v0.56.0 h1:Rw8j/hFzGvJUZwNBXnAtf5sVDVt+65SK2C7IxCxZt5o= golang.org/x/net v0.56.0/go.mod h1:D3Ku6r+V6JROoZK144D2XfMHFcMq/0zSfLelVTCFKec= golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= @@ -325,8 +287,6 @@ golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= -golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -340,15 +300,11 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= -golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= -golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc= golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -357,28 +313,21 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= -golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= -golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= -golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= -golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= golang.org/x/tools v0.45.0 h1:18qN3FAooORvApf5XjCXgsuayZOEtXf6JK18I3+ONa8= +golang.org/x/tools v0.45.0/go.mod h1:LuUGqqaXcXMEFEruIVJVm5mgDD8vww/z/SR1gQ4uE/0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= -google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af h1:+5/Sw3GsDNlEmu7TfklWKPdQ0Ykja5VEmq2i817+jbI= google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -400,48 +349,28 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20200605160147-a5ece683394c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.35.4 h1:P7nFYKl5vo9AGUp1Z+Pmd3p2tA7bX2wbFWCvDeRv988= -k8s.io/api v0.35.4/go.mod h1:yl4lqySWOgYJJf9RERXKUwE9g2y+CkuwG+xmcOK8wXU= k8s.io/api v0.36.2 h1:TF6YDLIzKfccK7cq9YpTcGX8TJmEkHVRv78DM51fRYY= k8s.io/api v0.36.2/go.mod h1:F4LbMO4brjZYh7yFkXWhynSvtB7YauxV4c+HHkNRGNg= -k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= -k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= k8s.io/apiextensions-apiserver v0.36.0 h1:Wt7E8J+VBCbj4FjiBfDTK/neXDDjyJVJc7xfuOHImZ0= k8s.io/apiextensions-apiserver v0.36.0/go.mod h1:kGDjH0msuiIB3tgsYRV0kS9GqpMYMUsQ3GHv7TApyug= -k8s.io/apimachinery v0.35.4 h1:xtdom9RG7e+yDp71uoXoJDWEE2eOiHgeO4GdBzwWpds= -k8s.io/apimachinery v0.35.4/go.mod h1:NNi1taPOpep0jOj+oRha3mBJPqvi0hGdaV8TCqGQ+cc= k8s.io/apimachinery v0.36.2 h1:0PE/W/WNy1UX61NLbXY5TMbJ6UwLL6E6lAPkYrKFxbQ= k8s.io/apimachinery v0.36.2/go.mod h1:fvf/HOLXq9RId0rnDIbN1OEBvHXdQbLMM8nu0LcBUf4= -k8s.io/client-go v0.35.4 h1:DN6fyaGuzK64UvnKO5fOA6ymSjvfGAnCAHAR0C66kD8= -k8s.io/client-go v0.35.4/go.mod h1:2Pg9WpsS4NeOpoYTfHHfMxBG8zFMSAUi4O/qoiJC3nY= k8s.io/client-go v0.36.2 h1:bfgxmFKc9CgqsgX4xKLAAdmTQlWee7Ob/HlDOrJ5TBI= k8s.io/client-go v0.36.2/go.mod h1:1vgO4OAlfPnoLcb+Rze2GF5rAr14w8qjrYMoyXJzQj0= -k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= -k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= -k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= -k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= k8s.io/kube-openapi v0.0.0-20260319004828-5883c5ee87b9 h1:Sztf7ESG9tAXRW/ACJZjrj5jhdOUqS2KFRQT+CTvu78= k8s.io/kube-openapi v0.0.0-20260319004828-5883c5ee87b9/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= -k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= -k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 h1:kBawHLSnx/mYHmRnNUf9d4CpjREbeZuxoSGOX/J+aYM= k8s.io/utils v0.0.0-20260319190234-28399d86e0b5/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= -oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= -oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= oras.land/oras-go/v2 v2.6.1 h1:bonOEkjLfp8tt6qXWRRWP6p1F+9octchOf2EqnWB4Zs= oras.land/oras-go/v2 v2.6.1/go.mod h1:dhtFrFOuZuDtAVeZ9FUnaa5zfzplG3ZnFX9/uH1J/Yk= -sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= -sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= sigs.k8s.io/controller-runtime v0.24.1 h1:miPEwrmirImAvgME1L9qebGHrOnGJoVmVdtOU9fRfo4= sigs.k8s.io/controller-runtime v0.24.1/go.mod h1:vFkfY5fGt5xAC/sKb8IBFKgWPNKG9OUG29dR8Y2wImw= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= -sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= sigs.k8s.io/structured-merge-diff/v6 v6.3.2 h1:kwVWMx5yS1CrnFWA/2QHyRVJ8jM6dBA80uLmm0wJkk8= sigs.k8s.io/structured-merge-diff/v6 v6.3.2/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= diff --git a/hack/e2e/README.md b/hack/e2e/README.md index acb5a990..67a0e22c 100644 --- a/hack/e2e/README.md +++ b/hack/e2e/README.md @@ -42,14 +42,16 @@ The default `all` command runs: | Command | Description | |---------|-------------| | `all` | Full flow: build, infra, join, validate, unjoin, validate absent, rejoin, validate, repave, logs, cleanup. | -| `infra` | Deploy AKS cluster and three VMs via Bicep. | +| `infra` | Deploy AKS cluster and four VMs via Bicep. | | `join` | Join all Flex Node VMs. | | `join-msi` | Join only the managed-identity node. | | `join-token` | Join only the bootstrap-token node. | +| `join-offline` | Join only the offline-artifacts bootstrap-token node. | | `join-kubeadm` | Join only the kubeadm-style bootstrap-token node. | | `unjoin` | Unjoin all Flex Node VMs. | | `unjoin-msi` | Unjoin only the managed-identity node. | | `unjoin-token` | Unjoin only the bootstrap-token node. | +| `unjoin-offline` | Unjoin only the offline-artifacts node. | | `unjoin-kubeadm` | Unjoin only the kubeadm-style node. | | `validate` | Verify joined nodes, node-problem-detector status, and run smoke tests. | | `validate-absent` | Verify Flex Node objects are absent after unjoin. | @@ -91,12 +93,13 @@ Additional environment variables: ## Join Modes -The suite validates three join paths: +The suite validates four join paths: | VM | Auth Mode | Join Path | |----|-----------|-----------| | `vm-e2e-msi-*` | Managed Identity | Generated managed-identity config and `aks-flex-node start` flow. | | `vm-e2e-token-*` | Bootstrap Token | Kubernetes bootstrap token, RBAC, generated config, and `aks-flex-node start` flow. | +| `vm-e2e-offline-*` | Bootstrap Token + Offline Artifacts | Bootstrap token config pinned to `bootstrap.ociImage=ghcr.io/azure/agent-ubuntu2404:v20260619` and `bootstrap.offlineArtifacts.source=oci://ghcr.io/azure/unbounded/bootstrap-artifacts:alpha-0cd4fe2-k8s-{{ .KubernetesVersion }}`. | | `vm-e2e-kubeadm-*` | Bootstrap Token | Kubeadm-style bootstrap resources plus generated config and `aks-flex-node start` flow. | The bootstrap-token VM is provisioned with an uppercase guest OS hostname while diff --git a/hack/e2e/infra/main.bicep b/hack/e2e/infra/main.bicep index 8a7fe72c..d257bbe8 100644 --- a/hack/e2e/infra/main.bicep +++ b/hack/e2e/infra/main.bicep @@ -5,6 +5,7 @@ // - AKS cluster (1-node control plane) // - VM with system-assigned managed identity (MSI auth mode) // - VM without managed identity (bootstrap token auth mode) +// - VM without managed identity (offline bootstrap artifacts mode) // - VM without managed identity (kubeadm apply -f auth mode) // // All flex-node VMs run Ubuntu 22.04 LTS, have public IPs, and allow SSH @@ -39,6 +40,7 @@ param tags object = {} var clusterName = 'aks-e2e-${nameSuffix}' var msiVmName = 'vm-e2e-msi-${nameSuffix}' var tokenVmName = 'vm-e2e-token-${nameSuffix}' +var offlineVmName = 'vm-e2e-offline-${nameSuffix}' var kubeadmVmName = 'vm-e2e-kubeadm-${nameSuffix}' var vnetName = 'vnet-e2e-${nameSuffix}' var nsgName = 'nsg-e2e-${nameSuffix}' @@ -171,6 +173,20 @@ module vmToken 'modules/vm.bicep' = { } } +module vmOffline 'modules/vm.bicep' = { + name: 'deploy-vm-offline' + params: { + location: location + vmName: offlineVmName + vmSize: vmSize + adminUsername: adminUsername + sshPublicKey: sshPublicKey + subnetId: vnet.properties.subnets[1].id + assignManagedIdentity: false + tags: tags + } +} + module vmKubeadm 'modules/vm.bicep' = { name: 'deploy-vm-kubeadm' params: { @@ -225,6 +241,10 @@ output tokenVmName string = vmToken.outputs.vmName output tokenVmIp string = vmToken.outputs.publicIpAddress output tokenVmPrivateIp string = vmToken.outputs.privateIpAddress +output offlineVmName string = vmOffline.outputs.vmName +output offlineVmIp string = vmOffline.outputs.publicIpAddress +output offlineVmPrivateIp string = vmOffline.outputs.privateIpAddress + output kubeadmVmName string = vmKubeadm.outputs.vmName output kubeadmVmIp string = vmKubeadm.outputs.publicIpAddress diff --git a/hack/e2e/lib/cleanup.sh b/hack/e2e/lib/cleanup.sh index f4957e60..94cbd216 100755 --- a/hack/e2e/lib/cleanup.sh +++ b/hack/e2e/lib/cleanup.sh @@ -31,6 +31,10 @@ _collect_vm_logs() { "sudo journalctl -u 'aks-flex-node-*' -n 500 --no-pager 2>/dev/null" \ > "${E2E_LOG_DIR}/${prefix}-agent-journal.log" 2>/dev/null || true + remote_exec "${vm_ip}" \ + "sudo cat /tmp/aks-flex-node-preflight.log 2>/dev/null" \ + > "${E2E_LOG_DIR}/${prefix}-preflight.log" 2>/dev/null || true + # Kubelet and containerd run inside the nspawn machine (kube1 or kube2). # Use nsenter via the machine leader PID to run journalctl inside the container. # Fall back to host journal for non-nspawn setups. @@ -109,9 +113,10 @@ collect_logs() { mkdir -p "${E2E_LOG_DIR}" - local msi_vm_ip token_vm_ip kubeadm_vm_ip + local msi_vm_ip token_vm_ip offline_vm_ip kubeadm_vm_ip msi_vm_ip="$(state_get msi_vm_ip)" token_vm_ip="$(state_get token_vm_ip)" + offline_vm_ip="$(state_get offline_vm_ip)" kubeadm_vm_ip="$(state_get kubeadm_vm_ip)" if [[ -n "${msi_vm_ip}" ]]; then @@ -122,6 +127,10 @@ collect_logs() { _collect_vm_logs "${token_vm_ip}" "token" || true fi + if [[ -n "${offline_vm_ip}" ]]; then + _collect_vm_logs "${offline_vm_ip}" "offline" || true + fi + if [[ -n "${kubeadm_vm_ip}" ]]; then _collect_vm_logs "${kubeadm_vm_ip}" "kubeadm" || true fi @@ -167,11 +176,12 @@ cleanup() { stop_daemon_csr_approver - local resource_group cluster_name msi_vm_name token_vm_name kubeadm_vm_name + local resource_group cluster_name msi_vm_name token_vm_name offline_vm_name kubeadm_vm_name resource_group="$(state_get resource_group)" cluster_name="$(state_get cluster_name)" msi_vm_name="$(state_get msi_vm_name)" token_vm_name="$(state_get token_vm_name)" + offline_vm_name="$(state_get offline_vm_name)" kubeadm_vm_name="$(state_get kubeadm_vm_name)" local deployment_name deployment_name="$(state_get deployment_name)" @@ -190,12 +200,16 @@ cleanup() { az vm delete --resource-group "${resource_group}" --name "${token_vm_name}" \ --force-deletion yes --yes --no-wait 2>/dev/null || true - log_info "[3/5] Deleting Kubeadm VM: ${kubeadm_vm_name}..." + log_info "[3/6] Deleting Offline VM: ${offline_vm_name}..." + az vm delete --resource-group "${resource_group}" --name "${offline_vm_name}" \ + --force-deletion yes --yes --no-wait 2>/dev/null || true + + log_info "[4/6] Deleting Kubeadm VM: ${kubeadm_vm_name}..." az vm delete --resource-group "${resource_group}" --name "${kubeadm_vm_name}" \ --force-deletion yes --yes --no-wait 2>/dev/null || true # Clean up leftover networking resources tied to our deployment - log_info "[4/5] Cleaning up networking resources..." + log_info "[5/6] Cleaning up networking resources..." local run_id="${GITHUB_RUN_ID:-}" if [[ -n "${run_id}" ]]; then for res_type in networkInterfaces publicIPAddresses networkSecurityGroups disks; do @@ -207,7 +221,7 @@ cleanup() { done fi - log_info "[5/5] Deleting AKS cluster: ${cluster_name}..." + log_info "[6/6] Deleting AKS cluster: ${cluster_name}..." az aks delete --resource-group "${resource_group}" --name "${cluster_name}" \ --yes --no-wait 2>/dev/null || true diff --git a/hack/e2e/lib/common.sh b/hack/e2e/lib/common.sh index 45a74d52..527490bc 100755 --- a/hack/e2e/lib/common.sh +++ b/hack/e2e/lib/common.sh @@ -250,6 +250,31 @@ state_dump() { fi } +# --------------------------------------------------------------------------- +# Cluster command serialization +# --------------------------------------------------------------------------- +# Several join flows run in parallel and some helper commands call +# `az aks get-credentials`, which rewrites the shared E2E kubeconfig. Serialize +# local cluster/kubeconfig operations so concurrent kubectl calls never observe a +# partially-written or context-less kubeconfig and fall back to localhost:8080. +with_cluster_lock() { + local lock_dir="${E2E_WORK_DIR}/cluster.lock" + local rc + + mkdir -p "${E2E_WORK_DIR}" + while ! mkdir "${lock_dir}" 2>/dev/null; do + sleep 1 + done + + set +e + "$@" + rc=$? + set -e + + rmdir "${lock_dir}" 2>/dev/null || true + return "${rc}" +} + # --------------------------------------------------------------------------- # SSH helpers # --------------------------------------------------------------------------- @@ -335,7 +360,7 @@ ensure_daemon_csr_approver() { log_info "Starting e2e daemon CSR approver..." local approver_kubeconfig="${E2E_WORK_DIR}/daemon-csr-approver.kubeconfig" - kubectl config view --raw --minify > "${approver_kubeconfig}" + with_cluster_lock kubectl config view --raw --minify > "${approver_kubeconfig}" chmod 600 "${approver_kubeconfig}" pkill -f 'e2ehelper daemon-csr-approver' 2>/dev/null || true diff --git a/hack/e2e/lib/infra.sh b/hack/e2e/lib/infra.sh index d0307984..136bb56c 100755 --- a/hack/e2e/lib/infra.sh +++ b/hack/e2e/lib/infra.sh @@ -105,7 +105,8 @@ infra_deploy() { -o json) local cluster_name cluster_id msi_vm_name msi_vm_ip msi_vm_principal_id - local token_vm_name token_vm_ip token_vm_private_ip kubeadm_vm_name kubeadm_vm_ip admin_username + local token_vm_name token_vm_ip token_vm_private_ip offline_vm_name offline_vm_ip offline_vm_private_ip + local kubeadm_vm_name kubeadm_vm_ip admin_username cluster_name=$(echo "${outputs}" | jq -r '.clusterName.value') cluster_id=$(echo "${outputs}" | jq -r '.clusterId.value') @@ -115,6 +116,9 @@ infra_deploy() { token_vm_name=$(echo "${outputs}" | jq -r '.tokenVmName.value') token_vm_ip=$(echo "${outputs}" | jq -r '.tokenVmIp.value') token_vm_private_ip=$(echo "${outputs}" | jq -r '.tokenVmPrivateIp.value // ""') + offline_vm_name=$(echo "${outputs}" | jq -r '.offlineVmName.value') + offline_vm_ip=$(echo "${outputs}" | jq -r '.offlineVmIp.value') + offline_vm_private_ip=$(echo "${outputs}" | jq -r '.offlineVmPrivateIp.value // ""') kubeadm_vm_name=$(echo "${outputs}" | jq -r '.kubeadmVmName.value') kubeadm_vm_ip=$(echo "${outputs}" | jq -r '.kubeadmVmIp.value') admin_username=$(echo "${outputs}" | jq -r '.adminUsername.value') @@ -123,6 +127,10 @@ infra_deploy() { log_error "Missing or invalid token VM private IP from deployment outputs: '${token_vm_private_ip}'" return 1 fi + if [[ -z "${offline_vm_private_ip}" ]] || ! is_valid_ipv4 "${offline_vm_private_ip}"; then + log_error "Missing or invalid offline VM private IP from deployment outputs: '${offline_vm_private_ip}'" + return 1 + fi # Persist to state state_set "cluster_name" "${cluster_name}" @@ -133,6 +141,9 @@ infra_deploy() { state_set "token_vm_name" "${token_vm_name}" state_set "token_vm_ip" "${token_vm_ip}" state_set "token_vm_private_ip" "${token_vm_private_ip}" + state_set "offline_vm_name" "${offline_vm_name}" + state_set "offline_vm_ip" "${offline_vm_ip}" + state_set "offline_vm_private_ip" "${offline_vm_private_ip}" state_set "kubeadm_vm_name" "${kubeadm_vm_name}" state_set "kubeadm_vm_ip" "${kubeadm_vm_ip}" state_set "admin_username" "${admin_username}" @@ -145,6 +156,7 @@ infra_deploy() { log_info "Cluster: ${cluster_name} (${cluster_id})" log_info "MSI VM: ${msi_vm_name} @ ${msi_vm_ip}" log_info "Token VM: ${token_vm_name} @ ${token_vm_ip}" + log_info "Offline VM: ${offline_vm_name} @ ${offline_vm_ip}" log_info "Kubeadm VM: ${kubeadm_vm_name} @ ${kubeadm_vm_ip}" # Get kubeconfig and extract cluster info @@ -156,12 +168,15 @@ infra_deploy() { local pid_msi=$! wait_for_ssh "${token_vm_ip}" & local pid_token=$! + wait_for_ssh "${offline_vm_ip}" & + local pid_offline=$! wait_for_ssh "${kubeadm_vm_ip}" & local pid_kubeadm=$! local ssh_failed=0 wait "${pid_msi}" || ssh_failed=1 wait "${pid_token}" || ssh_failed=1 + wait "${pid_offline}" || ssh_failed=1 wait "${pid_kubeadm}" || ssh_failed=1 if [[ "${ssh_failed}" -eq 1 ]]; then diff --git a/hack/e2e/lib/node-join-kubeadm.sh b/hack/e2e/lib/node-join-kubeadm.sh index 1e14c2a1..75192776 100644 --- a/hack/e2e/lib/node-join-kubeadm.sh +++ b/hack/e2e/lib/node-join-kubeadm.sh @@ -299,12 +299,12 @@ node_join_kubeadm() { location="$(state_get location)" # Step 1: Ensure RBAC / ConfigMaps and create a bootstrap token - _kubeadm_ensure_rbac "${server_url}" "${ca_cert_data}" + with_cluster_lock _kubeadm_ensure_rbac "${server_url}" "${ca_cert_data}" ensure_daemon_csr_approver log_info "Creating bootstrap token..." local bootstrap_token - bootstrap_token="$(_kubeadm_create_bootstrap_token)" + bootstrap_token="$(with_cluster_lock _kubeadm_create_bootstrap_token)" state_set "kubeadm_bootstrap_token" "${bootstrap_token}" # Step 2: Generate the config file for aks-flex-node agent diff --git a/hack/e2e/lib/node-join-offline.sh b/hack/e2e/lib/node-join-offline.sh new file mode 100644 index 00000000..13c3a923 --- /dev/null +++ b/hack/e2e/lib/node-join-offline.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# ============================================================================= +# hack/e2e/lib/node-join-offline.sh - Join / unjoin an AKS flex node using +# bootstrap token auth with offline assets +# ============================================================================= +set -euo pipefail + +[[ -n "${_E2E_NODE_JOIN_OFFLINE_LOADED:-}" ]] && return 0 +readonly _E2E_NODE_JOIN_OFFLINE_LOADED=1 + +# shellcheck disable=SC1091 +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +readonly offlineArtifactsSource='oci://ghcr.io/azure/unbounded/bootstrap-artifacts:alpha-0cd4fe2-k8s-{{ .KubernetesVersion }}' +readonly offlineOCIImage='ghcr.io/azure/agent-ubuntu2404:v20260619' + +node_join_offline() { + log_section "Joining Offline Artifacts Node" + local start + start=$(timer_start) + + local vm_ip + vm_ip="$(state_get offline_vm_ip)" + local vm_private_ip + vm_private_ip="$(state_get offline_vm_private_ip)" + local cluster_name + cluster_name="$(state_get cluster_name)" + local resource_group + resource_group="$(state_get resource_group)" + local subscription_id + subscription_id="$(state_get subscription_id)" + + if [[ -z "${vm_private_ip}" ]] || ! is_valid_ipv4 "${vm_private_ip}"; then + log_error "Invalid offline VM private IP in state: '${vm_private_ip}'" + return 1 + fi + + log_info "Setting up bootstrap token RBAC resources..." + with_cluster_lock "${REPO_ROOT}/scripts/aks-flex-config" setup-node-rbac \ + --resource-group "${resource_group}" \ + --cluster-name "${cluster_name}" \ + --subscription "${subscription_id}" + + ensure_daemon_csr_approver + + log_info "Generating offline artifacts config..." + local config_file="${E2E_WORK_DIR}/config-offline.json" + with_cluster_lock "${REPO_ROOT}/scripts/aks-flex-config" generate-node-config \ + --resource-group "${resource_group}" \ + --cluster-name "${cluster_name}" \ + --subscription "${subscription_id}" \ + --agent-pool-name "${E2E_TARGET_AGENT_POOL_NAME}" \ + --bootstrap-token \ + --output "${config_file}" + + jq \ + --arg nodeIP "${vm_private_ip}" \ + --arg kubernetesVersion "${E2E_KUBERNETES_VERSION}" \ + --arg offlineArtifactsSource "${offlineArtifactsSource}" \ + --arg ociImage "${offlineOCIImage}" \ + '.agent.logLevel = "debug" + | .agent.e2eMode = true + | .node.kubelet.nodeIP = $nodeIP + | .components = (.components // {}) + | .components.kubernetes = $kubernetesVersion + | del(.components.containerd, .components.runc, .networking.cniVersion) + | .bootstrap = (.bootstrap // {}) + | .bootstrap.ociImage = $ociImage + | .bootstrap.offlineArtifacts.source = $offlineArtifactsSource + | del(.kubernetes, .containerd, .runc)' \ + "${config_file}" > "${config_file}.tmp" + mv "${config_file}.tmp" "${config_file}" + + jq -e \ + --arg offlineArtifactsSource "${offlineArtifactsSource}" \ + --arg ociImage "${offlineOCIImage}" \ + '.bootstrap.ociImage == $ociImage and .bootstrap.offlineArtifacts.source == $offlineArtifactsSource' \ + "${config_file}" >/dev/null + log_info "Offline node artifact source: ${offlineArtifactsSource}" + log_info "Offline node OCI image: ${offlineOCIImage}" + + _deploy_and_start_agent "${vm_ip}" "${config_file}" "aks-flex-node-offline" + + log_success "Offline artifacts node joined in $(timer_elapsed "${start}")s" +} + +node_unjoin_offline() { + log_section "Unjoining Offline Artifacts Node" + local start + start=$(timer_start) + + local vm_ip vm_name + vm_ip="$(state_get offline_vm_ip)" + vm_name="$(state_get offline_vm_name)" + + _rp_delete_unjoin_node "${vm_ip}" "${vm_name}" + + log_success "Offline artifacts node unjoined in $(timer_elapsed "${start}")s" +} diff --git a/hack/e2e/lib/node-join-token.sh b/hack/e2e/lib/node-join-token.sh index e925f9c0..c828c4a8 100644 --- a/hack/e2e/lib/node-join-token.sh +++ b/hack/e2e/lib/node-join-token.sh @@ -40,7 +40,7 @@ node_join_token() { fi log_info "Setting up bootstrap token RBAC resources..." - "${REPO_ROOT}/scripts/aks-flex-config" setup-node-rbac \ + with_cluster_lock "${REPO_ROOT}/scripts/aks-flex-config" setup-node-rbac \ --resource-group "${resource_group}" \ --cluster-name "${cluster_name}" \ --subscription "${subscription_id}" @@ -49,7 +49,7 @@ node_join_token() { log_info "Generating token config..." local config_file="${E2E_WORK_DIR}/config-token.json" - "${REPO_ROOT}/scripts/aks-flex-config" generate-node-config \ + with_cluster_lock "${REPO_ROOT}/scripts/aks-flex-config" generate-node-config \ --resource-group "${resource_group}" \ --cluster-name "${cluster_name}" \ --subscription "${subscription_id}" \ diff --git a/hack/e2e/lib/node-join.sh b/hack/e2e/lib/node-join.sh index de526b9c..46b3eee3 100755 --- a/hack/e2e/lib/node-join.sh +++ b/hack/e2e/lib/node-join.sh @@ -5,10 +5,11 @@ # Sources: # node-join-msi.sh - MSI auth node join/unjoin (node_join_msi, node_unjoin_msi) # node-join-token.sh - Bootstrap token join/unjoin (node_join_token, node_unjoin_token) +# node-join-offline.sh - Offline artifact join/unjoin (node_join_offline, node_unjoin_offline) # node-join-kubeadm.sh - Kubeadm apply -f join/unjoin (node_join_kubeadm, node_unjoin_kubeadm) # # Functions: -# node_join_all - Join all nodes (MSI, token, and kubeadm) in parallel +# node_join_all - Join all nodes (MSI, token, offline, and kubeadm) in parallel # node_unjoin_all - Unjoin all nodes in parallel # ============================================================================= set -euo pipefail @@ -55,6 +56,29 @@ sudo tee /run/aks-flex-node/e2e-machine.json >/dev/null </dev/null 2>&1; then + echo "Installing host packages required by preflight..." + sudo DEBIAN_FRONTEND=noninteractive apt-get update + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y systemd-container curl nftables util-linux +fi + +preflight_log="/tmp/aks-flex-node-preflight.log" +echo "Running preflight checks before bootstrap..." +set +e +{ + echo "=== preflight ${UNIT_NAME} $(date -Is) ===" + sudo /usr/local/bin/aks-flex-node preflight --config /etc/aks-flex-node/config.json --output text + preflight_rc=$? + echo "=== preflight ${UNIT_NAME} exit ${preflight_rc} ===" + exit "${preflight_rc}" +} 2>&1 | sudo tee -a "${preflight_log}" +preflight_rc=${PIPESTATUS[0]} +set -e +if (( preflight_rc != 0 )); then + echo "Preflight checks failed with exit code ${preflight_rc}" + exit "${preflight_rc}" +fi + # Clean up any leftover transient unit from a previous run sudo systemctl stop "${UNIT_NAME}" 2>/dev/null || true sudo systemctl reset-failed "${UNIT_NAME}" 2>/dev/null || true @@ -311,6 +335,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/node-join-msi.sh" # shellcheck disable=SC1091 source "$(dirname "${BASH_SOURCE[0]}")/node-join-token.sh" # shellcheck disable=SC1091 +source "$(dirname "${BASH_SOURCE[0]}")/node-join-offline.sh" +# shellcheck disable=SC1091 source "$(dirname "${BASH_SOURCE[0]}")/node-join-kubeadm.sh" # --------------------------------------------------------------------------- @@ -321,8 +347,8 @@ node_join_all() { local start start=$(timer_start) - local msi_pid token_pid kubeadm_pid - local msi_exit=0 token_exit=0 kubeadm_exit=0 + local msi_pid token_pid offline_pid kubeadm_pid + local msi_exit=0 token_exit=0 offline_exit=0 kubeadm_exit=0 ensure_daemon_csr_approver @@ -332,11 +358,15 @@ node_join_all() { node_join_token & token_pid=$! + node_join_offline & + offline_pid=$! + node_join_kubeadm & kubeadm_pid=$! wait "${msi_pid}" || msi_exit=$? wait "${token_pid}" || token_exit=$? + wait "${offline_pid}" || offline_exit=$? wait "${kubeadm_pid}" || kubeadm_exit=$? local duration @@ -348,11 +378,14 @@ node_join_all() { if [[ "${token_exit}" -ne 0 ]]; then log_error "Token node join failed (exit ${token_exit})" fi + if [[ "${offline_exit}" -ne 0 ]]; then + log_error "Offline artifacts node join failed (exit ${offline_exit})" + fi if [[ "${kubeadm_exit}" -ne 0 ]]; then log_error "Kubeadm node join failed (exit ${kubeadm_exit})" fi - if [[ "${msi_exit}" -ne 0 || "${token_exit}" -ne 0 || "${kubeadm_exit}" -ne 0 ]]; then + if [[ "${msi_exit}" -ne 0 || "${token_exit}" -ne 0 || "${offline_exit}" -ne 0 || "${kubeadm_exit}" -ne 0 ]]; then log_error "Node joins failed (${duration}s)" return 1 fi @@ -368,8 +401,8 @@ node_unjoin_all() { local start start=$(timer_start) - local msi_pid token_pid kubeadm_pid - local msi_exit=0 token_exit=0 kubeadm_exit=0 + local msi_pid token_pid offline_pid kubeadm_pid + local msi_exit=0 token_exit=0 offline_exit=0 kubeadm_exit=0 node_unjoin_msi & msi_pid=$! @@ -377,11 +410,15 @@ node_unjoin_all() { node_unjoin_token & token_pid=$! + node_unjoin_offline & + offline_pid=$! + node_unjoin_kubeadm & kubeadm_pid=$! wait "${msi_pid}" || msi_exit=$? wait "${token_pid}" || token_exit=$? + wait "${offline_pid}" || offline_exit=$? wait "${kubeadm_pid}" || kubeadm_exit=$? local duration @@ -393,11 +430,14 @@ node_unjoin_all() { if [[ "${token_exit}" -ne 0 ]]; then log_error "Token node unjoin failed (exit ${token_exit})" fi + if [[ "${offline_exit}" -ne 0 ]]; then + log_error "Offline artifacts node unjoin failed (exit ${offline_exit})" + fi if [[ "${kubeadm_exit}" -ne 0 ]]; then log_error "Kubeadm node unjoin failed (exit ${kubeadm_exit})" fi - if [[ "${msi_exit}" -ne 0 || "${token_exit}" -ne 0 || "${kubeadm_exit}" -ne 0 ]]; then + if [[ "${msi_exit}" -ne 0 || "${token_exit}" -ne 0 || "${offline_exit}" -ne 0 || "${kubeadm_exit}" -ne 0 ]]; then log_error "Node unjoins failed (${duration}s)" return 1 fi diff --git a/hack/e2e/lib/validate.sh b/hack/e2e/lib/validate.sh index 4b24edda..92eaa900 100755 --- a/hack/e2e/lib/validate.sh +++ b/hack/e2e/lib/validate.sh @@ -4,7 +4,7 @@ # # Functions: # validate_node_joined - Wait for a specific node to appear in kubectl -# validate_all_nodes - Verify MSI, token, and kubeadm nodes joined +# validate_all_nodes - Verify MSI, token, offline, and kubeadm nodes joined # validate_npd_status - Verify node-problem-detector is active # validate_node_absent - Wait for a node to disappear from kubectl # validate_all_nodes_absent - Verify all flex nodes are gone after unjoin @@ -162,7 +162,7 @@ REMOTE } # --------------------------------------------------------------------------- -# validate_all_nodes - Check all MSI, token, and kubeadm VMs joined +# validate_all_nodes - Check all MSI, token, offline, and kubeadm VMs joined # --------------------------------------------------------------------------- validate_all_nodes() { log_section "Validating Node Join" @@ -178,24 +178,30 @@ validate_all_nodes() { --overwrite-existing \ --admin - local msi_vm_name token_vm_name kubeadm_vm_name - local msi_vm_ip token_vm_ip kubeadm_vm_ip - local token_vm_private_ip + local msi_vm_name token_vm_name offline_vm_name kubeadm_vm_name + local msi_vm_ip token_vm_ip offline_vm_ip kubeadm_vm_ip + local token_vm_private_ip offline_vm_private_ip msi_vm_name="$(state_get msi_vm_name)" token_vm_name="$(state_get token_vm_name)" + offline_vm_name="$(state_get offline_vm_name)" kubeadm_vm_name="$(state_get kubeadm_vm_name)" msi_vm_ip="$(state_get msi_vm_ip)" token_vm_ip="$(state_get token_vm_ip)" + offline_vm_ip="$(state_get offline_vm_ip)" kubeadm_vm_ip="$(state_get kubeadm_vm_ip)" token_vm_private_ip="$(state_get token_vm_private_ip)" + offline_vm_private_ip="$(state_get offline_vm_private_ip)" local failed=0 validate_node_joined "${msi_vm_name}" || failed=1 validate_node_joined "${token_vm_name}" || failed=1 + validate_node_joined "${offline_vm_name}" || failed=1 validate_node_joined "${kubeadm_vm_name}" || failed=1 validate_node_ip "${token_vm_name}" "${token_vm_private_ip}" || failed=1 + validate_node_ip "${offline_vm_name}" "${offline_vm_private_ip}" || failed=1 validate_npd_status "${msi_vm_name}" "${msi_vm_ip}" || failed=1 validate_npd_status "${token_vm_name}" "${token_vm_ip}" || failed=1 + validate_npd_status "${offline_vm_name}" "${offline_vm_ip}" || failed=1 validate_npd_status "${kubeadm_vm_name}" "${kubeadm_vm_ip}" || failed=1 if [[ "${failed}" -eq 1 ]]; then @@ -241,15 +247,17 @@ validate_node_absent() { validate_all_nodes_absent() { log_section "Validating Nodes Absent After Unjoin" - local msi_vm_name token_vm_name kubeadm_vm_name + local msi_vm_name token_vm_name offline_vm_name kubeadm_vm_name msi_vm_name="$(state_get msi_vm_name)" token_vm_name="$(state_get token_vm_name)" + offline_vm_name="$(state_get offline_vm_name)" kubeadm_vm_name="$(state_get kubeadm_vm_name)" local failed=0 # TODO: MSI validation skipped until credential plugin auth is supported log_info "Skipping MSI node absence validation (credential plugin auth not yet supported)" validate_node_absent "${token_vm_name}" || failed=1 + validate_node_absent "${offline_vm_name}" || failed=1 validate_node_absent "${kubeadm_vm_name}" || failed=1 if [[ "${failed}" -eq 1 ]]; then @@ -319,9 +327,10 @@ EOF smoke_test_all() { log_section "Running Smoke Tests" - local msi_vm_name token_vm_name kubeadm_vm_name + local msi_vm_name token_vm_name offline_vm_name kubeadm_vm_name msi_vm_name="$(state_get msi_vm_name)" token_vm_name="$(state_get token_vm_name)" + offline_vm_name="$(state_get offline_vm_name)" kubeadm_vm_name="$(state_get kubeadm_vm_name)" # A default bridge CNI config (99-bridge.conf) is written during bootstrap, @@ -329,6 +338,7 @@ smoke_test_all() { local failed=0 smoke_test "${msi_vm_name}" "msi" || failed=1 smoke_test "${token_vm_name}" "token" || failed=1 + smoke_test "${offline_vm_name}" "offline" || failed=1 smoke_test "${kubeadm_vm_name}" "kubeadm" || failed=1 if [[ "${failed}" -eq 1 ]]; then diff --git a/hack/e2e/run.sh b/hack/e2e/run.sh index 3a6d93f0..9b8fdd63 100755 --- a/hack/e2e/run.sh +++ b/hack/e2e/run.sh @@ -8,14 +8,16 @@ # Commands: # all Run the full E2E flow (default): build, infra, join, validate, # unjoin, validate-absent, rejoin, validate, cleanup -# infra Deploy infrastructure only (Bicep: AKS + 3 VMs) +# infra Deploy infrastructure only (Bicep: AKS + 4 VMs) # join Join all nodes to the cluster (requires prior infra) # join-msi Join only the MSI node # join-token Join only the token node +# join-offline Join only the offline artifacts node # join-kubeadm Join only the kubeadm node (apply -f with KubeadmNodeJoin) # unjoin Unjoin all nodes from the cluster # unjoin-msi Unjoin only the MSI node # unjoin-token Unjoin only the token node +# unjoin-offline Unjoin only the offline artifacts node # unjoin-kubeadm Reset the kubeadm node and remove it from the cluster # validate Verify nodes joined + run smoke tests # validate-absent Verify all flex nodes are gone after unjoin @@ -119,7 +121,7 @@ usage() { parse_args() { while [[ $# -gt 0 ]]; do case "$1" in - all|infra|join|join-msi|join-token|join-kubeadm|unjoin|unjoin-msi|unjoin-token|unjoin-kubeadm|validate|validate-absent|smoke|upgrade-drift|logs|cleanup|runner-cleanup|status) + all|infra|join|join-msi|join-token|join-offline|join-kubeadm|unjoin|unjoin-msi|unjoin-token|unjoin-offline|unjoin-kubeadm|validate|validate-absent|smoke|upgrade-drift|logs|cleanup|runner-cleanup|status) COMMAND="$1"; shift ;; -g|--resource-group) export E2E_RESOURCE_GROUP="$2"; shift 2 ;; -l|--location) export E2E_LOCATION="$2"; shift 2 ;; @@ -255,6 +257,10 @@ main() { ensure_binary node_join_token ;; + join-offline) + ensure_binary + node_join_offline + ;; join-kubeadm) ensure_binary node_join_kubeadm @@ -268,6 +274,9 @@ main() { unjoin-token) node_unjoin_token ;; + unjoin-offline) + node_unjoin_offline + ;; unjoin-kubeadm) node_unjoin_kubeadm ;; diff --git a/pkg/cmd/preflight/preflight.go b/pkg/cmd/preflight/preflight.go new file mode 100644 index 00000000..e6f47013 --- /dev/null +++ b/pkg/cmd/preflight/preflight.go @@ -0,0 +1,148 @@ +package preflight + +import ( + "context" + "encoding/json" + "fmt" + "io" + "os" + "strings" + + "github.com/spf13/cobra" + + "github.com/Azure/AKSFlexNode/pkg/config" + "github.com/Azure/AKSFlexNode/pkg/logger" + "github.com/Azure/unbounded/pkg/agent/goalstates" + "github.com/Azure/unbounded/pkg/agent/phases/host" + "github.com/Azure/unbounded/pkg/agent/phases/nodestart" + "github.com/Azure/unbounded/pkg/agent/phases/rootfs" + "github.com/Azure/unbounded/pkg/agent/preflight" +) + +type handler struct { + configPath string + ignorePreflightErrors []string + failOnWarnings bool + output string + writer io.Writer +} + +// NewCommand returns the preflight command. +func NewCommand() *cobra.Command { + h := &handler{writer: os.Stdout} + + cmd := &cobra.Command{ + Use: "preflight", + Short: "Run non-mutating preflight checks", + Long: "Run non-mutating preflight checks for the host and AKS Flex Node configuration before node bootstrap.", + RunE: func(cmd *cobra.Command, args []string) error { + return h.execute(cmd.Context()) + }, + } + + cmd.Flags().StringVar(&h.configPath, "config", "", "Path to configuration JSON file (required)") + _ = cmd.MarkFlagRequired("config") + cmd.Flags().StringSliceVar( + &h.ignorePreflightErrors, + "ignore-preflight-errors", + nil, + "Comma-separated preflight check names whose errors should be reported as warnings", + ) + cmd.Flags().BoolVar(&h.failOnWarnings, "fail-on-warnings", false, "Fail when any preflight warning is returned") + cmd.Flags().StringVar(&h.output, "output", "text", "Output format: text or json") + + return cmd +} + +func (h *handler) execute(ctx context.Context) error { + cfg, err := config.LoadConfig(h.configPath) + if err != nil { + return fmt.Errorf("failed to load config from %s: %w", h.configPath, err) + } + log := logger.CreateLogger(cfg.Agent.LogLevel, cfg.Agent.LogDir) + + agentCfg := config.ToAgentConfig(cfg, goalstates.NSpawnMachineKube1) + gs, err := goalstates.ResolveMachine(log, agentCfg, goalstates.NSpawnMachineKube1, nil) + if err != nil { + return fmt.Errorf("preflight failed to resolve goal state: %w", err) + } + + checks := preflight.Flatten( + host.Preflight(log, *agentCfg, gs), + nodestart.Preflight(log, *agentCfg, gs), + rootfs.Preflight(log, *agentCfg, gs), + ) + + report := preflight.Run(ctx, checks, preflight.Options{ + IgnoreErrors: h.ignorePreflightErrors, + FailOnWarnings: h.failOnWarnings, + }) + + switch strings.ToLower(h.output) { + case "", "text": + if err := writeText(h.writer, report); err != nil { + return err + } + case "json": + enc := json.NewEncoder(h.writer) + enc.SetIndent("", " ") + if err := enc.Encode(report); err != nil { + return err + } + default: + return fmt.Errorf("unsupported output format %q", h.output) + } + + return report.Err(h.failOnWarnings) +} + +func writeText(w io.Writer, report preflight.Report) error { + if _, err := fmt.Fprintln(w, "[preflight] Running AKS Flex Node preflight checks"); err != nil { + return err + } + + var errors []preflight.Result + for _, result := range report.Checks { + switch result.Severity { + case preflight.SeverityOK: + if err := writeResult(w, "OK", result); err != nil { + return err + } + case preflight.SeverityError: + errors = append(errors, result) + case preflight.SeverityWarning: + if err := writeResult(w, "WARNING", result); err != nil { + return err + } + } + } + + if len(errors) == 0 { + return nil + } + + if _, err := fmt.Fprintln(w, "[preflight] Some fatal errors occurred:"); err != nil { + return err + } + for _, result := range errors { + if err := writeResult(w, "ERROR", result); err != nil { + return err + } + } + + _, err := fmt.Fprintln(w, "[preflight] If you know what you are doing, you can make a check non-fatal with `--ignore-preflight-errors=...`") + return err +} + +func writeResult(w io.Writer, status string, result preflight.Result) error { + if _, err := fmt.Fprintf(w, "\t[%s %s]: %s", status, result.Name, result.Message); err != nil { + return err + } + if result.Target != "" { + if _, err := fmt.Fprintf(w, " (target: %s)", result.Target); err != nil { + return err + } + } + _, err := fmt.Fprintln(w) + return err +} diff --git a/pkg/cmd/preflight/preflight_test.go b/pkg/cmd/preflight/preflight_test.go new file mode 100644 index 00000000..0798439a --- /dev/null +++ b/pkg/cmd/preflight/preflight_test.go @@ -0,0 +1,53 @@ +package preflight + +import ( + "bytes" + "strings" + "testing" + + "github.com/Azure/unbounded/pkg/agent/preflight" +) + +func TestNewCommand(t *testing.T) { + t.Parallel() + + cmd := NewCommand() + if cmd.Use != "preflight" { + t.Fatalf("Use = %q, want preflight", cmd.Use) + } + + for _, flag := range []string{"config", "ignore-preflight-errors", "fail-on-warnings", "output"} { + if cmd.Flags().Lookup(flag) == nil { + t.Fatalf("expected flag %q", flag) + } + } +} + +func TestWriteText(t *testing.T) { + t.Parallel() + + report := preflight.Report{ + Checks: []preflight.Result{ + preflight.OK("ok-check", "ok target", "all good"), + preflight.Warning("warn-check", "warn target", "be careful"), + preflight.Error("error-check", "error target", "bad thing"), + }, + } + + var out bytes.Buffer + if err := writeText(&out, report); err != nil { + t.Fatalf("writeText() error = %v", err) + } + + got := out.String() + for _, want := range []string{ + "[preflight] Running AKS Flex Node preflight checks", + "[OK ok-check]: all good (target: ok target)", + "[WARNING warn-check]: be careful (target: warn target)", + "[ERROR error-check]: bad thing (target: error target)", + } { + if !strings.Contains(got, want) { + t.Fatalf("writeText() output missing %q\n%s", want, got) + } + } +} diff --git a/pkg/config/adapter.go b/pkg/config/adapter.go index 9330a7e3..d3fb1869 100644 --- a/pkg/config/adapter.go +++ b/pkg/config/adapter.go @@ -24,9 +24,7 @@ func ToAgentConfig(cfg *Config, machineName string) *agentconfig.AgentConfig { ac := &agentconfig.AgentConfig{ MachineName: machineName, NodeName: cfg.Agent.NodeName, - // TODO: implement support for overriding rootfs image from flex node config. - // Using empty string here means the agent will detect and use the default image. - // OCIImage: "", + OCIImage: cfg.Bootstrap.OCIImage, Cluster: agentconfig.AgentClusterConfig{ CaCertBase64: cfg.Node.Kubelet.CACertData, ClusterDNS: cfg.Networking.DNSServiceIP, @@ -40,7 +38,8 @@ func ToAgentConfig(cfg *Config, machineName string) *agentconfig.AgentConfig { }, CRI: agentconfig.CRIConfig{ Containerd: agentconfig.ContainerdConfig{ - Version: cfg.Components.Containerd, + Version: cfg.Components.Containerd, + SandboxImage: cfg.Components.SandboxImage, }, Runc: agentconfig.RuncConfig{ Version: cfg.Components.Runc, @@ -51,6 +50,12 @@ func ToAgentConfig(cfg *Config, machineName string) *agentconfig.AgentConfig { }, } + if cfg.Bootstrap.OfflineArtifacts.Source != "" { + ac.OfflineArtifacts = &agentconfig.AgentOfflineArtifacts{ + Source: cfg.Bootstrap.OfflineArtifacts.Source, + } + } + switch { case cfg.IsBootstrapTokenConfigured(): ac.Kubelet.Auth.BootstrapToken = cfg.Azure.BootstrapToken.Token diff --git a/pkg/config/adapter_test.go b/pkg/config/adapter_test.go index e48d4213..e7eb7aad 100644 --- a/pkg/config/adapter_test.go +++ b/pkg/config/adapter_test.go @@ -221,9 +221,13 @@ func TestToAgentConfig_CRICNIVersions(t *testing.T) { BootstrapToken: &BootstrapTokenConfig{Token: "tok"}, }, Components: ComponentsConfig{ - Kubernetes: "1.30.0", - Containerd: "2.1.0", - Runc: "1.2.0", + Kubernetes: "1.30.0", + Containerd: "2.1.0", + Runc: "1.2.0", + SandboxImage: "registry.example.test/pause:3.9", + }, + Bootstrap: BootstrapConfig{ + OCIImage: "registry.example.test/flex/rootfs:ubuntu-24.04", }, Networking: NetworkingConfig{ DNSServiceIP: "10.0.0.10", @@ -242,6 +246,12 @@ func TestToAgentConfig_CRICNIVersions(t *testing.T) { if ac.CRI.Containerd.Version != "2.1.0" { t.Fatalf("CRI.Containerd.Version=%q, want %q", ac.CRI.Containerd.Version, "2.1.0") } + if ac.CRI.Containerd.SandboxImage != "registry.example.test/pause:3.9" { + t.Fatalf("CRI.Containerd.SandboxImage=%q, want registry.example.test/pause:3.9", ac.CRI.Containerd.SandboxImage) + } + if ac.OCIImage != "registry.example.test/flex/rootfs:ubuntu-24.04" { + t.Fatalf("OCIImage=%q, want registry.example.test/flex/rootfs:ubuntu-24.04", ac.OCIImage) + } if ac.CRI.Runc.Version != "1.2.0" { t.Fatalf("CRI.Runc.Version=%q, want %q", ac.CRI.Runc.Version, "1.2.0") } @@ -250,6 +260,24 @@ func TestToAgentConfig_CRICNIVersions(t *testing.T) { } } +func TestToAgentConfig_OfflineArtifacts(t *testing.T) { + t.Parallel() + + cfg := &Config{ + Bootstrap: BootstrapConfig{ + OfflineArtifacts: OfflineArtifactsConfig{Source: "/opt/artifacts/{{ .KubernetesVersion }}"}, + }, + } + + ac := ToAgentConfig(cfg, "kube1") + if ac.OfflineArtifacts == nil { + t.Fatal("OfflineArtifacts is nil") + } + if ac.OfflineArtifacts.Source != "/opt/artifacts/{{ .KubernetesVersion }}" { + t.Fatalf("OfflineArtifacts.Source=%q", ac.OfflineArtifacts.Source) + } +} + func TestToAgentConfig_CRICNIVersionsEmpty(t *testing.T) { t.Parallel() @@ -274,6 +302,12 @@ func TestToAgentConfig_CRICNIVersionsEmpty(t *testing.T) { if ac.CRI.Containerd.Version != "" { t.Fatalf("CRI.Containerd.Version=%q, want empty", ac.CRI.Containerd.Version) } + if ac.CRI.Containerd.SandboxImage != "" { + t.Fatalf("CRI.Containerd.SandboxImage=%q, want empty", ac.CRI.Containerd.SandboxImage) + } + if ac.OCIImage != "" { + t.Fatalf("OCIImage=%q, want empty", ac.OCIImage) + } if ac.CRI.Runc.Version != "" { t.Fatalf("CRI.Runc.Version=%q, want empty", ac.CRI.Runc.Version) } diff --git a/pkg/config/config.go b/pkg/config/config.go index a64483b0..82b9766f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -45,6 +45,7 @@ type Config struct { Azure AzureConfig `json:"azure"` Agent AgentConfig `json:"agent"` Components ComponentsConfig `json:"components"` + Bootstrap BootstrapConfig `json:"bootstrap"` Networking NetworkingConfig `json:"networking"` Node NodeConfig `json:"node"` Npd NPDConfig `json:"npd"` @@ -161,9 +162,32 @@ type AgentConfig struct { // ComponentsConfig is the AKS RP component version contract used by the agent // at runtime. type ComponentsConfig struct { - Kubernetes string `json:"kubernetes,omitempty"` - Containerd string `json:"containerd,omitempty"` - Runc string `json:"runc,omitempty"` + Kubernetes string `json:"kubernetes,omitempty"` + Containerd string `json:"containerd,omitempty"` + Runc string `json:"runc,omitempty"` + SandboxImage string `json:"sandboxImage,omitempty"` +} + +// BootstrapConfig holds bootstrap settings that are not Kubernetes component +// versions. +type BootstrapConfig struct { + // OCIImage is the nspawn rootfs OCI image used by the shared agent library. + // When empty, the agent uses its built-in default image selection. + OCIImage string `json:"ociImage,omitempty"` + + // OfflineArtifacts points at a complete offline binary artifact source. + // When source is set, bootstrap resolves Kubernetes, containerd, runc, CNI, + // crictl, and optional sandbox image archive artifacts from this source. + OfflineArtifacts OfflineArtifactsConfig `json:"offlineArtifacts,omitempty"` +} + +// OfflineArtifactsConfig mirrors Unbounded's OfflineArtifacts bootstrap +// setting in the AKS Flex public config shape. +type OfflineArtifactsConfig struct { + // Source is a Go template string that resolves to an absolute filesystem + // path, file:// URL, or oci:// artifact reference. The template may use + // .KubernetesVersion and .KubernetesVersionNoV. + Source string `json:"source,omitempty"` } // NodeConfig holds configuration settings for the Kubernetes node. @@ -374,7 +398,11 @@ func (c *Config) setNodeDefaults() { } func (c *Config) setRuncDefaults() { - // Set default runc configuration if not provided + // Offline artifact manifests are the source of truth for runtime versions. + // Do not synthesize a runc version that would conflict with the manifest. + if c.Bootstrap.OfflineArtifacts.Source != "" { + return + } if c.Components.Runc == "" { c.Components.Runc = "1.1.12" } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index cff4b8d1..bcfd0ad2 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -96,6 +96,15 @@ func TestSetDefaults(t *testing.T) { c.Node.Kubelet.ImageGCLowThreshold == 80 }, }, + { + name: "offline artifacts skip runc default", + config: &Config{Bootstrap: BootstrapConfig{ + OfflineArtifacts: OfflineArtifactsConfig{Source: "/opt/artifacts/{{ .KubernetesVersion }}"}, + }}, + want: func(c *Config) bool { + return c.Components.Runc == "" + }, + }, { name: "machine operation mode can be disabled", config: &Config{Agent: AgentConfig{MachineOperationMode: "disable"}}, @@ -736,7 +745,11 @@ func TestLoadConfigPoolBootstrapData(t *testing.T) { "components": { "kubernetes": "1.29.0", "containerd": "2.0.5", - "runc": "1.2.3" + "runc": "1.2.3", + "sandboxImage": "registry.example.test/pause:3.9" + }, + "bootstrap": { + "ociImage": "registry.example.test/flex/rootfs:ubuntu-24.04" }, "networking": { "dnsServiceIP": "10.42.0.10", @@ -799,6 +812,12 @@ func TestLoadConfigPoolBootstrapData(t *testing.T) { if agentCfg.CRI.Containerd.Version != "2.0.5" { t.Fatalf("Agent CRI.Containerd.Version = %q, want 2.0.5", agentCfg.CRI.Containerd.Version) } + if agentCfg.CRI.Containerd.SandboxImage != "registry.example.test/pause:3.9" { + t.Fatalf("Agent CRI.Containerd.SandboxImage = %q, want registry.example.test/pause:3.9", agentCfg.CRI.Containerd.SandboxImage) + } + if agentCfg.OCIImage != "registry.example.test/flex/rootfs:ubuntu-24.04" { + t.Fatalf("Agent OCIImage = %q, want registry.example.test/flex/rootfs:ubuntu-24.04", agentCfg.OCIImage) + } if agentCfg.CRI.Runc.Version != "1.2.3" { t.Fatalf("Agent CRI.Runc.Version = %q, want 1.2.3", agentCfg.CRI.Runc.Version) } diff --git a/pkg/daemon/nodeoperator.go b/pkg/daemon/nodeoperator.go index 96f13723..6f1f16d8 100644 --- a/pkg/daemon/nodeoperator.go +++ b/pkg/daemon/nodeoperator.go @@ -7,7 +7,6 @@ import ( "github.com/Azure/AKSFlexNode/pkg/aksmachine" "github.com/Azure/AKSFlexNode/pkg/config" - "github.com/Azure/AKSFlexNode/pkg/npd" "github.com/Azure/unbounded/pkg/agent/goalstates" "github.com/Azure/unbounded/pkg/agent/phases" "github.com/Azure/unbounded/pkg/agent/phases/nodestart" @@ -52,7 +51,6 @@ func (o *nspawnNodeOperator) RestartNode(ctx context.Context, log *slog.Logger) nodestop.StopNode(log, active.Name), nodestart.StartNode(log, gs.NodeStart), nodestart.WaitForKubelet(log, active.Name), - npd.Start(log, gs.NodeStart), ).Do(ctx) } diff --git a/pkg/daemon/start.go b/pkg/daemon/start.go index 916fb12a..f40c9fe2 100644 --- a/pkg/daemon/start.go +++ b/pkg/daemon/start.go @@ -7,7 +7,6 @@ import ( "github.com/Azure/AKSFlexNode/pkg/cni" "github.com/Azure/AKSFlexNode/pkg/config" "github.com/Azure/AKSFlexNode/pkg/hostrouting" - "github.com/Azure/AKSFlexNode/pkg/npd" "github.com/Azure/unbounded/pkg/agent/goalstates" "github.com/Azure/unbounded/pkg/agent/phases" "github.com/Azure/unbounded/pkg/agent/phases/host" @@ -41,13 +40,11 @@ func StartNode( return phases.Serial(log, rootfs.Provision(log, gs.RootFS), phases.Parallel(log, - npd.Download(cfg, gs.RootFS.MachineDir), InstallBinary(gs.RootFS.MachineDir), cni.WriteCNIConfig(gs.RootFS.MachineDir), ), nodestart.StartNode(log, gs.NodeStart), nodestart.WaitForKubelet(log, machineName), - npd.Start(log, gs.NodeStart), saveState(store, state), ) }