From c3619656427dab152322057712c21894f68549f6 Mon Sep 17 00:00:00 2001 From: Michael Heller <21163552+mdheller@users.noreply.github.com> Date: Tue, 16 Jun 2026 15:03:58 -0400 Subject: [PATCH] =?UTF-8?q?fix(enroll):=20eighth=20audit=20=E2=80=94=20ato?= =?UTF-8?q?mic=20age-keygen,=20sops=20validation,=20step-12=20depth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - gen_password: validate output >= 16 chars; tr -d can produce a short result on low-probability urandom runs; guard catches it before the password is written to files - age-keygen atomic write: replace `age-keygen -o AGE_KEY` (truncate then write) with stdout capture to AGE_KEY.tmp then mv; interrupted write previously left a partial key that re-runs treated as present and then failed on age-keygen -y with a misleading "may be corrupt" message - age-keygen stale .tmp cleanup: rm -f AGE_KEY.tmp at step 3 start, consistent with HW_CONFIG.tmp, ENROLL_NIX.tmp, MINISIGN_CACHE_INFO.tmp - sops output non-empty check: [[ -s _SECRETS_TMP ]] before mv; sops exits 0 on some edge cases while producing empty output; empty ciphertext would silently replace secrets.yaml and loop on re-run - Remove redundant chmod 600 after mv: mv preserves permissions from _SECRETS_TMP (created 600 via umask 077); the post-mv chmod was a no-op that implied mv might change permissions - $(seq 1 N) → {1..N} brace expansion in both polling loops: eliminates subshell forks in tight 5s-interval loops (harmonia wait, syncd poll) - sops-nix-activate failed-state check in step 12: oneshot services stay in 'failed' not 'inactive' when they error; distinguishes decryption failure from "still starting", with targeted re-enroll instructions - Katello container liveness recheck in step 12: containers can be OOM-killed during nix build (steps 10-11); sourceos-syncd shows active but fails all Katello calls silently without this check - Quote ${KATELLO_ADMIN_PW_FILE} in banner cat call: unquoted variable is inconsistent with quoting elsewhere; safe in practice but fragile --- scripts/enroll.sh | 48 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/scripts/enroll.sh b/scripts/enroll.sh index de40638..024919d 100755 --- a/scripts/enroll.sh +++ b/scripts/enroll.sh @@ -78,7 +78,13 @@ wait_for_url() { ok "${label} is up" } -gen_password() { head -c 32 /dev/urandom | base64 | tr -d '/+=' | head -c 24; } +gen_password() { + local pw + pw=$(head -c 32 /dev/urandom | base64 | tr -d '/+=' | head -c 24) + # Paranoia: tr -d can strip enough chars to produce a short result. + [[ ${#pw} -ge 16 ]] || { echo "gen_password produced a short result (${#pw} chars) — retry" >&2; return 1; } + printf '%s' "${pw}" +} write_enroll_nix() { local signing_pub="$1" harmonia_pub_key="$2" @@ -112,7 +118,7 @@ banner() { printf " %-28s %s\n" "Signing public key:" "${MINISIGN_PUB}" printf " %-28s %s\n" "Harmonia cache key:" "${HARMONIA_PUB}" printf " %-28s %s\n" "Katello UI:" "${KATELLO_URL}" - printf " %-28s %s\n" "Katello password:" "$(cat ${KATELLO_ADMIN_PW_FILE} 2>/dev/null || echo '(see file)')" + printf " %-28s %s\n" "Katello password:" "$(cat "${KATELLO_ADMIN_PW_FILE}" 2>/dev/null || echo '(see file)')" echo printf " ${BOLD}Next steps:${NC}\n" printf " %-28s %s\n" "Daemon status:" "systemctl status sourceos-syncd" @@ -195,12 +201,18 @@ ok "Pass 1 complete ($(elapsed)) — log: ${PASS1_LOG}" step 3 "Device age key" +rm -f "${AGE_KEY}.tmp" # clean up stale temp from a previous interrupted run require_cmd age-keygen if [[ -f "${AGE_KEY}" ]]; then ok "Age key already exists" else - age-keygen -o "${AGE_KEY}" + # Capture stdout so the write is atomic: age-keygen -o FILE truncates before + # writing, leaving a corrupt partial file if interrupted. Stdout capture + + # mv means re-runs see no file (and regenerate) rather than a corrupt one. + age-keygen 2>/dev/null > "${AGE_KEY}.tmp" + [[ -s "${AGE_KEY}.tmp" ]] || die "age-keygen produced empty output — check age installation" + mv "${AGE_KEY}.tmp" "${AGE_KEY}" chmod 600 "${AGE_KEY}" ok "Generated ${AGE_KEY} ($(elapsed))" fi @@ -340,8 +352,10 @@ if [[ "${_needs_encrypt}" -eq 1 ]]; then chmod 600 "${_SECRETS_TMP}" trap "rm -f ${PLAINTEXT} ${_SECRETS_TMP}" EXIT SOPS_AGE_RECIPIENTS="${AGE_PUBKEY}" sops --encrypt "${PLAINTEXT}" > "${_SECRETS_TMP}" + [[ -s "${_SECRETS_TMP}" ]] || \ + die "sops --encrypt produced empty output — check AGE_PUBKEY format and sops version" mv "${_SECRETS_TMP}" "${SECRETS_YAML}" - chmod 600 "${SECRETS_YAML}" + # mv preserves permissions from _SECRETS_TMP (already 600 via umask 077). rm -f "${PLAINTEXT}"; trap - EXIT ok "Encrypted secrets written to ${SECRETS_YAML} ($(elapsed))" fi @@ -485,7 +499,7 @@ fi # repopulate the cache on its first sync cycle. info "Pushing NixOS closure to local harmonia cache (http://127.0.0.1:8101)..." _HARM_UP=0 -for _i in $(seq 1 12); do +for _i in {1..12}; do if curl -fsSk --max-time 5 "http://127.0.0.1:8101/nix-cache-info" &>/dev/null; then _HARM_UP=1; break fi @@ -543,7 +557,7 @@ step 12 "Verify enrollment" # slow path it can take longer. A fixed sleep either wastes time or races. HEALTHY=1 _SYNCD_UP=0 -for _i in $(seq 1 6); do +for _i in {1..6}; do if systemctl is-active --quiet sourceos-syncd 2>/dev/null; then _SYNCD_UP=1; break fi @@ -556,6 +570,28 @@ else HEALTHY=0 fi +# sops-nix-activate is a oneshot: it stays in 'failed' state (not 'inactive') +# when decryption fails. Distinguish "still starting" from "sops blew up". +if systemctl is-failed --quiet sops-nix-activate 2>/dev/null; then + warn "sops-nix-activate FAILED — secrets not decrypted; sourceos-syncd will not start" + warn " Diagnose: journalctl -u sops-nix-activate | tail -30" + warn " Common cause: age key mismatch — re-run: rm -f ${SECRETS_YAML} && sudo bash scripts/enroll.sh" + HEALTHY=0 +else + ok "sops-nix-activate: ok (secrets decrypted)" +fi + +# Verify Katello containers are still running after the long enrollment. +# They can be OOM-killed during nix build (steps 10–11) without the script noticing. +_KATELLO_FINAL=$(docker ps --filter "name=katello" --format "{{.Names}}" 2>/dev/null | wc -l | tr -d ' ') +if [[ "${_KATELLO_FINAL}" -ge 1 ]]; then + ok "Katello containers: ${_KATELLO_FINAL} running" +else + warn "No Katello containers running — they may have crashed during enrollment" + warn " Restart: docker-compose -f ${COMPOSE_FILE} up -d" + HEALTHY=0 +fi + if systemctl is-active --quiet harmonia 2>/dev/null; then ok "harmonia: active" else