From 5921c4abb9d81e2b5ad97dcec1a9010f9a909959 Mon Sep 17 00:00:00 2001 From: Michael Heller <21163552+mdheller@users.noreply.github.com> Date: Tue, 16 Jun 2026 15:09:32 -0400 Subject: [PATCH] =?UTF-8?q?fix(enroll):=20ninth=20audit=20=E2=80=94=20comp?= =?UTF-8?q?ose=20guards,=20sed=20verification,=20sops=20check,=20harmonia?= =?UTF-8?q?=20push,=20cache=20priority?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - COMPOSE_FILE existence check: validate the docker-compose YAML exists before `docker-compose up`; previously a missing file gave a generic "No such file" with no enrollment-specific context - sed substitution verification: grep-check that FOREMAN_ADMIN_PASSWORD and KATELLO_PG_PASSWORD keys exist in .env.example before substituting, then confirm the substitution landed; sed exits 0 silently on no-match, which would leave Foreman running with placeholder passwords - Replace python3 sops check with grep: `grep -q '^sops:'` is more accurate than substring search for 'sops' — a YAML comment containing the word would fool the old check; also drops the python3 dependency - nix build progress hint: print BUILD_LOG path before the build so operators know where to tail -f during the silent 5-15 min build; nixos-rebuild steps stream to terminal but nix build was fully silent - nix build head -1: pipe --print-out-paths through head -1 to guard against multi-output derivations; embedded newline in CLOSURE would cause [[ -e ]] and nix copy to fail with a confusing path error - Push active system to harmonia: when CURRENT_GEN differs from CLOSURE (nixos-rebuild fetched a cached derivation), push CURRENT_GEN first so the actually-running system is always in harmonia; previously only CLOSURE was pushed, leaving the active system absent from the cache - sops-nix-activate ground-truth check: test for /run/secrets/katello-password existence in addition to is-failed state; is-failed exits 1 (not-failed) when a unit doesn't exist, producing a false "ok" if sops-nix isn't configured or the service name changes - nix-cache-info Priority 30: local harmonia cache now has higher nix priority than cache.nixos.org (Priority 40); previously both were 40 and nix raced them, defeating the purpose of a local cache --- scripts/enroll.sh | 58 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/scripts/enroll.sh b/scripts/enroll.sh index 024919d..01b03a7 100755 --- a/scripts/enroll.sh +++ b/scripts/enroll.sh @@ -261,8 +261,18 @@ if [[ ! -f "${COMPOSE_ENV}" ]]; then # syscall. A plain `cp` followed by `chmod 600` leaves a window where the # file (containing plaintext passwords) is world-readable. install -m 600 "${COMPOSE_ENV_EXAMPLE}" "${COMPOSE_ENV}" + # Verify the expected keys exist before substituting — if the template + # changes to use different key names, sed exits 0 silently with no change + # and Foreman boots with the example placeholder passwords. + grep -q '^FOREMAN_ADMIN_PASSWORD=' "${COMPOSE_ENV}" || \ + die "FOREMAN_ADMIN_PASSWORD key missing from ${COMPOSE_ENV_EXAMPLE} — template may have changed" + grep -q '^KATELLO_PG_PASSWORD=' "${COMPOSE_ENV}" || \ + die "KATELLO_PG_PASSWORD key missing from ${COMPOSE_ENV_EXAMPLE} — template may have changed" sed -i "s|^FOREMAN_ADMIN_PASSWORD=.*|FOREMAN_ADMIN_PASSWORD=${FOREMAN_ADMIN_PASSWORD}|" "${COMPOSE_ENV}" sed -i "s|^KATELLO_PG_PASSWORD=.*|KATELLO_PG_PASSWORD=${KATELLO_PG_PASSWORD}|" "${COMPOSE_ENV}" + # Confirm the substitution actually landed — paranoia against sed edge cases. + grep -q "^FOREMAN_ADMIN_PASSWORD=${FOREMAN_ADMIN_PASSWORD}" "${COMPOSE_ENV}" || \ + die "sed failed to set FOREMAN_ADMIN_PASSWORD in ${COMPOSE_ENV}" # Atomic write: if the script is killed mid-write the file stays absent # rather than being empty, so a re-run regenerates it rather than reading @@ -279,6 +289,11 @@ KATELLO_PASSWORD=$(cat "${KATELLO_ADMIN_PW_FILE}") die "Katello password file is empty: ${KATELLO_ADMIN_PW_FILE} Regenerate: rm -f ${KATELLO_ADMIN_PW_FILE} ${COMPOSE_ENV} && sudo bash scripts/enroll.sh" +[[ -f "${COMPOSE_FILE}" ]] || \ + die "docker-compose file not found: ${COMPOSE_FILE} + The prophet-platform repo layout may have changed. + Check: ls $(dirname "${COMPOSE_FILE}")" + docker-compose -f "${COMPOSE_FILE}" --env-file "${COMPOSE_ENV}" up -d # Verify that compose actually started containers — `up -d` exits 0 even if @@ -327,11 +342,9 @@ require_cmd sops _needs_encrypt=0 if [[ -f "${SECRETS_YAML}" ]]; then - if python3 -c " -import json, sys -d = open('${SECRETS_YAML}').read() -if 'sops' not in d: sys.exit(1) -" 2>/dev/null; then + # grep for the sops metadata top-level key rather than a python substring + # search — a YAML comment containing the word 'sops' would fool the old check. + if grep -q '^sops:' "${SECRETS_YAML}" 2>/dev/null; then ok "secrets.yaml already SOPS-encrypted" else warn "secrets.yaml exists but is not encrypted — re-encrypting" @@ -425,8 +438,11 @@ info "Signing public key: ${SIGNING_PUBKEY}" cat > "${MINISIGN_CACHE_INFO}.tmp" <"${BUILD_LOG}") + --no-link --print-out-paths 2>"${BUILD_LOG}" | head -1) # nix build emits errors only to stderr (captured to BUILD_LOG above). # Verify stdout produced a non-empty, existing store path. [[ -n "${CLOSURE}" && -e "${CLOSURE}" ]] || \ @@ -506,12 +525,20 @@ for _i in {1..12}; do sleep 5 done if [[ $_HARM_UP -eq 1 ]]; then + # Push the active system (CURRENT_GEN) first — it may differ from CLOSURE if + # nixos-rebuild fetched a cached derivation with a different hash. Without this, + # the running system would not be in harmonia after enrollment. + if [[ -n "${CURRENT_GEN}" && "${CURRENT_GEN}" != "${CLOSURE}" ]]; then + nix copy --to "http://127.0.0.1:8101?compression=zstd" "${CURRENT_GEN}" && \ + ok "Active system pushed to harmonia cache" || \ + warn "nix copy (active gen) failed — run: nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CURRENT_GEN}" + fi nix copy --to "http://127.0.0.1:8101?compression=zstd" "${CLOSURE}" && \ - ok "Closure pushed to harmonia cache" || \ + ok "Step-10 closure pushed to harmonia cache" || \ warn "nix copy failed — run manually: nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CLOSURE}" else warn "harmonia not responding on :8101 after 60s — push manually after it starts" - warn " nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CLOSURE}" + warn " nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CURRENT_GEN:-${CLOSURE}}" fi # Promote content view to stable so sourceos-syncd can pick it up @@ -570,15 +597,22 @@ else HEALTHY=0 fi -# sops-nix-activate is a oneshot: it stays in 'failed' state (not 'inactive') -# when decryption fails. Distinguish "still starting" from "sops blew up". +# Check sops-nix activation by testing for the decrypted secret file rather +# than by service state alone. `is-failed` exits 1 (not-failed) if the unit +# doesn't exist, producing a false "ok" even when sops-nix isn't configured. +# The actual secret path is the ground truth. +_SOPS_SECRET="/run/secrets/katello-password" if systemctl is-failed --quiet sops-nix-activate 2>/dev/null; then warn "sops-nix-activate FAILED — secrets not decrypted; sourceos-syncd will not start" warn " Diagnose: journalctl -u sops-nix-activate | tail -30" warn " Common cause: age key mismatch — re-run: rm -f ${SECRETS_YAML} && sudo bash scripts/enroll.sh" HEALTHY=0 +elif [[ -f "${_SOPS_SECRET}" ]]; then + ok "sops-nix-activate: ok — secret present at ${_SOPS_SECRET}" else - ok "sops-nix-activate: ok (secrets decrypted)" + warn "sops-nix-activate: secret not at ${_SOPS_SECRET} — may still be activating" + warn " Diagnose: journalctl -u sops-nix-activate | tail -20" + HEALTHY=0 fi # Verify Katello containers are still running after the long enrollment.