diff --git a/scripts/enroll.sh b/scripts/enroll.sh index 024919d..01b03a7 100755 --- a/scripts/enroll.sh +++ b/scripts/enroll.sh @@ -261,8 +261,18 @@ if [[ ! -f "${COMPOSE_ENV}" ]]; then # syscall. A plain `cp` followed by `chmod 600` leaves a window where the # file (containing plaintext passwords) is world-readable. install -m 600 "${COMPOSE_ENV_EXAMPLE}" "${COMPOSE_ENV}" + # Verify the expected keys exist before substituting — if the template + # changes to use different key names, sed exits 0 silently with no change + # and Foreman boots with the example placeholder passwords. + grep -q '^FOREMAN_ADMIN_PASSWORD=' "${COMPOSE_ENV}" || \ + die "FOREMAN_ADMIN_PASSWORD key missing from ${COMPOSE_ENV_EXAMPLE} — template may have changed" + grep -q '^KATELLO_PG_PASSWORD=' "${COMPOSE_ENV}" || \ + die "KATELLO_PG_PASSWORD key missing from ${COMPOSE_ENV_EXAMPLE} — template may have changed" sed -i "s|^FOREMAN_ADMIN_PASSWORD=.*|FOREMAN_ADMIN_PASSWORD=${FOREMAN_ADMIN_PASSWORD}|" "${COMPOSE_ENV}" sed -i "s|^KATELLO_PG_PASSWORD=.*|KATELLO_PG_PASSWORD=${KATELLO_PG_PASSWORD}|" "${COMPOSE_ENV}" + # Confirm the substitution actually landed — paranoia against sed edge cases. + grep -q "^FOREMAN_ADMIN_PASSWORD=${FOREMAN_ADMIN_PASSWORD}" "${COMPOSE_ENV}" || \ + die "sed failed to set FOREMAN_ADMIN_PASSWORD in ${COMPOSE_ENV}" # Atomic write: if the script is killed mid-write the file stays absent # rather than being empty, so a re-run regenerates it rather than reading @@ -279,6 +289,11 @@ KATELLO_PASSWORD=$(cat "${KATELLO_ADMIN_PW_FILE}") die "Katello password file is empty: ${KATELLO_ADMIN_PW_FILE} Regenerate: rm -f ${KATELLO_ADMIN_PW_FILE} ${COMPOSE_ENV} && sudo bash scripts/enroll.sh" +[[ -f "${COMPOSE_FILE}" ]] || \ + die "docker-compose file not found: ${COMPOSE_FILE} + The prophet-platform repo layout may have changed. + Check: ls $(dirname "${COMPOSE_FILE}")" + docker-compose -f "${COMPOSE_FILE}" --env-file "${COMPOSE_ENV}" up -d # Verify that compose actually started containers — `up -d` exits 0 even if @@ -327,11 +342,9 @@ require_cmd sops _needs_encrypt=0 if [[ -f "${SECRETS_YAML}" ]]; then - if python3 -c " -import json, sys -d = open('${SECRETS_YAML}').read() -if 'sops' not in d: sys.exit(1) -" 2>/dev/null; then + # grep for the sops metadata top-level key rather than a python substring + # search — a YAML comment containing the word 'sops' would fool the old check. + if grep -q '^sops:' "${SECRETS_YAML}" 2>/dev/null; then ok "secrets.yaml already SOPS-encrypted" else warn "secrets.yaml exists but is not encrypted — re-encrypting" @@ -425,8 +438,11 @@ info "Signing public key: ${SIGNING_PUBKEY}" cat > "${MINISIGN_CACHE_INFO}.tmp" <"${BUILD_LOG}") + --no-link --print-out-paths 2>"${BUILD_LOG}" | head -1) # nix build emits errors only to stderr (captured to BUILD_LOG above). # Verify stdout produced a non-empty, existing store path. [[ -n "${CLOSURE}" && -e "${CLOSURE}" ]] || \ @@ -506,12 +525,20 @@ for _i in {1..12}; do sleep 5 done if [[ $_HARM_UP -eq 1 ]]; then + # Push the active system (CURRENT_GEN) first — it may differ from CLOSURE if + # nixos-rebuild fetched a cached derivation with a different hash. Without this, + # the running system would not be in harmonia after enrollment. + if [[ -n "${CURRENT_GEN}" && "${CURRENT_GEN}" != "${CLOSURE}" ]]; then + nix copy --to "http://127.0.0.1:8101?compression=zstd" "${CURRENT_GEN}" && \ + ok "Active system pushed to harmonia cache" || \ + warn "nix copy (active gen) failed — run: nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CURRENT_GEN}" + fi nix copy --to "http://127.0.0.1:8101?compression=zstd" "${CLOSURE}" && \ - ok "Closure pushed to harmonia cache" || \ + ok "Step-10 closure pushed to harmonia cache" || \ warn "nix copy failed — run manually: nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CLOSURE}" else warn "harmonia not responding on :8101 after 60s — push manually after it starts" - warn " nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CLOSURE}" + warn " nix copy --to 'http://127.0.0.1:8101?compression=zstd' ${CURRENT_GEN:-${CLOSURE}}" fi # Promote content view to stable so sourceos-syncd can pick it up @@ -570,15 +597,22 @@ else HEALTHY=0 fi -# sops-nix-activate is a oneshot: it stays in 'failed' state (not 'inactive') -# when decryption fails. Distinguish "still starting" from "sops blew up". +# Check sops-nix activation by testing for the decrypted secret file rather +# than by service state alone. `is-failed` exits 1 (not-failed) if the unit +# doesn't exist, producing a false "ok" even when sops-nix isn't configured. +# The actual secret path is the ground truth. +_SOPS_SECRET="/run/secrets/katello-password" if systemctl is-failed --quiet sops-nix-activate 2>/dev/null; then warn "sops-nix-activate FAILED — secrets not decrypted; sourceos-syncd will not start" warn " Diagnose: journalctl -u sops-nix-activate | tail -30" warn " Common cause: age key mismatch — re-run: rm -f ${SECRETS_YAML} && sudo bash scripts/enroll.sh" HEALTHY=0 +elif [[ -f "${_SOPS_SECRET}" ]]; then + ok "sops-nix-activate: ok — secret present at ${_SOPS_SECRET}" else - ok "sops-nix-activate: ok (secrets decrypted)" + warn "sops-nix-activate: secret not at ${_SOPS_SECRET} — may still be activating" + warn " Diagnose: journalctl -u sops-nix-activate | tail -20" + HEALTHY=0 fi # Verify Katello containers are still running after the long enrollment.