From efdd683dec91f98d1ef54e0cc8fec0f38e71afb1 Mon Sep 17 00:00:00 2001 From: kmajdoub Date: Fri, 26 Jun 2026 18:53:44 +0200 Subject: [PATCH] =?UTF-8?q?feat(ops):=20forward-progress=20watchdog=20?= =?UTF-8?q?=E2=80=94=20catch=20"busy=20but=20not=20progressing"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Event watchers (PR_OPEN/STALL/DEAD) are blind to a loop that is busy but not making forward progress — e.g. spinning for hours re-reviewing one approved-but-blocked PR while every liveness check stays green (cost ~9h once). The invariant that matters is merges landing, not liveness. scripts/progress-watchdog.sh watches forward progress and exits non-zero on a tripwire so a supervisor/operator/agent can react: - 10 loop entrypoint absent (matches the real `bin/forge-loop`, not the self-matching string "forge-loop run") - 11 stop-file present - 12 two-factor stall: no merge in 90min AND event log quiet 30min (a fresh event log = the loop is legitimately building/repairing a risk-gated PR that parks for human review, which IS progress) - 13 an open PR piling up comments past a cap (busy-but-stuck signature) Repo-agnostic via REPO_PATH/GH_REPO env; polls merges with token-free ls-remote. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/progress-watchdog.sh | 94 ++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100755 scripts/progress-watchdog.sh diff --git a/scripts/progress-watchdog.sh b/scripts/progress-watchdog.sh new file mode 100755 index 0000000..cf5d010 --- /dev/null +++ b/scripts/progress-watchdog.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# forge-loop forward-progress watchdog. +# +# WHY: event watchers (PR_OPEN / STALL / DEAD) are blind to "busy but not +# progressing" — a loop can spin for hours re-reviewing one approved-but-blocked +# PR while every liveness check stays green (this cost ~9h once). The invariant +# that matters is FORWARD PROGRESS (merges landing), not liveness. This script +# watches that invariant and exits non-zero on a tripwire so a supervisor (cron, +# systemd, an operator, or an agent re-invoked on exit) can react. +# +# USAGE: +# REPO_PATH=/path/to/checkout GH_REPO=owner/name scripts/progress-watchdog.sh +# +# ENV (all optional except the two above): +# REPO_PATH local checkout the loop runs in (default: cwd) +# GH_REPO GitHub slug owner/name for merge polling (required for stall check) +# STALL_SECS no-merge window before a stall (default: 5400 = 90min) +# QUIET_SECS event-log silence to confirm "stuck" (default: 1800 = 30min) +# POLL_SECS seconds between checks (default: 600 = 10min) +# COMMENT_CAP open-PR comment count = comment-storm (default: 18) +# EVENTLOG path to loop-runner-events.jsonl (default: $REPO_PATH/docs/ops/loop-runner-events.jsonl) +# STOPFILE path to the loop stop-file (default: $REPO_PATH/docs/ops/loop-runner.stop) +# +# EXIT CODES (tripwires): +# 10 loop entrypoint absent (DEAD) +# 11 stop-file present +# 12 no merge in STALL_SECS AND event log quiet QUIET_SECS (STUCK) +# 13 an open PR is piling up comments past COMMENT_CAP (busy-but-stuck) +# 2 misconfiguration +# +# NOTE on liveness detection: match the real installed entrypoint "bin/forge-loop" +# (NOT the string "forge-loop run", which self-matches an interactive shell that +# happens to contain that literal). Retry briefly to tolerate a tick boundary. +set -u + +REPO_PATH="${REPO_PATH:-$PWD}" +GH_REPO="${GH_REPO:-}" +STALL_SECS="${STALL_SECS:-5400}" +QUIET_SECS="${QUIET_SECS:-1800}" +POLL_SECS="${POLL_SECS:-600}" +COMMENT_CAP="${COMMENT_CAP:-18}" +EVENTLOG="${EVENTLOG:-$REPO_PATH/docs/ops/loop-runner-events.jsonl}" +STOPFILE="${STOPFILE:-$REPO_PATH/docs/ops/loop-runner.stop}" + +cd "$REPO_PATH" || { echo "WATCHDOG: cannot cd $REPO_PATH"; exit 2; } +command -v gh >/dev/null 2>&1 || { echo "WATCHDOG: gh CLI not found"; exit 2; } + +# gh needs an empty GH_TOKEN to use its own stored auth (a loop-exported GH_TOKEN +# breaks the gh CLI). Poll merges via ls-remote so no token is needed at all. +remote_main_sha() { git ls-remote "https://github.com/$GH_REPO.git" main 2>/dev/null | awk '{print $1}'; } + +base_sha="" +[ -n "$GH_REPO" ] && base_sha="$(remote_main_sha)" +last_advance="$(date +%s)" + +echo "WATCHDOG: watching $REPO_PATH (repo=${GH_REPO:-}) stall=${STALL_SECS}s quiet=${QUIET_SECS}s poll=${POLL_SECS}s" + +while true; do + now="$(date +%s)" + + # 1. loop alive? (real entrypoint, brief retry window for tick boundaries) + alive=0 + for _ in 1 2 3; do + if pgrep -af "bin/forge-loop" 2>/dev/null | grep -qv -e pgrep -e watchdog; then alive=1; break; fi + sleep 10 + done + [ "$alive" = "0" ] && { echo "TRIPWIRE(10): forge-loop entrypoint absent (DEAD)"; exit 10; } + + # 2. stop-file + [ -f "$STOPFILE" ] && { echo "TRIPWIRE(11): stop-file present ($STOPFILE)"; exit 11; } + + # 3. forward progress: merge advanced OR loop still actively ticking + if [ -n "$GH_REPO" ]; then + cur_sha="$(remote_main_sha)" + [ -n "$cur_sha" ] && [ "$cur_sha" != "$base_sha" ] && { base_sha="$cur_sha"; last_advance="$now"; } + ev_age=999999 + [ -f "$EVENTLOG" ] && ev_age=$(( now - $(stat -c %Y "$EVENTLOG" 2>/dev/null || echo 0) )) + # Two-factor: a fresh event log means the loop is working (building/repairing a + # risk-gated PR that parks for human review IS progress) — only a frozen log + # plus no merge is a genuinely stuck loop. + if [ $(( now - last_advance )) -ge "$STALL_SECS" ] && [ "$ev_age" -ge "$QUIET_SECS" ]; then + echo "TRIPWIRE(12): no merge in $((STALL_SECS/60))min AND event log quiet ${ev_age}s (STUCK)"; exit 12 + fi + fi + + # 4. a PR piling up comments without merging (busy-but-stuck signature) + if [ -n "$GH_REPO" ]; then + stuck="$(GH_TOKEN= gh pr list --repo "$GH_REPO" --state open \ + --json number,comments --jq ".[]|select((.comments|length)>$COMMENT_CAP)|.number" 2>/dev/null | head -1)" + [ -n "$stuck" ] && { echo "TRIPWIRE(13): PR #$stuck piling up >$COMMENT_CAP comments without merging"; exit 13; } + fi + + sleep "$POLL_SECS" +done