From 0d2bed7144a5595caf03793a9a5f8bbbd2abe9be Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Fri, 12 Jun 2026 14:25:06 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20SF=20predictor=20bootstrap=20cp=20--remo?= =?UTF-8?q?ve-destination=20=E2=80=94=20survive=20root-owned=20stale=20pre?= =?UTF-8?q?dictor.yaml=20(config#1034)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 2026-06-12 Friday shell run failed at PredictorTraining: an ad-hoc SSM edit on 06-07 (run as root, no sudo -u ec2-user) left alpha-engine-predictor/config/predictor.yaml root-owned on the always-on box, and 'sudo -u ec2-user cp' cannot open a root-owned 644 file for truncation even though ec2-user owns the directory. cp --remove-destination unlinks the target first (allowed via directory ownership) and recreates it owned by ec2-user, so a stale root-owned file self-heals on the next run instead of failing the branch. Applied to both bootstrap cp sites (PredictorTraining + ModelZooRotation). Frozen byte-identical fixture regenerated per its documented procedure (deliberate, reviewed change to a spot state's absent-path command). Box ownership was also fixed directly (chown); this change is the structural guard against recurrence. Co-Authored-By: Claude Fable 5 --- infrastructure/step_function.json | 4 ++-- tests/fixtures/sf_prekeystone_spot_commands.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/infrastructure/step_function.json b/infrastructure/step_function.json index 79650d9..fdfd20d 100644 --- a/infrastructure/step_function.json +++ b/infrastructure/step_function.json @@ -1393,7 +1393,7 @@ "DocumentName": "AWS-RunShellScript", "InstanceIds.$": "$.ec2_instance_id", "Parameters": { - "commands.$": "States.Array('set -eo pipefail','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-predictor pull --ff-only origin main','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-config pull --ff-only origin main','sudo -u ec2-user cp /home/ec2-user/alpha-engine-config/predictor/predictor.yaml /home/ec2-user/alpha-engine-predictor/config/predictor.yaml','cd /home/ec2-user/alpha-engine-predictor','export HOME=/home/ec2-user','set -a && source /home/ec2-user/.alpha-engine.env && set +a',States.Format('/home/ec2-user/alpha-engine-dashboard/.venv/bin/python -m alpha_engine_lib.ssm_log_capture run --slug predictor-training --log /var/log/predictor-training.log -- bash infrastructure/spot_train.sh --full-only{}',$.preflight_args))", + "commands.$": "States.Array('set -eo pipefail','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-predictor pull --ff-only origin main','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-config pull --ff-only origin main','sudo -u ec2-user cp --remove-destination /home/ec2-user/alpha-engine-config/predictor/predictor.yaml /home/ec2-user/alpha-engine-predictor/config/predictor.yaml','cd /home/ec2-user/alpha-engine-predictor','export HOME=/home/ec2-user','set -a && source /home/ec2-user/.alpha-engine.env && set +a',States.Format('/home/ec2-user/alpha-engine-dashboard/.venv/bin/python -m alpha_engine_lib.ssm_log_capture run --slug predictor-training --log /var/log/predictor-training.log -- bash infrastructure/spot_train.sh --full-only{}',$.preflight_args))", "executionTimeout": [ "5400" ] @@ -1521,7 +1521,7 @@ "DocumentName": "AWS-RunShellScript", "InstanceIds.$": "$.ec2_instance_id", "Parameters": { - "commands.$": "States.Array('set -eo pipefail','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-predictor pull --ff-only origin main','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-config pull --ff-only origin main','sudo -u ec2-user cp /home/ec2-user/alpha-engine-config/predictor/predictor.yaml /home/ec2-user/alpha-engine-predictor/config/predictor.yaml','cd /home/ec2-user/alpha-engine-predictor','export HOME=/home/ec2-user','set -a && source /home/ec2-user/.alpha-engine.env && set +a',States.Format('/home/ec2-user/alpha-engine-dashboard/.venv/bin/python -m alpha_engine_lib.ssm_log_capture run --slug predictor-model-zoo --log /var/log/predictor-model-zoo.log -- bash infrastructure/spot_train.sh --model-zoo-weekly{}',$.preflight_args))", + "commands.$": "States.Array('set -eo pipefail','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-predictor pull --ff-only origin main','sudo -u ec2-user git -C /home/ec2-user/alpha-engine-config pull --ff-only origin main','sudo -u ec2-user cp --remove-destination /home/ec2-user/alpha-engine-config/predictor/predictor.yaml /home/ec2-user/alpha-engine-predictor/config/predictor.yaml','cd /home/ec2-user/alpha-engine-predictor','export HOME=/home/ec2-user','set -a && source /home/ec2-user/.alpha-engine.env && set +a',States.Format('/home/ec2-user/alpha-engine-dashboard/.venv/bin/python -m alpha_engine_lib.ssm_log_capture run --slug predictor-model-zoo --log /var/log/predictor-model-zoo.log -- bash infrastructure/spot_train.sh --model-zoo-weekly{}',$.preflight_args))", "executionTimeout": [ "5400" ] diff --git a/tests/fixtures/sf_prekeystone_spot_commands.json b/tests/fixtures/sf_prekeystone_spot_commands.json index 4db9439..65e1dba 100644 --- a/tests/fixtures/sf_prekeystone_spot_commands.json +++ b/tests/fixtures/sf_prekeystone_spot_commands.json @@ -74,7 +74,7 @@ "set -eo pipefail", "sudo -u ec2-user git -C /home/ec2-user/alpha-engine-predictor pull --ff-only origin main", "sudo -u ec2-user git -C /home/ec2-user/alpha-engine-config pull --ff-only origin main", - "sudo -u ec2-user cp /home/ec2-user/alpha-engine-config/predictor/predictor.yaml /home/ec2-user/alpha-engine-predictor/config/predictor.yaml", + "sudo -u ec2-user cp --remove-destination /home/ec2-user/alpha-engine-config/predictor/predictor.yaml /home/ec2-user/alpha-engine-predictor/config/predictor.yaml", "cd /home/ec2-user/alpha-engine-predictor", "export HOME=/home/ec2-user", "set -a && source /home/ec2-user/.alpha-engine.env && set +a",