diff --git a/.github/workflows/e2e-label-dispatch.yml b/.github/workflows/e2e-label-dispatch.yml new file mode 100644 index 000000000..b58ca306a --- /dev/null +++ b/.github/workflows/e2e-label-dispatch.yml @@ -0,0 +1,65 @@ +name: E2E Label Dispatch + +# When a maintainer applies `test:e2e` or `test:e2e-gpu`, dispatch the matching +# self-hosted workflow against the copy-pr-bot mirror branch. Without this, +# the gated workflow only runs on push to `pull-request/`, so a label +# applied after the mirror was created leaves the gate stuck red until someone +# manually re-runs the workflow. +# +# Pushes to `pull-request/` (whether from an automatic copy-pr-bot sync or +# a maintainer-typed `/ok to test`) already re-trigger the gated workflow on +# their own - only the label-application case needs this dispatcher. +# +# Uses `pull_request_target` so forked PRs get a write-capable token. The job +# never checks out PR code; it only calls the GitHub API. + +on: + pull_request_target: + types: [labeled] + +permissions: {} + +jobs: + dispatch: + name: Dispatch E2E workflow for labeled PR + if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu' + runs-on: ubuntu-latest + permissions: + actions: write + pull-requests: write + steps: + - name: Dispatch workflow against the copy-pr-bot mirror + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} + LABEL_NAME: ${{ github.event.label.name }} + shell: bash + run: | + set -euo pipefail + + case "$LABEL_NAME" in + test:e2e) workflow=branch-e2e.yml ;; + test:e2e-gpu) workflow=test-gpu.yml ;; + *) echo "Unrecognized label $LABEL_NAME"; exit 1 ;; + esac + + mirror_ref="pull-request/$PR_NUMBER" + mirror_sha=$(gh api "repos/$GH_REPO/branches/$mirror_ref" --jq '.commit.sha' 2>/dev/null || echo "") + short_pr=${PR_HEAD_SHA:0:7} + + if [ -z "$mirror_sha" ]; then + gh pr comment "$PR_NUMBER" --body "Label \`$LABEL_NAME\` applied, but copy-pr-bot has not mirrored this PR yet. A maintainer needs to comment \`/ok to test $PR_HEAD_SHA\` to start the mirror; re-apply the label once \`$mirror_ref\` exists." + exit 0 + fi + + if [ "$mirror_sha" != "$PR_HEAD_SHA" ]; then + short_mirror=${mirror_sha:0:7} + gh pr comment "$PR_NUMBER" --body "Label \`$LABEL_NAME\` applied, but \`$mirror_ref\` is at \`$short_mirror\` while the PR head is \`$short_pr\`. Comment \`/ok to test $PR_HEAD_SHA\` to refresh the mirror, then re-apply the label." + exit 0 + fi + + echo "Dispatching $workflow against $mirror_ref ($short_pr) for label $LABEL_NAME." + gh workflow run "$workflow" --ref "$mirror_ref" + gh pr comment "$PR_NUMBER" --body "Dispatched \`$workflow\` against \`$mirror_ref\` at \`$short_pr\` (label \`$LABEL_NAME\`). Results will post as checks on this PR." diff --git a/CI.md b/CI.md new file mode 100644 index 000000000..b849919bd --- /dev/null +++ b/CI.md @@ -0,0 +1,111 @@ +# CI + +This document describes how OpenShell's continuous integration works for pull requests, with a focus on what contributors need to do to get their PR tested. + +For local test commands see [TESTING.md](TESTING.md). For PR conventions see [CONTRIBUTING.md](CONTRIBUTING.md). + +## Overview + +E2E tests run on self-hosted runners (`build-arm64`, GPU runners). To keep untrusted PR code off those runners we use NVIDIA's copy-pr-bot, which mirrors trusted PR commits to internal `pull-request/` branches in this repository. The gated workflows trigger on pushes to those branches, not on the original PR. + +Two opt-in labels enable the suites: + +- `test:e2e` runs `Branch E2E Checks` (non-GPU E2E) +- `test:e2e-gpu` runs `GPU Test` + +Both are required to merge once the corresponding `E2E Gate` checks are marked required in branch protection. + +## Commit signing + +copy-pr-bot decides whether to mirror a PR automatically based on whether the author is trusted. For org members and collaborators, "trusted" means **all commits in the PR are cryptographically signed**. Unsigned commits, even from an org member, force the bot to wait for a maintainer's `/ok to test `. + +DCO sign-off (`-s` / `Signed-off-by`) is a separate requirement and does not count as commit signing. + +### One-time setup with an SSH key + +If you already use an SSH key for `git push`, you can reuse it as a signing key. (You can also generate a separate one - GitHub allows the same SSH key as both auth and signing.) + +1. Generate a key (skip if reusing your existing SSH key): + + ```shell + ssh-keygen -t ed25519 -C "you@example.com" -f ~/.ssh/id_ed25519_signing + ``` + +2. Add the **public** key at using **New SSH key**, and set **Key type: Signing Key** (not Authentication). Signing keys are managed separately from authentication keys, even when they reuse the same key material - you have to add the entry once for each role. + +3. Configure git globally: + + ```shell + git config --global gpg.format ssh + git config --global user.signingkey ~/.ssh/id_ed25519_signing.pub + git config --global commit.gpgsign true + git config --global tag.gpgsign true + ``` + +4. Verify on a test commit: + + ```shell + git commit --allow-empty -s -m "test: signing" + ``` + + Push the branch and confirm GitHub shows the commit as **Verified**. + +## Pull request flows + +### Internal contributor PR + +Prerequisites: + +- Org member or collaborator on the repo. +- All commits cryptographically signed (see [Commit signing](#commit-signing)). +- All commits include a DCO sign-off (`git commit -s`). + +Flow: + +1. Open the PR. copy-pr-bot mirrors it to `pull-request/` automatically. +2. A maintainer applies `test:e2e` and/or `test:e2e-gpu`. +3. `E2E Label Dispatch` detects the label and triggers the matching workflow against the mirror. +4. Results post as checks on your PR head SHA. +5. New commits push to the mirror automatically; gated workflows re-run on their own. No re-labeling needed. + +### Forked PR + +Prerequisites: + +- DCO sign-off (`git commit -s`) on every commit. Commit signing is not required for forks - copy-pr-bot trusts forks based on maintainer review, not signing. +- A maintainer must vouch you. See the [Vouch System](AGENTS.md#vouch-system). + +Flow: + +1. Open the PR. The vouch check confirms you are vouched (otherwise the PR is auto-closed). +2. copy-pr-bot does not mirror forks automatically. A maintainer reviews the diff and comments `/ok to test ` with your latest commit SHA. +3. After `/ok to test`, copy-pr-bot mirrors to `pull-request/`. +4. A maintainer applies `test:e2e` / `test:e2e-gpu`. The dispatcher runs the matching workflow against the mirror. +5. Results post as checks on your PR. + +Important: every new commit you push requires another `/ok to test ` from a maintainer before E2E will run on it. If a label is applied while the mirror is stale, `E2E Label Dispatch` will post a comment explaining what's needed. + +## copy-pr-bot + +[copy-pr-bot](https://github.com/apps/copy-pr-bot) is a GitHub App maintained by NVIDIA that solves a specific GitHub Actions security problem: by default, `pull_request`-triggered workflows on a self-hosted runner can run an arbitrary contributor's code on hardware the project owns. For projects that need self-hosted runners (GPU access, ARM hardware, on-prem secrets), GitHub's recommended pattern is to never trigger workflows directly from external `pull_request` events. + +copy-pr-bot enforces that pattern. When a PR is opened against this repository, the bot evaluates whether the change is trusted - by default, only commits authored by org members and signed with a verified key are trusted, and forks always need an explicit per-SHA approval. Once a change passes that check, the bot mirrors the PR head into a branch named `pull-request/` inside this repository. Our self-hosted workflows then trigger on `push` to those mirror branches, never on the original `pull_request` event. + +The user-visible consequences inside this repo: + +- A PR cannot run E2E until copy-pr-bot has mirrored it. For trusted authors this happens within seconds of opening the PR; for forked PRs it requires a maintainer to comment `/ok to test `. +- New commits to a fork need a fresh `/ok to test ` before the mirror updates. +- The `pull-request/` branches are not for humans to push to - they are managed by the bot. + +The bot's full administrator documentation is internal to NVIDIA. The only command contributors may see in PR comments is `/ok to test `, used by maintainers to approve a specific commit on a forked PR for testing. + +## Workflow files + +| File | Role | +|---|---| +| `.github/workflows/branch-e2e.yml` | Non-GPU E2E. Triggers on `push: pull-request/[0-9]+`. | +| `.github/workflows/test-gpu.yml` | GPU E2E. Triggers on `push: pull-request/[0-9]+`. | +| `.github/actions/pr-gate/action.yml` | Composite action that resolves PR metadata and verifies the required label is set. | +| `.github/workflows/e2e-gate.yml` | Posts the required `E2E Gate` check on the PR. Re-evaluates after the gated workflow completes. | +| `.github/workflows/e2e-gate-check.yml` | Reusable gate logic shared by E2E and GPU E2E. | +| `.github/workflows/e2e-label-dispatch.yml` | Triggers gated workflows when a `test:e2e*` label is applied. Posts a comment if the mirror is missing or stale. | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c83d32efb..3a01d75eb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -293,3 +293,9 @@ All contributions must include a `Signed-off-by` line in each commit message. Th ```bash git commit -s -m "feat(sandbox): add new capability" ``` + +DCO sign-off is separate from cryptographic commit signing. CI requires signing for org members so that copy-pr-bot can mirror your PR automatically; see [CI.md](CI.md#commit-signing) for setup. + +## CI + +How E2E runs in CI, the `test:e2e` / `test:e2e-gpu` labels, copy-pr-bot, and commit-signing setup are documented in [CI.md](CI.md). diff --git a/architecture/ci-e2e.md b/architecture/ci-e2e.md new file mode 100644 index 000000000..3015df971 --- /dev/null +++ b/architecture/ci-e2e.md @@ -0,0 +1,178 @@ +# E2E CI Architecture + +This document describes the architecture of the E2E CI flow: every workflow involved, the trigger each one listens on, why those triggers were chosen, and how the pieces fit together. For the contributor-facing how-to (labels, signing, fork flow), see [CI.md](../CI.md). + +## Goals and constraints + +Three independent goals shape the design: + +1. **Self-hosted runner safety.** E2E runs on `build-arm64` and on GPU runners. GitHub's [security hardening guide](https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners) states bluntly: "Self-hosted runners should almost never be used for public repositories on GitHub, because any user can open pull requests against the repository and compromise the environment." Our workaround is the same one used elsewhere in NVIDIA's GHA infrastructure: copy-pr-bot mirrors trusted PRs into `pull-request/` branches inside this repository, and the self-hosted workflows trigger on `push` to those mirror branches rather than on `pull_request`. +2. **Label as a hard merge gate.** When a PR carries `test:e2e` (or `test:e2e-gpu`), the corresponding suite *must* have actually executed and passed for the PR head SHA. The label has to be enforcing, not advisory: it blocks merge unless the suite ran with the label set. +3. **Per-job least privilege on the GitHub token.** Each workflow declares `permissions: {}` at the top, and each job declares only what it needs. This follows the hardening pattern described at . + +These three goals do not compose cleanly: the safety goal forces `push: pull-request/` triggers (which the PR author can't influence), but `push` triggers don't fire on label changes, so the label gate has to come from a separate workflow on a different trigger. That is the heart of the architecture. + +## Pieces at a glance + +| File | Trigger | Role | +|---|---|---| +| `.github/copy-pr-bot.yaml` | (config) | Tells copy-pr-bot to mirror trusted PRs into `pull-request/` branches. Pre-existed. | +| `.github/workflows/branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs non-GPU E2E on `build-arm64`. | +| `.github/workflows/test-gpu.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs GPU E2E on self-hosted GPU runners. | +| `.github/actions/pr-gate/action.yml` | (composite) | Resolves PR metadata for a `pull-request/` push and decides whether the run should proceed. Used by the two workflows above. | +| `.github/workflows/e2e-gate.yml` | `pull_request` + `workflow_run` | Posts the required `E2E Gate` check on the PR. Re-evaluates after the gated workflow completes. | +| `.github/workflows/e2e-gate-check.yml` | `workflow_call` | Reusable gate logic shared by E2E and GPU E2E. | +| `.github/workflows/e2e-label-dispatch.yml` | `pull_request_target: [labeled]` | Dispatches the gated workflow when a `test:e2e*` label is applied after the mirror already exists. | +| `.github/workflows/e2e-test.yml`, `e2e-gpu-test.yaml`, `docker-build.yml` | `workflow_call` | Reusable worker workflows. Unchanged by this design - called from the gated workflows and from release workflows. | + +## Trigger taxonomy + +Five GitHub Actions trigger types appear in this flow. Each one was chosen for a specific reason - they are not interchangeable. + +| Trigger | Workflow context | Token scope | Why we use it here | +|---|---|---|---| +| `push: pull-request/[0-9]+` | The pushed commit (mirror branch) | Repo-default | Only fires for branches copy-pr-bot created. Decouples test execution from PR author actions: the author cannot create a `pull-request/` branch themselves. | +| `pull_request` | The PR head SHA, but actions checkout the *base* branch's workflow files | Read-only for forks | Lets us post a status check on the PR's head SHA (so branch protection sees it). Used by the `E2E Gate` evaluation jobs. | +| `pull_request_target` | Base branch | Write-capable, even for forks | Needed for the label dispatcher: a forked PR's `GITHUB_TOKEN` is read-only on `pull_request`, so it cannot dispatch workflows or post comments. The dispatcher never checks out PR code, so the `pull_request_target` foot-gun does not apply. | +| `workflow_run` | Default branch | Repo-default | Fires when the gated workflow finishes. Lets us run a re-evaluation step in a trusted (default-branch) context. | +| `workflow_dispatch` | Caller's ref | Repo-default | Maintainer escape hatch and the mechanism the label dispatcher uses to call the gated workflow (`gh workflow run --ref pull-request/`). | + +The non-obvious move here is that the same logical "did E2E pass for this PR" check has to be posted from at least two of these trigger contexts: a `pull_request`-triggered run (which can attach a check to the PR head SHA) and a `workflow_run`-triggered run (which knows the gated workflow finished but can only attach checks to `main`). The flow stitches them together by re-running the original `pull_request`-triggered run after the gated workflow completes. + +## Happy-path flow (trusted PR with label set up-front) + +```mermaid +sequenceDiagram + autonumber + participant Author as PR Author (org member) + participant GH as GitHub + participant Bot as copy-pr-bot + participant BranchE2E as Branch E2E Checks
(self-hosted) + participant Gate as E2E Gate
(github-hosted) + participant Dispatcher as E2E Label Dispatch
(github-hosted) + participant Maintainer + + Author->>GH: Open PR (signed commits) + GH->>Bot: PR opened + Bot->>GH: push pull-request/N (mirror) + GH->>BranchE2E: push event on pull-request/N + BranchE2E->>BranchE2E: pr_metadata: should_run = false
(no label yet) + BranchE2E-->>GH: workflow concludes success
(only metadata job ran) + + GH->>Gate: pull_request opened + Gate->>Gate: no label, gate passes (no-op) + + Maintainer->>GH: apply test:e2e label + GH->>Gate: pull_request labeled + Gate->>Gate: label set,
upstream only ran metadata
→ FAIL (red) + GH->>Dispatcher: pull_request_target labeled + Dispatcher->>GH: gh workflow run branch-e2e.yml
--ref pull-request/N + GH->>BranchE2E: workflow_dispatch + BranchE2E->>BranchE2E: pr_metadata: should_run = true
(label set, SHA matches) + BranchE2E->>BranchE2E: build + e2e jobs run + + BranchE2E-->>GH: workflow concludes success + GH->>Gate: workflow_run completed + Gate->>GH: rerun original pull_request gate run + GH->>Gate: pull_request rerun (replays event) + Gate->>Gate: label set,
upstream success + non-gate jobs ran
→ PASS (green) +``` + +The key moment is step 11: the dispatcher closes the loop that the `push` trigger doesn't. Without it, applying the label leaves the gate stuck red until a maintainer manually re-runs the gated workflow. + +## Forked PR flow + +The shape is identical but with two extra round trips: the maintainer has to vet each commit before copy-pr-bot will mirror it. + +```mermaid +sequenceDiagram + autonumber + participant Author as PR Author (fork) + participant GH as GitHub + participant Bot as copy-pr-bot + participant Maintainer + + Author->>GH: Open PR from fork + GH->>Bot: PR opened + Bot->>Bot: not trusted, wait + Maintainer->>GH: comment "/ok to test " + Bot->>GH: push pull-request/N + Note over Bot,GH: From here, identical to the trusted flow:
label → dispatcher → gated workflow → rerun gate + Author->>GH: push new commit + Bot->>Bot: still untrusted, wait again + Maintainer->>GH: comment "/ok to test " +``` + +## Why each design choice exists + +### Why `push` on `pull-request/` instead of `pull_request` + +`pull_request` workflows execute the workflow file from the PR's own branch. On a self-hosted runner, that means an attacker can rewrite our workflow YAML and run anything. `push: pull-request/` only fires for branches that copy-pr-bot creates, so the workflow file source is always one that the bot vetted (signed commit + trusted author, or `/ok to test`). + +### Why the gate has to verify a non-gate job actually ran + +The gated workflows always start with a `pr_metadata` job. When the label is missing, `pr_metadata` reports `should_run=false` and the build/E2E jobs are skipped. From GitHub's perspective the workflow concluded `success`. If the gate only checked top-level conclusion, an unlabeled run from earlier would satisfy the gate forever - the label could be added without ever causing E2E to actually execute. The gate's "at least one non-gate job succeeded" check (`e2e-gate-check.yml:106-110`) is what forces a re-run after labeling. + +### Why `workflow_run` is needed for the gate flip + +Once the label dispatcher kicks off `branch-e2e.yml`, the workflow runs and finishes - but the `pull_request`-triggered gate check posted earlier still says "fail". `workflow_run` is the only event that fires when an arbitrary other workflow completes, and it's how we know to re-evaluate the gate. But `workflow_run` runs in the *default branch context*, so a check posted from there lands on `main` instead of the PR. Workaround: instead of posting a new check, look up the most recent `pull_request`-triggered gate run for the same head SHA and call `POST /actions/runs//rerun`. The re-run replays the original `pull_request` event, so the new check posts against the PR's head SHA and branch protection picks it up. + +### Why `pull_request_target` for the label dispatcher + +A `pull_request` workflow on a forked PR receives a read-only `GITHUB_TOKEN`. That's intentional: it prevents PR-supplied workflow code from escalating. But our dispatcher doesn't *run* PR code - it never checks out the PR head, only the workflow file from `main`. It needs `actions: write` to dispatch the gated workflow and `pull-requests: write` to post a status comment. `pull_request_target` provides a write-capable token while still loading the workflow definition from `main`. The standard `pull_request_target` warning ("don't check out PR code with this token") doesn't apply here because we don't check out anything. + +### Why labels and not comment commands + +Labels persist as PR metadata and survive re-runs and force-pushes. Comment-based commands like `/ok to test` don't survive the same way: a comment from yesterday doesn't enable today's commit. Branch protection rules can require a check be present; they cannot require a comment. The label is the merge gate's primary signal because it is the only thing GitHub's branch protection knows how to look at. + +### Why a separate `E2E Label Dispatch` workflow instead of dispatching from the gate + +The gate runs on `pull_request`. If we tried to dispatch the gated workflow from the gate, the dispatch call would fail on forked PRs because the gate's token is read-only. Splitting the dispatcher into its own `pull_request_target` workflow gives it the write-capable token without giving it to the gate evaluation logic. + +## Permission posture + +Every workflow declares `permissions: {}` at the top. Per-job grants are the minimum needed: + +| Workflow | Job | Grants | +|---|---|---| +| `branch-e2e.yml`, `test-gpu.yml` | `pr_metadata` | `contents: read`, `pull-requests: read` | +| | `build-*` | `contents: read`, `packages: write` | +| | `e2e*` | `contents: read`, `packages: read` | +| `e2e-gate.yml` | `e2e`, `gpu` (`workflow_call`) | inherits via the called workflow | +| | `rerun-on-completion` | `actions: write` | +| `e2e-gate-check.yml` | `check` | `contents: read`, `pull-requests: read`, `actions: read` | +| `e2e-label-dispatch.yml` | `dispatch` | `actions: write`, `pull-requests: write` | + +The reusable worker workflows (`e2e-test.yml`, `e2e-gpu-test.yaml`, `docker-build.yml`) declare their own internal permissions; the calling job grants are an upper bound for them. + +Two workflows hold "interesting" tokens: + +- `rerun-on-completion` in `e2e-gate.yml` has `actions: write`. It calls one specific endpoint - `POST /actions/runs//rerun` for an `e2e-gate.yml` run on the same head SHA - and never executes PR code. +- `dispatch` in `e2e-label-dispatch.yml` has `actions: write` + `pull-requests: write`. It calls `gh workflow run` against `branch-e2e.yml` or `test-gpu.yml` (only) and posts a comment on the PR. It never executes PR code. + +Both are small, github-hosted workflows whose source lives only on `main`. + +## Release flow + +`release-tag.yml` and `release-dev.yml` call `e2e-test.yml` directly on `main` / tag pushes. Tags and `main` are inherently trusted refs, so they bypass copy-pr-bot. E2E still blocks the release jobs (`tag-ghcr-release: needs: [..., e2e]`). + +Permissions on the release workflows are not yet scoped per-job. Tracked separately. + +## Edge cases + +| Case | What happens | +|---|---| +| Label applied before copy-pr-bot mirrors the PR | Dispatcher detects no `pull-request/` branch and posts a comment telling the maintainer to wait or run `/ok to test `. | +| Label applied while mirror is stale (new commit pending `/ok to test`) | Dispatcher detects mirror SHA != PR head SHA and posts the corresponding comment with the SHA the maintainer needs to vet. | +| Label removed | No dispatcher reaction. The next PR event (push, label, etc.) re-evaluates the gate, which now sees no label and passes as a no-op. | +| Author force-pushes after label set | copy-pr-bot re-mirrors the new SHA → gated workflow fires on `push` → finishes → `workflow_run` fires the gate re-run → new green check on the new SHA. No re-labeling needed. | +| Maintainer re-runs the gated workflow manually from the Actions UI | Same as above without the force-push. | +| Gate's first evaluation fails (label set, upstream not yet started) | Email-on-failure noise. The check eventually flips to success once upstream finishes and `workflow_run` re-runs the gate. Tracked as a known rough edge; possible fix is posting `neutral` until the upstream completes. | + +## References + +- copy-pr-bot: +- Astral hardening guidance: +- GitHub Actions security pattern for self-hosted runners: +- `pull_request_target` foot-gun: +- Contributor-facing flow doc: [../CI.md](../CI.md)