diff --git a/internal/cli/skillopt.go b/internal/cli/skillopt.go index 6ebdffd9..9b6d5c9c 100644 --- a/internal/cli/skillopt.go +++ b/internal/cli/skillopt.go @@ -78,6 +78,8 @@ func runSkillOpt(args []string, stdout, stderr io.Writer) int { return runSkillOptTrain(args[1:], stdout, stderr) case "ab": return runSkillOptAB(args[1:], stdout, stderr) + case "pairwise": + return runSkillOptPairwise(args[1:], stdout, stderr) default: fmt.Fprintf(stderr, "unknown skillopt command %q\n\n", args[0]) printSkillOptUsage(stderr) @@ -102,6 +104,7 @@ func printSkillOptUsage(w io.Writer) { fmt.Fprintln(w, " gitmoot skillopt feedback github publish --run [--repo owner/repo] [--pr ]") fmt.Fprintln(w, " gitmoot skillopt feedback github sync --run [--repo owner/repo] (--issue |--pr )") fmt.Fprintln(w, " gitmoot skillopt ab \"\" [--challenger ] [--pick a|b] [--seed N] [--judge] [--judge-only] [--home path]") + fmt.Fprintln(w, " gitmoot skillopt pairwise import [--packet path] [--secret-map path] [--picks path] [--reviewer name] [--home path] [--json]") fmt.Fprintln(w, " gitmoot skillopt judge-report [--template id]") fmt.Fprintln(w, " gitmoot skillopt judge promote --template --task-kind --file [--home ] [--yes] [--json]") fmt.Fprintln(w, " gitmoot skillopt train init --name --template --review-repo owner/repo --artifact-kind kind --preview kind (--request text|--request-file path)") diff --git a/internal/cli/skillopt_ab.go b/internal/cli/skillopt_ab.go index 99a42b2d..80fc88d0 100644 --- a/internal/cli/skillopt_ab.go +++ b/internal/cli/skillopt_ab.go @@ -561,10 +561,10 @@ func defaultReadSkillOptABLine() (string, bool) { // (ranking=[winner,loser], source=skillopt-ab) that passes // store.validateRankedFeedbackEventOptions. func recordSkillOptABPick(ctx context.Context, store *db.Store, paths config.Paths, runID, pickSourceURL, templateID string, champion, challenger skillOptABVariant, championDelivery, challengerDelivery skillOptABDelivery, winnerLabel, loserLabel, prompt string) error { - if err := ensureSkillOptABRunRows(ctx, store, paths, runID, templateID, champion, challenger, championDelivery, challengerDelivery, prompt); err != nil { + if err := ensureSkillOptABRunRows(ctx, store, paths, runID, skillOptABItemID, templateID, champion, challenger, championDelivery, challengerDelivery, prompt, skillOptABSource, skillOptABFeedbackSource); err != nil { return err } - return upsertSkillOptABRankedEvent(ctx, store, runID, winnerLabel, loserLabel, "human", skillOptABSource, pickSourceURL) + return upsertSkillOptABRankedEvent(ctx, store, runID, skillOptABItemID, winnerLabel, loserLabel, "human", skillOptABSource, pickSourceURL) } // ensureSkillOptABRunRows idempotently upserts the shared pairwise scaffolding for @@ -575,12 +575,12 @@ func recordSkillOptABPick(ctx context.Context, store *db.Store, paths config.Pat // judge row is self-sufficient when --judge-only skips the human record, and the // rows are byte-identical when both run (every write here is an idempotent upsert // keyed by (run_id,item_id,label)). -func ensureSkillOptABRunRows(ctx context.Context, store *db.Store, paths config.Paths, runID, templateID string, champion, challenger skillOptABVariant, championDelivery, challengerDelivery skillOptABDelivery, prompt string) error { +func ensureSkillOptABRunRows(ctx context.Context, store *db.Store, paths config.Paths, runID, itemID, templateID string, champion, challenger skillOptABVariant, championDelivery, challengerDelivery skillOptABDelivery, prompt, source, feedbackSource string) error { blobStore := artifact.NewStore(paths.ArtifactBlobs) metadataJSON, err := json.Marshal(map[string]any{ - "feedback_source": skillOptABFeedbackSource, - "source": skillOptABSource, + "feedback_source": feedbackSource, + "source": source, "champion": champion.version.ID, "challenger": challenger.version.ID, }) @@ -599,7 +599,7 @@ func ensureSkillOptABRunRows(ctx context.Context, store *db.Store, paths config. if err := store.UpsertEvalRun(ctx, run); err != nil { return err } - if err := store.UpsertEvalReviewItem(ctx, db.EvalReviewItem{RunID: runID, ItemID: skillOptABItemID, Title: prompt}); err != nil { + if err := store.UpsertEvalReviewItem(ctx, db.EvalReviewItem{RunID: runID, ItemID: itemID, Title: prompt}); err != nil { return err } @@ -612,7 +612,7 @@ func ensureSkillOptABRunRows(ctx context.Context, store *db.Store, paths config. {label: skillOptABChampionLabel, delivery: championDelivery}, {label: skillOptABChallengerLabel, delivery: challengerDelivery}, } { - evalArtifact, err := prepareReviewItemContentArtifact(blobStore, runID, skillOptABItemID, opt.label, []byte(answerOrPlaceholder(opt.delivery.answer)), "text/plain", "text") + evalArtifact, err := prepareReviewItemContentArtifact(blobStore, runID, itemID, opt.label, []byte(answerOrPlaceholder(opt.delivery.answer)), "text/plain", "text") if err != nil { return err } @@ -621,7 +621,7 @@ func ensureSkillOptABRunRows(ctx context.Context, store *db.Store, paths config. } if err := store.UpsertEvalReviewOption(ctx, db.EvalReviewOption{ RunID: runID, - ItemID: skillOptABItemID, + ItemID: itemID, Label: opt.label, ArtifactID: evalArtifact.ID, Role: opt.label, @@ -640,7 +640,7 @@ func ensureSkillOptABRunRows(ctx context.Context, store *db.Store, paths config. // (reviewer=skillopt-ab-judge,source=skillopt-ab-judge) as SEPARATE coexisting // rows on the same run/item, and what makes repeated picks each persist via a // distinct per-pick source_url. -func upsertSkillOptABRankedEvent(ctx context.Context, store *db.Store, runID, winnerLabel, loserLabel, reviewer, source, sourceURL string) error { +func upsertSkillOptABRankedEvent(ctx context.Context, store *db.Store, runID, itemID, winnerLabel, loserLabel, reviewer, source, sourceURL string) error { ranking, err := json.Marshal([]string{winnerLabel, loserLabel}) if err != nil { return err @@ -651,7 +651,7 @@ func upsertSkillOptABRankedEvent(ctx context.Context, store *db.Store, runID, wi } return store.UpsertRankedFeedbackEvent(ctx, db.RankedFeedbackEvent{ RunID: runID, - ItemID: skillOptABItemID, + ItemID: itemID, RankingJSON: string(ranking), TieGroupsJSON: string(tieGroups), Winner: winnerLabel, @@ -766,12 +766,12 @@ func runSkillOptABJudge(ctx context.Context, store *db.Store, paths config.Paths // Ensure the shared run/item/options exist (idempotent) so the judge row is // self-sufficient even under --judge-only where the human record path never runs. - if err := ensureSkillOptABRunRows(ctx, store, paths, runID, templateID, champion, challenger, optionAToDelivery(optionA, optionB, champion), optionAToDelivery(optionA, optionB, challenger), options.prompt); err != nil { + if err := ensureSkillOptABRunRows(ctx, store, paths, runID, skillOptABItemID, templateID, champion, challenger, optionAToDelivery(optionA, optionB, champion), optionAToDelivery(optionA, optionB, challenger), options.prompt, skillOptABSource, skillOptABFeedbackSource); err != nil { log.Printf("skillopt ab judge: ensure run rows: %v", err) return } judgeSourceURL := skillOptABJudgeSourceURL(challenger.version.ID) - if err := upsertSkillOptABRankedEvent(ctx, store, runID, winnerLabel, loserLabel, skillOptABJudgeReviewer, skillOptABJudgeSource, judgeSourceURL); err != nil { + if err := upsertSkillOptABRankedEvent(ctx, store, runID, skillOptABItemID, winnerLabel, loserLabel, skillOptABJudgeReviewer, skillOptABJudgeSource, judgeSourceURL); err != nil { log.Printf("skillopt ab judge: record judge pick: %v", err) return } diff --git a/internal/cli/skillopt_pairwise.go b/internal/cli/skillopt_pairwise.go new file mode 100644 index 00000000..22e4b3f1 --- /dev/null +++ b/internal/cli/skillopt_pairwise.go @@ -0,0 +1,329 @@ +package cli + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/jerryfane/gitmoot/internal/config" + "github.com/jerryfane/gitmoot/internal/db" + "github.com/jerryfane/gitmoot/internal/skillopt" +) + +// Live-pairwise ingestion (#77b) source tags. They are DISTINCT from the +// single-prompt Mode B (#473) tags (skillopt-ab / preference_ab) so validation-set +// live-pairwise feedback is trivially separable from single-prompt A/B in the +// canonical ranked_feedback_events store and in the downstream export/optimizer. +const ( + skillOptPairwiseSource = "live-pairwise" + skillOptPairwiseFeedbackSource = "pairwise_valset" + skillOptPairwiseRunIDPrefix = "skillopt-pairwise:" + skillOptPairwiseDefaultReviewer = "human" + + // Conventional filenames #77a writes into the packet/artifact dir + // (gitmoot_skillopt/pairwise.py:write_pairwise_artifacts). The reviewer's + // picks file is added at review time. + pairwisePacketFileName = "pairwise-review.json" + pairwiseSecretMapFileName = "pairwise-secret-map.json" + pairwisePicksFileName = "pairwise-picks.json" +) + +func runSkillOptPairwise(args []string, stdout, stderr io.Writer) int { + if len(args) == 0 || args[0] == "-h" || args[0] == "--help" { + printSkillOptPairwiseUsage(stdout) + return 0 + } + switch args[0] { + case "import": + return runSkillOptPairwiseImport(args[1:], stdout, stderr) + default: + fmt.Fprintf(stderr, "unknown skillopt pairwise command %q\n\n", args[0]) + printSkillOptPairwiseUsage(stderr) + return 2 + } +} + +func printSkillOptPairwiseUsage(w io.Writer) { + fmt.Fprintln(w, "Usage:") + fmt.Fprintln(w, " gitmoot skillopt pairwise import [--packet path] [--secret-map path] [--picks path] [--reviewer name] [--home path] [--json]") + fmt.Fprintln(w, "") + fmt.Fprintln(w, "Ingest a REVIEWED #77a blinded live-pairwise packet on review close: the") + fmt.Fprintln(w, "per-item anonymized A/B options, the reviewer's per-item pick, and the SEPARATE") + fmt.Fprintln(w, "secret map. Each pick is UNBLINDED back to champion vs challenger via the secret") + fmt.Fprintln(w, "map ONLY, then written as a canonical RankedFeedbackEvent through the Mode B") + fmt.Fprintln(w, "recording path with source=live-pairwise (feedback_source=pairwise_valset).") + fmt.Fprintln(w, "Ingestion writes feedback ONLY: it NEVER promotes or auto-promotes. Re-importing") + fmt.Fprintln(w, "the same reviewed packet is idempotent (stable per-item conflict key). A") + fmt.Fprintln(w, "missing/garbage secret entry or missing pick is reported per item without") + fmt.Fprintln(w, "aborting the rest of the import.") +} + +type skillOptPairwiseImportOptions struct { + home string + packet string + secretMap string + picks string + reviewer string + asJSON bool +} + +func runSkillOptPairwiseImport(args []string, stdout, stderr io.Writer) int { + fs := flag.NewFlagSet("skillopt pairwise import", flag.ContinueOnError) + fs.SetOutput(stderr) + home := fs.String("home", "", "home directory to use instead of the current user's home") + packet := fs.String("packet", "", "blinded review packet JSON (defaults to /"+pairwisePacketFileName+")") + secretMap := fs.String("secret-map", "", "secret unblinding map JSON (defaults to /"+pairwiseSecretMapFileName+")") + picks := fs.String("picks", "", "reviewer picks JSON (defaults to /"+pairwisePicksFileName+")") + reviewer := fs.String("reviewer", skillOptPairwiseDefaultReviewer, "reviewer id recorded on each feedback event") + asJSON := fs.Bool("json", false, "emit a JSON summary instead of human-readable lines") + // Separate the leading packet-dir positional from flags. flag.Parse stops at + // the first non-flag, so collect positionals manually to allow the packet-dir + // to appear before or after flags. + positionals := []string{} + rest := []string{} + for i := 0; i < len(args); i++ { + arg := args[i] + if strings.HasPrefix(arg, "-") { + rest = append(rest, arg) + if !strings.Contains(arg, "=") && (arg == "--home" || arg == "--packet" || arg == "--secret-map" || arg == "--picks" || arg == "--reviewer") && i+1 < len(args) { + i++ + rest = append(rest, args[i]) + } + continue + } + positionals = append(positionals, arg) + } + if err := fs.Parse(rest); err != nil { + if err == flag.ErrHelp { + return 0 + } + return 2 + } + options := skillOptPairwiseImportOptions{ + home: strings.TrimSpace(*home), + packet: strings.TrimSpace(*packet), + secretMap: strings.TrimSpace(*secretMap), + picks: strings.TrimSpace(*picks), + reviewer: strings.TrimSpace(*reviewer), + asJSON: *asJSON, + } + if options.reviewer == "" { + options.reviewer = skillOptPairwiseDefaultReviewer + } + + // Resolve the three artifacts: a positional packet-dir supplies defaults, and + // the explicit flags override any of them. + if len(positionals) > 1 { + fmt.Fprintln(stderr, "skillopt pairwise import accepts at most one packet-dir positional") + return 2 + } + packetDir := "" + if len(positionals) == 1 { + packetDir = strings.TrimSpace(positionals[0]) + } + if options.packet == "" { + if packetDir == "" { + fmt.Fprintln(stderr, "skillopt pairwise import requires a packet-dir or --packet") + return 2 + } + options.packet = filepath.Join(packetDir, pairwisePacketFileName) + } + if options.secretMap == "" { + if packetDir == "" { + fmt.Fprintln(stderr, "skillopt pairwise import requires a packet-dir or --secret-map") + return 2 + } + options.secretMap = filepath.Join(packetDir, pairwiseSecretMapFileName) + } + if options.picks == "" { + if packetDir == "" { + fmt.Fprintln(stderr, "skillopt pairwise import requires a packet-dir or --picks") + return 2 + } + options.picks = filepath.Join(packetDir, pairwisePicksFileName) + } + + packetData, err := os.ReadFile(options.packet) + if err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: read packet: %v\n", err) + return 1 + } + secretData, err := os.ReadFile(options.secretMap) + if err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: read secret map: %v\n", err) + return 1 + } + picksData, err := os.ReadFile(options.picks) + if err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: read picks: %v\n", err) + return 1 + } + + reviewPacket, err := skillopt.ParsePairwiseReviewPacket(packetData) + if err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: %v\n", err) + return 1 + } + secret, err := skillopt.ParsePairwiseSecretMap(secretData) + if err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: %v\n", err) + return 1 + } + pairwisePicks, err := skillopt.ParsePairwisePicks(picksData) + if err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: %v\n", err) + return 1 + } + // The secret map and packet MUST describe the same run; mixing a packet with a + // foreign secret map would unblind against the wrong mapping. + if strings.TrimSpace(secret.RunID) != "" && strings.TrimSpace(secret.RunID) != strings.TrimSpace(reviewPacket.RunID) { + fmt.Fprintf(stderr, "skillopt pairwise import: secret map run_id %q does not match packet run_id %q\n", secret.RunID, reviewPacket.RunID) + return 1 + } + // The picks are the artifact that decides each winner, so they MUST also describe + // the same run as the packet; a foreign picks file whose items happen to share + // generic ids (item-1, …) would otherwise be joined by item_id alone and silently + // unblind the WRONG reviewer preferences for THIS run. Picks supplied as a bare + // map shape carry no run_id (RunID stays empty) and therefore skip this check. + if strings.TrimSpace(pairwisePicks.RunID) != "" && strings.TrimSpace(pairwisePicks.RunID) != strings.TrimSpace(reviewPacket.RunID) { + fmt.Fprintf(stderr, "skillopt pairwise import: picks run_id %q does not match packet run_id %q\n", pairwisePicks.RunID, reviewPacket.RunID) + return 1 + } + + exit := 0 + if err := withStoreAndPaths(options.home, func(paths config.Paths, store *db.Store) error { + exit = importPairwisePacket(context.Background(), store, paths, reviewPacket, secret, pairwisePicks, options, stdout, stderr) + return nil + }); err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: %v\n", err) + return 1 + } + return exit +} + +type pairwiseImportItemResult struct { + ItemID string `json:"item_id"` + Status string `json:"status"` // imported | skipped + Winner string `json:"winner,omitempty"` + Pick string `json:"pick,omitempty"` + Message string `json:"message,omitempty"` +} + +type pairwiseImportSummary struct { + RunID string `json:"run_id"` + Source string `json:"source"` + Reviewer string `json:"reviewer"` + Imported int `json:"imported"` + Skipped int `json:"skipped"` + Items []pairwiseImportItemResult `json:"items"` +} + +// importPairwisePacket unblinds every item and writes the resolved preference via +// the Mode B recording path with the distinct live-pairwise source. It is +// fail-safe per item: a missing/garbage secret entry, a missing pick, or a write +// failure for one item is reported in that item's result and does NOT abort the +// rest of the import. It NEVER promotes — only canonical feedback rows are +// written. The exit code is 0 when every item imported, 1 when any item was +// skipped (so an automated caller can detect a partial import) while still having +// persisted every good item. +func importPairwisePacket(ctx context.Context, store *db.Store, paths config.Paths, packet skillopt.PairwiseReviewPacket, secret skillopt.PairwiseSecretMap, picks skillopt.PairwisePicks, options skillOptPairwiseImportOptions, stdout, stderr io.Writer) int { + runID := skillOptPairwiseRunIDPrefix + strings.TrimSpace(packet.RunID) + templateID := strings.TrimSpace(packet.TemplateID) + championVersionID := strings.TrimSpace(packet.BaseVersionID) + // The candidate (challenger) is supplied to #77a as content, not necessarily a + // registered version; the secret map's candidate content hash is the stable + // challenger identifier for the run metadata. It is metadata only and is never + // used for the unblind. + challengerVersionID := firstNonEmpty(strings.TrimSpace(secret.CandidateContentHash), strings.TrimSpace(packet.RunID)+":candidate") + + champion := skillOptABVariant{version: db.AgentTemplateVersion{ID: championVersionID}, label: skillOptABChampionLabel} + challenger := skillOptABVariant{version: db.AgentTemplateVersion{ID: challengerVersionID}, label: skillOptABChallengerLabel} + + summary := pairwiseImportSummary{ + RunID: runID, + Source: skillOptPairwiseSource, + Reviewer: options.reviewer, + } + + for _, result := range skillopt.UnblindPairwisePacket(packet, secret, picks) { + itemResult := pairwiseImportItemResult{ItemID: result.ItemID, Pick: result.Pick} + if result.Err != nil { + itemResult.Status = "skipped" + itemResult.Message = result.Err.Error() + summary.Skipped++ + summary.Items = append(summary.Items, itemResult) + continue + } + + championDelivery := skillOptABDelivery{label: skillOptABChampionLabel, answer: result.ChampionResponse} + challengerDelivery := skillOptABDelivery{label: skillOptABChallengerLabel, answer: result.ChallengerResponse} + title := firstNonEmpty(result.Title, result.Prompt, result.ItemID) + + // Reuse the Mode B run-row scaffold (eval_run + per-item eval_review_item + + // champion/challenger eval_review_options backed by the answer blobs), tagged + // with the distinct live-pairwise source/feedback_source. + if err := ensureSkillOptABRunRows(ctx, store, paths, runID, result.ItemID, templateID, champion, challenger, championDelivery, challengerDelivery, title, skillOptPairwiseSource, skillOptPairwiseFeedbackSource); err != nil { + itemResult.Status = "skipped" + itemResult.Message = fmt.Sprintf("ensure run rows: %v", err) + summary.Skipped++ + summary.Items = append(summary.Items, itemResult) + continue + } + + // A STABLE per-item source_url makes re-import idempotent: the + // (run_id,item_id,reviewer,source,source_url) conflict key is identical on a + // re-import of the same reviewed packet, so the ranked event is upserted in + // place and never double-counted. + sourceURL := skillOptPairwiseSourceURL(packet.RunID, result.ItemID) + if err := upsertSkillOptABRankedEvent(ctx, store, runID, result.ItemID, result.WinnerLabel, result.LoserLabel, options.reviewer, skillOptPairwiseSource, sourceURL); err != nil { + itemResult.Status = "skipped" + itemResult.Message = fmt.Sprintf("record pick: %v", err) + summary.Skipped++ + summary.Items = append(summary.Items, itemResult) + continue + } + + itemResult.Status = "imported" + itemResult.Winner = result.WinnerLabel + summary.Imported++ + summary.Items = append(summary.Items, itemResult) + } + + if options.asJSON { + encoded, err := json.MarshalIndent(summary, "", " ") + if err != nil { + fmt.Fprintf(stderr, "skillopt pairwise import: %v\n", err) + return 1 + } + fmt.Fprintln(stdout, string(encoded)) + } else { + for _, item := range summary.Items { + if item.Status == "imported" { + fmt.Fprintf(stdout, "imported %s: %s (pick %s)\n", item.ItemID, item.Winner, item.Pick) + } else { + fmt.Fprintf(stdout, "skipped %s: %s\n", item.ItemID, item.Message) + } + } + fmt.Fprintf(stdout, "pairwise import complete: run %s, %d imported, %d skipped (source=%s)\n", runID, summary.Imported, summary.Skipped, skillOptPairwiseSource) + } + + if summary.Skipped > 0 { + return 1 + } + return 0 +} + +// skillOptPairwiseSourceURL mints the STABLE per-item SourceURL for the +// live-pairwise RankedFeedbackEvent. Unlike Mode B's monotonic per-pick token (a +// single prompt can be A/B'd repeatedly), one reviewed pairwise packet has exactly +// one preference per item, so the source_url is deterministic from (run_id, +// item_id). That determinism is precisely what makes re-importing the same packet +// idempotent: the conflict key resolves to the same row. +func skillOptPairwiseSourceURL(runID, itemID string) string { + return fmt.Sprintf("%s%s:%s", skillOptPairwiseRunIDPrefix, strings.TrimSpace(runID), strings.TrimSpace(itemID)) +} diff --git a/internal/cli/skillopt_pairwise_test.go b/internal/cli/skillopt_pairwise_test.go new file mode 100644 index 00000000..0b75f294 --- /dev/null +++ b/internal/cli/skillopt_pairwise_test.go @@ -0,0 +1,340 @@ +package cli + +import ( + "bytes" + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/jerryfane/gitmoot/internal/config" + "github.com/jerryfane/gitmoot/internal/db" + "github.com/jerryfane/gitmoot/internal/skillopt" +) + +// skillOptPairwiseHome installs a "planner" template (its current version is the +// promoted champion) so the import has a real template/version to reference, and +// returns the home, the store, and the champion version id. +func skillOptPairwiseHome(t *testing.T) (string, *db.Store, string) { + t.Helper() + home := t.TempDir() + paths := config.PathsForHome(home) + if err := config.Initialize(paths); err != nil { + t.Fatalf("Initialize: %v", err) + } + store, err := db.Open(paths.Database) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { store.Close() }) + ctx := context.Background() + if err := store.UpsertAgentTemplate(ctx, cliSkillOptTemplate("planner", "Champion guidance.")); err != nil { + t.Fatalf("UpsertAgentTemplate: %v", err) + } + tmpl, err := store.GetAgentTemplate(ctx, "planner") + if err != nil { + t.Fatalf("GetAgentTemplate: %v", err) + } + return home, store, tmpl.VersionID +} + +// writePairwisePacketDir writes a one-item blinded packet, the matching secret +// map, and a picks file into a temp dir. championIsA controls the A/B placement +// of the champion. It returns the packet dir. +func writePairwisePacketDir(t *testing.T, baseVersionID string, championIsA bool, pick string) string { + t.Helper() + championLabel, challengerLabel := "A", "B" + if !championIsA { + championLabel, challengerLabel = "B", "A" + } + var sideA, sideB string + if championIsA { + sideA, sideB = "CHAMPION-RESPONSE", "CHALLENGER-RESPONSE" + } else { + sideA, sideB = "CHALLENGER-RESPONSE", "CHAMPION-RESPONSE" + } + packet := skillopt.PairwiseReviewPacket{ + Kind: skillopt.PairwiseReviewPacketKind, + ContractVersion: skillopt.ContractVersion, + Mode: skillopt.PairwiseMode, + TemplateID: "planner", + BaseVersionID: baseVersionID, + RunID: "run-1", + Items: []skillopt.PairwisePacketItem{ + { + ItemID: "item-1", + Title: "Item One", + Prompt: "Plan the migration.", + Outputs: []skillopt.PairwisePacketSide{{Label: "A", Response: sideA}, {Label: "B", Response: sideB}}, + }, + }, + } + secret := skillopt.PairwiseSecretMap{ + Kind: skillopt.PairwiseSecretMapKind, + ContractVersion: skillopt.ContractVersion, + RunID: "run-1", + TemplateID: "planner", + ChampionRole: "promoted", + ChallengerRole: "candidate", + CandidateContentHash: "sha256:candidatehash", + Items: []skillopt.PairwiseSecretItem{ + { + ItemID: "item-1", + ChampionLabel: championLabel, + ChallengerLabel: challengerLabel, + Mapping: map[string]string{championLabel: "promoted", challengerLabel: "candidate"}, + }, + }, + } + picks := skillopt.PairwisePicks{ + Kind: skillopt.PairwisePicksKind, + RunID: "run-1", + Picks: []skillopt.PairwisePick{{ItemID: "item-1", Pick: pick}}, + } + dir := t.TempDir() + writePairwiseJSON(t, filepath.Join(dir, pairwisePacketFileName), packet) + writePairwiseJSON(t, filepath.Join(dir, pairwiseSecretMapFileName), secret) + writePairwiseJSON(t, filepath.Join(dir, pairwisePicksFileName), picks) + return dir +} + +func writePairwiseJSON(t *testing.T, path string, v any) { + t.Helper() + data, err := json.MarshalIndent(v, "", " ") + if err != nil { + t.Fatalf("marshal %s: %v", path, err) + } + if err := os.WriteFile(path, data, 0o644); err != nil { + t.Fatalf("write %s: %v", path, err) + } +} + +// TestRunSkillOptPairwiseImportUnblindOrientations is the consumer-side analogue +// of the unblind test: it runs the FULL CLI import for both A->champion and +// A->challenger orientations and asserts the persisted RankedFeedbackEvent names +// the correct winner. It would FAIL if the unblind were inverted. +func TestRunSkillOptPairwiseImportUnblindOrientations(t *testing.T) { + cases := []struct { + name string + championIsA bool + pick string + wantWinner string + }{ + {name: "A champion, pick A -> champion wins", championIsA: true, pick: "A", wantWinner: skillOptABChampionLabel}, + {name: "A champion, pick B -> challenger wins", championIsA: true, pick: "B", wantWinner: skillOptABChallengerLabel}, + {name: "A challenger, pick A -> challenger wins", championIsA: false, pick: "A", wantWinner: skillOptABChallengerLabel}, + {name: "A challenger, pick B -> champion wins", championIsA: false, pick: "B", wantWinner: skillOptABChampionLabel}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + home, store, championID := skillOptPairwiseHome(t) + dir := writePairwisePacketDir(t, championID, tc.championIsA, tc.pick) + var stdout, stderr bytes.Buffer + code := runSkillOptPairwise([]string{"import", dir, "--home", home}, &stdout, &stderr) + if code != 0 { + t.Fatalf("import exit = %d, stderr: %s", code, stderr.String()) + } + runID := skillOptPairwiseRunIDPrefix + "run-1" + events, err := store.ListRankedFeedbackEvents(context.Background(), runID) + if err != nil { + t.Fatalf("ListRankedFeedbackEvents: %v", err) + } + if len(events) != 1 { + t.Fatalf("events = %d, want 1", len(events)) + } + ev := events[0] + if ev.Source != skillOptPairwiseSource { + t.Fatalf("source = %q, want %q", ev.Source, skillOptPairwiseSource) + } + if ev.Winner != tc.wantWinner { + t.Fatalf("winner = %q, want %q", ev.Winner, tc.wantWinner) + } + if ev.ItemID != "item-1" { + t.Fatalf("item id = %q, want item-1", ev.ItemID) + } + }) + } +} + +// TestRunSkillOptPairwiseImportForeignPicksRunID asserts a picks file whose +// run_id names a DIFFERENT pairwise run is rejected before any unblind, even when +// the packet and secret map agree on the run and the items share generic ids. +// Picks are the artifact that decides each winner, so a silent item_id-only join +// against foreign preferences would unblind the wrong winners; the guard makes the +// import fail (exit 1) and persist nothing. +func TestRunSkillOptPairwiseImportForeignPicksRunID(t *testing.T) { + home, store, championID := skillOptPairwiseHome(t) + dir := writePairwisePacketDir(t, championID, true, "A") + // Overwrite the picks file so its run_id points at a foreign run while keeping + // the same generic item_id the packet/secret map use. + foreignPicks := skillopt.PairwisePicks{ + Kind: skillopt.PairwisePicksKind, + RunID: "run-OTHER", + Picks: []skillopt.PairwisePick{{ItemID: "item-1", Pick: "A"}}, + } + writePairwiseJSON(t, filepath.Join(dir, pairwisePicksFileName), foreignPicks) + + var stdout, stderr bytes.Buffer + code := runSkillOptPairwise([]string{"import", dir, "--home", home}, &stdout, &stderr) + if code != 1 { + t.Fatalf("foreign-picks import exit = %d, want 1 (rejected); stderr: %s", code, stderr.String()) + } + if !bytes.Contains(stderr.Bytes(), []byte("picks run_id")) { + t.Fatalf("stderr = %q, want a picks run_id mismatch error", stderr.String()) + } + events, err := store.ListRankedFeedbackEvents(context.Background(), skillOptPairwiseRunIDPrefix+"run-1") + if err != nil { + t.Fatalf("ListRankedFeedbackEvents: %v", err) + } + if len(events) != 0 { + t.Fatalf("events = %d, want 0: a foreign-picks import must persist nothing", len(events)) + } +} + +// TestRunSkillOptPairwiseImportIdempotent asserts re-importing the same reviewed +// packet does NOT double-count: the stable per-item source_url keeps the conflict +// key identical so the row is upserted in place. +func TestRunSkillOptPairwiseImportIdempotent(t *testing.T) { + home, store, championID := skillOptPairwiseHome(t) + dir := writePairwisePacketDir(t, championID, true, "A") + runID := skillOptPairwiseRunIDPrefix + "run-1" + for i := 0; i < 3; i++ { + var stdout, stderr bytes.Buffer + if code := runSkillOptPairwise([]string{"import", dir, "--home", home}, &stdout, &stderr); code != 0 { + t.Fatalf("import %d exit non-zero: %s", i, stderr.String()) + } + } + events, err := store.ListRankedFeedbackEvents(context.Background(), runID) + if err != nil { + t.Fatalf("ListRankedFeedbackEvents: %v", err) + } + if len(events) != 1 { + t.Fatalf("after 3 imports events = %d, want exactly 1 (no double count)", len(events)) + } +} + +// TestRunSkillOptPairwiseImportNoPromotion asserts ingestion writes feedback only +// and NEVER promotes: the template's current promoted version is unchanged and no +// pending candidate version was created or promoted. +func TestRunSkillOptPairwiseImportNoPromotion(t *testing.T) { + home, store, championID := skillOptPairwiseHome(t) + ctx := context.Background() + dir := writePairwisePacketDir(t, championID, true, "B") // challenger preferred + var stdout, stderr bytes.Buffer + if code := runSkillOptPairwise([]string{"import", dir, "--home", home}, &stdout, &stderr); code != 0 { + t.Fatalf("import exit non-zero: %s", stderr.String()) + } + tmpl, err := store.GetAgentTemplate(ctx, "planner") + if err != nil { + t.Fatalf("GetAgentTemplate: %v", err) + } + if tmpl.VersionID != championID { + t.Fatalf("current version changed to %q (want unchanged %q): import must not promote", tmpl.VersionID, championID) + } + pending, err := store.ListPendingAgentTemplateVersions(ctx, "planner") + if err != nil { + t.Fatalf("ListPendingAgentTemplateVersions: %v", err) + } + if len(pending) != 0 { + t.Fatalf("pending versions = %d, want 0: import must not create/promote versions", len(pending)) + } +} + +// TestRunSkillOptPairwiseImportPartialPacket asserts a packet with a good item +// plus an item lacking a pick and an item lacking a secret entry imports the good +// item and reports the others per item without aborting. The exit code is 1 +// (partial) but the good item's event is persisted. +func TestRunSkillOptPairwiseImportPartialPacket(t *testing.T) { + home, store, championID := skillOptPairwiseHome(t) + ctx := context.Background() + packet := skillopt.PairwiseReviewPacket{ + Kind: skillopt.PairwiseReviewPacketKind, + ContractVersion: skillopt.ContractVersion, + Mode: skillopt.PairwiseMode, + TemplateID: "planner", + BaseVersionID: championID, + RunID: "run-1", + Items: []skillopt.PairwisePacketItem{ + {ItemID: "good", Outputs: []skillopt.PairwisePacketSide{{Label: "A", Response: "champ"}, {Label: "B", Response: "chal"}}}, + {ItemID: "no-pick", Outputs: []skillopt.PairwisePacketSide{{Label: "A", Response: "x"}, {Label: "B", Response: "y"}}}, + {ItemID: "no-secret", Outputs: []skillopt.PairwisePacketSide{{Label: "A", Response: "x"}, {Label: "B", Response: "y"}}}, + }, + } + secret := skillopt.PairwiseSecretMap{ + Kind: skillopt.PairwiseSecretMapKind, + ContractVersion: skillopt.ContractVersion, + RunID: "run-1", + Items: []skillopt.PairwiseSecretItem{ + {ItemID: "good", ChampionLabel: "A", ChallengerLabel: "B", Mapping: map[string]string{"A": "promoted", "B": "candidate"}}, + {ItemID: "no-pick", ChampionLabel: "A", ChallengerLabel: "B", Mapping: map[string]string{"A": "promoted", "B": "candidate"}}, + }, + } + picks := skillopt.PairwisePicks{Picks: []skillopt.PairwisePick{{ItemID: "good", Pick: "A"}, {ItemID: "no-secret", Pick: "A"}}} + dir := t.TempDir() + writePairwiseJSON(t, filepath.Join(dir, pairwisePacketFileName), packet) + writePairwiseJSON(t, filepath.Join(dir, pairwiseSecretMapFileName), secret) + writePairwiseJSON(t, filepath.Join(dir, pairwisePicksFileName), picks) + + var stdout, stderr bytes.Buffer + code := runSkillOptPairwise([]string{"import", dir, "--home", home, "--json"}, &stdout, &stderr) + if code != 1 { + t.Fatalf("partial import exit = %d, want 1", code) + } + var summary pairwiseImportSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("decode summary: %v\n%s", err, stdout.String()) + } + if summary.Imported != 1 || summary.Skipped != 2 { + t.Fatalf("summary imported=%d skipped=%d, want 1/2", summary.Imported, summary.Skipped) + } + runID := skillOptPairwiseRunIDPrefix + "run-1" + events, err := store.ListRankedFeedbackEvents(ctx, runID) + if err != nil { + t.Fatalf("ListRankedFeedbackEvents: %v", err) + } + if len(events) != 1 { + t.Fatalf("events = %d, want exactly 1 (only the good item)", len(events)) + } + if events[0].ItemID != "good" { + t.Fatalf("persisted item = %q, want good", events[0].ItemID) + } +} + +// TestRunSkillOptPairwiseImportDistinctSource asserts the recorded source/ +// feedback_source are the distinct live-pairwise tags, not the single-prompt Mode +// B tags, so val-set feedback is separable from single-prompt A/B. +func TestRunSkillOptPairwiseImportDistinctSource(t *testing.T) { + home, store, championID := skillOptPairwiseHome(t) + ctx := context.Background() + dir := writePairwisePacketDir(t, championID, true, "A") + var stdout, stderr bytes.Buffer + if code := runSkillOptPairwise([]string{"import", dir, "--home", home, "--reviewer", "alice"}, &stdout, &stderr); code != 0 { + t.Fatalf("import exit non-zero: %s", stderr.String()) + } + runID := skillOptPairwiseRunIDPrefix + "run-1" + events, err := store.ListRankedFeedbackEvents(ctx, runID) + if err != nil { + t.Fatalf("ListRankedFeedbackEvents: %v", err) + } + if len(events) != 1 { + t.Fatalf("events = %d, want 1", len(events)) + } + if events[0].Source != skillOptPairwiseSource { + t.Fatalf("source = %q, want %q (distinct from %q)", events[0].Source, skillOptPairwiseSource, skillOptABSource) + } + if events[0].Reviewer != "alice" { + t.Fatalf("reviewer = %q, want alice", events[0].Reviewer) + } + run, err := store.GetEvalRun(ctx, runID) + if err != nil { + t.Fatalf("GetEvalRun: %v", err) + } + var meta map[string]any + if err := json.Unmarshal([]byte(run.MetadataJSON), &meta); err != nil { + t.Fatalf("decode run metadata: %v", err) + } + if meta["feedback_source"] != skillOptPairwiseFeedbackSource { + t.Fatalf("run feedback_source = %v, want %q", meta["feedback_source"], skillOptPairwiseFeedbackSource) + } +} diff --git a/internal/skillopt/pairwise.go b/internal/skillopt/pairwise.go new file mode 100644 index 00000000..01931661 --- /dev/null +++ b/internal/skillopt/pairwise.go @@ -0,0 +1,380 @@ +package skillopt + +import ( + "encoding/json" + "errors" + "fmt" + "strings" +) + +// Live-pairwise (#77a/#77b) packet contract. These types decode the blinded +// paired review packet and the SEPARATE secret unblinding map produced by the +// gitmoot-skillopt fork's `live-pairwise` mode (gitmoot_skillopt/pairwise.py: +// build_pairwise_packet / write_pairwise_artifacts). The blinded packet is +// human-visible and carries only anonymized A/B options per item; the secret map +// is the ONLY artifact that records which anonymized side is the promoted +// champion vs the candidate challenger. Ingestion (#77b) is the review-close step +// that consumes BOTH plus the reviewer's per-item pick and unblinds it back to a +// champion/challenger preference. +// +// Everything here is additive to ContractVersion=1: it adds no field to any wire +// package the optimizer reads; it only decodes the fork's review artifacts. +const ( + // PairwiseReviewPacketKind / PairwiseSecretMapKind are the `kind` tags #77a + // stamps on the blinded packet and the secret map respectively. + PairwiseReviewPacketKind = "gitmoot-skillopt-pairwise-review-packet" + PairwiseSecretMapKind = "gitmoot-skillopt-pairwise-secret-map" + // PairwisePicksKind is the `kind` tag on the reviewer's recorded picks file. + PairwisePicksKind = "gitmoot-skillopt-pairwise-picks" + + // PairwiseMode is the #77a mode string. + PairwiseMode = "live-pairwise" + + // PairwiseChampionRole / PairwiseChallengerRole are the canonical role labels + // the unblind resolves to. They MATCH the eval_review_option role/label the + // Mode B recording path registers ("champion"/"challenger"), so an unblinded + // winner label is directly accepted by store.validateRankedFeedbackEventOptions. + PairwiseChampionRole = "champion" + PairwiseChallengerRole = "challenger" + + // pairwiseRolePromoted / pairwiseRoleCandidate are the secret-map mapping + // values (#77a writes mapping[label] -> "promoted"|"candidate"). Promoted is + // the champion; candidate is the challenger. + pairwiseRolePromoted = "promoted" + pairwiseRoleCandidate = "candidate" +) + +// PairwisePacketSide is one anonymized output (A or B) for an item in the blinded +// packet. It deliberately carries NO role-revealing field (the fork strips +// target_trace_path etc.); only the secret map knows which side is the champion. +type PairwisePacketSide struct { + Label string `json:"label"` + Response string `json:"response"` + Failed bool `json:"failed"` + FailReason string `json:"fail_reason,omitempty"` + TokenUsage json.RawMessage `json:"token_usage,omitempty"` +} + +// PairwisePacketItem is one reviewed item: the prompt plus the two anonymized +// outputs. +type PairwisePacketItem struct { + ItemID string `json:"item_id"` + Title string `json:"title,omitempty"` + Prompt string `json:"prompt,omitempty"` + Outputs []PairwisePacketSide `json:"outputs"` +} + +// PairwiseReviewPacket is the blinded, human-visible packet. +type PairwiseReviewPacket struct { + Kind string `json:"kind"` + ContractVersion int `json:"contract_version"` + Mode string `json:"mode"` + TemplateID string `json:"template_id"` + BaseVersionID string `json:"base_version_id"` + RunID string `json:"run_id"` + Items []PairwisePacketItem `json:"items"` +} + +// PairwiseSecretItem is the per-item unblinding record. The mapping is the sole +// source of truth for which anonymized label is the promoted champion. The +// trace-path fields are admin/debug only and are never written into feedback. +type PairwiseSecretItem struct { + ItemID string `json:"item_id"` + ChampionLabel string `json:"champion_label"` + ChallengerLabel string `json:"challenger_label"` + Mapping map[string]string `json:"mapping"` + PromotedTracePath string `json:"promoted_trace_path,omitempty"` + CandidateTracePath string `json:"candidate_trace_path,omitempty"` +} + +// PairwiseSecretMap is the SEPARATE secret unblinding artifact. +type PairwiseSecretMap struct { + Kind string `json:"kind"` + ContractVersion int `json:"contract_version"` + RunID string `json:"run_id"` + TemplateID string `json:"template_id"` + ChampionRole string `json:"champion_role"` + ChallengerRole string `json:"challenger_role"` + PromotedContentHash string `json:"promoted_content_hash,omitempty"` + CandidateContentHash string `json:"candidate_content_hash,omitempty"` + Items []PairwiseSecretItem `json:"items"` +} + +// PairwisePick is one reviewer preference: the anonymized label (A or B) the +// reviewer preferred for an item. The reviewer never sees the secret map, so the +// pick is recorded against the blinded label only. +type PairwisePick struct { + ItemID string `json:"item_id"` + Pick string `json:"pick"` +} + +// PairwisePicks is the reviewer's recorded picks over the blinded packet. +type PairwisePicks struct { + Kind string `json:"kind,omitempty"` + RunID string `json:"run_id,omitempty"` + Reviewer string `json:"reviewer,omitempty"` + Picks []PairwisePick `json:"picks"` +} + +// ParsePairwiseReviewPacket decodes and validates the blinded packet, enforcing +// the kind tag and ContractVersion so a stale or foreign payload is rejected. +func ParsePairwiseReviewPacket(data []byte) (PairwiseReviewPacket, error) { + if len(strings.TrimSpace(string(data))) == 0 { + return PairwiseReviewPacket{}, errors.New("pairwise review packet is empty") + } + var packet PairwiseReviewPacket + if err := json.Unmarshal(data, &packet); err != nil { + return PairwiseReviewPacket{}, fmt.Errorf("decode pairwise review packet: %w", err) + } + if packet.Kind != PairwiseReviewPacketKind { + return PairwiseReviewPacket{}, fmt.Errorf("pairwise review packet kind must be %q, got %q", PairwiseReviewPacketKind, packet.Kind) + } + if packet.ContractVersion != ContractVersion { + return PairwiseReviewPacket{}, fmt.Errorf("pairwise review packet contract_version must be %d, got %d", ContractVersion, packet.ContractVersion) + } + if strings.TrimSpace(packet.RunID) == "" { + return PairwiseReviewPacket{}, errors.New("pairwise review packet run_id is required") + } + if len(packet.Items) == 0 { + return PairwiseReviewPacket{}, errors.New("pairwise review packet has no items") + } + return packet, nil +} + +// ParsePairwiseSecretMap decodes and validates the secret map. +func ParsePairwiseSecretMap(data []byte) (PairwiseSecretMap, error) { + if len(strings.TrimSpace(string(data))) == 0 { + return PairwiseSecretMap{}, errors.New("pairwise secret map is empty") + } + var secret PairwiseSecretMap + if err := json.Unmarshal(data, &secret); err != nil { + return PairwiseSecretMap{}, fmt.Errorf("decode pairwise secret map: %w", err) + } + if secret.Kind != PairwiseSecretMapKind { + return PairwiseSecretMap{}, fmt.Errorf("pairwise secret map kind must be %q, got %q", PairwiseSecretMapKind, secret.Kind) + } + if secret.ContractVersion != ContractVersion { + return PairwiseSecretMap{}, fmt.Errorf("pairwise secret map contract_version must be %d, got %d", ContractVersion, secret.ContractVersion) + } + return secret, nil +} + +// ParsePairwisePicks decodes the reviewer's picks. The picks JSON accepts either +// an array of {item_id, pick} objects under "picks" OR a bare object map of +// item_id -> pick under "picks"; both shapes are tolerated so a reviewer can hand +// back whichever is convenient. +func ParsePairwisePicks(data []byte) (PairwisePicks, error) { + if len(strings.TrimSpace(string(data))) == 0 { + return PairwisePicks{}, errors.New("pairwise picks file is empty") + } + // First decode the envelope without the picks field so we can fall back to a + // map shape for picks. + var envelope struct { + Kind string `json:"kind"` + RunID string `json:"run_id"` + Reviewer string `json:"reviewer"` + Picks json.RawMessage `json:"picks"` + } + if err := json.Unmarshal(data, &envelope); err != nil { + return PairwisePicks{}, fmt.Errorf("decode pairwise picks: %w", err) + } + out := PairwisePicks{Kind: envelope.Kind, RunID: envelope.RunID, Reviewer: envelope.Reviewer} + raw := strings.TrimSpace(string(envelope.Picks)) + if raw == "" { + return PairwisePicks{}, errors.New("pairwise picks file has no picks") + } + switch raw[0] { + case '[': + var arr []PairwisePick + if err := json.Unmarshal(envelope.Picks, &arr); err != nil { + return PairwisePicks{}, fmt.Errorf("decode pairwise picks array: %w", err) + } + out.Picks = arr + case '{': + var byID map[string]string + if err := json.Unmarshal(envelope.Picks, &byID); err != nil { + return PairwisePicks{}, fmt.Errorf("decode pairwise picks map: %w", err) + } + for itemID, pick := range byID { + out.Picks = append(out.Picks, PairwisePick{ItemID: itemID, Pick: pick}) + } + default: + return PairwisePicks{}, errors.New("pairwise picks must be an array or object") + } + if len(out.Picks) == 0 { + return PairwisePicks{}, errors.New("pairwise picks file has no picks") + } + return out, nil +} + +// PairwiseUnblindResult is one item's resolved preference after unblinding. When +// Err is non-nil the item could not be resolved (missing/garbage secret entry, +// missing pick, missing output) and MUST be reported per item WITHOUT aborting +// the rest of the import — the other items remain importable. +type PairwiseUnblindResult struct { + ItemID string + Title string + Prompt string + Pick string // normalized anonymized label (A/B) the reviewer chose + WinnerLabel string // PairwiseChampionRole or PairwiseChallengerRole + LoserLabel string + ChampionResponse string + ChallengerResponse string + Err error +} + +// normalizePairwiseLabel upper-cases and trims an A/B label so picks and +// secret-map labels compare consistently regardless of case. +func normalizePairwiseLabel(label string) string { + return strings.ToUpper(strings.TrimSpace(label)) +} + +// roleForLabel resolves the canonical winner role (champion/challenger) for an +// anonymized label using ONLY the secret-map item. It prefers the explicit +// mapping (label -> promoted|candidate) and cross-checks champion_label / +// challenger_label; an inconsistency is an error rather than a guess so a +// corrupt/inverted secret entry can never silently flip the preference. This is +// the single load-bearing unblind: it must come solely from the secret map. +func roleForLabel(secret PairwiseSecretItem, label string) (string, error) { + label = normalizePairwiseLabel(label) + if label == "" { + return "", errors.New("empty pick label") + } + championLabel := normalizePairwiseLabel(secret.ChampionLabel) + challengerLabel := normalizePairwiseLabel(secret.ChallengerLabel) + + var roleFromMapping string + if len(secret.Mapping) > 0 { + // Mapping keys may be either case; build a normalized view. + for rawLabel, rawRole := range secret.Mapping { + if normalizePairwiseLabel(rawLabel) != label { + continue + } + switch strings.ToLower(strings.TrimSpace(rawRole)) { + case pairwiseRolePromoted: + roleFromMapping = PairwiseChampionRole + case pairwiseRoleCandidate: + roleFromMapping = PairwiseChallengerRole + default: + return "", fmt.Errorf("secret map label %q has unknown role %q", label, rawRole) + } + } + } + + var roleFromLabels string + switch label { + case championLabel: + if championLabel != "" { + roleFromLabels = PairwiseChampionRole + } + case challengerLabel: + if challengerLabel != "" { + roleFromLabels = PairwiseChallengerRole + } + } + // When the same label is both champion and challenger the secret map is + // corrupt and the unblind is undefined. + if championLabel != "" && championLabel == challengerLabel { + return "", fmt.Errorf("secret map champion and challenger share label %q", championLabel) + } + + switch { + case roleFromMapping != "" && roleFromLabels != "" && roleFromMapping != roleFromLabels: + return "", fmt.Errorf("secret map mapping and labels disagree for %q", label) + case roleFromMapping != "": + return roleFromMapping, nil + case roleFromLabels != "": + return roleFromLabels, nil + default: + return "", fmt.Errorf("secret map has no champion/challenger entry for label %q", label) + } +} + +// responseForLabel returns the packet output text for an anonymized label. +func responseForLabel(item PairwisePacketItem, label string) (string, bool) { + label = normalizePairwiseLabel(label) + for _, side := range item.Outputs { + if normalizePairwiseLabel(side.Label) == label { + return side.Response, true + } + } + return "", false +} + +// UnblindPairwisePacket joins the blinded packet, the secret map, and the +// reviewer's picks into one resolved preference per item. It is pure and total: +// per-item failures are captured in the result's Err (never panics, never returns +// a top-level error for a single bad item) so the caller can record the good +// items and report the bad ones individually. A result whose Err is nil carries a +// WinnerLabel of PairwiseChampionRole or PairwiseChallengerRole plus both option +// responses, ready for the Mode B recording path. +// +// The order of results follows the packet item order, so the output is +// deterministic. +func UnblindPairwisePacket(packet PairwiseReviewPacket, secret PairwiseSecretMap, picks PairwisePicks) []PairwiseUnblindResult { + secretByID := make(map[string]PairwiseSecretItem, len(secret.Items)) + for _, item := range secret.Items { + secretByID[strings.TrimSpace(item.ItemID)] = item + } + pickByID := make(map[string]string, len(picks.Picks)) + for _, pick := range picks.Picks { + pickByID[strings.TrimSpace(pick.ItemID)] = pick.Pick + } + + results := make([]PairwiseUnblindResult, 0, len(packet.Items)) + for _, item := range packet.Items { + itemID := strings.TrimSpace(item.ItemID) + result := PairwiseUnblindResult{ItemID: itemID, Title: item.Title, Prompt: item.Prompt} + + secretItem, ok := secretByID[itemID] + if !ok { + result.Err = fmt.Errorf("no secret map entry for item %q", itemID) + results = append(results, result) + continue + } + rawPick, ok := pickByID[itemID] + if !ok || strings.TrimSpace(rawPick) == "" { + result.Err = fmt.Errorf("no reviewer pick for item %q", itemID) + results = append(results, result) + continue + } + pick := normalizePairwiseLabel(rawPick) + result.Pick = pick + + winnerRole, err := roleForLabel(secretItem, pick) + if err != nil { + result.Err = fmt.Errorf("unblind item %q: %w", itemID, err) + results = append(results, result) + continue + } + result.WinnerLabel = winnerRole + if winnerRole == PairwiseChampionRole { + result.LoserLabel = PairwiseChallengerRole + } else { + result.LoserLabel = PairwiseChampionRole + } + + // Resolve both option responses from the blinded packet via the secret + // map's champion/challenger labels. We need both labels present and both + // outputs available so the recorded eval_review_options carry the correct + // answer text under the correct role. + championLabel := normalizePairwiseLabel(secretItem.ChampionLabel) + challengerLabel := normalizePairwiseLabel(secretItem.ChallengerLabel) + if championLabel == "" || challengerLabel == "" { + result.Err = fmt.Errorf("secret map item %q missing champion/challenger label", itemID) + results = append(results, result) + continue + } + championResponse, okChampion := responseForLabel(item, championLabel) + challengerResponse, okChallenger := responseForLabel(item, challengerLabel) + if !okChampion || !okChallenger { + result.Err = fmt.Errorf("item %q is missing an output for the champion or challenger side", itemID) + results = append(results, result) + continue + } + result.ChampionResponse = championResponse + result.ChallengerResponse = challengerResponse + results = append(results, result) + } + return results +} diff --git a/internal/skillopt/pairwise_test.go b/internal/skillopt/pairwise_test.go new file mode 100644 index 00000000..9e375464 --- /dev/null +++ b/internal/skillopt/pairwise_test.go @@ -0,0 +1,233 @@ +package skillopt + +import ( + "encoding/json" + "strings" + "testing" +) + +// pairwisePacketFixture builds a one-item blinded packet, the matching secret +// map, and a picks file with the given orientation. championIsA controls which +// anonymized label the champion (promoted) sits behind, so a single fixture +// exercises both A->champion and A->challenger orientations. +func pairwisePacketFixture(championIsA bool, pick string) (PairwiseReviewPacket, PairwiseSecretMap, PairwisePicks) { + championLabel, challengerLabel := "A", "B" + if !championIsA { + championLabel, challengerLabel = "B", "A" + } + // Outputs are always presented as A then B; the response text is keyed to the + // role so we can assert the unblind routed the correct answer to each role. + championResponse := "CHAMPION-RESPONSE" + challengerResponse := "CHALLENGER-RESPONSE" + var sideAResponse, sideBResponse string + if championIsA { + sideAResponse, sideBResponse = championResponse, challengerResponse + } else { + sideAResponse, sideBResponse = challengerResponse, championResponse + } + + packet := PairwiseReviewPacket{ + Kind: PairwiseReviewPacketKind, + ContractVersion: ContractVersion, + Mode: PairwiseMode, + TemplateID: "planner", + BaseVersionID: "planner@v1", + RunID: "run-1", + Items: []PairwisePacketItem{ + { + ItemID: "item-1", + Title: "Item One", + Prompt: "Plan the migration.", + Outputs: []PairwisePacketSide{ + {Label: "A", Response: sideAResponse}, + {Label: "B", Response: sideBResponse}, + }, + }, + }, + } + secret := PairwiseSecretMap{ + Kind: PairwiseSecretMapKind, + ContractVersion: ContractVersion, + RunID: "run-1", + TemplateID: "planner", + ChampionRole: pairwiseRolePromoted, + ChallengerRole: pairwiseRoleCandidate, + Items: []PairwiseSecretItem{ + { + ItemID: "item-1", + ChampionLabel: championLabel, + ChallengerLabel: challengerLabel, + Mapping: map[string]string{ + championLabel: pairwiseRolePromoted, + challengerLabel: pairwiseRoleCandidate, + }, + }, + }, + } + picks := PairwisePicks{Picks: []PairwisePick{{ItemID: "item-1", Pick: pick}}} + return packet, secret, picks +} + +// TestUnblindPairwisePacketOrientations is the core correctness test. It pins the +// unblind for BOTH orientations and would FAIL if the secret-map mapping were +// inverted: when A is the champion, a pick of A must resolve to champion; when A +// is the challenger, a pick of A must resolve to challenger. The response routing +// is asserted too so a side-swap is caught. +func TestUnblindPairwisePacketOrientations(t *testing.T) { + cases := []struct { + name string + championIsA bool + pick string + wantWinner string + wantLoser string + }{ + {name: "A is champion, pick A -> champion", championIsA: true, pick: "A", wantWinner: PairwiseChampionRole, wantLoser: PairwiseChallengerRole}, + {name: "A is champion, pick B -> challenger", championIsA: true, pick: "B", wantWinner: PairwiseChallengerRole, wantLoser: PairwiseChampionRole}, + {name: "A is challenger, pick A -> challenger", championIsA: false, pick: "A", wantWinner: PairwiseChallengerRole, wantLoser: PairwiseChampionRole}, + {name: "A is challenger, pick B -> champion", championIsA: false, pick: "B", wantWinner: PairwiseChampionRole, wantLoser: PairwiseChallengerRole}, + {name: "lowercase pick is normalized", championIsA: true, pick: "a", wantWinner: PairwiseChampionRole, wantLoser: PairwiseChallengerRole}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + packet, secret, picks := pairwisePacketFixture(tc.championIsA, tc.pick) + results := UnblindPairwisePacket(packet, secret, picks) + if len(results) != 1 { + t.Fatalf("results = %d, want 1", len(results)) + } + r := results[0] + if r.Err != nil { + t.Fatalf("unexpected per-item error: %v", r.Err) + } + if r.WinnerLabel != tc.wantWinner { + t.Fatalf("winner = %q, want %q", r.WinnerLabel, tc.wantWinner) + } + if r.LoserLabel != tc.wantLoser { + t.Fatalf("loser = %q, want %q", r.LoserLabel, tc.wantLoser) + } + // The champion option must always carry the champion response and the + // challenger option the challenger response, regardless of A/B placement. + if r.ChampionResponse != "CHAMPION-RESPONSE" { + t.Fatalf("champion response = %q, want CHAMPION-RESPONSE", r.ChampionResponse) + } + if r.ChallengerResponse != "CHALLENGER-RESPONSE" { + t.Fatalf("challenger response = %q, want CHALLENGER-RESPONSE", r.ChallengerResponse) + } + }) + } +} + +// TestUnblindPairwisePacketDetectsInvertedSecretMap proves the unblind comes +// SOLELY from the secret map: if we feed an INVERTED secret map (mapping says A is +// the candidate while the labels say A is the champion), the unblind must refuse +// rather than silently pick a side — the single scariest bug (a backwards +// preference fed to the optimizer). This test fails if the mapping/label +// cross-check is removed. +func TestUnblindPairwisePacketDetectsInvertedSecretMap(t *testing.T) { + packet, secret, picks := pairwisePacketFixture(true, "A") + // Corrupt only the mapping so it disagrees with champion_label/challenger_label. + secret.Items[0].Mapping = map[string]string{ + "A": pairwiseRoleCandidate, // labels say A is champion; mapping says candidate + "B": pairwiseRolePromoted, + } + results := UnblindPairwisePacket(packet, secret, picks) + if len(results) != 1 { + t.Fatalf("results = %d, want 1", len(results)) + } + if results[0].Err == nil { + t.Fatalf("expected an error for an inverted secret map, got winner=%q", results[0].WinnerLabel) + } + if !strings.Contains(results[0].Err.Error(), "disagree") { + t.Fatalf("error = %v, want a mapping/label disagreement", results[0].Err) + } +} + +// TestUnblindPairwisePacketPerItemFailures asserts a missing secret entry, a +// missing pick, and a missing output are each reported PER ITEM without aborting +// the good items. +func TestUnblindPairwisePacketPerItemFailures(t *testing.T) { + packet := PairwiseReviewPacket{ + Kind: PairwiseReviewPacketKind, + ContractVersion: ContractVersion, + Mode: PairwiseMode, + TemplateID: "planner", + RunID: "run-1", + Items: []PairwisePacketItem{ + {ItemID: "good", Outputs: []PairwisePacketSide{{Label: "A", Response: "champ"}, {Label: "B", Response: "chal"}}}, + {ItemID: "no-secret", Outputs: []PairwisePacketSide{{Label: "A", Response: "x"}, {Label: "B", Response: "y"}}}, + {ItemID: "no-pick", Outputs: []PairwisePacketSide{{Label: "A", Response: "x"}, {Label: "B", Response: "y"}}}, + {ItemID: "missing-output", Outputs: []PairwisePacketSide{{Label: "A", Response: "x"}}}, + }, + } + secret := PairwiseSecretMap{ + Kind: PairwiseSecretMapKind, + ContractVersion: ContractVersion, + RunID: "run-1", + Items: []PairwiseSecretItem{ + {ItemID: "good", ChampionLabel: "A", ChallengerLabel: "B", Mapping: map[string]string{"A": pairwiseRolePromoted, "B": pairwiseRoleCandidate}}, + {ItemID: "no-pick", ChampionLabel: "A", ChallengerLabel: "B", Mapping: map[string]string{"A": pairwiseRolePromoted, "B": pairwiseRoleCandidate}}, + {ItemID: "missing-output", ChampionLabel: "B", ChallengerLabel: "A", Mapping: map[string]string{"B": pairwiseRolePromoted, "A": pairwiseRoleCandidate}}, + }, + } + picks := PairwisePicks{Picks: []PairwisePick{ + {ItemID: "good", Pick: "A"}, + {ItemID: "no-secret", Pick: "A"}, + {ItemID: "missing-output", Pick: "B"}, + }} + + results := UnblindPairwisePacket(packet, secret, picks) + if len(results) != 4 { + t.Fatalf("results = %d, want 4 (one per packet item)", len(results)) + } + byID := map[string]PairwiseUnblindResult{} + for _, r := range results { + byID[r.ItemID] = r + } + if r := byID["good"]; r.Err != nil || r.WinnerLabel != PairwiseChampionRole { + t.Fatalf("good item: err=%v winner=%q", r.Err, r.WinnerLabel) + } + if byID["no-secret"].Err == nil { + t.Fatalf("no-secret item should have errored") + } + if byID["no-pick"].Err == nil { + t.Fatalf("no-pick item should have errored") + } + if byID["missing-output"].Err == nil { + t.Fatalf("missing-output item should have errored") + } +} + +// TestParsePairwisePicksShapes covers both the array and object map picks shapes. +func TestParsePairwisePicksShapes(t *testing.T) { + arr := []byte(`{"kind":"gitmoot-skillopt-pairwise-picks","run_id":"run-1","picks":[{"item_id":"a","pick":"A"},{"item_id":"b","pick":"B"}]}`) + got, err := ParsePairwisePicks(arr) + if err != nil { + t.Fatalf("array picks: %v", err) + } + if len(got.Picks) != 2 { + t.Fatalf("array picks len = %d, want 2", len(got.Picks)) + } + + obj := []byte(`{"picks":{"a":"A","b":"B"}}`) + got, err = ParsePairwisePicks(obj) + if err != nil { + t.Fatalf("map picks: %v", err) + } + if len(got.Picks) != 2 { + t.Fatalf("map picks len = %d, want 2", len(got.Picks)) + } +} + +// TestParsePairwiseReviewPacketRejectsWrongContract guards the additive +// contract: a foreign kind or a non-1 contract_version is rejected. +func TestParsePairwiseReviewPacketRejectsWrongContract(t *testing.T) { + bad := PairwiseReviewPacket{Kind: "something-else", ContractVersion: ContractVersion, RunID: "r", Items: []PairwisePacketItem{{ItemID: "x"}}} + data, _ := json.Marshal(bad) + if _, err := ParsePairwiseReviewPacket(data); err == nil { + t.Fatalf("expected a kind mismatch error") + } + badVersion := PairwiseReviewPacket{Kind: PairwiseReviewPacketKind, ContractVersion: 2, RunID: "r", Items: []PairwisePacketItem{{ItemID: "x"}}} + data, _ = json.Marshal(badVersion) + if _, err := ParsePairwiseReviewPacket(data); err == nil { + t.Fatalf("expected a contract_version mismatch error") + } +}