diff --git a/apps/studio/src/components/RunEvalModal.tsx b/apps/studio/src/components/RunEvalModal.tsx index 9c281a9d..add95c3a 100644 --- a/apps/studio/src/components/RunEvalModal.tsx +++ b/apps/studio/src/components/RunEvalModal.tsx @@ -21,8 +21,14 @@ import { useEvalDiscover, useEvalRunStatus, useEvalTargets, + useStudioConfig, } from '~/lib/api'; import type { RunEvalRequest } from '~/lib/types'; +import { + buildRunEvalRequest, + getDefaultThresholdInputValue, + getThresholdFieldValue, +} from './run-eval-threshold'; // ── Props ──────────────────────────────────────────────────────────────── @@ -49,6 +55,7 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod const [testIds, setTestIds] = useState(prefill?.testIds ?? []); const [target, setTarget] = useState(prefill?.target ?? ''); const [threshold, setThreshold] = useState(''); + const [thresholdEdited, setThresholdEdited] = useState(false); const [workers, setWorkers] = useState(''); const [dryRun, setDryRun] = useState(false); const [showAdvanced, setShowAdvanced] = useState(false); @@ -63,9 +70,18 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod const { data: discoverData } = useEvalDiscover(benchmarkId); const { data: targetsData } = useEvalTargets(benchmarkId); const { data: runStatus } = useEvalRunStatus(activeRunId); + const { data: studioConfig } = useStudioConfig(benchmarkId); const evalFiles = useMemo(() => discoverData?.eval_files ?? [], [discoverData]); const targetNames = useMemo(() => targetsData?.targets ?? [], [targetsData]); + const defaultThresholdInput = useMemo( + () => getDefaultThresholdInputValue(threshold, studioConfig?.threshold), + [studioConfig?.threshold, threshold], + ); + const thresholdFieldValue = useMemo( + () => getThresholdFieldValue(threshold, thresholdEdited, studioConfig?.threshold), + [studioConfig?.threshold, threshold, thresholdEdited], + ); // Reset form when opening with new prefill useEffect(() => { @@ -75,6 +91,7 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod setTarget(prefill?.target ?? ''); setTestIdInput(''); setThreshold(''); + setThresholdEdited(false); setWorkers(''); setDryRun(false); setShowAdvanced(false); @@ -95,15 +112,16 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod // Build request body from form state const buildRequest = useCallback((): RunEvalRequest => { - const req: RunEvalRequest = {}; - if (suiteFilter.trim()) req.suite_filter = suiteFilter.trim(); - if (testIds.length > 0) req.test_ids = testIds; - if (target) req.target = target; - if (threshold) req.threshold = Number.parseFloat(threshold); - if (workers) req.workers = Number.parseInt(workers, 10); - if (dryRun) req.dry_run = true; - return req; - }, [suiteFilter, testIds, target, threshold, workers, dryRun]); + return buildRunEvalRequest({ + suiteFilter, + testIds, + target, + thresholdInput: threshold, + studioThreshold: studioConfig?.threshold, + workers, + dryRun, + }); + }, [dryRun, studioConfig?.threshold, suiteFilter, target, testIds, threshold, workers]); // Update CLI preview when form changes useEffect(() => { @@ -297,12 +315,15 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod setThreshold(e.target.value)} + value={thresholdFieldValue} + onChange={(e) => { + setThresholdEdited(true); + setThreshold(e.target.value); + }} min="0" max="1" step="0.1" - placeholder="0.8" + placeholder={defaultThresholdInput} className="w-full rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" /> diff --git a/apps/studio/src/components/run-eval-threshold.test.ts b/apps/studio/src/components/run-eval-threshold.test.ts new file mode 100644 index 00000000..3eab4d8c --- /dev/null +++ b/apps/studio/src/components/run-eval-threshold.test.ts @@ -0,0 +1,67 @@ +import { describe, expect, it } from 'bun:test'; + +import { + buildRunEvalRequest, + getDefaultThresholdInputValue, + getThresholdFieldValue, +} from './run-eval-threshold'; + +describe('getDefaultThresholdInputValue', () => { + it('uses the configured studio threshold when the modal threshold is blank', () => { + expect(getDefaultThresholdInputValue('', 0.75)).toBe('0.75'); + }); + + it('falls back to the CLI default when no studio threshold is configured', () => { + expect(getDefaultThresholdInputValue('', undefined)).toBe('0.8'); + }); + + it('preserves a per-run override when the user edits the threshold', () => { + expect(getDefaultThresholdInputValue('0.9', 0.75)).toBe('0.9'); + }); +}); + +describe('buildRunEvalRequest', () => { + it('submits the studio threshold when the modal threshold input is left blank', () => { + expect( + buildRunEvalRequest({ + suiteFilter: 'evals/**/*.eval.yaml', + testIds: [], + target: '', + thresholdInput: '', + studioThreshold: 0.75, + workers: '', + dryRun: false, + }), + ).toEqual({ + suite_filter: 'evals/**/*.eval.yaml', + threshold: 0.75, + }); + }); + + it('submits a per-run threshold override when the user changes the field', () => { + expect( + buildRunEvalRequest({ + suiteFilter: 'evals/**/*.eval.yaml', + testIds: [], + target: '', + thresholdInput: '0.9', + studioThreshold: 0.75, + workers: '', + dryRun: false, + }), + ).toEqual({ + suite_filter: 'evals/**/*.eval.yaml', + threshold: 0.9, + }); + }); +}); + +describe('getThresholdFieldValue', () => { + it('shows the default threshold before the user edits the field', () => { + expect(getThresholdFieldValue('', false, 0.75)).toBe('0.75'); + }); + + it('lets the user clear the field while editing', () => { + expect(getThresholdFieldValue('', true, 0.75)).toBe(''); + }); +}); diff --git a/apps/studio/src/components/run-eval-threshold.ts b/apps/studio/src/components/run-eval-threshold.ts new file mode 100644 index 00000000..616bea6e --- /dev/null +++ b/apps/studio/src/components/run-eval-threshold.ts @@ -0,0 +1,59 @@ +import { DEFAULT_PASS_THRESHOLD } from '~/lib/api'; +import type { RunEvalRequest } from '~/lib/types'; + +interface BuildRunEvalRequestOptions { + suiteFilter: string; + testIds: string[]; + target: string; + thresholdInput: string; + studioThreshold?: number; + workers: string; + dryRun: boolean; +} + +export function getThresholdFieldValue( + thresholdInput: string, + thresholdEdited: boolean, + studioThreshold?: number, +): string { + if (thresholdEdited) { + return thresholdInput; + } + + return getDefaultThresholdInputValue(thresholdInput, studioThreshold); +} + +export function getDefaultThresholdInputValue( + thresholdInput: string, + studioThreshold?: number, +): string { + if (thresholdInput) { + return thresholdInput; + } + + return String(studioThreshold ?? DEFAULT_PASS_THRESHOLD); +} + +export function buildRunEvalRequest({ + suiteFilter, + testIds, + target, + thresholdInput, + studioThreshold, + workers, + dryRun, +}: BuildRunEvalRequestOptions): RunEvalRequest { + const req: RunEvalRequest = {}; + + if (suiteFilter.trim()) req.suite_filter = suiteFilter.trim(); + if (testIds.length > 0) req.test_ids = testIds; + if (target) req.target = target; + + const resolvedThreshold = getDefaultThresholdInputValue(thresholdInput, studioThreshold); + if (resolvedThreshold) req.threshold = Number.parseFloat(resolvedThreshold); + + if (workers) req.workers = Number.parseInt(workers, 10); + if (dryRun) req.dry_run = true; + + return req; +} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index ea48bfd6..1ea719e6 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -222,8 +222,8 @@ export function useCategorySuites(runId: string, category: string) { return useQuery(categorySuitesOptions(runId, category)); } -export function useStudioConfig() { - return useQuery(studioConfigOptions); +export function useStudioConfig(benchmarkId?: string) { + return useQuery(benchmarkId ? benchmarkConfigOptions(benchmarkId) : studioConfigOptions); } export function useRemoteStatus(benchmarkId?: string) {