Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 33 additions & 12 deletions apps/studio/src/components/RunEvalModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,14 @@ import {
useEvalDiscover,
useEvalRunStatus,
useEvalTargets,
useStudioConfig,
} from '~/lib/api';
import type { RunEvalRequest } from '~/lib/types';
import {
buildRunEvalRequest,
getDefaultThresholdInputValue,
getThresholdFieldValue,
} from './run-eval-threshold';

// ── Props ────────────────────────────────────────────────────────────────

Expand All @@ -49,6 +55,7 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod
const [testIds, setTestIds] = useState<string[]>(prefill?.testIds ?? []);
const [target, setTarget] = useState(prefill?.target ?? '');
const [threshold, setThreshold] = useState('');
const [thresholdEdited, setThresholdEdited] = useState(false);
const [workers, setWorkers] = useState('');
const [dryRun, setDryRun] = useState(false);
const [showAdvanced, setShowAdvanced] = useState(false);
Expand All @@ -63,9 +70,18 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod
const { data: discoverData } = useEvalDiscover(benchmarkId);
const { data: targetsData } = useEvalTargets(benchmarkId);
const { data: runStatus } = useEvalRunStatus(activeRunId);
const { data: studioConfig } = useStudioConfig(benchmarkId);

const evalFiles = useMemo(() => discoverData?.eval_files ?? [], [discoverData]);
const targetNames = useMemo(() => targetsData?.targets ?? [], [targetsData]);
const defaultThresholdInput = useMemo(
() => getDefaultThresholdInputValue(threshold, studioConfig?.threshold),
[studioConfig?.threshold, threshold],
);
const thresholdFieldValue = useMemo(
() => getThresholdFieldValue(threshold, thresholdEdited, studioConfig?.threshold),
[studioConfig?.threshold, threshold, thresholdEdited],
);

// Reset form when opening with new prefill
useEffect(() => {
Expand All @@ -75,6 +91,7 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod
setTarget(prefill?.target ?? '');
setTestIdInput('');
setThreshold('');
setThresholdEdited(false);
setWorkers('');
setDryRun(false);
setShowAdvanced(false);
Expand All @@ -95,15 +112,16 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod

// Build request body from form state
const buildRequest = useCallback((): RunEvalRequest => {
const req: RunEvalRequest = {};
if (suiteFilter.trim()) req.suite_filter = suiteFilter.trim();
if (testIds.length > 0) req.test_ids = testIds;
if (target) req.target = target;
if (threshold) req.threshold = Number.parseFloat(threshold);
if (workers) req.workers = Number.parseInt(workers, 10);
if (dryRun) req.dry_run = true;
return req;
}, [suiteFilter, testIds, target, threshold, workers, dryRun]);
return buildRunEvalRequest({
suiteFilter,
testIds,
target,
thresholdInput: threshold,
studioThreshold: studioConfig?.threshold,
workers,
dryRun,
});
}, [dryRun, studioConfig?.threshold, suiteFilter, target, testIds, threshold, workers]);

// Update CLI preview when form changes
useEffect(() => {
Expand Down Expand Up @@ -297,12 +315,15 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod
<input
id="threshold-input"
type="number"
value={threshold}
onChange={(e) => setThreshold(e.target.value)}
value={thresholdFieldValue}
onChange={(e) => {
setThresholdEdited(true);
setThreshold(e.target.value);
}}
min="0"
max="1"
step="0.1"
placeholder="0.8"
placeholder={defaultThresholdInput}
className="w-full rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none"
/>
</div>
Expand Down
67 changes: 67 additions & 0 deletions apps/studio/src/components/run-eval-threshold.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import { describe, expect, it } from 'bun:test';

import {
buildRunEvalRequest,
getDefaultThresholdInputValue,
getThresholdFieldValue,
} from './run-eval-threshold';

describe('getDefaultThresholdInputValue', () => {
it('uses the configured studio threshold when the modal threshold is blank', () => {
expect(getDefaultThresholdInputValue('', 0.75)).toBe('0.75');
});

it('falls back to the CLI default when no studio threshold is configured', () => {
expect(getDefaultThresholdInputValue('', undefined)).toBe('0.8');
});

it('preserves a per-run override when the user edits the threshold', () => {
expect(getDefaultThresholdInputValue('0.9', 0.75)).toBe('0.9');
});
});

describe('buildRunEvalRequest', () => {
it('submits the studio threshold when the modal threshold input is left blank', () => {
expect(
buildRunEvalRequest({
suiteFilter: 'evals/**/*.eval.yaml',
testIds: [],
target: '',
thresholdInput: '',
studioThreshold: 0.75,
workers: '',
dryRun: false,
}),
).toEqual({
suite_filter: 'evals/**/*.eval.yaml',
threshold: 0.75,
});
});

it('submits a per-run threshold override when the user changes the field', () => {
expect(
buildRunEvalRequest({
suiteFilter: 'evals/**/*.eval.yaml',
testIds: [],
target: '',
thresholdInput: '0.9',
studioThreshold: 0.75,
workers: '',
dryRun: false,
}),
).toEqual({
suite_filter: 'evals/**/*.eval.yaml',
threshold: 0.9,
});
});
});

describe('getThresholdFieldValue', () => {
it('shows the default threshold before the user edits the field', () => {
expect(getThresholdFieldValue('', false, 0.75)).toBe('0.75');
});

it('lets the user clear the field while editing', () => {
expect(getThresholdFieldValue('', true, 0.75)).toBe('');
});
});
59 changes: 59 additions & 0 deletions apps/studio/src/components/run-eval-threshold.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import { DEFAULT_PASS_THRESHOLD } from '~/lib/api';
import type { RunEvalRequest } from '~/lib/types';

interface BuildRunEvalRequestOptions {
suiteFilter: string;
testIds: string[];
target: string;
thresholdInput: string;
studioThreshold?: number;
workers: string;
dryRun: boolean;
}

export function getThresholdFieldValue(
thresholdInput: string,
thresholdEdited: boolean,
studioThreshold?: number,
): string {
if (thresholdEdited) {
return thresholdInput;
}

return getDefaultThresholdInputValue(thresholdInput, studioThreshold);
}

export function getDefaultThresholdInputValue(
thresholdInput: string,
studioThreshold?: number,
): string {
if (thresholdInput) {
return thresholdInput;
}

return String(studioThreshold ?? DEFAULT_PASS_THRESHOLD);
}

export function buildRunEvalRequest({
suiteFilter,
testIds,
target,
thresholdInput,
studioThreshold,
workers,
dryRun,
}: BuildRunEvalRequestOptions): RunEvalRequest {
const req: RunEvalRequest = {};

if (suiteFilter.trim()) req.suite_filter = suiteFilter.trim();
if (testIds.length > 0) req.test_ids = testIds;
if (target) req.target = target;

const resolvedThreshold = getDefaultThresholdInputValue(thresholdInput, studioThreshold);
if (resolvedThreshold) req.threshold = Number.parseFloat(resolvedThreshold);

if (workers) req.workers = Number.parseInt(workers, 10);
if (dryRun) req.dry_run = true;

return req;
}
4 changes: 2 additions & 2 deletions apps/studio/src/lib/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,8 @@ export function useCategorySuites(runId: string, category: string) {
return useQuery(categorySuitesOptions(runId, category));
}

export function useStudioConfig() {
return useQuery(studioConfigOptions);
export function useStudioConfig(benchmarkId?: string) {
return useQuery(benchmarkId ? benchmarkConfigOptions(benchmarkId) : studioConfigOptions);
}

export function useRemoteStatus(benchmarkId?: string) {
Expand Down
Loading