From 9c88a23eaf2985a4b2febca4391ff85551f11f9c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 29 Apr 2026 03:38:31 +0200 Subject: [PATCH] fix(core): normalize rubric grader name to rubrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The criteria: shorthand path (buildLlmGraderFromCriteria) returned name: 'rubric' while the explicit type: rubrics assertion path (generateAssertionName) returned 'rubrics'. Both code paths were written in the same PR — the singular form was an oversight. Co-Authored-By: Claude Sonnet 4.6 --- .../features/rubric/evals/dataset.eval.baseline.jsonl | 8 ++++---- packages/core/src/evaluation/loaders/grader-parser.ts | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/features/rubric/evals/dataset.eval.baseline.jsonl b/examples/features/rubric/evals/dataset.eval.baseline.jsonl index 7e293be26..7f5c4240c 100644 --- a/examples/features/rubric/evals/dataset.eval.baseline.jsonl +++ b/examples/features/rubric/evals/dataset.eval.baseline.jsonl @@ -1,5 +1,5 @@ -{"timestamp":"2026-02-20T21:40:12.230Z","test_id":"code-quality-multi-eval","suite":"dataset","score":0.75,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.5,"weight":1,"verdict":"fail","assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation."},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]},{"name":"python_syntax","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Python syntax is valid","passed":true,"evidence":"Code compiled successfully"}]}],"assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"rubric: The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation. | python_syntax: Code compiled successfully"},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"Python syntax is valid","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]} -{"timestamp":"2026-02-20T21:40:13.903Z","test_id":"code-explanation-simple","suite":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}],"assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"rubric: The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]} +{"timestamp":"2026-02-20T21:40:12.230Z","test_id":"code-quality-multi-eval","suite":"dataset","score":0.75,"target":"default","scores":[{"name":"rubrics","type":"llm-grader","score":0.5,"weight":1,"verdict":"fail","assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation."},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]},{"name":"python_syntax","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Python syntax is valid","passed":true,"evidence":"Code compiled successfully"}]}],"assertions":[{"text":"[rubric-1] Uses regular expressions for email validation: The function uses regular expressions via the re module to validate the email address against a regex pattern.","passed":true,"evidence":"rubric: The candidate answer uses regular expressions effectively and includes a docstring, but lacks type hints and does not handle edge cases like None or empty strings. Thus, it partially meets the criteria but is missing some important requirements, notably robustness and explicit type annotation. | python_syntax: Code compiled successfully"},{"text":"[rubric-3] Has docstring documentation: A docstring is present, briefly describing what the function does.","passed":true},{"text":"Python syntax is valid","passed":true},{"text":"[rubric-2] Includes type hints: The function does not include any type hints (such as 'email: str' or '-> bool') in its definition.","passed":false},{"text":"[rubric-4] Handles edge cases (None, empty string): The function does not explicitly handle edge cases such as None or empty string inputs; passing None would raise an exception.","passed":false}]} +{"timestamp":"2026-02-20T21:40:13.903Z","test_id":"code-explanation-simple","suite":"dataset","score":1,"target":"default","scores":[{"name":"rubrics","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]}],"assertions":[{"text":"[rubric-1] Mentions divide-and-conquer approach: The candidate clearly states that quicksort uses a 'divide-and-conquer strategy' and repeatedly references recursion and dividing the array, which satisfies the requirement.","passed":true,"evidence":"rubric: The candidate answer satisfies all three rubric criteria. It clearly describes the divide-and-conquer approach, offers a detailed explanation of the partition step, and accurately presents the time complexity, including both average and worst case scenarios."},{"text":"[rubric-2] Explains the partition step: The answer thoroughly explains the partition step, describing how elements less than the pivot are moved before it, those greater after, and how the pivot ends up in its sorted position.","passed":true},{"text":"[rubric-3] States time complexity correctly: The answer states both average and worst case time complexity explicitly as O(n log n) and O(n^2), matching the reference answer and rubric requirements.","passed":true}]} {"timestamp":"2026-02-20T21:40:14.527Z","test_id":"summary-task","suite":"dataset","score":1,"target":"default","assertions":[{"text":"Mentions faster-than-expected climate change","passed":true,"evidence":"The candidate_answer concisely covers all key points: accelerating climate change, Arctic melt, sea rise, extreme weather, and the scientific call to action, matching the reference answer in both content and tone."},{"text":"Notes rapid Arctic ice melt","passed":true},{"text":"Includes rising sea levels and extreme weather","passed":true},{"text":"Calls out urgent need for emissions cuts and renewables","passed":true}]} -{"timestamp":"2026-02-20T21:40:18.010Z","test_id":"summary-multi-criteria-score-ranges-proposed","suite":"dataset","score":0.9666666666666667,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":0.9666666666666667,"weight":1,"verdict":"pass","assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}],"assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"rubric: The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]} -{"timestamp":"2026-02-20T21:40:18.450Z","test_id":"technical-writing-detailed","suite":"dataset","score":1,"target":"default","scores":[{"name":"rubric","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}],"assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"rubric: The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]} +{"timestamp":"2026-02-20T21:40:18.010Z","test_id":"summary-multi-criteria-score-ranges-proposed","suite":"dataset","score":0.9666666666666667,"target":"default","scores":[{"name":"rubrics","type":"llm-grader","score":0.9666666666666667,"weight":1,"verdict":"pass","assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]}],"assertions":[{"text":"[factual_accuracy] factual_accuracy - Score: 10/10 (Fully accurate, captures all key points with no distortions.): The candidate summary accurately reflects all major points in the article: accelerating climate change, Arctic ice melting, rising sea levels, more extreme weather, and scientists\u2019 call for urgent emissions reductions and renewable energy transition. There are no factual errors or distortions.","passed":true,"evidence":"rubric: The candidate answer is accurate, complete, and concise, closely matching the reference summary and meeting the requirements for both factual accuracy and brevity."},{"text":"[brevity_and_clarity] brevity_and_clarity - Score: 9/10 (Under 50 words, clear and concise.): The summary is under 50 words and is concise and clear. The phrasing is strong, though not exceptionally elegant; there is slight room for improved conciseness, but overall it is well formulated.","passed":true}]} +{"timestamp":"2026-02-20T21:40:18.450Z","test_id":"technical-writing-detailed","suite":"dataset","score":1,"target":"default","scores":[{"name":"rubrics","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]}],"assertions":[{"text":"[structure] Has clear headings and organization: The answer uses clear headings, sections, and a summary table. The organization is logical and easy to follow.","passed":true,"evidence":"rubric: The candidate's answer presents a thorough and well-structured guide, covering all major status code classes with explanations and example scenarios. It meets all rubric requirements and provides more detail and context than the reference answer."},{"text":"[success-codes] Covers 2xx success codes with examples: The guide covers several 2xx codes (200, 201, 204) and provides explanations and example use cases in the summary table and descriptions.","passed":true},{"text":"[client-errors] Explains 4xx client error codes: Multiple 4xx codes (400, 401, 403, 404, 405) are explained with both basic descriptions and summary of use cases.","passed":true},{"text":"[server-errors] Explains 5xx server error codes: The guide details several 5xx errors (500, 502, 503, 504) with descriptions and includes a table summarizing typical uses.","passed":true},{"text":"[practical-examples] Includes practical use case examples: Practical examples are included both in descriptions (e.g., when to use 404, 301, 401, etc.) and in the summary table mapping codes to scenarios.","passed":true}]} diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index 42db53e4c..7bdd9d01a 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -2304,7 +2304,7 @@ export function parseInlineRubrics( } return { - name: 'rubric', + name: 'rubrics', type: 'llm-grader', rubrics: rubricItems, };