From bfe798502a5752afb902bb91e4b0229c29d855ef Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Jun 2026 07:33:55 +0000 Subject: [PATCH 1/2] Initial plan From 908f270861a509eff6c774a69e309e1fd8b008ca Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Jun 2026 07:37:43 +0000 Subject: [PATCH 2/2] Author eval suite for azure-cost-estimator skill --- .github/evals/azure-cost-estimator/eval.yaml | 48 +++++++++++++++++++ .../tasks/negative-billing-history.yaml | 15 ++++++ .../tasks/negative-off-topic.yaml | 15 ++++++ .../tasks/positive-arm-template-estimate.yaml | 48 +++++++++++++++++++ .../tasks/positive-sku-compare.yaml | 42 ++++++++++++++++ .github/evals/manifest.yaml | 2 + 6 files changed, 170 insertions(+) create mode 100644 .github/evals/azure-cost-estimator/eval.yaml create mode 100644 .github/evals/azure-cost-estimator/tasks/negative-billing-history.yaml create mode 100644 .github/evals/azure-cost-estimator/tasks/negative-off-topic.yaml create mode 100644 .github/evals/azure-cost-estimator/tasks/positive-arm-template-estimate.yaml create mode 100644 .github/evals/azure-cost-estimator/tasks/positive-sku-compare.yaml diff --git a/.github/evals/azure-cost-estimator/eval.yaml b/.github/evals/azure-cost-estimator/eval.yaml new file mode 100644 index 0000000..22c217a --- /dev/null +++ b/.github/evals/azure-cost-estimator/eval.yaml @@ -0,0 +1,48 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/eval.schema.json + +# Expanded-tier evaluation suite for the azure-cost-estimator skill. +# Validates trigger precision via the heuristic `trigger` grader plus +# per-positive-task answer_quality LLM judge. +# +# Run: waza run .github/evals/azure-cost-estimator/eval.yaml + +name: azure-cost-estimator-eval +description: Trigger precision + answer quality for azure-cost-estimator (Azure Retail Prices API). +skill: azure-cost-estimator +version: "0.1" + +config: + # 2 trials catches obvious LLM nondeterminism flakes (single trial = no + # flake signal). Pilot tier bumps to 3 via /skill-promote. + trials_per_task: 2 + timeout_seconds: 60 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 + +metrics: + - name: trigger_precision + weight: 1.0 + threshold: 0.6 + description: Skill should activate on Azure cost / pricing estimation prompts and stay quiet on unrelated or billing-history prompts. + +graders: + # Budget grader: azure-cost-estimator drives a handful of unauthenticated + # REST queries; flag any leg that explodes in tool calls or runs long. + - type: behavior + name: budget + config: + max_tool_calls: 30 + max_duration_ms: 240000 + + # answer_quality (LLM-as-judge) is scoped per-task on positive tasks + # only (see tasks/positive-*.yaml). Keeps judge-model errors from + # zeroing out the negative-task trigger check in the same leg. + # + # Do NOT add `skill_invocation` with `required_skills:` here — eval-level + # prompt graders fire on EVERY task (including negatives) and produce + # deterministic 0.0 noise across all models (removed in commit 2f699c79 + # from git-ape-onboarding for this reason). + +tasks: + - "tasks/*.yaml" diff --git a/.github/evals/azure-cost-estimator/tasks/negative-billing-history.yaml b/.github/evals/azure-cost-estimator/tasks/negative-billing-history.yaml new file mode 100644 index 0000000..5cc0dfd --- /dev/null +++ b/.github/evals/azure-cost-estimator/tasks/negative-billing-history.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-billing-history +name: Negative — Historical billing / Cost Management invoice query +description: Querying actual billed usage from Azure Cost Management belongs to a billing/cost-analysis workflow, not the retail-prices-based estimator. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "Pull my actual Azure spend for last month broken down by resource group from Cost Management — I need the real invoiced numbers, not an estimate." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-cost-estimator/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-cost-estimator/tasks/negative-off-topic.yaml b/.github/evals/azure-cost-estimator/tasks/negative-off-topic.yaml new file mode 100644 index 0000000..1a9d92c --- /dev/null +++ b/.github/evals/azure-cost-estimator/tasks/negative-off-topic.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-off-topic +name: Negative — Off-topic prompt (Linux kernel scheduling) +description: Off-topic prompt clearly outside Azure cost estimation should not trigger this skill. +tags: [trigger, negative, off-topic, mutable-by-skill] +inputs: + prompt: "Explain how the Linux Completely Fair Scheduler (CFS) picks the next task to run, and how vruntime is recomputed when a task wakes from sleep." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-cost-estimator/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-cost-estimator/tasks/positive-arm-template-estimate.yaml b/.github/evals/azure-cost-estimator/tasks/positive-arm-template-estimate.yaml new file mode 100644 index 0000000..16ba474 --- /dev/null +++ b/.github/evals/azure-cost-estimator/tasks/positive-arm-template-estimate.yaml @@ -0,0 +1,48 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-arm-template-estimate +name: Positive — Monthly cost estimate from an ARM template +description: Skill should be invoked when the user asks for a monthly cost estimate of resources defined in an ARM template. +# `mutable-by-skill` — score reflects SKILL.md (trigger + answer_quality +# graders read from .github/skills/azure-cost-estimator/SKILL.md). +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "I have an ARM template at .azure/deployments/deploy-20260506-001/template.json that provisions a Standard_B1ls Linux VM with a 30 GB Standard_LRS OS disk and a static public IP in southeastasia. Roughly how much will this cost me per month?" +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/azure-cost-estimator/SKILL.md + mode: positive + threshold: 0.5 + + # answer_quality (LLM-as-judge): scoped per-task on positives so a flaky + # judge call only zeroes out this task, not the whole leg. + # IMPORTANT: `continue_session: true` is mandatory — without it the judge + # has zero access to the agent's response and scores oscillate. + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user asked for a monthly cost estimate of an ARM template + that provisions a Standard_B1ls Linux VM, a 30 GB Standard_LRS + OS disk, and a static public IP in southeastasia. + + PASS criteria — the response must contain ALL of: + 1. Names the **Azure Retail Prices API** + (`https://prices.azure.com/api/retail/prices`) as the source + of pricing — NOT hardcoded numbers or the Pricing Calculator + web UI. + 2. Mentions converting `1 Hour` unit prices to monthly by + multiplying by **730** (hours per month). + 3. Calls out at least one of the free-tier / no-charge items + relevant to this template (e.g., VNet / NSG / NIC are free, + or notes bandwidth egress free tier). + 4. Produces or commits to producing a per-resource cost + breakdown with a monthly total (table, list, or JSON) — + does NOT just hand-wave a single rough total. + + If ALL four criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/evals/azure-cost-estimator/tasks/positive-sku-compare.yaml b/.github/evals/azure-cost-estimator/tasks/positive-sku-compare.yaml new file mode 100644 index 0000000..171615a --- /dev/null +++ b/.github/evals/azure-cost-estimator/tasks/positive-sku-compare.yaml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-sku-compare +name: Positive — Compare monthly cost of two VM SKUs +description: Skill should be invoked when the user asks to compare retail monthly cost between two Azure SKUs in a specific region. +# See positive-arm-template-estimate.yaml for `mutable-by-*` tag semantics. +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "How much more per month would a Standard_B2s Linux VM cost me compared to a Standard_B1ls in eastus, at retail pay-as-you-go pricing? Show your work." +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/azure-cost-estimator/SKILL.md + mode: positive + threshold: 0.5 + + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user asked to compare the monthly retail cost of + `Standard_B2s` vs `Standard_B1ls` Linux VMs in `eastus`. + + PASS criteria — the response must contain ALL of: + 1. Names the **Azure Retail Prices API** + (`https://prices.azure.com/api/retail/prices`) as the + pricing source — NOT hardcoded numbers fabricated from + memory and NOT the Pricing Calculator web UI. + 2. Constructs (or shows it would construct) an OData + `$filter` query that targets `serviceName eq 'Virtual Machines'` + with `armRegionName eq 'eastus'` and the relevant `armSkuName` + values, including `priceType eq 'Consumption'`. + 3. Multiplies the per-hour unit price by **730** to derive a + monthly figure for each SKU. + 4. Reports a per-SKU monthly cost AND the delta (B2s − B1ls) + between them — not just a single number. + + If ALL four criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/evals/manifest.yaml b/.github/evals/manifest.yaml index 86f5b36..9ca57a6 100644 --- a/.github/evals/manifest.yaml +++ b/.github/evals/manifest.yaml @@ -34,6 +34,8 @@ skills: tier: expanded - name: azure-stack-destroy tier: expanded + - name: azure-cost-estimator + tier: expanded # Per-tier model fan-out. The matrix runs each selected skill against every # model in its tier. To compare additional models, add them here. #