SummerOneTwo · SummerOneTwo · Apr 29, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -88,9 +88,9 @@ AutoCode/
 ├── statements/         # 题面
 │   └── README.md
 └── tests/              # 生成的测试数据
-    ├── 01.in
-    ├── 01.ans
-    └── ...
+ ├── 01.in
+ ├── 01.ans / 01.out（由 answer_ext 控制）
+ └── ...
 ```
 
 ## 出题工作流程
@@ -102,7 +102,7 @@ AutoCode/
 5. 构建生成器 (`generator_build`)
 6. 运行压力测试 (`stress_test_run`, completed_rounds == total_rounds)
 7. 按需构建检查器 (`checker_build`, accuracy >= 0.9)
-8. 生成测试数据（`problem_generate_tests`, generated_test_count > 0，且最终 extreme/tle 至少占一半；候选不足时尽量满足）
+8. 生成测试数据（`problem_generate_tests`, generated_test_count > 0，支持 `answer_ext`；最终 extreme/tle 至少占一半；候选不足时尽量满足；长任务中断可 `resume=true` 续跑）
 9. 验证测试数据 (`problem_verify_tests`, passed)
 10. 打包 Polygon (`problem_pack_polygon`)
 

diff --git a/README.md b/README.md
@@ -246,7 +246,7 @@ AutoCode 提供 15 个原子工具，分为 7 组。所有工具返回统一格
 | 工具 | 描述 | 关键参数 |
 |------|------|----------|
 | `problem_create` | 初始化题目目录 | `problem_dir`, `problem_name` |
-| `problem_generate_tests` | 生成最终测试数据（最终数据集中 extreme/tle 至少占一半，候选不足时尽量满足） | `problem_dir`, `test_count` |
+| `problem_generate_tests` | 生成最终测试数据（最终数据集中 extreme/tle 至少占一半，候选不足时尽量满足） | `problem_dir`, `test_count`, `answer_ext`, `resume`, `hard_timeout_seconds` |
 | `problem_verify_tests` | 验证测试数据质量（含 extreme/tle 占比硬校验） | `problem_dir`, `tests_dir`, `verify_types` |
 | `problem_pack_polygon` | 打包为 Polygon 格式 | `problem_dir`, `time_limit`, `memory_limit` |
 
@@ -375,11 +375,13 @@ All 1000 rounds passed
 ```python
 problem_generate_tests(
     problem_dir="problems/ab",
-    test_count=50
+    test_count=50,
+    answer_ext=".out",      # 可选，默认 .ans
+    hard_timeout_seconds=600
 )
 ```
 
-说明：最终写入的测试中，`extreme`（type=3）与 `tle`（type=4）合计不少于一半；若候选里极限类不足，则会在可用候选范围内尽量满足并返回对应统计字段。
+说明：最终写入的测试中，`extreme`（type=3）与 `tle`（type=4）合计不少于一半；若候选里极限类不足，则会在可用候选范围内尽量满足并返回对应统计字段。若长任务被中断，可使用 `resume=true` 从 checkpoint 续跑。
 
 ### 步骤 7：打包为 Polygon 格式
 
@@ -499,7 +501,7 @@ problems/your-problem/
 │   └── README.md       # 题目描述
 ├── tests/
 │   ├── 01.in           # 测试输入
-│   ├── 01.ans          # 期望输出
+│   ├── 01.ans/.out     # 期望输出（由 answer_ext 控制）
 │   └── ...
 └── problem.xml         # Polygon 配置
 ```

diff --git a/agents/autocode-workflow.md b/agents/autocode-workflow.md
@@ -25,6 +25,8 @@ Always work through this sequence unless the task is explicitly outside problem
 
 When the user asks for a later step directly, explain which prerequisite step is missing and complete the missing work first.
 
-When running `problem_generate_tests`, enforce test quality: final test data should contain at least half limit-oriented cases (`type=3` extreme + `type=4` tle) when candidate availability allows.
+When running `problem_generate_tests`, enforce test quality: final test data should contain at least half limit-oriented cases (`type=3` extreme + `type=4` tle) when candidate availability allows. Also enforce that generator logic for type=3 and type=4 is semantically different (type=4 should include targeted worst-case patterns, not only max-parameter scaling).
+
+For long-running `problem_generate_tests`, warn that new user messages can interrupt MCP execution. If interrupted, prefer resuming with checkpoint (`resume=true`) rather than restarting from scratch.
 
 Treat hook feedback as authoritative. If a hook denies a tool call, fix the workflow gap instead of retrying the same call.
diff --git a/pyproject.toml b/pyproject.toml
@@ -80,6 +80,7 @@ exclude_lines = [
 
 [dependency-groups]
 dev = [
+    "twine>=6.2.0",
     "types-psutil>=7.2.2.20260402",
     "types-pywin32>=311.0.0.20260402",
     "types-pyyaml>=6.0.12.20250915",

diff --git a/scripts/workflow_guard.py b/scripts/workflow_guard.py
@@ -272,6 +272,8 @@ def session_start() -> int:
         "problem_validate(validation_passed) -> "
         "problem_generate_tests(generated_test_count > 0, and prefer >=50% type3/type4 in final tests when candidates are sufficient) -> "
         "problem_verify_tests(passed) -> problem_pack_polygon. "
+        "When running long problem_generate_tests tasks, avoid sending new chat messages because that can interrupt MCP calls; if interrupted, resume with checkpoint state (resume=true). "
+        "Generator quality gate: ensure type=3 and type=4 branches are semantically different, and type=4 includes targeted worst-case patterns rather than only max parameters. "
         "If a hook blocks a step, complete the missing prerequisite instead of retrying blindly."
     )
     print(

diff --git a/skills/autocode-workflow/SKILL.md b/skills/autocode-workflow/SKILL.md
@@ -233,9 +233,10 @@ CRITICAL: Must pass validation before generating final tests
 Tool: problem_generate_tests
 Required: problem_dir
 Recommended: test_count=50, enable_dedup=true, enable_validator_filter=true
-Output: tests/01.in ~ tests/50.in + corresponding .ans files
+Output: tests/01.in ~ tests/50.in + corresponding answer files (`.ans` by default, or configured `answer_ext` such as `.out`)
 Verify: Check generated_tests count matches test_count
 Quality Gate: In final tests, type 3/4 (extreme + tle) should be >= ceil(test_count/2) when candidates are sufficient
+Long-running note: sending new user messages may interrupt MCP execution; prefer waiting, or resume with `resume=true` if interrupted.
 ```
 
 ### Phase 9: Packaging
@@ -337,6 +338,7 @@ Before considering the problem complete:
 - [ ] Sample files validated (problem_validate passed)
 - [ ] Final test data generated (50+ tests)
 - [ ] Final test data has at least 50% extreme/tle cases when candidate pool allows
+- [ ] type=3/type=4 generation logic is semantically different (not just max-parameter duplication)
 - [ ] Polygon package created
 
 ## Example Complete Workflow

diff --git a/src/autocode_mcp/prompts/__init__.py b/src/autocode_mcp/prompts/__init__.py
@@ -65,6 +65,7 @@
 - 先保证最终测试中至少一半是 extreme/tle（type=3/4，候选不足时尽量满足）
 - 再平衡分布
 - 采样
+- 长任务期间避免发送新消息（可能中断 MCP 调用）；若中断，优先使用 resume/checkpoint 续跑
 
 ## 质量指标
 - Consistency > 90%
@@ -124,6 +125,7 @@
 - type=2 (random): 随机数据
 - type=3 (extreme): 极端数据（溢出、精度、hash碰撞）
 - type=4 (tle): TLE 诱导数据
+- 要求 type=3 与 type=4 分支有实质差异，type=4 应包含针对性卡法，不应仅靠 n_max/t_max 拉满
 
 ### 代码模板
 ```cpp

diff --git a/src/autocode_mcp/server.py b/src/autocode_mcp/server.py
@@ -32,7 +32,12 @@
 from .tools.file_ops import FileReadTool, FileSaveTool
 from .tools.generator import GeneratorBuildTool, GeneratorRunTool
 from .tools.interactor import InteractorBuildTool
-from .tools.problem import ProblemCreateTool, ProblemGenerateTestsTool, ProblemPackPolygonTool
+from .tools.problem import (
+    ProblemCleanupProcessesTool,
+    ProblemCreateTool,
+    ProblemGenerateTestsTool,
+    ProblemPackPolygonTool,
+)
 from .tools.solution import SolutionBuildTool, SolutionRunTool
 from .tools.stress_test import StressTestRunTool
 from .tools.test_verify import ProblemVerifyTestsTool
@@ -68,6 +73,7 @@ def register_all_tools() -> None:
     # Problem 工具组
     register_tool(ProblemCreateTool())
     register_tool(ProblemGenerateTestsTool())
+    register_tool(ProblemCleanupProcessesTool())
     register_tool(ProblemVerifyTestsTool())
     register_tool(ProblemPackPolygonTool())
     register_tool(ProblemValidateTool())
@@ -118,6 +124,18 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult:
             structuredContent=result_dict,
             isError=not result.success,
         )
+    except asyncio.CancelledError:
+        cancel_result = ToolResult.fail(
+            "Tool call interrupted by cancellation",
+            interrupted=True,
+            resume_hint="Retry with resume=true if tool supports checkpoints",
+        )
+        cancel_dict = cancel_result.to_dict()
+        return CallToolResult(
+            content=[TextContent(type="text", text=json.dumps(cancel_dict, ensure_ascii=False))],
+            structuredContent=cancel_dict,
+            isError=True,
+        )
     except Exception as e:
         error_result = ToolResult.fail(str(e))
         error_dict = error_result.to_dict()

diff --git a/src/autocode_mcp/tools/generator.py b/src/autocode_mcp/tools/generator.py
@@ -8,6 +8,7 @@
 
 import hashlib
 import os
+import re
 
 from ..utils.compiler import run_binary, run_binary_with_args
 from ..utils.platform import get_exe_extension
@@ -58,6 +59,16 @@ def input_schema(self) -> dict:
                     "description": "编译器名称",
                     "default": "g++",
                 },
+                "enable_semantic_check": {
+                    "type": "boolean",
+                    "description": "是否启用 type=3/type=4 语义静态检查",
+                    "default": True,
+                },
+                "strict_semantic_check": {
+                    "type": "boolean",
+                    "description": "语义静态检查不通过时是否直接失败",
+                    "default": False,
+                },
             },
             "required": ["problem_dir"],
             "anyOf": [
@@ -72,6 +83,8 @@ async def execute(
         code: str | None = None,
         source_path: str | None = None,
         compiler: str = "g++",
+        enable_semantic_check: bool = True,
+        strict_semantic_check: bool = False,
     ) -> ToolResult:
         """执行 Generator 构建。"""
         resolved, err = resolve_source(problem_dir, code, source_path)
@@ -107,15 +120,65 @@ async def execute(
 
         binary_size = os.path.getsize(binary_path) if os.path.exists(binary_path) else 0
 
+        semantic_check = self._check_type34_semantics(resolved.code) if enable_semantic_check else {"enabled": False}
+        if (
+            enable_semantic_check
+            and strict_semantic_check
+            and not semantic_check.get("passed", True)
+        ):
+            return ToolResult.fail(
+                "Generator semantic check failed: type=3/type=4 lack substantial difference",
+                semantic_check=semantic_check,
+            )
+
         return ToolResult.ok(
             source_path=compile_source,
             canonical_path=canonical_path,
             binary_path=binary_path,
             binary_size=binary_size,
             compile_log=compile_result.stderr,
+            semantic_check=semantic_check,
             message="Generator built successfully",
         )
 
+    def _check_type34_semantics(self, code: str) -> dict:
+        type3_blocks = self._extract_type_branch_snippets(code, 3)
+        type4_blocks = self._extract_type_branch_snippets(code, 4)
+        has_type3 = bool(type3_blocks)
+        has_type4 = bool(type4_blocks)
+        if not has_type3 or not has_type4:
+            return {
+                "enabled": True,
+                "passed": True,
+                "advisory": True,
+                "reason": "semantic check could not reliably detect both type=3/type=4 branches",
+                "hint": "请人工确认 type=3/type=4 分支存在且有实质差异",
+            }
+
+        norm3 = " ".join(type3_blocks).replace(" ", "")
+        norm4 = " ".join(type4_blocks).replace(" ", "")
+        output_lines = [line.strip() for line in code.splitlines() if "cout" in line or "printf" in line]
+        duplicate_outputs = len(set(output_lines)) <= 1 and len(output_lines) > 0
+        similar = norm3 == norm4 or (norm3 and norm4 and abs(len(norm3) - len(norm4)) < 10) or duplicate_outputs
+        return {
+            "enabled": True,
+            "passed": not similar,
+            "reason": "" if not similar else "type=3/type=4 branch snippets are too similar",
+            "hint": "为 type=4 增加针对性卡法，而不仅是 n_max/t_max 取最大值",
+        }
+
+    def _extract_type_branch_snippets(self, code: str, type_value: int) -> list[str]:
+        patterns = [
+            rf"type\s*==\s*{type_value}\b",
+            rf"\b{type_value}\s*==\s*type\b",
+            rf"case\s+{type_value}\s*:",
+        ]
+        snippets: list[str] = []
+        for pattern in patterns:
+            for match in re.finditer(pattern, code):
+                snippets.append(code[match.start(): match.start() + 240])
+        return snippets
+
 
 class GeneratorRunTool(Tool):
     """运行多策略数据生成器。"""