fix: defer diff summary and handle parse failures

This commit is contained in:
sladro 2026-04-02 12:28:42 +08:00
parent a96c66eeb2
commit f7ef3d49b3
2 changed files with 79 additions and 5 deletions

View File

@ -57,7 +57,6 @@ def main() -> int:
artifact_manager = ArtifactManager(task)
snapshot = artifact_manager.snapshot()
diff_summary = artifact_manager.diff_summary(snapshot)
run_result = run_command(
task.runner.command,
@ -72,7 +71,7 @@ def main() -> int:
status="crash",
reason=f"command failed with exit code {run_result.exit_code}",
candidate_score=None,
diff_summary=diff_summary,
diff_summary=artifact_manager.diff_summary(snapshot),
)
scorer_result = run_command(
@ -88,7 +87,7 @@ def main() -> int:
status="crash",
reason=f"scorer failed with exit code {scorer_result.exit_code}",
candidate_score=None,
diff_summary=diff_summary,
diff_summary=artifact_manager.diff_summary(snapshot),
)
try:
@ -105,7 +104,7 @@ def main() -> int:
status="crash",
reason=f"score parse failed: {exc}",
candidate_score=None,
diff_summary=diff_summary,
diff_summary=artifact_manager.diff_summary(snapshot),
)
decision = decide_candidate(
@ -124,7 +123,7 @@ def main() -> int:
status=decision.status,
reason=decision.reason,
candidate_score=decision.candidate_score,
diff_summary=diff_summary,
diff_summary=artifact_manager.diff_summary(snapshot),
)

View File

@ -212,6 +212,81 @@ class RunTaskCliTest(unittest.TestCase):
self.assertEqual(record["reason"], "scorer failed with exit code 7")
self.assertIsNone(record["candidate_score"])
def test_run_task_cli_records_score_parse_failure(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
temp_root = Path(tmp)
self._copy_repo_layout(temp_root)
task_dir = temp_root / "tasks" / "score-parse-failure"
fixture_dir = task_dir / "fixtures"
fixture_dir.mkdir(parents=True)
(fixture_dir / "SKILL.md").write_text("# Fixture\n", encoding="utf-8")
(task_dir / "task.yaml").write_text(
"\n".join(
[
"id: score-parse-failure",
"description: Score parse failure fixture.",
"artifacts:",
" include:",
" - fixtures/SKILL.md",
" exclude: []",
" max_files_per_iteration: 1",
"mutation:",
" mode: direct_edit",
" allowed_file_types:",
" - .md",
" max_changed_lines: 20",
"runner:",
" command: python -c \"print('runner ok')\"",
" cwd: tasks/score-parse-failure",
" timeout_seconds: 30",
"scorer:",
" type: command",
" command: python -c \"print('not-json')\"",
" parse:",
" format: json",
" score_field: score",
" metrics_field: metrics",
"objective:",
" primary_metric: score",
" direction: maximize",
"constraints: []",
"policy:",
" keep_if: better_primary",
" tie_breakers: []",
" on_failure: discard",
"budget:",
" max_iterations: 1",
" max_failures: 1",
"logging:",
" results_file: work/results.jsonl",
" candidate_dir: work/candidates",
"",
]
),
encoding="utf-8",
)
completed = subprocess.run(
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/score-parse-failure/task.yaml"],
cwd=str(temp_root),
capture_output=True,
text=True,
encoding="utf-8",
check=False,
)
self.assertEqual(completed.returncode, 0, msg=completed.stderr)
results_path = temp_root / "work" / "results.jsonl"
self.assertTrue(results_path.exists())
lines = results_path.read_text(encoding="utf-8").splitlines()
self.assertEqual(len(lines), 1)
record = json.loads(lines[0])
self.assertEqual(record["task_id"], "score-parse-failure")
self.assertEqual(record["status"], "crash")
self.assertIn("score parse failed:", record["reason"])
self.assertIsNone(record["candidate_score"])
if __name__ == "__main__":
unittest.main()