From 1632aefa8513431a371c7beebd10099d1ba8dd68 Mon Sep 17 00:00:00 2001 From: sladro Date: Thu, 2 Apr 2026 14:02:57 +0800 Subject: [PATCH] Fix crash rollback and exit semantics --- scripts/run_task.py | 47 +++++++++++++++--- tests/test_execution_pipeline.py | 84 ++++++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 11 deletions(-) diff --git a/scripts/run_task.py b/scripts/run_task.py index 638715b..962897c 100644 --- a/scripts/run_task.py +++ b/scripts/run_task.py @@ -49,6 +49,33 @@ def _emit_record(repo_root: Path, task_id: str, results_file: str, status: str, return 0 +def _finalize_outcome( + *, + repo_root: Path, + task_id: str, + results_file: str, + artifact_manager: ArtifactManager, + snapshot, + status: str, + reason: str, + candidate_score: float | None, +) -> int: + diff_summary = artifact_manager.diff_summary(snapshot) + if status in {"discard", "crash"}: + artifact_manager.restore(snapshot) + exit_code = 1 if status == "crash" else 0 + _emit_record( + repo_root=repo_root, + task_id=task_id, + results_file=results_file, + status=status, + reason=reason, + candidate_score=candidate_score, + diff_summary=diff_summary, + ) + return exit_code + + def main() -> int: args = parse_args() repo_root = ROOT_DIR.resolve() @@ -64,14 +91,15 @@ def main() -> int: task.runner.timeout_seconds, ) if run_result.exit_code != 0: - return _emit_record( + return _finalize_outcome( repo_root=repo_root, task_id=task.id, results_file=task.logging.results_file, + artifact_manager=artifact_manager, + snapshot=snapshot, status="crash", reason=f"command failed with exit code {run_result.exit_code}", candidate_score=None, - diff_summary=artifact_manager.diff_summary(snapshot), ) scorer_result = run_command( @@ -80,14 +108,15 @@ def main() -> int: task.runner.timeout_seconds, ) if scorer_result.exit_code != 0: - return _emit_record( + return _finalize_outcome( repo_root=repo_root, task_id=task.id, results_file=task.logging.results_file, + artifact_manager=artifact_manager, + snapshot=snapshot, status="crash", reason=f"scorer failed with exit code {scorer_result.exit_code}", candidate_score=None, - diff_summary=artifact_manager.diff_summary(snapshot), ) try: @@ -97,14 +126,15 @@ def main() -> int: metrics_field=task.scorer.parse.metrics_field, ) except (KeyError, TypeError, ValueError) as exc: - return _emit_record( + return _finalize_outcome( repo_root=repo_root, task_id=task.id, results_file=task.logging.results_file, + artifact_manager=artifact_manager, + snapshot=snapshot, status="crash", reason=f"score parse failed: {exc}", candidate_score=None, - diff_summary=artifact_manager.diff_summary(snapshot), ) decision = decide_candidate( @@ -116,14 +146,15 @@ def main() -> int: run_result=run_result, ) - return _emit_record( + return _finalize_outcome( repo_root=repo_root, task_id=task.id, results_file=task.logging.results_file, + artifact_manager=artifact_manager, + snapshot=snapshot, status=decision.status, reason=decision.reason, candidate_score=decision.candidate_score, - diff_summary=artifact_manager.diff_summary(snapshot), ) diff --git a/tests/test_execution_pipeline.py b/tests/test_execution_pipeline.py index 9175384..23b0437 100644 --- a/tests/test_execution_pipeline.py +++ b/tests/test_execution_pipeline.py @@ -137,7 +137,85 @@ class RunTaskCliTest(unittest.TestCase): self.assertEqual(record["task_id"], "skill-quality") self.assertEqual(record["status"], "keep") - def test_run_task_cli_records_scorer_failure(self) -> None: + def test_run_task_cli_restores_artifacts_after_crash(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + temp_root = Path(tmp) + self._copy_repo_layout(temp_root) + + task_dir = temp_root / "tasks" / "runner-crash-restores" + fixture_dir = task_dir / "fixtures" + fixture_dir.mkdir(parents=True) + artifact_path = fixture_dir / "SKILL.md" + artifact_path.write_text("# Original\n", encoding="utf-8") + (task_dir / "task.yaml").write_text( + "\n".join( + [ + "id: runner-crash-restores", + "description: Runner crash restore fixture.", + "artifacts:", + " include:", + " - fixtures/SKILL.md", + " exclude: []", + " max_files_per_iteration: 1", + "mutation:", + " mode: direct_edit", + " allowed_file_types:", + " - .md", + " max_changed_lines: 20", + "runner:", + " command: python -c \"from pathlib import Path; Path('fixtures/SKILL.md').write_text('# Mutated\\n', encoding='utf-8'); raise SystemExit(9)\"", + " cwd: tasks/runner-crash-restores", + " timeout_seconds: 30", + "scorer:", + " type: command", + " command: python -c \"print('unused scorer')\"", + " parse:", + " format: json", + " score_field: score", + " metrics_field: metrics", + "objective:", + " primary_metric: score", + " direction: maximize", + "constraints: []", + "policy:", + " keep_if: better_primary", + " tie_breakers: []", + " on_failure: discard", + "budget:", + " max_iterations: 1", + " max_failures: 1", + "logging:", + " results_file: work/results.jsonl", + " candidate_dir: work/candidates", + "", + ] + ), + encoding="utf-8", + ) + + completed = subprocess.run( + ["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/runner-crash-restores/task.yaml"], + cwd=str(temp_root), + capture_output=True, + text=True, + encoding="utf-8", + check=False, + ) + + self.assertNotEqual(completed.returncode, 0, msg=completed.stderr) + self.assertEqual(artifact_path.read_text(encoding="utf-8"), "# Original\n") + results_path = temp_root / "work" / "results.jsonl" + self.assertTrue(results_path.exists()) + lines = results_path.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(lines), 1) + record = json.loads(lines[0]) + self.assertEqual(record["task_id"], "runner-crash-restores") + self.assertEqual(record["status"], "crash") + self.assertEqual(record["reason"], "command failed with exit code 9") + self.assertIn("# Original", record["diff_summary"]) + self.assertIn("# Mutated", record["diff_summary"]) + + def test_run_task_cli_returns_nonzero_on_crash_and_writes_record(self) -> None: with tempfile.TemporaryDirectory() as tmp: temp_root = Path(tmp) self._copy_repo_layout(temp_root) @@ -201,7 +279,7 @@ class RunTaskCliTest(unittest.TestCase): check=False, ) - self.assertEqual(completed.returncode, 0, msg=completed.stderr) + self.assertNotEqual(completed.returncode, 0, msg=completed.stderr) results_path = temp_root / "work" / "results.jsonl" self.assertTrue(results_path.exists()) lines = results_path.read_text(encoding="utf-8").splitlines() @@ -276,7 +354,7 @@ class RunTaskCliTest(unittest.TestCase): check=False, ) - self.assertEqual(completed.returncode, 0, msg=completed.stderr) + self.assertNotEqual(completed.returncode, 0, msg=completed.stderr) results_path = temp_root / "work" / "results.jsonl" self.assertTrue(results_path.exists()) lines = results_path.read_text(encoding="utf-8").splitlines()