diff --git a/scripts/run_task.py b/scripts/run_task.py index 63db29e..638715b 100644 --- a/scripts/run_task.py +++ b/scripts/run_task.py @@ -57,7 +57,6 @@ def main() -> int: artifact_manager = ArtifactManager(task) snapshot = artifact_manager.snapshot() - diff_summary = artifact_manager.diff_summary(snapshot) run_result = run_command( task.runner.command, @@ -72,7 +71,7 @@ def main() -> int: status="crash", reason=f"command failed with exit code {run_result.exit_code}", candidate_score=None, - diff_summary=diff_summary, + diff_summary=artifact_manager.diff_summary(snapshot), ) scorer_result = run_command( @@ -88,7 +87,7 @@ def main() -> int: status="crash", reason=f"scorer failed with exit code {scorer_result.exit_code}", candidate_score=None, - diff_summary=diff_summary, + diff_summary=artifact_manager.diff_summary(snapshot), ) try: @@ -105,7 +104,7 @@ def main() -> int: status="crash", reason=f"score parse failed: {exc}", candidate_score=None, - diff_summary=diff_summary, + diff_summary=artifact_manager.diff_summary(snapshot), ) decision = decide_candidate( @@ -124,7 +123,7 @@ def main() -> int: status=decision.status, reason=decision.reason, candidate_score=decision.candidate_score, - diff_summary=diff_summary, + diff_summary=artifact_manager.diff_summary(snapshot), ) diff --git a/tests/test_execution_pipeline.py b/tests/test_execution_pipeline.py index 44592f9..9175384 100644 --- a/tests/test_execution_pipeline.py +++ b/tests/test_execution_pipeline.py @@ -212,6 +212,81 @@ class RunTaskCliTest(unittest.TestCase): self.assertEqual(record["reason"], "scorer failed with exit code 7") self.assertIsNone(record["candidate_score"]) + def test_run_task_cli_records_score_parse_failure(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + temp_root = Path(tmp) + self._copy_repo_layout(temp_root) + + task_dir = temp_root / "tasks" / "score-parse-failure" + fixture_dir = task_dir / "fixtures" + fixture_dir.mkdir(parents=True) + (fixture_dir / "SKILL.md").write_text("# Fixture\n", encoding="utf-8") + (task_dir / "task.yaml").write_text( + "\n".join( + [ + "id: score-parse-failure", + "description: Score parse failure fixture.", + "artifacts:", + " include:", + " - fixtures/SKILL.md", + " exclude: []", + " max_files_per_iteration: 1", + "mutation:", + " mode: direct_edit", + " allowed_file_types:", + " - .md", + " max_changed_lines: 20", + "runner:", + " command: python -c \"print('runner ok')\"", + " cwd: tasks/score-parse-failure", + " timeout_seconds: 30", + "scorer:", + " type: command", + " command: python -c \"print('not-json')\"", + " parse:", + " format: json", + " score_field: score", + " metrics_field: metrics", + "objective:", + " primary_metric: score", + " direction: maximize", + "constraints: []", + "policy:", + " keep_if: better_primary", + " tie_breakers: []", + " on_failure: discard", + "budget:", + " max_iterations: 1", + " max_failures: 1", + "logging:", + " results_file: work/results.jsonl", + " candidate_dir: work/candidates", + "", + ] + ), + encoding="utf-8", + ) + + completed = subprocess.run( + ["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/score-parse-failure/task.yaml"], + cwd=str(temp_root), + capture_output=True, + text=True, + encoding="utf-8", + check=False, + ) + + self.assertEqual(completed.returncode, 0, msg=completed.stderr) + results_path = temp_root / "work" / "results.jsonl" + self.assertTrue(results_path.exists()) + lines = results_path.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(lines), 1) + record = json.loads(lines[0]) + self.assertEqual(record["task_id"], "score-parse-failure") + self.assertEqual(record["status"], "crash") + self.assertIn("score parse failed:", record["reason"]) + self.assertIsNone(record["candidate_score"]) + if __name__ == "__main__": unittest.main()