fix: defer diff summary and handle parse failures
This commit is contained in:
parent
a96c66eeb2
commit
f7ef3d49b3
@ -57,7 +57,6 @@ def main() -> int:
|
|||||||
|
|
||||||
artifact_manager = ArtifactManager(task)
|
artifact_manager = ArtifactManager(task)
|
||||||
snapshot = artifact_manager.snapshot()
|
snapshot = artifact_manager.snapshot()
|
||||||
diff_summary = artifact_manager.diff_summary(snapshot)
|
|
||||||
|
|
||||||
run_result = run_command(
|
run_result = run_command(
|
||||||
task.runner.command,
|
task.runner.command,
|
||||||
@ -72,7 +71,7 @@ def main() -> int:
|
|||||||
status="crash",
|
status="crash",
|
||||||
reason=f"command failed with exit code {run_result.exit_code}",
|
reason=f"command failed with exit code {run_result.exit_code}",
|
||||||
candidate_score=None,
|
candidate_score=None,
|
||||||
diff_summary=diff_summary,
|
diff_summary=artifact_manager.diff_summary(snapshot),
|
||||||
)
|
)
|
||||||
|
|
||||||
scorer_result = run_command(
|
scorer_result = run_command(
|
||||||
@ -88,7 +87,7 @@ def main() -> int:
|
|||||||
status="crash",
|
status="crash",
|
||||||
reason=f"scorer failed with exit code {scorer_result.exit_code}",
|
reason=f"scorer failed with exit code {scorer_result.exit_code}",
|
||||||
candidate_score=None,
|
candidate_score=None,
|
||||||
diff_summary=diff_summary,
|
diff_summary=artifact_manager.diff_summary(snapshot),
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -105,7 +104,7 @@ def main() -> int:
|
|||||||
status="crash",
|
status="crash",
|
||||||
reason=f"score parse failed: {exc}",
|
reason=f"score parse failed: {exc}",
|
||||||
candidate_score=None,
|
candidate_score=None,
|
||||||
diff_summary=diff_summary,
|
diff_summary=artifact_manager.diff_summary(snapshot),
|
||||||
)
|
)
|
||||||
|
|
||||||
decision = decide_candidate(
|
decision = decide_candidate(
|
||||||
@ -124,7 +123,7 @@ def main() -> int:
|
|||||||
status=decision.status,
|
status=decision.status,
|
||||||
reason=decision.reason,
|
reason=decision.reason,
|
||||||
candidate_score=decision.candidate_score,
|
candidate_score=decision.candidate_score,
|
||||||
diff_summary=diff_summary,
|
diff_summary=artifact_manager.diff_summary(snapshot),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -212,6 +212,81 @@ class RunTaskCliTest(unittest.TestCase):
|
|||||||
self.assertEqual(record["reason"], "scorer failed with exit code 7")
|
self.assertEqual(record["reason"], "scorer failed with exit code 7")
|
||||||
self.assertIsNone(record["candidate_score"])
|
self.assertIsNone(record["candidate_score"])
|
||||||
|
|
||||||
|
def test_run_task_cli_records_score_parse_failure(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
temp_root = Path(tmp)
|
||||||
|
self._copy_repo_layout(temp_root)
|
||||||
|
|
||||||
|
task_dir = temp_root / "tasks" / "score-parse-failure"
|
||||||
|
fixture_dir = task_dir / "fixtures"
|
||||||
|
fixture_dir.mkdir(parents=True)
|
||||||
|
(fixture_dir / "SKILL.md").write_text("# Fixture\n", encoding="utf-8")
|
||||||
|
(task_dir / "task.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"id: score-parse-failure",
|
||||||
|
"description: Score parse failure fixture.",
|
||||||
|
"artifacts:",
|
||||||
|
" include:",
|
||||||
|
" - fixtures/SKILL.md",
|
||||||
|
" exclude: []",
|
||||||
|
" max_files_per_iteration: 1",
|
||||||
|
"mutation:",
|
||||||
|
" mode: direct_edit",
|
||||||
|
" allowed_file_types:",
|
||||||
|
" - .md",
|
||||||
|
" max_changed_lines: 20",
|
||||||
|
"runner:",
|
||||||
|
" command: python -c \"print('runner ok')\"",
|
||||||
|
" cwd: tasks/score-parse-failure",
|
||||||
|
" timeout_seconds: 30",
|
||||||
|
"scorer:",
|
||||||
|
" type: command",
|
||||||
|
" command: python -c \"print('not-json')\"",
|
||||||
|
" parse:",
|
||||||
|
" format: json",
|
||||||
|
" score_field: score",
|
||||||
|
" metrics_field: metrics",
|
||||||
|
"objective:",
|
||||||
|
" primary_metric: score",
|
||||||
|
" direction: maximize",
|
||||||
|
"constraints: []",
|
||||||
|
"policy:",
|
||||||
|
" keep_if: better_primary",
|
||||||
|
" tie_breakers: []",
|
||||||
|
" on_failure: discard",
|
||||||
|
"budget:",
|
||||||
|
" max_iterations: 1",
|
||||||
|
" max_failures: 1",
|
||||||
|
"logging:",
|
||||||
|
" results_file: work/results.jsonl",
|
||||||
|
" candidate_dir: work/candidates",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
completed = subprocess.run(
|
||||||
|
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/score-parse-failure/task.yaml"],
|
||||||
|
cwd=str(temp_root),
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(completed.returncode, 0, msg=completed.stderr)
|
||||||
|
results_path = temp_root / "work" / "results.jsonl"
|
||||||
|
self.assertTrue(results_path.exists())
|
||||||
|
lines = results_path.read_text(encoding="utf-8").splitlines()
|
||||||
|
self.assertEqual(len(lines), 1)
|
||||||
|
record = json.loads(lines[0])
|
||||||
|
self.assertEqual(record["task_id"], "score-parse-failure")
|
||||||
|
self.assertEqual(record["status"], "crash")
|
||||||
|
self.assertIn("score parse failed:", record["reason"])
|
||||||
|
self.assertIsNone(record["candidate_score"])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user