diff --git a/scripts/run_task.py b/scripts/run_task.py index 0d4ec88..63db29e 100644 --- a/scripts/run_task.py +++ b/scripts/run_task.py @@ -16,36 +16,98 @@ from engine.scorer import parse_score_output from engine.task_loader import load_task +def _resolve_repo_path(repo_root: Path, raw_path: str) -> Path: + path = Path(raw_path) + if path.is_absolute(): + return path.resolve() + return (repo_root / path).resolve() + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("--task", required=True) return parser.parse_args() +def _append_record(repo_root: Path, results_file: str, record: dict[str, object]) -> None: + results_path = _resolve_repo_path(repo_root, results_file) + results_path.parent.mkdir(parents=True, exist_ok=True) + with results_path.open("a", encoding="utf-8", newline="") as handle: + handle.write(json.dumps(record, ensure_ascii=False) + "\n") + + +def _emit_record(repo_root: Path, task_id: str, results_file: str, status: str, reason: str, candidate_score: float | None, diff_summary: str) -> int: + record = { + "task_id": task_id, + "status": status, + "reason": reason, + "candidate_score": candidate_score, + "diff_summary": diff_summary, + } + _append_record(repo_root, results_file, record) + print(json.dumps(record, ensure_ascii=False)) + return 0 + + def main() -> int: args = parse_args() - root_dir = Path.cwd() - task_path = (root_dir / args.task).resolve() + repo_root = ROOT_DIR.resolve() + task_path = _resolve_repo_path(repo_root, args.task) task = load_task(task_path) artifact_manager = ArtifactManager(task) snapshot = artifact_manager.snapshot() + diff_summary = artifact_manager.diff_summary(snapshot) run_result = run_command( task.runner.command, - (root_dir / task.runner.cwd).resolve(), + _resolve_repo_path(repo_root, task.runner.cwd), task.runner.timeout_seconds, ) + if run_result.exit_code != 0: + return _emit_record( + repo_root=repo_root, + task_id=task.id, + results_file=task.logging.results_file, + status="crash", + reason=f"command failed with exit code {run_result.exit_code}", + candidate_score=None, + diff_summary=diff_summary, + ) + scorer_result = run_command( task.scorer.command, - root_dir.resolve(), + repo_root, task.runner.timeout_seconds, ) - score_result = parse_score_output( - scorer_result.stdout, - score_field=task.scorer.parse.score_field, - metrics_field=task.scorer.parse.metrics_field, - ) + if scorer_result.exit_code != 0: + return _emit_record( + repo_root=repo_root, + task_id=task.id, + results_file=task.logging.results_file, + status="crash", + reason=f"scorer failed with exit code {scorer_result.exit_code}", + candidate_score=None, + diff_summary=diff_summary, + ) + + try: + score_result = parse_score_output( + scorer_result.stdout, + score_field=task.scorer.parse.score_field, + metrics_field=task.scorer.parse.metrics_field, + ) + except (KeyError, TypeError, ValueError) as exc: + return _emit_record( + repo_root=repo_root, + task_id=task.id, + results_file=task.logging.results_file, + status="crash", + reason=f"score parse failed: {exc}", + candidate_score=None, + diff_summary=diff_summary, + ) + decision = decide_candidate( baseline=None, candidate=score_result, @@ -55,21 +117,15 @@ def main() -> int: run_result=run_result, ) - record = { - "task_id": task.id, - "status": decision.status, - "reason": decision.reason, - "candidate_score": decision.candidate_score, - "diff_summary": artifact_manager.diff_summary(snapshot), - } - - results_path = (root_dir / task.logging.results_file).resolve() - results_path.parent.mkdir(parents=True, exist_ok=True) - with results_path.open("a", encoding="utf-8", newline="") as handle: - handle.write(json.dumps(record, ensure_ascii=False) + "\n") - - print(json.dumps(record, ensure_ascii=False)) - return 0 + return _emit_record( + repo_root=repo_root, + task_id=task.id, + results_file=task.logging.results_file, + status=decision.status, + reason=decision.reason, + candidate_score=decision.candidate_score, + diff_summary=diff_summary, + ) if __name__ == "__main__": diff --git a/tests/test_execution_pipeline.py b/tests/test_execution_pipeline.py index 2ed9941..44592f9 100644 --- a/tests/test_execution_pipeline.py +++ b/tests/test_execution_pipeline.py @@ -62,23 +62,26 @@ class ExecutionPipelineTest(unittest.TestCase): class RunTaskCliTest(unittest.TestCase): - def test_run_task_cli_writes_results_jsonl(self) -> None: + def _copy_repo_layout(self, destination: Path) -> None: source_root = Path(__file__).resolve().parents[1] + shutil.copytree( + source_root / "engine", + destination / "engine", + ignore=shutil.ignore_patterns("__pycache__"), + ) + for relative_dir in ("scripts", "tasks"): + source_dir = source_root / relative_dir + if source_dir.exists(): + shutil.copytree( + source_dir, + destination / relative_dir, + ignore=shutil.ignore_patterns("__pycache__"), + ) + + def test_run_task_cli_writes_results_jsonl(self) -> None: with tempfile.TemporaryDirectory() as tmp: temp_root = Path(tmp) - shutil.copytree( - source_root / "engine", - temp_root / "engine", - ignore=shutil.ignore_patterns("__pycache__"), - ) - for relative_dir in ("scripts", "tasks"): - source_dir = source_root / relative_dir - if source_dir.exists(): - shutil.copytree( - source_dir, - temp_root / relative_dir, - ignore=shutil.ignore_patterns("__pycache__"), - ) + self._copy_repo_layout(temp_root) completed = subprocess.run( ["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"], @@ -102,6 +105,113 @@ class RunTaskCliTest(unittest.TestCase): self.assertEqual(record["candidate_score"], 4.0) self.assertEqual(record["diff_summary"], "") + def test_run_task_cli_uses_repo_root_for_absolute_task_path(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + temp_root = Path(tmp) + self._copy_repo_layout(temp_root) + outside_root = temp_root / "outside" + outside_root.mkdir() + + completed = subprocess.run( + [ + "uv", + "run", + "python", + str(temp_root / "scripts" / "run_task.py"), + "--task", + str(temp_root / "tasks" / "skill-quality" / "task.yaml"), + ], + cwd=str(outside_root), + capture_output=True, + text=True, + encoding="utf-8", + check=False, + ) + + self.assertEqual(completed.returncode, 0, msg=completed.stderr) + results_path = temp_root / "work" / "results.jsonl" + self.assertTrue(results_path.exists()) + lines = results_path.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(lines), 1) + record = json.loads(lines[0]) + self.assertEqual(record["task_id"], "skill-quality") + self.assertEqual(record["status"], "keep") + + def test_run_task_cli_records_scorer_failure(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + temp_root = Path(tmp) + self._copy_repo_layout(temp_root) + + task_dir = temp_root / "tasks" / "scorer-failure" + fixture_dir = task_dir / "fixtures" + fixture_dir.mkdir(parents=True) + (fixture_dir / "SKILL.md").write_text("# Fixture\n", encoding="utf-8") + (task_dir / "task.yaml").write_text( + "\n".join( + [ + "id: scorer-failure", + "description: Scorer failure fixture.", + "artifacts:", + " include:", + " - fixtures/SKILL.md", + " exclude: []", + " max_files_per_iteration: 1", + "mutation:", + " mode: direct_edit", + " allowed_file_types:", + " - .md", + " max_changed_lines: 20", + "runner:", + " command: python -c \"print('runner ok')\"", + " cwd: tasks/scorer-failure", + " timeout_seconds: 30", + "scorer:", + " type: command", + " command: python -c \"import sys; sys.stderr.write('boom\\n'); raise SystemExit(7)\"", + " parse:", + " format: json", + " score_field: score", + " metrics_field: metrics", + "objective:", + " primary_metric: score", + " direction: maximize", + "constraints: []", + "policy:", + " keep_if: better_primary", + " tie_breakers: []", + " on_failure: discard", + "budget:", + " max_iterations: 1", + " max_failures: 1", + "logging:", + " results_file: work/results.jsonl", + " candidate_dir: work/candidates", + "", + ] + ), + encoding="utf-8", + ) + + completed = subprocess.run( + ["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/scorer-failure/task.yaml"], + cwd=str(temp_root), + capture_output=True, + text=True, + encoding="utf-8", + check=False, + ) + + self.assertEqual(completed.returncode, 0, msg=completed.stderr) + results_path = temp_root / "work" / "results.jsonl" + self.assertTrue(results_path.exists()) + lines = results_path.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(lines), 1) + record = json.loads(lines[0]) + self.assertEqual(record["task_id"], "scorer-failure") + self.assertEqual(record["status"], "crash") + self.assertEqual(record["reason"], "scorer failed with exit code 7") + self.assertIsNone(record["candidate_score"]) + if __name__ == "__main__": unittest.main()