fix: harden task cli result handling
This commit is contained in:
parent
dfd668e5d2
commit
a96c66eeb2
@ -16,36 +16,98 @@ from engine.scorer import parse_score_output
|
|||||||
from engine.task_loader import load_task
|
from engine.task_loader import load_task
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_repo_path(repo_root: Path, raw_path: str) -> Path:
|
||||||
|
path = Path(raw_path)
|
||||||
|
if path.is_absolute():
|
||||||
|
return path.resolve()
|
||||||
|
return (repo_root / path).resolve()
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--task", required=True)
|
parser.add_argument("--task", required=True)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def _append_record(repo_root: Path, results_file: str, record: dict[str, object]) -> None:
|
||||||
|
results_path = _resolve_repo_path(repo_root, results_file)
|
||||||
|
results_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with results_path.open("a", encoding="utf-8", newline="") as handle:
|
||||||
|
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def _emit_record(repo_root: Path, task_id: str, results_file: str, status: str, reason: str, candidate_score: float | None, diff_summary: str) -> int:
|
||||||
|
record = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"status": status,
|
||||||
|
"reason": reason,
|
||||||
|
"candidate_score": candidate_score,
|
||||||
|
"diff_summary": diff_summary,
|
||||||
|
}
|
||||||
|
_append_record(repo_root, results_file, record)
|
||||||
|
print(json.dumps(record, ensure_ascii=False))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
def main() -> int:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
root_dir = Path.cwd()
|
repo_root = ROOT_DIR.resolve()
|
||||||
task_path = (root_dir / args.task).resolve()
|
task_path = _resolve_repo_path(repo_root, args.task)
|
||||||
task = load_task(task_path)
|
task = load_task(task_path)
|
||||||
|
|
||||||
artifact_manager = ArtifactManager(task)
|
artifact_manager = ArtifactManager(task)
|
||||||
snapshot = artifact_manager.snapshot()
|
snapshot = artifact_manager.snapshot()
|
||||||
|
diff_summary = artifact_manager.diff_summary(snapshot)
|
||||||
|
|
||||||
run_result = run_command(
|
run_result = run_command(
|
||||||
task.runner.command,
|
task.runner.command,
|
||||||
(root_dir / task.runner.cwd).resolve(),
|
_resolve_repo_path(repo_root, task.runner.cwd),
|
||||||
task.runner.timeout_seconds,
|
task.runner.timeout_seconds,
|
||||||
)
|
)
|
||||||
|
if run_result.exit_code != 0:
|
||||||
|
return _emit_record(
|
||||||
|
repo_root=repo_root,
|
||||||
|
task_id=task.id,
|
||||||
|
results_file=task.logging.results_file,
|
||||||
|
status="crash",
|
||||||
|
reason=f"command failed with exit code {run_result.exit_code}",
|
||||||
|
candidate_score=None,
|
||||||
|
diff_summary=diff_summary,
|
||||||
|
)
|
||||||
|
|
||||||
scorer_result = run_command(
|
scorer_result = run_command(
|
||||||
task.scorer.command,
|
task.scorer.command,
|
||||||
root_dir.resolve(),
|
repo_root,
|
||||||
task.runner.timeout_seconds,
|
task.runner.timeout_seconds,
|
||||||
)
|
)
|
||||||
score_result = parse_score_output(
|
if scorer_result.exit_code != 0:
|
||||||
scorer_result.stdout,
|
return _emit_record(
|
||||||
score_field=task.scorer.parse.score_field,
|
repo_root=repo_root,
|
||||||
metrics_field=task.scorer.parse.metrics_field,
|
task_id=task.id,
|
||||||
)
|
results_file=task.logging.results_file,
|
||||||
|
status="crash",
|
||||||
|
reason=f"scorer failed with exit code {scorer_result.exit_code}",
|
||||||
|
candidate_score=None,
|
||||||
|
diff_summary=diff_summary,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
score_result = parse_score_output(
|
||||||
|
scorer_result.stdout,
|
||||||
|
score_field=task.scorer.parse.score_field,
|
||||||
|
metrics_field=task.scorer.parse.metrics_field,
|
||||||
|
)
|
||||||
|
except (KeyError, TypeError, ValueError) as exc:
|
||||||
|
return _emit_record(
|
||||||
|
repo_root=repo_root,
|
||||||
|
task_id=task.id,
|
||||||
|
results_file=task.logging.results_file,
|
||||||
|
status="crash",
|
||||||
|
reason=f"score parse failed: {exc}",
|
||||||
|
candidate_score=None,
|
||||||
|
diff_summary=diff_summary,
|
||||||
|
)
|
||||||
|
|
||||||
decision = decide_candidate(
|
decision = decide_candidate(
|
||||||
baseline=None,
|
baseline=None,
|
||||||
candidate=score_result,
|
candidate=score_result,
|
||||||
@ -55,21 +117,15 @@ def main() -> int:
|
|||||||
run_result=run_result,
|
run_result=run_result,
|
||||||
)
|
)
|
||||||
|
|
||||||
record = {
|
return _emit_record(
|
||||||
"task_id": task.id,
|
repo_root=repo_root,
|
||||||
"status": decision.status,
|
task_id=task.id,
|
||||||
"reason": decision.reason,
|
results_file=task.logging.results_file,
|
||||||
"candidate_score": decision.candidate_score,
|
status=decision.status,
|
||||||
"diff_summary": artifact_manager.diff_summary(snapshot),
|
reason=decision.reason,
|
||||||
}
|
candidate_score=decision.candidate_score,
|
||||||
|
diff_summary=diff_summary,
|
||||||
results_path = (root_dir / task.logging.results_file).resolve()
|
)
|
||||||
results_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
with results_path.open("a", encoding="utf-8", newline="") as handle:
|
|
||||||
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
||||||
|
|
||||||
print(json.dumps(record, ensure_ascii=False))
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -62,23 +62,26 @@ class ExecutionPipelineTest(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class RunTaskCliTest(unittest.TestCase):
|
class RunTaskCliTest(unittest.TestCase):
|
||||||
def test_run_task_cli_writes_results_jsonl(self) -> None:
|
def _copy_repo_layout(self, destination: Path) -> None:
|
||||||
source_root = Path(__file__).resolve().parents[1]
|
source_root = Path(__file__).resolve().parents[1]
|
||||||
|
shutil.copytree(
|
||||||
|
source_root / "engine",
|
||||||
|
destination / "engine",
|
||||||
|
ignore=shutil.ignore_patterns("__pycache__"),
|
||||||
|
)
|
||||||
|
for relative_dir in ("scripts", "tasks"):
|
||||||
|
source_dir = source_root / relative_dir
|
||||||
|
if source_dir.exists():
|
||||||
|
shutil.copytree(
|
||||||
|
source_dir,
|
||||||
|
destination / relative_dir,
|
||||||
|
ignore=shutil.ignore_patterns("__pycache__"),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_run_task_cli_writes_results_jsonl(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
temp_root = Path(tmp)
|
temp_root = Path(tmp)
|
||||||
shutil.copytree(
|
self._copy_repo_layout(temp_root)
|
||||||
source_root / "engine",
|
|
||||||
temp_root / "engine",
|
|
||||||
ignore=shutil.ignore_patterns("__pycache__"),
|
|
||||||
)
|
|
||||||
for relative_dir in ("scripts", "tasks"):
|
|
||||||
source_dir = source_root / relative_dir
|
|
||||||
if source_dir.exists():
|
|
||||||
shutil.copytree(
|
|
||||||
source_dir,
|
|
||||||
temp_root / relative_dir,
|
|
||||||
ignore=shutil.ignore_patterns("__pycache__"),
|
|
||||||
)
|
|
||||||
|
|
||||||
completed = subprocess.run(
|
completed = subprocess.run(
|
||||||
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"],
|
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"],
|
||||||
@ -102,6 +105,113 @@ class RunTaskCliTest(unittest.TestCase):
|
|||||||
self.assertEqual(record["candidate_score"], 4.0)
|
self.assertEqual(record["candidate_score"], 4.0)
|
||||||
self.assertEqual(record["diff_summary"], "")
|
self.assertEqual(record["diff_summary"], "")
|
||||||
|
|
||||||
|
def test_run_task_cli_uses_repo_root_for_absolute_task_path(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
temp_root = Path(tmp)
|
||||||
|
self._copy_repo_layout(temp_root)
|
||||||
|
outside_root = temp_root / "outside"
|
||||||
|
outside_root.mkdir()
|
||||||
|
|
||||||
|
completed = subprocess.run(
|
||||||
|
[
|
||||||
|
"uv",
|
||||||
|
"run",
|
||||||
|
"python",
|
||||||
|
str(temp_root / "scripts" / "run_task.py"),
|
||||||
|
"--task",
|
||||||
|
str(temp_root / "tasks" / "skill-quality" / "task.yaml"),
|
||||||
|
],
|
||||||
|
cwd=str(outside_root),
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(completed.returncode, 0, msg=completed.stderr)
|
||||||
|
results_path = temp_root / "work" / "results.jsonl"
|
||||||
|
self.assertTrue(results_path.exists())
|
||||||
|
lines = results_path.read_text(encoding="utf-8").splitlines()
|
||||||
|
self.assertEqual(len(lines), 1)
|
||||||
|
record = json.loads(lines[0])
|
||||||
|
self.assertEqual(record["task_id"], "skill-quality")
|
||||||
|
self.assertEqual(record["status"], "keep")
|
||||||
|
|
||||||
|
def test_run_task_cli_records_scorer_failure(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
temp_root = Path(tmp)
|
||||||
|
self._copy_repo_layout(temp_root)
|
||||||
|
|
||||||
|
task_dir = temp_root / "tasks" / "scorer-failure"
|
||||||
|
fixture_dir = task_dir / "fixtures"
|
||||||
|
fixture_dir.mkdir(parents=True)
|
||||||
|
(fixture_dir / "SKILL.md").write_text("# Fixture\n", encoding="utf-8")
|
||||||
|
(task_dir / "task.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"id: scorer-failure",
|
||||||
|
"description: Scorer failure fixture.",
|
||||||
|
"artifacts:",
|
||||||
|
" include:",
|
||||||
|
" - fixtures/SKILL.md",
|
||||||
|
" exclude: []",
|
||||||
|
" max_files_per_iteration: 1",
|
||||||
|
"mutation:",
|
||||||
|
" mode: direct_edit",
|
||||||
|
" allowed_file_types:",
|
||||||
|
" - .md",
|
||||||
|
" max_changed_lines: 20",
|
||||||
|
"runner:",
|
||||||
|
" command: python -c \"print('runner ok')\"",
|
||||||
|
" cwd: tasks/scorer-failure",
|
||||||
|
" timeout_seconds: 30",
|
||||||
|
"scorer:",
|
||||||
|
" type: command",
|
||||||
|
" command: python -c \"import sys; sys.stderr.write('boom\\n'); raise SystemExit(7)\"",
|
||||||
|
" parse:",
|
||||||
|
" format: json",
|
||||||
|
" score_field: score",
|
||||||
|
" metrics_field: metrics",
|
||||||
|
"objective:",
|
||||||
|
" primary_metric: score",
|
||||||
|
" direction: maximize",
|
||||||
|
"constraints: []",
|
||||||
|
"policy:",
|
||||||
|
" keep_if: better_primary",
|
||||||
|
" tie_breakers: []",
|
||||||
|
" on_failure: discard",
|
||||||
|
"budget:",
|
||||||
|
" max_iterations: 1",
|
||||||
|
" max_failures: 1",
|
||||||
|
"logging:",
|
||||||
|
" results_file: work/results.jsonl",
|
||||||
|
" candidate_dir: work/candidates",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
completed = subprocess.run(
|
||||||
|
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/scorer-failure/task.yaml"],
|
||||||
|
cwd=str(temp_root),
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(completed.returncode, 0, msg=completed.stderr)
|
||||||
|
results_path = temp_root / "work" / "results.jsonl"
|
||||||
|
self.assertTrue(results_path.exists())
|
||||||
|
lines = results_path.read_text(encoding="utf-8").splitlines()
|
||||||
|
self.assertEqual(len(lines), 1)
|
||||||
|
record = json.loads(lines[0])
|
||||||
|
self.assertEqual(record["task_id"], "scorer-failure")
|
||||||
|
self.assertEqual(record["status"], "crash")
|
||||||
|
self.assertEqual(record["reason"], "scorer failed with exit code 7")
|
||||||
|
self.assertIsNone(record["candidate_score"])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user