diff --git a/scripts/evaluate_skill_task.py b/scripts/evaluate_skill_task.py new file mode 100644 index 0000000..3e62f01 --- /dev/null +++ b/scripts/evaluate_skill_task.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path + + +CHECKS = { + "title_line": lambda text: text.lstrip().startswith("# "), + "when_to_use_section": lambda text: bool(re.search(r"(?m)^## When to Use\s*$", text)), + "steps_section": lambda text: bool(re.search(r"(?m)^## Steps\s*$", text)), + "numbered_step": lambda text: bool(re.search(r"(?m)^1\. ", text)), +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--task-dir", required=True) + parser.add_argument("--artifact", required=True) + parser.add_argument("--output", required=True) + return parser.parse_args() + + +def load_rubric_keys(rubric_text: str) -> list[str]: + keys: list[str] = [] + for line in rubric_text.splitlines(): + if not line.startswith("- "): + continue + key = line[2:].split(":", 1)[0].strip() + if key: + keys.append(key) + return keys + + +def main() -> int: + args = parse_args() + task_dir = Path(args.task_dir).resolve() + artifact_path = (task_dir / args.artifact).resolve() + output_path = Path(args.output).resolve() + + prompt_text = (task_dir / "prompt.md").read_text(encoding="utf-8") + rubric_text = (task_dir / "rubric.md").read_text(encoding="utf-8") + artifact_text = artifact_path.read_text(encoding="utf-8") + + checks: dict[str, bool] = {} + for key in load_rubric_keys(rubric_text): + evaluator = CHECKS.get(key) + if evaluator is None: + raise ValueError(f"unsupported rubric check: {key}") + checks[key] = evaluator(artifact_text) + + passed_checks = sum(1 for passed in checks.values() if passed) + total_checks = len(checks) + result = { + "score": float(passed_checks), + "metrics": { + "passed_checks": passed_checks, + "total_checks": total_checks, + "violation_count": total_checks - passed_checks, + }, + "details": { + "prompt": prompt_text.strip(), + "checks": checks, + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(result, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_task.py b/scripts/run_task.py new file mode 100644 index 0000000..0d4ec88 --- /dev/null +++ b/scripts/run_task.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +ROOT_DIR = Path(__file__).resolve().parents[1] +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +from engine.artifact_manager import ArtifactManager +from engine.decision_engine import decide_candidate +from engine.runner import run_command +from engine.scorer import parse_score_output +from engine.task_loader import load_task + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--task", required=True) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + root_dir = Path.cwd() + task_path = (root_dir / args.task).resolve() + task = load_task(task_path) + + artifact_manager = ArtifactManager(task) + snapshot = artifact_manager.snapshot() + + run_result = run_command( + task.runner.command, + (root_dir / task.runner.cwd).resolve(), + task.runner.timeout_seconds, + ) + scorer_result = run_command( + task.scorer.command, + root_dir.resolve(), + task.runner.timeout_seconds, + ) + score_result = parse_score_output( + scorer_result.stdout, + score_field=task.scorer.parse.score_field, + metrics_field=task.scorer.parse.metrics_field, + ) + decision = decide_candidate( + baseline=None, + candidate=score_result, + objective=task.objective, + constraints=task.constraints, + tie_breakers=task.policy.tie_breakers, + run_result=run_result, + ) + + record = { + "task_id": task.id, + "status": decision.status, + "reason": decision.reason, + "candidate_score": decision.candidate_score, + "diff_summary": artifact_manager.diff_summary(snapshot), + } + + results_path = (root_dir / task.logging.results_file).resolve() + results_path.parent.mkdir(parents=True, exist_ok=True) + with results_path.open("a", encoding="utf-8", newline="") as handle: + handle.write(json.dumps(record, ensure_ascii=False) + "\n") + + print(json.dumps(record, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/score_skill_task.py b/scripts/score_skill_task.py new file mode 100644 index 0000000..09655d0 --- /dev/null +++ b/scripts/score_skill_task.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + input_path = Path(args.input).resolve() + print(input_path.read_text(encoding="utf-8"), end="") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tasks/skill-quality/fixtures/SKILL.md b/tasks/skill-quality/fixtures/SKILL.md new file mode 100644 index 0000000..f716aeb --- /dev/null +++ b/tasks/skill-quality/fixtures/SKILL.md @@ -0,0 +1,15 @@ +# Deterministic Sample Skill + +## Purpose + +Provide a stable sample skill document for the execution pipeline. + +## When to Use + +Use this skill when you need a deterministic artifact for end-to-end testing. + +## Steps + +1. Read the task instructions. +2. Compare the skill against the rubric. +3. Return the computed score. diff --git a/tasks/skill-quality/prompt.md b/tasks/skill-quality/prompt.md new file mode 100644 index 0000000..e3e12b7 --- /dev/null +++ b/tasks/skill-quality/prompt.md @@ -0,0 +1,3 @@ +# Skill Quality Prompt + +Evaluate `fixtures/SKILL.md` against `rubric.md` and write a JSON report with top-level `score` and `metrics` fields. diff --git a/tasks/skill-quality/rubric.md b/tasks/skill-quality/rubric.md new file mode 100644 index 0000000..1730df4 --- /dev/null +++ b/tasks/skill-quality/rubric.md @@ -0,0 +1,8 @@ +# Skill Quality Rubric + +Each satisfied check is worth one point. + +- title_line: The document starts with a level-one heading. +- when_to_use_section: The document contains a `## When to Use` section. +- steps_section: The document contains a `## Steps` section. +- numbered_step: The document contains at least one numbered step. diff --git a/tasks/skill-quality/task.yaml b/tasks/skill-quality/task.yaml new file mode 100644 index 0000000..a9b1bb0 --- /dev/null +++ b/tasks/skill-quality/task.yaml @@ -0,0 +1,40 @@ +id: skill-quality +description: Deterministic sample task for scoring a skill document. +artifacts: + include: + - fixtures/SKILL.md + exclude: [] + max_files_per_iteration: 1 +mutation: + mode: direct_edit + allowed_file_types: + - .md + max_changed_lines: 20 +runner: + command: python ../../scripts/evaluate_skill_task.py --task-dir . --artifact fixtures/SKILL.md --output ../../work/skill-run.json + cwd: tasks/skill-quality + timeout_seconds: 30 +scorer: + type: command + command: python scripts/score_skill_task.py --input work/skill-run.json + parse: + format: json + score_field: score + metrics_field: metrics +objective: + primary_metric: score + direction: maximize +constraints: + - metric: violation_count + op: <= + value: 0 +policy: + keep_if: better_primary + tie_breakers: [] + on_failure: discard +budget: + max_iterations: 1 + max_failures: 1 +logging: + results_file: work/results.jsonl + candidate_dir: work/candidates diff --git a/tests/test_execution_pipeline.py b/tests/test_execution_pipeline.py index 5ffbc9d..2ed9941 100644 --- a/tests/test_execution_pipeline.py +++ b/tests/test_execution_pipeline.py @@ -1,4 +1,7 @@ +import json from pathlib import Path +import shutil +import subprocess import tempfile import unittest @@ -58,5 +61,47 @@ class ExecutionPipelineTest(unittest.TestCase): self.assertIn("violation_count", decision.reason) +class RunTaskCliTest(unittest.TestCase): + def test_run_task_cli_writes_results_jsonl(self) -> None: + source_root = Path(__file__).resolve().parents[1] + with tempfile.TemporaryDirectory() as tmp: + temp_root = Path(tmp) + shutil.copytree( + source_root / "engine", + temp_root / "engine", + ignore=shutil.ignore_patterns("__pycache__"), + ) + for relative_dir in ("scripts", "tasks"): + source_dir = source_root / relative_dir + if source_dir.exists(): + shutil.copytree( + source_dir, + temp_root / relative_dir, + ignore=shutil.ignore_patterns("__pycache__"), + ) + + completed = subprocess.run( + ["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"], + cwd=str(temp_root), + capture_output=True, + text=True, + encoding="utf-8", + check=False, + ) + + self.assertEqual(completed.returncode, 0, msg=completed.stderr) + results_path = temp_root / "work" / "results.jsonl" + self.assertTrue(results_path.exists()) + lines = results_path.read_text(encoding="utf-8").splitlines() + self.assertEqual(len(lines), 1) + + record = json.loads(lines[0]) + self.assertEqual(record["task_id"], "skill-quality") + self.assertEqual(record["status"], "keep") + self.assertEqual(record["reason"], "no baseline available") + self.assertEqual(record["candidate_score"], 4.0) + self.assertEqual(record["diff_summary"], "") + + if __name__ == "__main__": unittest.main()