feat: add artifact loop cli and sample skill task

2026-04-02 12:13:17 +08:00 · 2026-04-02 12:13:17 +08:00 · dfd668e5d2
commit dfd668e5d2
parent 5a51d25791
8 changed files with 282 additions and 0 deletions
--- a/scripts/evaluate_skill_task.py
+++ b/scripts/evaluate_skill_task.py
@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from pathlib import Path
+
+
+CHECKS = {
+    "title_line": lambda text: text.lstrip().startswith("# "),
+    "when_to_use_section": lambda text: bool(re.search(r"(?m)^## When to Use\s*$", text)),
+    "steps_section": lambda text: bool(re.search(r"(?m)^## Steps\s*$", text)),
+    "numbered_step": lambda text: bool(re.search(r"(?m)^1\. ", text)),
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task-dir", required=True)
+    parser.add_argument("--artifact", required=True)
+    parser.add_argument("--output", required=True)
+    return parser.parse_args()
+
+
+def load_rubric_keys(rubric_text: str) -> list[str]:
+    keys: list[str] = []
+    for line in rubric_text.splitlines():
+        if not line.startswith("- "):
+            continue
+        key = line[2:].split(":", 1)[0].strip()
+        if key:
+            keys.append(key)
+    return keys
+
+
+def main() -> int:
+    args = parse_args()
+    task_dir = Path(args.task_dir).resolve()
+    artifact_path = (task_dir / args.artifact).resolve()
+    output_path = Path(args.output).resolve()
+
+    prompt_text = (task_dir / "prompt.md").read_text(encoding="utf-8")
+    rubric_text = (task_dir / "rubric.md").read_text(encoding="utf-8")
+    artifact_text = artifact_path.read_text(encoding="utf-8")
+
+    checks: dict[str, bool] = {}
+    for key in load_rubric_keys(rubric_text):
+        evaluator = CHECKS.get(key)
+        if evaluator is None:
+            raise ValueError(f"unsupported rubric check: {key}")
+        checks[key] = evaluator(artifact_text)
+
+    passed_checks = sum(1 for passed in checks.values() if passed)
+    total_checks = len(checks)
+    result = {
+        "score": float(passed_checks),
+        "metrics": {
+            "passed_checks": passed_checks,
+            "total_checks": total_checks,
+            "violation_count": total_checks - passed_checks,
+        },
+        "details": {
+            "prompt": prompt_text.strip(),
+            "checks": checks,
+        },
+    }
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(result, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/run_task.py
+++ b/scripts/run_task.py
@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+ROOT_DIR = Path(__file__).resolve().parents[1]
+if str(ROOT_DIR) not in sys.path:
+    sys.path.insert(0, str(ROOT_DIR))
+
+from engine.artifact_manager import ArtifactManager
+from engine.decision_engine import decide_candidate
+from engine.runner import run_command
+from engine.scorer import parse_score_output
+from engine.task_loader import load_task
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", required=True)
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    root_dir = Path.cwd()
+    task_path = (root_dir / args.task).resolve()
+    task = load_task(task_path)
+
+    artifact_manager = ArtifactManager(task)
+    snapshot = artifact_manager.snapshot()
+
+    run_result = run_command(
+        task.runner.command,
+        (root_dir / task.runner.cwd).resolve(),
+        task.runner.timeout_seconds,
+    )
+    scorer_result = run_command(
+        task.scorer.command,
+        root_dir.resolve(),
+        task.runner.timeout_seconds,
+    )
+    score_result = parse_score_output(
+        scorer_result.stdout,
+        score_field=task.scorer.parse.score_field,
+        metrics_field=task.scorer.parse.metrics_field,
+    )
+    decision = decide_candidate(
+        baseline=None,
+        candidate=score_result,
+        objective=task.objective,
+        constraints=task.constraints,
+        tie_breakers=task.policy.tie_breakers,
+        run_result=run_result,
+    )
+
+    record = {
+        "task_id": task.id,
+        "status": decision.status,
+        "reason": decision.reason,
+        "candidate_score": decision.candidate_score,
+        "diff_summary": artifact_manager.diff_summary(snapshot),
+    }
+
+    results_path = (root_dir / task.logging.results_file).resolve()
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+    with results_path.open("a", encoding="utf-8", newline="") as handle:
+        handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+    print(json.dumps(record, ensure_ascii=False))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/score_skill_task.py
+++ b/scripts/score_skill_task.py
@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True)
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    input_path = Path(args.input).resolve()
+    print(input_path.read_text(encoding="utf-8"), end="")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/tasks/skill-quality/fixtures/SKILL.md
+++ b/tasks/skill-quality/fixtures/SKILL.md
@ -0,0 +1,15 @@
+# Deterministic Sample Skill
+
+## Purpose
+
+Provide a stable sample skill document for the execution pipeline.
+
+## When to Use
+
+Use this skill when you need a deterministic artifact for end-to-end testing.
+
+## Steps
+
+1. Read the task instructions.
+2. Compare the skill against the rubric.
+3. Return the computed score.
--- a/tasks/skill-quality/prompt.md
+++ b/tasks/skill-quality/prompt.md
@ -0,0 +1,3 @@
+# Skill Quality Prompt
+
+Evaluate `fixtures/SKILL.md` against `rubric.md` and write a JSON report with top-level `score` and `metrics` fields.
--- a/tasks/skill-quality/rubric.md
+++ b/tasks/skill-quality/rubric.md
@ -0,0 +1,8 @@
+# Skill Quality Rubric
+
+Each satisfied check is worth one point.
+
+- title_line: The document starts with a level-one heading.
+- when_to_use_section: The document contains a `## When to Use` section.
+- steps_section: The document contains a `## Steps` section.
+- numbered_step: The document contains at least one numbered step.
--- a/tasks/skill-quality/task.yaml
+++ b/tasks/skill-quality/task.yaml
@ -0,0 +1,40 @@
+id: skill-quality
+description: Deterministic sample task for scoring a skill document.
+artifacts:
+  include:
+    - fixtures/SKILL.md
+  exclude: []
+  max_files_per_iteration: 1
+mutation:
+  mode: direct_edit
+  allowed_file_types:
+    - .md
+  max_changed_lines: 20
+runner:
+  command: python ../../scripts/evaluate_skill_task.py --task-dir . --artifact fixtures/SKILL.md --output ../../work/skill-run.json
+  cwd: tasks/skill-quality
+  timeout_seconds: 30
+scorer:
+  type: command
+  command: python scripts/score_skill_task.py --input work/skill-run.json
+  parse:
+    format: json
+    score_field: score
+    metrics_field: metrics
+objective:
+  primary_metric: score
+  direction: maximize
+constraints:
+  - metric: violation_count
+    op: <=
+    value: 0
+policy:
+  keep_if: better_primary
+  tie_breakers: []
+  on_failure: discard
+budget:
+  max_iterations: 1
+  max_failures: 1
+logging:
+  results_file: work/results.jsonl
+  candidate_dir: work/candidates
--- a/tests/test_execution_pipeline.py
+++ b/tests/test_execution_pipeline.py
@ -1,4 +1,7 @@
+import json
 from pathlib import Path
+import shutil
+import subprocess
 import tempfile
 import unittest

@ -58,5 +61,47 @@ class ExecutionPipelineTest(unittest.TestCase):
        self.assertIn("violation_count", decision.reason)


+class RunTaskCliTest(unittest.TestCase):
+    def test_run_task_cli_writes_results_jsonl(self) -> None:
+        source_root = Path(__file__).resolve().parents[1]
+        with tempfile.TemporaryDirectory() as tmp:
+            temp_root = Path(tmp)
+            shutil.copytree(
+                source_root / "engine",
+                temp_root / "engine",
+                ignore=shutil.ignore_patterns("__pycache__"),
+            )
+            for relative_dir in ("scripts", "tasks"):
+                source_dir = source_root / relative_dir
+                if source_dir.exists():
+                    shutil.copytree(
+                        source_dir,
+                        temp_root / relative_dir,
+                        ignore=shutil.ignore_patterns("__pycache__"),
+                    )
+
+            completed = subprocess.run(
+                ["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"],
+                cwd=str(temp_root),
+                capture_output=True,
+                text=True,
+                encoding="utf-8",
+                check=False,
+            )
+
+            self.assertEqual(completed.returncode, 0, msg=completed.stderr)
+            results_path = temp_root / "work" / "results.jsonl"
+            self.assertTrue(results_path.exists())
+            lines = results_path.read_text(encoding="utf-8").splitlines()
+            self.assertEqual(len(lines), 1)
+
+            record = json.loads(lines[0])
+            self.assertEqual(record["task_id"], "skill-quality")
+            self.assertEqual(record["status"], "keep")
+            self.assertEqual(record["reason"], "no baseline available")
+            self.assertEqual(record["candidate_score"], 4.0)
+            self.assertEqual(record["diff_summary"], "")
+
+
 if __name__ == "__main__":
    unittest.main()