feat: add artifact loop cli and sample skill task

This commit is contained in:
sladro 2026-04-02 12:13:17 +08:00
parent 5a51d25791
commit dfd668e5d2
8 changed files with 282 additions and 0 deletions

View File

@ -0,0 +1,74 @@
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
CHECKS = {
"title_line": lambda text: text.lstrip().startswith("# "),
"when_to_use_section": lambda text: bool(re.search(r"(?m)^## When to Use\s*$", text)),
"steps_section": lambda text: bool(re.search(r"(?m)^## Steps\s*$", text)),
"numbered_step": lambda text: bool(re.search(r"(?m)^1\. ", text)),
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--task-dir", required=True)
parser.add_argument("--artifact", required=True)
parser.add_argument("--output", required=True)
return parser.parse_args()
def load_rubric_keys(rubric_text: str) -> list[str]:
keys: list[str] = []
for line in rubric_text.splitlines():
if not line.startswith("- "):
continue
key = line[2:].split(":", 1)[0].strip()
if key:
keys.append(key)
return keys
def main() -> int:
args = parse_args()
task_dir = Path(args.task_dir).resolve()
artifact_path = (task_dir / args.artifact).resolve()
output_path = Path(args.output).resolve()
prompt_text = (task_dir / "prompt.md").read_text(encoding="utf-8")
rubric_text = (task_dir / "rubric.md").read_text(encoding="utf-8")
artifact_text = artifact_path.read_text(encoding="utf-8")
checks: dict[str, bool] = {}
for key in load_rubric_keys(rubric_text):
evaluator = CHECKS.get(key)
if evaluator is None:
raise ValueError(f"unsupported rubric check: {key}")
checks[key] = evaluator(artifact_text)
passed_checks = sum(1 for passed in checks.values() if passed)
total_checks = len(checks)
result = {
"score": float(passed_checks),
"metrics": {
"passed_checks": passed_checks,
"total_checks": total_checks,
"violation_count": total_checks - passed_checks,
},
"details": {
"prompt": prompt_text.strip(),
"checks": checks,
},
}
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(result, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
return 0
if __name__ == "__main__":
raise SystemExit(main())

76
scripts/run_task.py Normal file
View File

@ -0,0 +1,76 @@
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from engine.artifact_manager import ArtifactManager
from engine.decision_engine import decide_candidate
from engine.runner import run_command
from engine.scorer import parse_score_output
from engine.task_loader import load_task
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--task", required=True)
return parser.parse_args()
def main() -> int:
args = parse_args()
root_dir = Path.cwd()
task_path = (root_dir / args.task).resolve()
task = load_task(task_path)
artifact_manager = ArtifactManager(task)
snapshot = artifact_manager.snapshot()
run_result = run_command(
task.runner.command,
(root_dir / task.runner.cwd).resolve(),
task.runner.timeout_seconds,
)
scorer_result = run_command(
task.scorer.command,
root_dir.resolve(),
task.runner.timeout_seconds,
)
score_result = parse_score_output(
scorer_result.stdout,
score_field=task.scorer.parse.score_field,
metrics_field=task.scorer.parse.metrics_field,
)
decision = decide_candidate(
baseline=None,
candidate=score_result,
objective=task.objective,
constraints=task.constraints,
tie_breakers=task.policy.tie_breakers,
run_result=run_result,
)
record = {
"task_id": task.id,
"status": decision.status,
"reason": decision.reason,
"candidate_score": decision.candidate_score,
"diff_summary": artifact_manager.diff_summary(snapshot),
}
results_path = (root_dir / task.logging.results_file).resolve()
results_path.parent.mkdir(parents=True, exist_ok=True)
with results_path.open("a", encoding="utf-8", newline="") as handle:
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
print(json.dumps(record, ensure_ascii=False))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,21 @@
from __future__ import annotations
import argparse
from pathlib import Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True)
return parser.parse_args()
def main() -> int:
args = parse_args()
input_path = Path(args.input).resolve()
print(input_path.read_text(encoding="utf-8"), end="")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,15 @@
# Deterministic Sample Skill
## Purpose
Provide a stable sample skill document for the execution pipeline.
## When to Use
Use this skill when you need a deterministic artifact for end-to-end testing.
## Steps
1. Read the task instructions.
2. Compare the skill against the rubric.
3. Return the computed score.

View File

@ -0,0 +1,3 @@
# Skill Quality Prompt
Evaluate `fixtures/SKILL.md` against `rubric.md` and write a JSON report with top-level `score` and `metrics` fields.

View File

@ -0,0 +1,8 @@
# Skill Quality Rubric
Each satisfied check is worth one point.
- title_line: The document starts with a level-one heading.
- when_to_use_section: The document contains a `## When to Use` section.
- steps_section: The document contains a `## Steps` section.
- numbered_step: The document contains at least one numbered step.

View File

@ -0,0 +1,40 @@
id: skill-quality
description: Deterministic sample task for scoring a skill document.
artifacts:
include:
- fixtures/SKILL.md
exclude: []
max_files_per_iteration: 1
mutation:
mode: direct_edit
allowed_file_types:
- .md
max_changed_lines: 20
runner:
command: python ../../scripts/evaluate_skill_task.py --task-dir . --artifact fixtures/SKILL.md --output ../../work/skill-run.json
cwd: tasks/skill-quality
timeout_seconds: 30
scorer:
type: command
command: python scripts/score_skill_task.py --input work/skill-run.json
parse:
format: json
score_field: score
metrics_field: metrics
objective:
primary_metric: score
direction: maximize
constraints:
- metric: violation_count
op: <=
value: 0
policy:
keep_if: better_primary
tie_breakers: []
on_failure: discard
budget:
max_iterations: 1
max_failures: 1
logging:
results_file: work/results.jsonl
candidate_dir: work/candidates

View File

@ -1,4 +1,7 @@
import json
from pathlib import Path
import shutil
import subprocess
import tempfile
import unittest
@ -58,5 +61,47 @@ class ExecutionPipelineTest(unittest.TestCase):
self.assertIn("violation_count", decision.reason)
class RunTaskCliTest(unittest.TestCase):
def test_run_task_cli_writes_results_jsonl(self) -> None:
source_root = Path(__file__).resolve().parents[1]
with tempfile.TemporaryDirectory() as tmp:
temp_root = Path(tmp)
shutil.copytree(
source_root / "engine",
temp_root / "engine",
ignore=shutil.ignore_patterns("__pycache__"),
)
for relative_dir in ("scripts", "tasks"):
source_dir = source_root / relative_dir
if source_dir.exists():
shutil.copytree(
source_dir,
temp_root / relative_dir,
ignore=shutil.ignore_patterns("__pycache__"),
)
completed = subprocess.run(
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"],
cwd=str(temp_root),
capture_output=True,
text=True,
encoding="utf-8",
check=False,
)
self.assertEqual(completed.returncode, 0, msg=completed.stderr)
results_path = temp_root / "work" / "results.jsonl"
self.assertTrue(results_path.exists())
lines = results_path.read_text(encoding="utf-8").splitlines()
self.assertEqual(len(lines), 1)
record = json.loads(lines[0])
self.assertEqual(record["task_id"], "skill-quality")
self.assertEqual(record["status"], "keep")
self.assertEqual(record["reason"], "no baseline available")
self.assertEqual(record["candidate_score"], 4.0)
self.assertEqual(record["diff_summary"], "")
if __name__ == "__main__":
unittest.main()