feat: add artifact loop cli and sample skill task
This commit is contained in:
parent
5a51d25791
commit
dfd668e5d2
74
scripts/evaluate_skill_task.py
Normal file
74
scripts/evaluate_skill_task.py
Normal file
@ -0,0 +1,74 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
CHECKS = {
|
||||
"title_line": lambda text: text.lstrip().startswith("# "),
|
||||
"when_to_use_section": lambda text: bool(re.search(r"(?m)^## When to Use\s*$", text)),
|
||||
"steps_section": lambda text: bool(re.search(r"(?m)^## Steps\s*$", text)),
|
||||
"numbered_step": lambda text: bool(re.search(r"(?m)^1\. ", text)),
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task-dir", required=True)
|
||||
parser.add_argument("--artifact", required=True)
|
||||
parser.add_argument("--output", required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_rubric_keys(rubric_text: str) -> list[str]:
|
||||
keys: list[str] = []
|
||||
for line in rubric_text.splitlines():
|
||||
if not line.startswith("- "):
|
||||
continue
|
||||
key = line[2:].split(":", 1)[0].strip()
|
||||
if key:
|
||||
keys.append(key)
|
||||
return keys
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
task_dir = Path(args.task_dir).resolve()
|
||||
artifact_path = (task_dir / args.artifact).resolve()
|
||||
output_path = Path(args.output).resolve()
|
||||
|
||||
prompt_text = (task_dir / "prompt.md").read_text(encoding="utf-8")
|
||||
rubric_text = (task_dir / "rubric.md").read_text(encoding="utf-8")
|
||||
artifact_text = artifact_path.read_text(encoding="utf-8")
|
||||
|
||||
checks: dict[str, bool] = {}
|
||||
for key in load_rubric_keys(rubric_text):
|
||||
evaluator = CHECKS.get(key)
|
||||
if evaluator is None:
|
||||
raise ValueError(f"unsupported rubric check: {key}")
|
||||
checks[key] = evaluator(artifact_text)
|
||||
|
||||
passed_checks = sum(1 for passed in checks.values() if passed)
|
||||
total_checks = len(checks)
|
||||
result = {
|
||||
"score": float(passed_checks),
|
||||
"metrics": {
|
||||
"passed_checks": passed_checks,
|
||||
"total_checks": total_checks,
|
||||
"violation_count": total_checks - passed_checks,
|
||||
},
|
||||
"details": {
|
||||
"prompt": prompt_text.strip(),
|
||||
"checks": checks,
|
||||
},
|
||||
}
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(json.dumps(result, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
76
scripts/run_task.py
Normal file
76
scripts/run_task.py
Normal file
@ -0,0 +1,76 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT_DIR = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT_DIR))
|
||||
|
||||
from engine.artifact_manager import ArtifactManager
|
||||
from engine.decision_engine import decide_candidate
|
||||
from engine.runner import run_command
|
||||
from engine.scorer import parse_score_output
|
||||
from engine.task_loader import load_task
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task", required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
root_dir = Path.cwd()
|
||||
task_path = (root_dir / args.task).resolve()
|
||||
task = load_task(task_path)
|
||||
|
||||
artifact_manager = ArtifactManager(task)
|
||||
snapshot = artifact_manager.snapshot()
|
||||
|
||||
run_result = run_command(
|
||||
task.runner.command,
|
||||
(root_dir / task.runner.cwd).resolve(),
|
||||
task.runner.timeout_seconds,
|
||||
)
|
||||
scorer_result = run_command(
|
||||
task.scorer.command,
|
||||
root_dir.resolve(),
|
||||
task.runner.timeout_seconds,
|
||||
)
|
||||
score_result = parse_score_output(
|
||||
scorer_result.stdout,
|
||||
score_field=task.scorer.parse.score_field,
|
||||
metrics_field=task.scorer.parse.metrics_field,
|
||||
)
|
||||
decision = decide_candidate(
|
||||
baseline=None,
|
||||
candidate=score_result,
|
||||
objective=task.objective,
|
||||
constraints=task.constraints,
|
||||
tie_breakers=task.policy.tie_breakers,
|
||||
run_result=run_result,
|
||||
)
|
||||
|
||||
record = {
|
||||
"task_id": task.id,
|
||||
"status": decision.status,
|
||||
"reason": decision.reason,
|
||||
"candidate_score": decision.candidate_score,
|
||||
"diff_summary": artifact_manager.diff_summary(snapshot),
|
||||
}
|
||||
|
||||
results_path = (root_dir / task.logging.results_file).resolve()
|
||||
results_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with results_path.open("a", encoding="utf-8", newline="") as handle:
|
||||
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
|
||||
print(json.dumps(record, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
21
scripts/score_skill_task.py
Normal file
21
scripts/score_skill_task.py
Normal file
@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input", required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
input_path = Path(args.input).resolve()
|
||||
print(input_path.read_text(encoding="utf-8"), end="")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
15
tasks/skill-quality/fixtures/SKILL.md
Normal file
15
tasks/skill-quality/fixtures/SKILL.md
Normal file
@ -0,0 +1,15 @@
|
||||
# Deterministic Sample Skill
|
||||
|
||||
## Purpose
|
||||
|
||||
Provide a stable sample skill document for the execution pipeline.
|
||||
|
||||
## When to Use
|
||||
|
||||
Use this skill when you need a deterministic artifact for end-to-end testing.
|
||||
|
||||
## Steps
|
||||
|
||||
1. Read the task instructions.
|
||||
2. Compare the skill against the rubric.
|
||||
3. Return the computed score.
|
||||
3
tasks/skill-quality/prompt.md
Normal file
3
tasks/skill-quality/prompt.md
Normal file
@ -0,0 +1,3 @@
|
||||
# Skill Quality Prompt
|
||||
|
||||
Evaluate `fixtures/SKILL.md` against `rubric.md` and write a JSON report with top-level `score` and `metrics` fields.
|
||||
8
tasks/skill-quality/rubric.md
Normal file
8
tasks/skill-quality/rubric.md
Normal file
@ -0,0 +1,8 @@
|
||||
# Skill Quality Rubric
|
||||
|
||||
Each satisfied check is worth one point.
|
||||
|
||||
- title_line: The document starts with a level-one heading.
|
||||
- when_to_use_section: The document contains a `## When to Use` section.
|
||||
- steps_section: The document contains a `## Steps` section.
|
||||
- numbered_step: The document contains at least one numbered step.
|
||||
40
tasks/skill-quality/task.yaml
Normal file
40
tasks/skill-quality/task.yaml
Normal file
@ -0,0 +1,40 @@
|
||||
id: skill-quality
|
||||
description: Deterministic sample task for scoring a skill document.
|
||||
artifacts:
|
||||
include:
|
||||
- fixtures/SKILL.md
|
||||
exclude: []
|
||||
max_files_per_iteration: 1
|
||||
mutation:
|
||||
mode: direct_edit
|
||||
allowed_file_types:
|
||||
- .md
|
||||
max_changed_lines: 20
|
||||
runner:
|
||||
command: python ../../scripts/evaluate_skill_task.py --task-dir . --artifact fixtures/SKILL.md --output ../../work/skill-run.json
|
||||
cwd: tasks/skill-quality
|
||||
timeout_seconds: 30
|
||||
scorer:
|
||||
type: command
|
||||
command: python scripts/score_skill_task.py --input work/skill-run.json
|
||||
parse:
|
||||
format: json
|
||||
score_field: score
|
||||
metrics_field: metrics
|
||||
objective:
|
||||
primary_metric: score
|
||||
direction: maximize
|
||||
constraints:
|
||||
- metric: violation_count
|
||||
op: <=
|
||||
value: 0
|
||||
policy:
|
||||
keep_if: better_primary
|
||||
tie_breakers: []
|
||||
on_failure: discard
|
||||
budget:
|
||||
max_iterations: 1
|
||||
max_failures: 1
|
||||
logging:
|
||||
results_file: work/results.jsonl
|
||||
candidate_dir: work/candidates
|
||||
@ -1,4 +1,7 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
@ -58,5 +61,47 @@ class ExecutionPipelineTest(unittest.TestCase):
|
||||
self.assertIn("violation_count", decision.reason)
|
||||
|
||||
|
||||
class RunTaskCliTest(unittest.TestCase):
|
||||
def test_run_task_cli_writes_results_jsonl(self) -> None:
|
||||
source_root = Path(__file__).resolve().parents[1]
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
temp_root = Path(tmp)
|
||||
shutil.copytree(
|
||||
source_root / "engine",
|
||||
temp_root / "engine",
|
||||
ignore=shutil.ignore_patterns("__pycache__"),
|
||||
)
|
||||
for relative_dir in ("scripts", "tasks"):
|
||||
source_dir = source_root / relative_dir
|
||||
if source_dir.exists():
|
||||
shutil.copytree(
|
||||
source_dir,
|
||||
temp_root / relative_dir,
|
||||
ignore=shutil.ignore_patterns("__pycache__"),
|
||||
)
|
||||
|
||||
completed = subprocess.run(
|
||||
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"],
|
||||
cwd=str(temp_root),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
check=False,
|
||||
)
|
||||
|
||||
self.assertEqual(completed.returncode, 0, msg=completed.stderr)
|
||||
results_path = temp_root / "work" / "results.jsonl"
|
||||
self.assertTrue(results_path.exists())
|
||||
lines = results_path.read_text(encoding="utf-8").splitlines()
|
||||
self.assertEqual(len(lines), 1)
|
||||
|
||||
record = json.loads(lines[0])
|
||||
self.assertEqual(record["task_id"], "skill-quality")
|
||||
self.assertEqual(record["status"], "keep")
|
||||
self.assertEqual(record["reason"], "no baseline available")
|
||||
self.assertEqual(record["candidate_score"], 4.0)
|
||||
self.assertEqual(record["diff_summary"], "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user