CommonAutoRearsh/scripts/evaluate_skill_task.py

75 lines
2.3 KiB
Python

from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
CHECKS = {
"title_line": lambda text: text.lstrip().startswith("# "),
"when_to_use_section": lambda text: bool(re.search(r"(?m)^## When to Use\s*$", text)),
"steps_section": lambda text: bool(re.search(r"(?m)^## Steps\s*$", text)),
"numbered_step": lambda text: bool(re.search(r"(?m)^1\. ", text)),
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--task-dir", required=True)
parser.add_argument("--artifact", required=True)
parser.add_argument("--output", required=True)
return parser.parse_args()
def load_rubric_keys(rubric_text: str) -> list[str]:
keys: list[str] = []
for line in rubric_text.splitlines():
if not line.startswith("- "):
continue
key = line[2:].split(":", 1)[0].strip()
if key:
keys.append(key)
return keys
def main() -> int:
args = parse_args()
task_dir = Path(args.task_dir).resolve()
artifact_path = (task_dir / args.artifact).resolve()
output_path = Path(args.output).resolve()
prompt_text = (task_dir / "prompt.md").read_text(encoding="utf-8")
rubric_text = (task_dir / "rubric.md").read_text(encoding="utf-8")
artifact_text = artifact_path.read_text(encoding="utf-8")
checks: dict[str, bool] = {}
for key in load_rubric_keys(rubric_text):
evaluator = CHECKS.get(key)
if evaluator is None:
raise ValueError(f"unsupported rubric check: {key}")
checks[key] = evaluator(artifact_text)
passed_checks = sum(1 for passed in checks.values() if passed)
total_checks = len(checks)
result = {
"score": float(passed_checks),
"metrics": {
"passed_checks": passed_checks,
"total_checks": total_checks,
"violation_count": total_checks - passed_checks,
},
"details": {
"prompt": prompt_text.strip(),
"checks": checks,
},
}
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(result, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
return 0
if __name__ == "__main__":
raise SystemExit(main())