CommonAutoRearsh/tasks/skill-quality/task.yaml

47 lines
1.1 KiB
YAML

id: skill-quality
description: Deterministic sample task for scoring a skill document.
artifacts:
include:
- fixtures/SKILL.md
exclude: []
max_files_per_iteration: 1
mutation:
mode: direct_edit
allowed_file_types:
- .md
max_changed_lines: 40
mutator:
type: command
command: "python ../../scripts/mutate_skill_task.py --task-dir . --artifact fixtures/SKILL.md"
cwd: "tasks/skill-quality"
timeout_seconds: 30
runner:
command: python ../../scripts/evaluate_skill_task.py --task-dir . --artifact fixtures/SKILL.md --output ../../work/skill-run.json
cwd: "tasks/skill-quality"
timeout_seconds: 30
scorer:
type: command
command: python scripts/score_skill_task.py --input work/skill-run.json
timeout_seconds: 30
parse:
format: json
score_field: score
metrics_field: metrics
objective:
primary_metric: score
direction: maximize
constraints:
- metric: violation_count
op: <=
value: 0
policy:
keep_if: better_primary
tie_breakers: []
on_failure: discard
budget:
max_iterations: 5
max_failures: 3
logging:
results_file: work/results.jsonl
candidate_dir: work/candidates