43 KiB
Baseline-Aware Single-Iteration Orchestrator Implementation Plan
For agentic workers: REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (
- [ ]) syntax for tracking.
Goal: Add a baseline-aware single-iteration orchestrator that generates one candidate in a sandbox, validates it against mutation budgets, runs and scores it in isolation, then keeps or discards the candidate without corrupting the main workspace.
Architecture: Extend the task schema with a mutator section, then introduce an engine/orchestrator.py layer that owns sandbox lifecycle and candidate sync-back. Keep the existing loader, artifact manager, runner, scorer, and decision engine modules as foundations, but move orchestration decisions into the new layer and have scripts/run_task.py become a thin entrypoint.
Tech Stack: Python 3.10+, standard library, PyYAML, uv, unittest
File Map
New Files
engine/orchestrator.py- baseline-aware single-iteration orchestrationscripts/mutate_skill_task.py- deterministic sample mutator for theskill-qualitytasktests/test_orchestrator.py- sandbox keep/discard/crash coverage
Modified Files
engine/models.py- addMutatorSpecand extendTaskSpecengine/task_loader.py- parse and validatemutatorscripts/run_task.py- delegate to orchestrator instead of hand-rolled flowtasks/skill-quality/task.yaml- add a concretemutatorREADME.md- document the baseline-aware single-iteration behavior
Task 1: Extend The Task Schema For Mutators
Files:
-
Modify:
engine/models.py -
Modify:
engine/task_loader.py -
Modify:
tests/test_task_loader.py -
Test:
tests/test_task_loader.py -
Step 1: Add a failing mutator test to the task loader suite
# tests/test_task_loader.py
from pathlib import Path
import tempfile
import unittest
from engine.task_loader import TaskValidationError, load_task
VALID_TASK = """
id: demo
description: Demo task
artifacts:
include:
- tasks/demo/sample.txt
exclude: []
max_files_per_iteration: 1
mutation:
mode: direct_edit
allowed_file_types: [".txt"]
max_changed_lines: 10
mutator:
type: command
command: "python scripts/mutate.py"
cwd: "."
timeout_seconds: 30
runner:
command: "python -c \\\"print('run')\\\""
cwd: "."
timeout_seconds: 10
scorer:
type: command
command: "python -c \\\"import json; print(json.dumps({'score': 1, 'metrics': {'violation_count': 0}}))\\\""
parse:
format: json
score_field: "score"
metrics_field: "metrics"
objective:
primary_metric: score
direction: maximize
constraints:
- metric: violation_count
op: "<="
value: 0
policy:
keep_if: better_primary
tie_breakers: []
on_failure: discard
budget:
max_iterations: 3
max_failures: 1
logging:
results_file: work/results.jsonl
candidate_dir: work/candidates
"""
class TaskLoaderTest(unittest.TestCase):
def write_task(self, content: str) -> Path:
temp_dir = tempfile.TemporaryDirectory()
self.addCleanup(temp_dir.cleanup)
task_path = Path(temp_dir.name) / "task.yaml"
task_path.write_text(content, encoding="utf-8")
return task_path
def test_loads_minimal_task(self) -> None:
task = load_task(self.write_task(VALID_TASK))
self.assertEqual(task.id, "demo")
self.assertEqual(task.artifacts.max_files_per_iteration, 1)
self.assertEqual(task.constraints[0].metric, "violation_count")
def test_loads_mutator_spec(self) -> None:
task = load_task(self.write_task(VALID_TASK))
self.assertEqual(task.mutator.type, "command")
self.assertEqual(task.mutator.command, "python scripts/mutate.py")
self.assertEqual(task.mutator.timeout_seconds, 30)
def test_rejects_missing_required_section(self) -> None:
content = VALID_TASK.replace("objective:\n primary_metric: score\n direction: maximize\n", "")
with self.assertRaises(TaskValidationError) as ctx:
load_task(self.write_task(content))
self.assertIn("objective", str(ctx.exception))
def test_rejects_invalid_direction(self) -> None:
content = VALID_TASK.replace("direction: maximize", "direction: sideways")
with self.assertRaises(TaskValidationError) as ctx:
load_task(self.write_task(content))
self.assertIn("direction", str(ctx.exception))
def test_rejects_invalid_mutator_type(self) -> None:
content = VALID_TASK.replace("type: command", "type: agent", 1)
with self.assertRaises(TaskValidationError) as ctx:
load_task(self.write_task(content))
self.assertIn("mutator.type", str(ctx.exception))
if __name__ == "__main__":
unittest.main()
- Step 2: Run the mutation tests to verify they fail
Run: uv run python -m unittest tests.test_mutation_engine -v
Expected: FAIL because validate_candidate_changes does not accept a candidate root yet
- Step 3: Make validation compare baseline snapshot to candidate workspace
# engine/mutation_engine.py
from __future__ import annotations
from difflib import unified_diff
from pathlib import Path
from engine.models import BaselineSnapshot, TaskSpec
class MutationValidationError(ValueError):
pass
def _count_changed_lines(before: str, after: str, path: Path) -> int:
diff = unified_diff(
before.splitlines(keepends=True),
after.splitlines(keepends=True),
fromfile=f"{path.as_posix()} (before)",
tofile=f"{path.as_posix()} (after)",
)
changed_lines = 0
for line in diff:
if line.startswith(("---", "+++", "@@")):
continue
if line.startswith(("+", "-")):
changed_lines += 1
return changed_lines
def validate_candidate_changes(task: TaskSpec, snapshot: BaselineSnapshot, candidate_root: Path) -> None:
changed_files = 0
changed_lines = 0
allowed_file_types = set(task.mutation.allowed_file_types)
for baseline_path, baseline_text in snapshot.file_contents.items():
relative = baseline_path.relative_to(task.root_dir)
candidate_path = candidate_root / relative
current_text = candidate_path.read_text(encoding="utf-8") if candidate_path.exists() else ""
if current_text == baseline_text:
continue
changed_files += 1
if candidate_path.suffix not in allowed_file_types:
raise MutationValidationError(f"disallowed file type: {candidate_path.suffix}")
changed_lines += _count_changed_lines(baseline_text, current_text, candidate_path)
for path in sorted(candidate_root.rglob("*")):
if not path.is_file():
continue
relative = path.relative_to(candidate_root)
baseline_path = task.root_dir / relative
if baseline_path in snapshot.file_contents:
continue
changed_files += 1
if path.suffix not in allowed_file_types:
raise MutationValidationError(f"disallowed file type: {path.suffix}")
changed_lines += _count_changed_lines("", path.read_text(encoding="utf-8"), path)
if changed_files > task.artifacts.max_files_per_iteration:
raise MutationValidationError(
f"too many changed files: {changed_files} > {task.artifacts.max_files_per_iteration}"
)
if changed_lines > task.mutation.max_changed_lines:
raise MutationValidationError(
f"too many changed lines: {changed_lines} > {task.mutation.max_changed_lines}"
)
- Step 4: Run the mutation tests to verify they pass
Run: uv run python -m unittest tests.test_mutation_engine -v
Expected: OK
- Step 5: Commit the baseline-aware mutation validation
git add engine/mutation_engine.py tests/test_mutation_engine.py
git commit -m "feat: compare candidate workspace against baseline snapshot"
Task 4: Wire The CLI To The Orchestrator
Files:
-
Modify:
scripts/run_task.py -
Modify:
tests/test_execution_pipeline.py -
Test:
tests/test_execution_pipeline.py -
Step 1: Add a failing sandbox orchestration test
# tests/test_execution_pipeline.py
from pathlib import Path
import json
import shutil
import subprocess
import tempfile
import unittest
class RunTaskCliTest(unittest.TestCase):
def test_run_task_cli_keeps_candidate_from_sandbox(self) -> None:
source_root = Path(__file__).resolve().parents[1]
with tempfile.TemporaryDirectory() as tmp:
temp_root = Path(tmp)
shutil.copytree(source_root / "engine", temp_root / "engine")
shutil.copytree(source_root / "scripts", temp_root / "scripts")
shutil.copytree(source_root / "tasks", temp_root / "tasks")
completed = subprocess.run(
["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"],
cwd=str(temp_root),
capture_output=True,
text=True,
encoding="utf-8",
check=False,
)
self.assertEqual(completed.returncode, 0, msg=completed.stderr)
artifact_text = (temp_root / "tasks" / "skill-quality" / "fixtures" / "SKILL.md").read_text(encoding="utf-8")
self.assertIn("## When to Use", artifact_text)
record = json.loads((temp_root / "work" / "results.jsonl").read_text(encoding="utf-8").splitlines()[-1])
self.assertEqual(record["status"], "keep")
if __name__ == "__main__":
unittest.main()
- Step 2: Run the execution pipeline tests to verify they fail
Run: uv run python -m unittest tests.test_execution_pipeline -v
Expected: FAIL because the current CLI has no mutator support or sandbox orchestration
- Step 3: Replace the hand-built CLI flow with orchestrator delegation
# scripts/run_task.py
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from engine.orchestrator import run_single_iteration
from engine.task_loader import load_task
def _resolve_repo_path(repo_root: Path, raw_path: str) -> Path:
path = Path(raw_path)
if path.is_absolute():
return path.resolve()
return (repo_root / path).resolve()
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--task", required=True)
return parser.parse_args()
def _append_record(repo_root: Path, results_file: str, record: dict[str, object]) -> None:
results_path = _resolve_repo_path(repo_root, results_file)
results_path.parent.mkdir(parents=True, exist_ok=True)
with results_path.open("a", encoding="utf-8", newline="") as handle:
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
def main() -> int:
args = parse_args()
repo_root = ROOT_DIR.resolve()
task_path = _resolve_repo_path(repo_root, args.task)
task = load_task(task_path)
decision = run_single_iteration(task, baseline_score=None)
record = {
"task_id": task.id,
"status": decision.status,
"reason": decision.reason,
"candidate_score": decision.candidate_score,
"diff_summary": "",
}
_append_record(repo_root, task.logging.results_file, record)
print(json.dumps(record, ensure_ascii=False))
return 1 if decision.status == "crash" else 0
if __name__ == "__main__":
raise SystemExit(main())
- Step 4: Run the execution pipeline tests to verify they pass
Run: uv run python -m unittest tests.test_execution_pipeline -v
Expected: OK
- Step 5: Commit the CLI orchestration wiring
git add scripts/run_task.py tests/test_execution_pipeline.py
git commit -m "feat: route task runner through sandbox orchestrator"
Task 5: Add A Deterministic Sample Mutator
Files:
-
Create:
scripts/mutate_skill_task.py -
Modify:
tasks/skill-quality/task.yaml -
Test:
tests/test_execution_pipeline.py -
Step 1: Add the sample mutator script
# scripts/mutate_skill_task.py
from __future__ import annotations
import argparse
from pathlib import Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--task-dir", required=True)
parser.add_argument("--artifact", required=True)
return parser.parse_args()
def main() -> int:
args = parse_args()
task_dir = Path(args.task_dir).resolve()
artifact_path = (task_dir / args.artifact).resolve()
sections = [
"# Planning Skill",
"",
"## When to Use",
"- Use this task when a skill file is missing structure.",
"",
"## Steps",
"1. Add the missing sections.",
"2. Keep the instructions direct.",
"",
"## Constraints",
"Do not add filler content.",
"",
"## Examples",
"- Show concrete commands.",
]
artifact_path.write_text("\n".join(sections) + "\n", encoding="utf-8")
return 0
if __name__ == "__main__":
raise SystemExit(main())
- Step 2: Add
mutatorto the sample task
# tasks/skill-quality/task.yaml
id: skill-quality
description: Score one skill file against a deterministic rubric.
artifacts:
include:
- fixtures/SKILL.md
exclude: []
max_files_per_iteration: 1
mutation:
mode: direct_edit
allowed_file_types: [".md"]
max_changed_lines: 40
mutator:
type: command
command: "python ../../scripts/mutate_skill_task.py --task-dir . --artifact fixtures/SKILL.md"
cwd: "tasks/skill-quality"
timeout_seconds: 30
runner:
command: "python ../../scripts/evaluate_skill_task.py --task-dir . --artifact fixtures/SKILL.md --output ../../work/skill-run.json"
cwd: "tasks/skill-quality"
timeout_seconds: 30
scorer:
type: command
command: "python scripts/score_skill_task.py --input work/skill-run.json"
parse:
format: json
score_field: score
metrics_field: metrics
objective:
primary_metric: score
direction: maximize
constraints:
- metric: violation_count
op: "<="
value: 0
policy:
keep_if: better_primary
tie_breakers: []
on_failure: discard
budget:
max_iterations: 5
max_failures: 3
logging:
results_file: work/results.jsonl
candidate_dir: work/candidates
- Step 3: Run the execution pipeline tests to verify the sample task passes
Run: uv run python -m unittest tests.test_execution_pipeline -v
Expected: OK
- Step 4: Commit the sample mutator
git add scripts/mutate_skill_task.py tasks/skill-quality/task.yaml
git commit -m "feat: add deterministic sample mutator"
Task 6: Update The README For The Baseline-Aware Iteration Model
Files:
-
Modify:
README.md -
Test: none
-
Step 1: Update the Artifact Loop Engine section
## Artifact Loop Engine
This repository also includes a generic optimization engine for editable text artifacts such as prompts, skills, config files, and small code paths.
The current CLI now runs a baseline-aware single iteration:
1. Build a baseline view of the allowed artifacts
2. Create a temporary candidate sandbox
3. Run a task-specific mutator in the sandbox
4. Validate the candidate against mutation limits
5. Run and score the candidate in the sandbox
6. Keep or discard the candidate
Run the deterministic sample task:
```bash
uv run python scripts/run_task.py --task tasks/skill-quality/task.yaml
The task writes structured iteration results to work/results.jsonl.
Engine concepts:
artifacts: files that may be accepted back into the main workspacemutation: file-count and line-count limits for candidate changesmutator: command that generates a candidate inside the sandboxrunner: command that evaluates a candidatescorer: command that returns a structured score payloadpolicy: keep or discard logic based on objective and constraints
- [ ] **Step 2: Review the README update for consistency**
Read: `README.md`
Expected: the original training flow remains intact, and the engine section now describes the baseline-aware single iteration accurately
- [ ] **Step 3: Commit the README update**
```bash
git add README.md
git commit -m "docs: document baseline-aware single iteration"
Final Verification
- Step 1: Run the targeted test suite
Run: uv run python -m unittest tests.test_task_loader tests.test_artifact_manager tests.test_execution_pipeline tests.test_mutation_engine tests.test_orchestrator -v
Expected: OK
- Step 2: Run the sample task manually
Run: uv run python scripts/run_task.py --task tasks/skill-quality/task.yaml
Expected:
-
exit code
0 -
a new JSON line in
work/results.jsonl -
tasks/skill-quality/fixtures/SKILL.mdupdated only if the candidate is kept -
Step 3: Inspect the latest record
Read: work/results.jsonl
Expected latest record fields:
-
task_id -
status -
reason -
candidate_score -
diff_summary -
Step 4: Commit the final verified state
git add README.md engine scripts tasks tests
git commit -m "feat: ship baseline-aware single-iteration orchestrator"
- Step 2: Run the orchestrator tests to verify they fail
Run: uv run python -m unittest tests.test_orchestrator -v
Expected: FAIL with ModuleNotFoundError: No module named 'engine.orchestrator'
- Step 3: Implement the orchestrator
# engine/orchestrator.py
from __future__ import annotations
import shutil
import tempfile
from pathlib import Path
from engine.artifact_manager import ArtifactManager
from engine.decision_engine import decide_candidate
from engine.models import DecisionResult, ScoreResult, TaskSpec
from engine.mutation_engine import MutationValidationError, validate_candidate_changes
from engine.runner import run_command
from engine.scorer import parse_score_output
def _copy_repo_to_sandbox(repo_root: Path, sandbox_root: Path) -> None:
for child in repo_root.iterdir():
if child.name == ".git":
continue
target = sandbox_root / child.name
if child.is_dir():
shutil.copytree(child, target, dirs_exist_ok=True)
else:
shutil.copy2(child, target)
def _sync_artifacts_back(task: TaskSpec, sandbox_task: TaskSpec) -> None:
source_manager = ArtifactManager(sandbox_task)
target_manager = ArtifactManager(task)
target_snapshot = target_manager.snapshot()
for path in source_manager.resolve_paths():
relative = path.relative_to(sandbox_task.root_dir)
target_path = task.root_dir / relative
target_path.parent.mkdir(parents=True, exist_ok=True)
with path.open("r", encoding="utf-8", newline="") as src:
with target_path.open("w", encoding="utf-8", newline="") as dst:
dst.write(src.read())
for baseline_path, baseline_text in target_snapshot.file_contents.items():
if baseline_path.exists():
continue
baseline_path.parent.mkdir(parents=True, exist_ok=True)
with baseline_path.open("w", encoding="utf-8", newline="") as handle:
handle.write(baseline_text)
def run_single_iteration(task: TaskSpec, baseline_score: float | ScoreResult | None) -> DecisionResult:
repo_root = task.root_dir.parent.parent if task.root_dir.name == "skill-quality" else task.root_dir
with tempfile.TemporaryDirectory(prefix="artifact-loop-") as tmp:
sandbox_root = Path(tmp)
_copy_repo_to_sandbox(repo_root, sandbox_root)
from engine.task_loader import load_task
sandbox_task_path = sandbox_root / task.root_dir.relative_to(repo_root) / "task.yaml"
sandbox_task = load_task(sandbox_task_path)
baseline_snapshot = ArtifactManager(task).snapshot()
mutator_result = run_command(
sandbox_task.mutator.command,
(sandbox_root / sandbox_task.mutator.cwd).resolve(),
sandbox_task.mutator.timeout_seconds,
)
if mutator_result.exit_code != 0:
return DecisionResult(
status="crash",
reason=f"mutator failed with exit code {mutator_result.exit_code}",
baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
candidate_score=None,
)
try:
validate_candidate_changes(
sandbox_task,
baseline_snapshot,
sandbox_task.root_dir,
)
except MutationValidationError as exc:
return DecisionResult(
status="discard",
reason=str(exc),
baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
candidate_score=None,
)
run_result = run_command(
sandbox_task.runner.command,
(sandbox_root / sandbox_task.runner.cwd).resolve(),
sandbox_task.runner.timeout_seconds,
)
if run_result.exit_code != 0:
return DecisionResult(
status="crash",
reason=f"command failed with exit code {run_result.exit_code}",
baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
candidate_score=None,
)
scorer_result = run_command(
sandbox_task.scorer.command,
sandbox_root,
sandbox_task.runner.timeout_seconds,
)
if scorer_result.exit_code != 0:
return DecisionResult(
status="crash",
reason=f"scorer failed with exit code {scorer_result.exit_code}",
baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
candidate_score=None,
)
score_result = parse_score_output(
scorer_result.stdout,
score_field=sandbox_task.scorer.parse.score_field,
metrics_field=sandbox_task.scorer.parse.metrics_field,
)
decision = decide_candidate(
baseline=baseline_score,
candidate=score_result,
objective=sandbox_task.objective,
constraints=sandbox_task.constraints,
tie_breakers=sandbox_task.policy.tie_breakers,
run_result=run_result,
)
if decision.status == "keep":
_sync_artifacts_back(task, sandbox_task)
return decision
- Step 4: Run the orchestrator tests to verify they pass
Run: uv run python -m unittest tests.test_orchestrator -v
Expected: OK
- Step 5: Commit the orchestrator
git add engine/orchestrator.py tests/test_orchestrator.py
git commit -m "feat: add single-iteration orchestrator"
Task 3: Make Mutation Validation Baseline-Aware
Files:
-
Modify:
engine/mutation_engine.py -
Modify:
tests/test_mutation_engine.py -
Test:
tests/test_mutation_engine.py -
Step 1: Add a failing cross-workspace validation test
# tests/test_mutation_engine.py
from pathlib import Path
import shutil
import tempfile
import unittest
from engine.artifact_manager import ArtifactManager
from engine.models import (
ArtifactSpec,
BudgetSpec,
ConstraintSpec,
LoggingSpec,
MutationSpec,
MutatorSpec,
ObjectiveSpec,
PolicySpec,
RunnerSpec,
ScorerParseSpec,
ScorerSpec,
TaskSpec,
)
from engine.mutation_engine import MutationValidationError, validate_candidate_changes
def _make_task(root_dir: Path) -> TaskSpec:
return TaskSpec(
id="mutation-test",
description="Mutation validation fixture.",
artifacts=ArtifactSpec(include=["fixtures/*"], exclude=[], max_files_per_iteration=1),
mutation=MutationSpec(mode="direct_edit", allowed_file_types=[".md"], max_changed_lines=1),
mutator=MutatorSpec(type="command", command="python mutate.py", cwd=".", timeout_seconds=30),
runner=RunnerSpec(command="python -c \"print('runner ok')\"", cwd=".", timeout_seconds=30),
scorer=ScorerSpec(
type="command",
command="python -c \"print('{\\\"score\\\": 1.0, \\\"metrics\\\": {}}')\"",
parse=ScorerParseSpec(format="json", score_field="score", metrics_field="metrics"),
),
objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
constraints=[],
policy=PolicySpec(keep_if="better_primary", tie_breakers=[], on_failure="discard"),
budget=BudgetSpec(max_iterations=1, max_failures=1),
logging=LoggingSpec(results_file="work/results.jsonl", candidate_dir="work/candidates"),
root_dir=root_dir,
)
class MutationEngineTest(unittest.TestCase):
def test_rejects_too_many_changed_lines(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root_dir = Path(tmp)
fixture_dir = root_dir / "fixtures"
fixture_dir.mkdir(parents=True)
target = fixture_dir / "note.md"
target.write_text("line 1\nline 2\n", encoding="utf-8")
task = _make_task(root_dir)
snapshot = ArtifactManager(task).snapshot()
target.write_text("line 1\nline 2\nline 3\n", encoding="utf-8")
with self.assertRaises(MutationValidationError):
validate_candidate_changes(task, snapshot, root_dir)
def test_rejects_disallowed_extension(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root_dir = Path(tmp)
fixture_dir = root_dir / "fixtures"
fixture_dir.mkdir(parents=True)
target = fixture_dir / "note.md"
target.write_text("line 1\n", encoding="utf-8")
task = _make_task(root_dir)
snapshot = ArtifactManager(task).snapshot()
target.unlink()
(fixture_dir / "note.txt").write_text("line 1 changed\n", encoding="utf-8")
with self.assertRaises(MutationValidationError):
validate_candidate_changes(task, snapshot, root_dir)
def test_rejects_candidate_workspace_changes_against_baseline_snapshot(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
baseline_root = Path(tmp) / "baseline"
candidate_root = Path(tmp) / "candidate"
(baseline_root / "fixtures").mkdir(parents=True)
(baseline_root / "fixtures" / "note.md").write_text("base\n", encoding="utf-8")
shutil.copytree(baseline_root, candidate_root)
task = _make_task(candidate_root)
baseline_task = _make_task(baseline_root)
snapshot = ArtifactManager(baseline_task).snapshot()
(candidate_root / "fixtures" / "note.md").write_text("base\nextra\n", encoding="utf-8")
with self.assertRaises(MutationValidationError):
validate_candidate_changes(task, snapshot, candidate_root)
if __name__ == "__main__":
unittest.main()
- Step 2: Run the task loader tests to verify they fail
Run: uv run python -m unittest tests.test_task_loader -v
Expected: FAIL because TaskSpec has no mutator field and the loader does not parse it yet
- Step 3: Extend the shared models with
MutatorSpec
# engine/models.py
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@dataclass(frozen=True)
class ArtifactSpec:
include: list[str]
exclude: list[str]
max_files_per_iteration: int
@dataclass(frozen=True)
class MutationSpec:
mode: str
allowed_file_types: list[str]
max_changed_lines: int
@dataclass(frozen=True)
class MutatorSpec:
type: str
command: str
cwd: str
timeout_seconds: int
@dataclass(frozen=True)
class RunnerSpec:
command: str
cwd: str
timeout_seconds: int
@dataclass(frozen=True)
class ScorerParseSpec:
format: str
score_field: str
metrics_field: str
@dataclass(frozen=True)
class ScorerSpec:
type: str
command: str
parse: ScorerParseSpec
@dataclass(frozen=True)
class ObjectiveSpec:
primary_metric: str
direction: str
@dataclass(frozen=True)
class ConstraintSpec:
metric: str
op: str
value: Any
@dataclass(frozen=True)
class PolicySpec:
keep_if: str
tie_breakers: list[dict[str, str]]
on_failure: str
@dataclass(frozen=True)
class BudgetSpec:
max_iterations: int
max_failures: int
@dataclass(frozen=True)
class LoggingSpec:
results_file: str
candidate_dir: str
@dataclass(frozen=True)
class TaskSpec:
id: str
description: str
artifacts: ArtifactSpec
mutation: MutationSpec
mutator: MutatorSpec
runner: RunnerSpec
scorer: ScorerSpec
objective: ObjectiveSpec
constraints: list[ConstraintSpec]
policy: PolicySpec
budget: BudgetSpec
logging: LoggingSpec
root_dir: Path
@dataclass(frozen=True)
class BaselineSnapshot:
file_contents: dict[Path, str]
file_hashes: dict[Path, str]
@dataclass(frozen=True)
class RunResult:
command: str
cwd: Path
exit_code: int
runtime_seconds: float
stdout: str
stderr: str
@dataclass(frozen=True)
class ScoreResult:
primary_score: float
metrics: dict[str, Any]
raw_output: dict[str, Any]
@dataclass(frozen=True)
class DecisionResult:
status: str
reason: str
baseline_score: float | None
candidate_score: float | None
constraint_failures: list[str] = field(default_factory=list)
- Step 4: Parse and validate the mutator block
# engine/task_loader.py
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
from engine.models import (
ArtifactSpec,
BudgetSpec,
ConstraintSpec,
LoggingSpec,
MutationSpec,
MutatorSpec,
ObjectiveSpec,
PolicySpec,
RunnerSpec,
ScorerParseSpec,
ScorerSpec,
TaskSpec,
)
class TaskValidationError(ValueError):
pass
def _require_mapping(value: Any, path: str) -> dict[str, Any]:
if not isinstance(value, dict):
raise TaskValidationError(f"{path} must be a mapping")
return value
def _require_list(value: Any, path: str) -> list[Any]:
if not isinstance(value, list):
raise TaskValidationError(f"{path} must be a list")
return value
def _require_value(mapping: dict[str, Any], key: str) -> Any:
if key not in mapping:
raise TaskValidationError(f"missing required field: {key}")
return mapping[key]
def load_task(task_path: Path) -> TaskSpec:
try:
task_data = yaml.safe_load(task_path.read_text(encoding="utf-8"))
except yaml.YAMLError as exc:
raise TaskValidationError(str(exc)) from exc
def _require_str(mapping: dict[str, Any], key: str, path: str) -> str:
value = _require_value(mapping, key)
if not isinstance(value, str):
raise TaskValidationError(f"{path}.{key} must be a string")
return value
def _require_int(mapping: dict[str, Any], key: str, path: str) -> int:
value = _require_value(mapping, key)
if not isinstance(value, int) or isinstance(value, bool):
raise TaskValidationError(f"{path}.{key} must be an integer")
return value
def _require_str_list(mapping: dict[str, Any], key: str, path: str) -> list[str]:
items = _require_list(_require_value(mapping, key), f"{path}.{key}")
result: list[str] = []
for index, item in enumerate(items):
if not isinstance(item, str):
raise TaskValidationError(f"{path}.{key}[{index}] must be a string")
result.append(item)
return result
def _require_tie_breakers(mapping: dict[str, Any], key: str, path: str) -> list[dict[str, str]]:
items = _require_list(_require_value(mapping, key), f"{path}.{key}")
result: list[dict[str, str]] = []
for index, item in enumerate(items):
entry = _require_mapping(item, f"{path}.{key}[{index}]")
result.append({str(k): str(v) for k, v in entry.items()})
return result
root = _require_mapping(task_data, "task")
artifacts_data = _require_mapping(_require_value(root, "artifacts"), "task.artifacts")
mutation_data = _require_mapping(_require_value(root, "mutation"), "task.mutation")
mutator_data = _require_mapping(_require_value(root, "mutator"), "task.mutator")
runner_data = _require_mapping(_require_value(root, "runner"), "task.runner")
scorer_data = _require_mapping(_require_value(root, "scorer"), "task.scorer")
scorer_parse_data = _require_mapping(_require_value(scorer_data, "parse"), "task.scorer.parse")
objective_data = _require_mapping(_require_value(root, "objective"), "task.objective")
policy_data = _require_mapping(_require_value(root, "policy"), "task.policy")
budget_data = _require_mapping(_require_value(root, "budget"), "task.budget")
logging_data = _require_mapping(_require_value(root, "logging"), "task.logging")
direction = _require_str(objective_data, "direction", "task.objective")
if direction not in {"maximize", "minimize"}:
raise TaskValidationError("task.objective.direction must be maximize or minimize")
mode = _require_str(mutation_data, "mode", "task.mutation")
if mode != "direct_edit":
raise TaskValidationError("task.mutation.mode must be direct_edit")
mutator_type = _require_str(mutator_data, "type", "task.mutator")
if mutator_type != "command":
raise TaskValidationError("task.mutator.type must be command")
scorer_type = _require_str(scorer_data, "type", "task.scorer")
if scorer_type != "command":
raise TaskValidationError("task.scorer.type must be command")
parse_format = _require_str(scorer_parse_data, "format", "task.scorer.parse")
if parse_format != "json":
raise TaskValidationError("task.scorer.parse.format must be json")
constraints_data = _require_list(_require_value(root, "constraints"), "task.constraints")
constraints = []
for index, item in enumerate(constraints_data):
constraint_data = _require_mapping(item, f"task.constraints[{index}]")
op = _require_str(constraint_data, "op", f"task.constraints[{index}]")
if op not in {"<=", ">=", "=="}:
raise TaskValidationError(f"task.constraints[{index}].op must be <=, >=, or ==")
constraints.append(
ConstraintSpec(
metric=_require_str(constraint_data, "metric", f"task.constraints[{index}]"),
op=op,
value=_require_value(constraint_data, "value"),
)
)
keep_if = _require_str(policy_data, "keep_if", "task.policy")
if keep_if != "better_primary":
raise TaskValidationError("task.policy.keep_if must be better_primary")
on_failure = _require_str(policy_data, "on_failure", "task.policy")
if on_failure != "discard":
raise TaskValidationError("task.policy.on_failure must be discard")
return TaskSpec(
id=_require_str(root, "id", "task"),
description=_require_str(root, "description", "task"),
artifacts=ArtifactSpec(
include=_require_str_list(artifacts_data, "include", "task.artifacts"),
exclude=_require_str_list(artifacts_data, "exclude", "task.artifacts"),
max_files_per_iteration=_require_int(artifacts_data, "max_files_per_iteration", "task.artifacts"),
),
mutation=MutationSpec(
mode=mode,
allowed_file_types=_require_str_list(mutation_data, "allowed_file_types", "task.mutation"),
max_changed_lines=_require_int(mutation_data, "max_changed_lines", "task.mutation"),
),
mutator=MutatorSpec(
type=mutator_type,
command=_require_str(mutator_data, "command", "task.mutator"),
cwd=_require_str(mutator_data, "cwd", "task.mutator"),
timeout_seconds=_require_int(mutator_data, "timeout_seconds", "task.mutator"),
),
runner=RunnerSpec(
command=_require_str(runner_data, "command", "task.runner"),
cwd=_require_str(runner_data, "cwd", "task.runner"),
timeout_seconds=_require_int(runner_data, "timeout_seconds", "task.runner"),
),
scorer=ScorerSpec(
type=scorer_type,
command=_require_str(scorer_data, "command", "task.scorer"),
parse=ScorerParseSpec(
format=parse_format,
score_field=_require_str(scorer_parse_data, "score_field", "task.scorer.parse"),
metrics_field=_require_str(scorer_parse_data, "metrics_field", "task.scorer.parse"),
),
),
objective=ObjectiveSpec(
primary_metric=_require_str(objective_data, "primary_metric", "task.objective"),
direction=direction,
),
constraints=constraints,
policy=PolicySpec(
keep_if=keep_if,
tie_breakers=_require_tie_breakers(policy_data, "tie_breakers", "task.policy"),
on_failure=on_failure,
),
budget=BudgetSpec(
max_iterations=_require_int(budget_data, "max_iterations", "task.budget"),
max_failures=_require_int(budget_data, "max_failures", "task.budget"),
),
logging=LoggingSpec(
results_file=_require_str(logging_data, "results_file", "task.logging"),
candidate_dir=_require_str(logging_data, "candidate_dir", "task.logging"),
),
root_dir=task_path.parent,
)
- Step 5: Run the task loader tests to verify they pass
Run: uv run python -m unittest tests.test_task_loader -v
Expected: OK
- Step 6: Commit the schema extension
git add engine/models.py engine/task_loader.py tests/test_task_loader.py
git commit -m "feat: add mutator spec to task schema"
Task 2: Add The Baseline-Aware Orchestrator Core
Files:
-
Create:
engine/orchestrator.py -
Create:
tests/test_orchestrator.py -
Test:
tests/test_orchestrator.py -
Step 1: Write failing orchestrator tests
# tests/test_orchestrator.py
from pathlib import Path
import tempfile
import unittest
from engine.orchestrator import run_single_iteration
from engine.models import (
ArtifactSpec,
BudgetSpec,
ConstraintSpec,
LoggingSpec,
MutationSpec,
MutatorSpec,
ObjectiveSpec,
PolicySpec,
RunnerSpec,
ScorerParseSpec,
ScorerSpec,
TaskSpec,
)
def make_task(root_dir: Path) -> TaskSpec:
return TaskSpec(
id="demo",
description="Demo task",
artifacts=ArtifactSpec(include=["task/*.md"], exclude=[], max_files_per_iteration=1),
mutation=MutationSpec(mode="direct_edit", allowed_file_types=[".md"], max_changed_lines=20),
mutator=MutatorSpec(
type="command",
command="python scripts/mutate_demo.py --artifact task/sample.md",
cwd=".",
timeout_seconds=30,
),
runner=RunnerSpec(
command="python scripts/evaluate_demo.py --artifact task/sample.md --output work/run.json",
cwd=".",
timeout_seconds=30,
),
scorer=ScorerSpec(
type="command",
command="python scripts/score_demo.py --input work/run.json",
parse=ScorerParseSpec(format="json", score_field="score", metrics_field="metrics"),
),
objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
constraints=[],
policy=PolicySpec(keep_if="better_primary", tie_breakers=[], on_failure="discard"),
budget=BudgetSpec(max_iterations=1, max_failures=1),
logging=LoggingSpec(results_file="work/results.jsonl", candidate_dir="work/candidates"),
root_dir=root_dir,
)
class OrchestratorTest(unittest.TestCase):
def test_discard_leaves_main_workspace_unchanged(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
(root / "task").mkdir()
(root / "scripts").mkdir()
(root / "work").mkdir()
(root / "task" / "sample.md").write_text("# Original\n", encoding="utf-8")
(root / "scripts" / "mutate_demo.py").write_text(
"from pathlib import Path\n"
"import argparse\n"
"p=argparse.ArgumentParser(); p.add_argument('--artifact'); args=p.parse_args()\n"
"Path(args.artifact).write_text('# Candidate\\n', encoding='utf-8')\n",
encoding="utf-8",
)
(root / "scripts" / "evaluate_demo.py").write_text(
"from pathlib import Path\n"
"import argparse, json\n"
"p=argparse.ArgumentParser(); p.add_argument('--artifact'); p.add_argument('--output'); args=p.parse_args()\n"
"payload={'score': 0.5, 'metrics': {}}\n"
"Path(args.output).parent.mkdir(parents=True, exist_ok=True)\n"
"Path(args.output).write_text(json.dumps(payload), encoding='utf-8')\n",
encoding="utf-8",
)
(root / "scripts" / "score_demo.py").write_text(
"from pathlib import Path\n"
"import argparse\n"
"p=argparse.ArgumentParser(); p.add_argument('--input'); args=p.parse_args()\n"
"print(Path(args.input).read_text(encoding='utf-8'))\n",
encoding="utf-8",
)
decision = run_single_iteration(make_task(root), baseline_score=1.0)
self.assertEqual(decision.status, "discard")
self.assertEqual((root / "task" / "sample.md").read_text(encoding="utf-8"), "# Original\n")
def test_keep_syncs_candidate_back(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
(root / "task").mkdir()
(root / "scripts").mkdir()
(root / "work").mkdir()
(root / "task" / "sample.md").write_text("# Original\n", encoding="utf-8")
(root / "scripts" / "mutate_demo.py").write_text(
"from pathlib import Path\n"
"import argparse\n"
"p=argparse.ArgumentParser(); p.add_argument('--artifact'); args=p.parse_args()\n"
"Path(args.artifact).write_text('# Candidate\\n', encoding='utf-8')\n",
encoding="utf-8",
)
(root / "scripts" / "evaluate_demo.py").write_text(
"from pathlib import Path\n"
"import argparse, json\n"
"p=argparse.ArgumentParser(); p.add_argument('--artifact'); p.add_argument('--output'); args=p.parse_args()\n"
"payload={'score': 2.0, 'metrics': {}}\n"
"Path(args.output).parent.mkdir(parents=True, exist_ok=True)\n"
"Path(args.output).write_text(json.dumps(payload), encoding='utf-8')\n",
encoding="utf-8",
)
(root / "scripts" / "score_demo.py").write_text(
"from pathlib import Path\n"
"import argparse\n"
"p=argparse.ArgumentParser(); p.add_argument('--input'); args=p.parse_args()\n"
"print(Path(args.input).read_text(encoding='utf-8'))\n",
encoding="utf-8",
)
decision = run_single_iteration(make_task(root), baseline_score=1.0)
self.assertEqual(decision.status, "keep")
self.assertEqual((root / "task" / "sample.md").read_text(encoding="utf-8"), "# Candidate\n")
if __name__ == "__main__":
unittest.main()