sladro b3bfbd43ca chore: clean up project tracking

2026-04-02 17:42:53 +08:00

43 KiB

Raw Blame History

Baseline-Aware Single-Iteration Orchestrator Implementation Plan

For agentic workers: REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (- [ ]) syntax for tracking.

Goal: Add a baseline-aware single-iteration orchestrator that generates one candidate in a sandbox, validates it against mutation budgets, runs and scores it in isolation, then keeps or discards the candidate without corrupting the main workspace.

Architecture: Extend the task schema with a mutator section, then introduce an engine/orchestrator.py layer that owns sandbox lifecycle and candidate sync-back. Keep the existing loader, artifact manager, runner, scorer, and decision engine modules as foundations, but move orchestration decisions into the new layer and have scripts/run_task.py become a thin entrypoint.

Tech Stack: Python 3.10+, standard library, PyYAML, uv, unittest

File Map

New Files

engine/orchestrator.py - baseline-aware single-iteration orchestration
scripts/mutate_skill_task.py - deterministic sample mutator for the skill-quality task
tests/test_orchestrator.py - sandbox keep/discard/crash coverage

Modified Files

engine/models.py - add MutatorSpec and extend TaskSpec
engine/task_loader.py - parse and validate mutator
scripts/run_task.py - delegate to orchestrator instead of hand-rolled flow
tasks/skill-quality/task.yaml - add a concrete mutator
README.md - document the baseline-aware single-iteration behavior

Task 1: Extend The Task Schema For Mutators

Files:

Modify: engine/models.py
Modify: engine/task_loader.py
Modify: tests/test_task_loader.py
Test: tests/test_task_loader.py
Step 1: Add a failing mutator test to the task loader suite

# tests/test_task_loader.py
from pathlib import Path
import tempfile
import unittest

from engine.task_loader import TaskValidationError, load_task


VALID_TASK = """
id: demo
description: Demo task
artifacts:
  include:
    - tasks/demo/sample.txt
  exclude: []
  max_files_per_iteration: 1
mutation:
  mode: direct_edit
  allowed_file_types: [".txt"]
  max_changed_lines: 10
mutator:
  type: command
  command: "python scripts/mutate.py"
  cwd: "."
  timeout_seconds: 30
runner:
  command: "python -c \\\"print('run')\\\""
  cwd: "."
  timeout_seconds: 10
scorer:
  type: command
  command: "python -c \\\"import json; print(json.dumps({'score': 1, 'metrics': {'violation_count': 0}}))\\\""
  parse:
    format: json
    score_field: "score"
    metrics_field: "metrics"
objective:
  primary_metric: score
  direction: maximize
constraints:
  - metric: violation_count
    op: "<="
    value: 0
policy:
  keep_if: better_primary
  tie_breakers: []
  on_failure: discard
budget:
  max_iterations: 3
  max_failures: 1
logging:
  results_file: work/results.jsonl
  candidate_dir: work/candidates
"""


class TaskLoaderTest(unittest.TestCase):
    def write_task(self, content: str) -> Path:
        temp_dir = tempfile.TemporaryDirectory()
        self.addCleanup(temp_dir.cleanup)
        task_path = Path(temp_dir.name) / "task.yaml"
        task_path.write_text(content, encoding="utf-8")
        return task_path

    def test_loads_minimal_task(self) -> None:
        task = load_task(self.write_task(VALID_TASK))
        self.assertEqual(task.id, "demo")
        self.assertEqual(task.artifacts.max_files_per_iteration, 1)
        self.assertEqual(task.constraints[0].metric, "violation_count")

    def test_loads_mutator_spec(self) -> None:
        task = load_task(self.write_task(VALID_TASK))
        self.assertEqual(task.mutator.type, "command")
        self.assertEqual(task.mutator.command, "python scripts/mutate.py")
        self.assertEqual(task.mutator.timeout_seconds, 30)

    def test_rejects_missing_required_section(self) -> None:
        content = VALID_TASK.replace("objective:\n  primary_metric: score\n  direction: maximize\n", "")
        with self.assertRaises(TaskValidationError) as ctx:
            load_task(self.write_task(content))
        self.assertIn("objective", str(ctx.exception))

    def test_rejects_invalid_direction(self) -> None:
        content = VALID_TASK.replace("direction: maximize", "direction: sideways")
        with self.assertRaises(TaskValidationError) as ctx:
            load_task(self.write_task(content))
        self.assertIn("direction", str(ctx.exception))

    def test_rejects_invalid_mutator_type(self) -> None:
        content = VALID_TASK.replace("type: command", "type: agent", 1)
        with self.assertRaises(TaskValidationError) as ctx:
            load_task(self.write_task(content))
        self.assertIn("mutator.type", str(ctx.exception))


if __name__ == "__main__":
    unittest.main()

Step 2: Run the mutation tests to verify they fail

Run: uv run python -m unittest tests.test_mutation_engine -v
Expected: FAIL because validate_candidate_changes does not accept a candidate root yet

Step 3: Make validation compare baseline snapshot to candidate workspace

# engine/mutation_engine.py
from __future__ import annotations

from difflib import unified_diff
from pathlib import Path

from engine.models import BaselineSnapshot, TaskSpec


class MutationValidationError(ValueError):
    pass


def _count_changed_lines(before: str, after: str, path: Path) -> int:
    diff = unified_diff(
        before.splitlines(keepends=True),
        after.splitlines(keepends=True),
        fromfile=f"{path.as_posix()} (before)",
        tofile=f"{path.as_posix()} (after)",
    )
    changed_lines = 0
    for line in diff:
        if line.startswith(("---", "+++", "@@")):
            continue
        if line.startswith(("+", "-")):
            changed_lines += 1
    return changed_lines


def validate_candidate_changes(task: TaskSpec, snapshot: BaselineSnapshot, candidate_root: Path) -> None:
    changed_files = 0
    changed_lines = 0
    allowed_file_types = set(task.mutation.allowed_file_types)

    for baseline_path, baseline_text in snapshot.file_contents.items():
        relative = baseline_path.relative_to(task.root_dir)
        candidate_path = candidate_root / relative
        current_text = candidate_path.read_text(encoding="utf-8") if candidate_path.exists() else ""
        if current_text == baseline_text:
            continue

        changed_files += 1
        if candidate_path.suffix not in allowed_file_types:
            raise MutationValidationError(f"disallowed file type: {candidate_path.suffix}")
        changed_lines += _count_changed_lines(baseline_text, current_text, candidate_path)

    for path in sorted(candidate_root.rglob("*")):
        if not path.is_file():
            continue
        relative = path.relative_to(candidate_root)
        baseline_path = task.root_dir / relative
        if baseline_path in snapshot.file_contents:
            continue
        changed_files += 1
        if path.suffix not in allowed_file_types:
            raise MutationValidationError(f"disallowed file type: {path.suffix}")
        changed_lines += _count_changed_lines("", path.read_text(encoding="utf-8"), path)

    if changed_files > task.artifacts.max_files_per_iteration:
        raise MutationValidationError(
            f"too many changed files: {changed_files} > {task.artifacts.max_files_per_iteration}"
        )
    if changed_lines > task.mutation.max_changed_lines:
        raise MutationValidationError(
            f"too many changed lines: {changed_lines} > {task.mutation.max_changed_lines}"
        )

Step 4: Run the mutation tests to verify they pass

Run: uv run python -m unittest tests.test_mutation_engine -v
Expected: OK

Step 5: Commit the baseline-aware mutation validation

git add engine/mutation_engine.py tests/test_mutation_engine.py
git commit -m "feat: compare candidate workspace against baseline snapshot"

Task 4: Wire The CLI To The Orchestrator

Files:

Modify: scripts/run_task.py
Modify: tests/test_execution_pipeline.py
Test: tests/test_execution_pipeline.py
Step 1: Add a failing sandbox orchestration test

# tests/test_execution_pipeline.py
from pathlib import Path
import json
import shutil
import subprocess
import tempfile
import unittest


class RunTaskCliTest(unittest.TestCase):
    def test_run_task_cli_keeps_candidate_from_sandbox(self) -> None:
        source_root = Path(__file__).resolve().parents[1]
        with tempfile.TemporaryDirectory() as tmp:
            temp_root = Path(tmp)
            shutil.copytree(source_root / "engine", temp_root / "engine")
            shutil.copytree(source_root / "scripts", temp_root / "scripts")
            shutil.copytree(source_root / "tasks", temp_root / "tasks")

            completed = subprocess.run(
                ["uv", "run", "python", "scripts/run_task.py", "--task", "tasks/skill-quality/task.yaml"],
                cwd=str(temp_root),
                capture_output=True,
                text=True,
                encoding="utf-8",
                check=False,
            )

            self.assertEqual(completed.returncode, 0, msg=completed.stderr)
            artifact_text = (temp_root / "tasks" / "skill-quality" / "fixtures" / "SKILL.md").read_text(encoding="utf-8")
            self.assertIn("## When to Use", artifact_text)
            record = json.loads((temp_root / "work" / "results.jsonl").read_text(encoding="utf-8").splitlines()[-1])
            self.assertEqual(record["status"], "keep")


if __name__ == "__main__":
    unittest.main()

Step 2: Run the execution pipeline tests to verify they fail

Run: uv run python -m unittest tests.test_execution_pipeline -v
Expected: FAIL because the current CLI has no mutator support or sandbox orchestration

Step 3: Replace the hand-built CLI flow with orchestrator delegation

# scripts/run_task.py
from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path

ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from engine.orchestrator import run_single_iteration
from engine.task_loader import load_task


def _resolve_repo_path(repo_root: Path, raw_path: str) -> Path:
    path = Path(raw_path)
    if path.is_absolute():
        return path.resolve()
    return (repo_root / path).resolve()


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("--task", required=True)
    return parser.parse_args()


def _append_record(repo_root: Path, results_file: str, record: dict[str, object]) -> None:
    results_path = _resolve_repo_path(repo_root, results_file)
    results_path.parent.mkdir(parents=True, exist_ok=True)
    with results_path.open("a", encoding="utf-8", newline="") as handle:
        handle.write(json.dumps(record, ensure_ascii=False) + "\n")


def main() -> int:
    args = parse_args()
    repo_root = ROOT_DIR.resolve()
    task_path = _resolve_repo_path(repo_root, args.task)
    task = load_task(task_path)

    decision = run_single_iteration(task, baseline_score=None)
    record = {
        "task_id": task.id,
        "status": decision.status,
        "reason": decision.reason,
        "candidate_score": decision.candidate_score,
        "diff_summary": "",
    }
    _append_record(repo_root, task.logging.results_file, record)
    print(json.dumps(record, ensure_ascii=False))
    return 1 if decision.status == "crash" else 0


if __name__ == "__main__":
    raise SystemExit(main())

Step 4: Run the execution pipeline tests to verify they pass

Run: uv run python -m unittest tests.test_execution_pipeline -v
Expected: OK

Step 5: Commit the CLI orchestration wiring

git add scripts/run_task.py tests/test_execution_pipeline.py
git commit -m "feat: route task runner through sandbox orchestrator"

Task 5: Add A Deterministic Sample Mutator

Files:

Create: scripts/mutate_skill_task.py
Modify: tasks/skill-quality/task.yaml
Test: tests/test_execution_pipeline.py
Step 1: Add the sample mutator script

# scripts/mutate_skill_task.py
from __future__ import annotations

import argparse
from pathlib import Path


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("--task-dir", required=True)
    parser.add_argument("--artifact", required=True)
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    task_dir = Path(args.task_dir).resolve()
    artifact_path = (task_dir / args.artifact).resolve()
    sections = [
        "# Planning Skill",
        "",
        "## When to Use",
        "- Use this task when a skill file is missing structure.",
        "",
        "## Steps",
        "1. Add the missing sections.",
        "2. Keep the instructions direct.",
        "",
        "## Constraints",
        "Do not add filler content.",
        "",
        "## Examples",
        "- Show concrete commands.",
    ]
    artifact_path.write_text("\n".join(sections) + "\n", encoding="utf-8")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())

Step 2: Add mutator to the sample task

# tasks/skill-quality/task.yaml
id: skill-quality
description: Score one skill file against a deterministic rubric.
artifacts:
  include:
    - fixtures/SKILL.md
  exclude: []
  max_files_per_iteration: 1
mutation:
  mode: direct_edit
  allowed_file_types: [".md"]
  max_changed_lines: 40
mutator:
  type: command
  command: "python ../../scripts/mutate_skill_task.py --task-dir . --artifact fixtures/SKILL.md"
  cwd: "tasks/skill-quality"
  timeout_seconds: 30
runner:
  command: "python ../../scripts/evaluate_skill_task.py --task-dir . --artifact fixtures/SKILL.md --output ../../work/skill-run.json"
  cwd: "tasks/skill-quality"
  timeout_seconds: 30
scorer:
  type: command
  command: "python scripts/score_skill_task.py --input work/skill-run.json"
  parse:
    format: json
    score_field: score
    metrics_field: metrics
objective:
  primary_metric: score
  direction: maximize
constraints:
  - metric: violation_count
    op: "<="
    value: 0
policy:
  keep_if: better_primary
  tie_breakers: []
  on_failure: discard
budget:
  max_iterations: 5
  max_failures: 3
logging:
  results_file: work/results.jsonl
  candidate_dir: work/candidates

Step 3: Run the execution pipeline tests to verify the sample task passes

Run: uv run python -m unittest tests.test_execution_pipeline -v
Expected: OK

Step 4: Commit the sample mutator

git add scripts/mutate_skill_task.py tasks/skill-quality/task.yaml
git commit -m "feat: add deterministic sample mutator"

Task 6: Update The README For The Baseline-Aware Iteration Model

Files:

Modify: README.md
Test: none
Step 1: Update the Artifact Loop Engine section

## Artifact Loop Engine

This repository also includes a generic optimization engine for editable text artifacts such as prompts, skills, config files, and small code paths.

The current CLI now runs a baseline-aware single iteration:

1. Build a baseline view of the allowed artifacts
2. Create a temporary candidate sandbox
3. Run a task-specific mutator in the sandbox
4. Validate the candidate against mutation limits
5. Run and score the candidate in the sandbox
6. Keep or discard the candidate

Run the deterministic sample task:

```bash
uv run python scripts/run_task.py --task tasks/skill-quality/task.yaml

The task writes structured iteration results to work/results.jsonl.

Engine concepts:

artifacts: files that may be accepted back into the main workspace
mutation: file-count and line-count limits for candidate changes
mutator: command that generates a candidate inside the sandbox
runner: command that evaluates a candidate
scorer: command that returns a structured score payload
policy: keep or discard logic based on objective and constraints


- [ ] **Step 2: Review the README update for consistency**

Read: `README.md`  
Expected: the original training flow remains intact, and the engine section now describes the baseline-aware single iteration accurately

- [ ] **Step 3: Commit the README update**

```bash
git add README.md
git commit -m "docs: document baseline-aware single iteration"

Final Verification

Step 1: Run the targeted test suite

Run: uv run python -m unittest tests.test_task_loader tests.test_artifact_manager tests.test_execution_pipeline tests.test_mutation_engine tests.test_orchestrator -v
Expected: OK

Step 2: Run the sample task manually

Run: uv run python scripts/run_task.py --task tasks/skill-quality/task.yaml
Expected:

exit code 0
a new JSON line in work/results.jsonl
tasks/skill-quality/fixtures/SKILL.md updated only if the candidate is kept
Step 3: Inspect the latest record

Read: work/results.jsonl
Expected latest record fields:

task_id
status
reason
candidate_score
diff_summary
Step 4: Commit the final verified state

git add README.md engine scripts tasks tests
git commit -m "feat: ship baseline-aware single-iteration orchestrator"

Step 2: Run the orchestrator tests to verify they fail

Run: uv run python -m unittest tests.test_orchestrator -v
Expected: FAIL with ModuleNotFoundError: No module named 'engine.orchestrator'

Step 3: Implement the orchestrator

# engine/orchestrator.py
from __future__ import annotations

import shutil
import tempfile
from pathlib import Path

from engine.artifact_manager import ArtifactManager
from engine.decision_engine import decide_candidate
from engine.models import DecisionResult, ScoreResult, TaskSpec
from engine.mutation_engine import MutationValidationError, validate_candidate_changes
from engine.runner import run_command
from engine.scorer import parse_score_output


def _copy_repo_to_sandbox(repo_root: Path, sandbox_root: Path) -> None:
    for child in repo_root.iterdir():
        if child.name == ".git":
            continue
        target = sandbox_root / child.name
        if child.is_dir():
            shutil.copytree(child, target, dirs_exist_ok=True)
        else:
            shutil.copy2(child, target)


def _sync_artifacts_back(task: TaskSpec, sandbox_task: TaskSpec) -> None:
    source_manager = ArtifactManager(sandbox_task)
    target_manager = ArtifactManager(task)
    target_snapshot = target_manager.snapshot()

    for path in source_manager.resolve_paths():
        relative = path.relative_to(sandbox_task.root_dir)
        target_path = task.root_dir / relative
        target_path.parent.mkdir(parents=True, exist_ok=True)
        with path.open("r", encoding="utf-8", newline="") as src:
            with target_path.open("w", encoding="utf-8", newline="") as dst:
                dst.write(src.read())

    for baseline_path, baseline_text in target_snapshot.file_contents.items():
        if baseline_path.exists():
            continue
        baseline_path.parent.mkdir(parents=True, exist_ok=True)
        with baseline_path.open("w", encoding="utf-8", newline="") as handle:
            handle.write(baseline_text)


def run_single_iteration(task: TaskSpec, baseline_score: float | ScoreResult | None) -> DecisionResult:
    repo_root = task.root_dir.parent.parent if task.root_dir.name == "skill-quality" else task.root_dir
    with tempfile.TemporaryDirectory(prefix="artifact-loop-") as tmp:
        sandbox_root = Path(tmp)
        _copy_repo_to_sandbox(repo_root, sandbox_root)

        from engine.task_loader import load_task

        sandbox_task_path = sandbox_root / task.root_dir.relative_to(repo_root) / "task.yaml"
        sandbox_task = load_task(sandbox_task_path)
        baseline_snapshot = ArtifactManager(task).snapshot()

        mutator_result = run_command(
            sandbox_task.mutator.command,
            (sandbox_root / sandbox_task.mutator.cwd).resolve(),
            sandbox_task.mutator.timeout_seconds,
        )
        if mutator_result.exit_code != 0:
            return DecisionResult(
                status="crash",
                reason=f"mutator failed with exit code {mutator_result.exit_code}",
                baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
                candidate_score=None,
            )

        try:
            validate_candidate_changes(
                sandbox_task,
                baseline_snapshot,
                sandbox_task.root_dir,
            )
        except MutationValidationError as exc:
            return DecisionResult(
                status="discard",
                reason=str(exc),
                baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
                candidate_score=None,
            )

        run_result = run_command(
            sandbox_task.runner.command,
            (sandbox_root / sandbox_task.runner.cwd).resolve(),
            sandbox_task.runner.timeout_seconds,
        )
        if run_result.exit_code != 0:
            return DecisionResult(
                status="crash",
                reason=f"command failed with exit code {run_result.exit_code}",
                baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
                candidate_score=None,
            )

        scorer_result = run_command(
            sandbox_task.scorer.command,
            sandbox_root,
            sandbox_task.runner.timeout_seconds,
        )
        if scorer_result.exit_code != 0:
            return DecisionResult(
                status="crash",
                reason=f"scorer failed with exit code {scorer_result.exit_code}",
                baseline_score=None if baseline_score is None or isinstance(baseline_score, ScoreResult) else baseline_score,
                candidate_score=None,
            )

        score_result = parse_score_output(
            scorer_result.stdout,
            score_field=sandbox_task.scorer.parse.score_field,
            metrics_field=sandbox_task.scorer.parse.metrics_field,
        )
        decision = decide_candidate(
            baseline=baseline_score,
            candidate=score_result,
            objective=sandbox_task.objective,
            constraints=sandbox_task.constraints,
            tie_breakers=sandbox_task.policy.tie_breakers,
            run_result=run_result,
        )

        if decision.status == "keep":
            _sync_artifacts_back(task, sandbox_task)

        return decision

Step 4: Run the orchestrator tests to verify they pass

Run: uv run python -m unittest tests.test_orchestrator -v
Expected: OK

Step 5: Commit the orchestrator

git add engine/orchestrator.py tests/test_orchestrator.py
git commit -m "feat: add single-iteration orchestrator"

Task 3: Make Mutation Validation Baseline-Aware

Files:

Modify: engine/mutation_engine.py
Modify: tests/test_mutation_engine.py
Test: tests/test_mutation_engine.py
Step 1: Add a failing cross-workspace validation test

# tests/test_mutation_engine.py
from pathlib import Path
import shutil
import tempfile
import unittest

from engine.artifact_manager import ArtifactManager
from engine.models import (
    ArtifactSpec,
    BudgetSpec,
    ConstraintSpec,
    LoggingSpec,
    MutationSpec,
    MutatorSpec,
    ObjectiveSpec,
    PolicySpec,
    RunnerSpec,
    ScorerParseSpec,
    ScorerSpec,
    TaskSpec,
)
from engine.mutation_engine import MutationValidationError, validate_candidate_changes


def _make_task(root_dir: Path) -> TaskSpec:
    return TaskSpec(
        id="mutation-test",
        description="Mutation validation fixture.",
        artifacts=ArtifactSpec(include=["fixtures/*"], exclude=[], max_files_per_iteration=1),
        mutation=MutationSpec(mode="direct_edit", allowed_file_types=[".md"], max_changed_lines=1),
        mutator=MutatorSpec(type="command", command="python mutate.py", cwd=".", timeout_seconds=30),
        runner=RunnerSpec(command="python -c \"print('runner ok')\"", cwd=".", timeout_seconds=30),
        scorer=ScorerSpec(
            type="command",
            command="python -c \"print('{\\\"score\\\": 1.0, \\\"metrics\\\": {}}')\"",
            parse=ScorerParseSpec(format="json", score_field="score", metrics_field="metrics"),
        ),
        objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
        constraints=[],
        policy=PolicySpec(keep_if="better_primary", tie_breakers=[], on_failure="discard"),
        budget=BudgetSpec(max_iterations=1, max_failures=1),
        logging=LoggingSpec(results_file="work/results.jsonl", candidate_dir="work/candidates"),
        root_dir=root_dir,
    )


class MutationEngineTest(unittest.TestCase):
    def test_rejects_too_many_changed_lines(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            root_dir = Path(tmp)
            fixture_dir = root_dir / "fixtures"
            fixture_dir.mkdir(parents=True)
            target = fixture_dir / "note.md"
            target.write_text("line 1\nline 2\n", encoding="utf-8")
            task = _make_task(root_dir)
            snapshot = ArtifactManager(task).snapshot()
            target.write_text("line 1\nline 2\nline 3\n", encoding="utf-8")
            with self.assertRaises(MutationValidationError):
                validate_candidate_changes(task, snapshot, root_dir)

    def test_rejects_disallowed_extension(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            root_dir = Path(tmp)
            fixture_dir = root_dir / "fixtures"
            fixture_dir.mkdir(parents=True)
            target = fixture_dir / "note.md"
            target.write_text("line 1\n", encoding="utf-8")
            task = _make_task(root_dir)
            snapshot = ArtifactManager(task).snapshot()
            target.unlink()
            (fixture_dir / "note.txt").write_text("line 1 changed\n", encoding="utf-8")
            with self.assertRaises(MutationValidationError):
                validate_candidate_changes(task, snapshot, root_dir)

    def test_rejects_candidate_workspace_changes_against_baseline_snapshot(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            baseline_root = Path(tmp) / "baseline"
            candidate_root = Path(tmp) / "candidate"
            (baseline_root / "fixtures").mkdir(parents=True)
            (baseline_root / "fixtures" / "note.md").write_text("base\n", encoding="utf-8")
            shutil.copytree(baseline_root, candidate_root)
            task = _make_task(candidate_root)
            baseline_task = _make_task(baseline_root)
            snapshot = ArtifactManager(baseline_task).snapshot()
            (candidate_root / "fixtures" / "note.md").write_text("base\nextra\n", encoding="utf-8")
            with self.assertRaises(MutationValidationError):
                validate_candidate_changes(task, snapshot, candidate_root)


if __name__ == "__main__":
    unittest.main()

Step 2: Run the task loader tests to verify they fail

Run: uv run python -m unittest tests.test_task_loader -v
Expected: FAIL because TaskSpec has no mutator field and the loader does not parse it yet

Step 3: Extend the shared models with MutatorSpec

# engine/models.py
from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Any


@dataclass(frozen=True)
class ArtifactSpec:
    include: list[str]
    exclude: list[str]
    max_files_per_iteration: int


@dataclass(frozen=True)
class MutationSpec:
    mode: str
    allowed_file_types: list[str]
    max_changed_lines: int


@dataclass(frozen=True)
class MutatorSpec:
    type: str
    command: str
    cwd: str
    timeout_seconds: int


@dataclass(frozen=True)
class RunnerSpec:
    command: str
    cwd: str
    timeout_seconds: int


@dataclass(frozen=True)
class ScorerParseSpec:
    format: str
    score_field: str
    metrics_field: str


@dataclass(frozen=True)
class ScorerSpec:
    type: str
    command: str
    parse: ScorerParseSpec


@dataclass(frozen=True)
class ObjectiveSpec:
    primary_metric: str
    direction: str


@dataclass(frozen=True)
class ConstraintSpec:
    metric: str
    op: str
    value: Any


@dataclass(frozen=True)
class PolicySpec:
    keep_if: str
    tie_breakers: list[dict[str, str]]
    on_failure: str


@dataclass(frozen=True)
class BudgetSpec:
    max_iterations: int
    max_failures: int


@dataclass(frozen=True)
class LoggingSpec:
    results_file: str
    candidate_dir: str


@dataclass(frozen=True)
class TaskSpec:
    id: str
    description: str
    artifacts: ArtifactSpec
    mutation: MutationSpec
    mutator: MutatorSpec
    runner: RunnerSpec
    scorer: ScorerSpec
    objective: ObjectiveSpec
    constraints: list[ConstraintSpec]
    policy: PolicySpec
    budget: BudgetSpec
    logging: LoggingSpec
    root_dir: Path


@dataclass(frozen=True)
class BaselineSnapshot:
    file_contents: dict[Path, str]
    file_hashes: dict[Path, str]


@dataclass(frozen=True)
class RunResult:
    command: str
    cwd: Path
    exit_code: int
    runtime_seconds: float
    stdout: str
    stderr: str


@dataclass(frozen=True)
class ScoreResult:
    primary_score: float
    metrics: dict[str, Any]
    raw_output: dict[str, Any]


@dataclass(frozen=True)
class DecisionResult:
    status: str
    reason: str
    baseline_score: float | None
    candidate_score: float | None
    constraint_failures: list[str] = field(default_factory=list)

Step 4: Parse and validate the mutator block

# engine/task_loader.py
from __future__ import annotations

from pathlib import Path
from typing import Any

import yaml

from engine.models import (
    ArtifactSpec,
    BudgetSpec,
    ConstraintSpec,
    LoggingSpec,
    MutationSpec,
    MutatorSpec,
    ObjectiveSpec,
    PolicySpec,
    RunnerSpec,
    ScorerParseSpec,
    ScorerSpec,
    TaskSpec,
)


class TaskValidationError(ValueError):
    pass


def _require_mapping(value: Any, path: str) -> dict[str, Any]:
    if not isinstance(value, dict):
        raise TaskValidationError(f"{path} must be a mapping")
    return value


def _require_list(value: Any, path: str) -> list[Any]:
    if not isinstance(value, list):
        raise TaskValidationError(f"{path} must be a list")
    return value


def _require_value(mapping: dict[str, Any], key: str) -> Any:
    if key not in mapping:
        raise TaskValidationError(f"missing required field: {key}")
    return mapping[key]


def load_task(task_path: Path) -> TaskSpec:
    try:
        task_data = yaml.safe_load(task_path.read_text(encoding="utf-8"))
    except yaml.YAMLError as exc:
        raise TaskValidationError(str(exc)) from exc

    def _require_str(mapping: dict[str, Any], key: str, path: str) -> str:
        value = _require_value(mapping, key)
        if not isinstance(value, str):
            raise TaskValidationError(f"{path}.{key} must be a string")
        return value

    def _require_int(mapping: dict[str, Any], key: str, path: str) -> int:
        value = _require_value(mapping, key)
        if not isinstance(value, int) or isinstance(value, bool):
            raise TaskValidationError(f"{path}.{key} must be an integer")
        return value

    def _require_str_list(mapping: dict[str, Any], key: str, path: str) -> list[str]:
        items = _require_list(_require_value(mapping, key), f"{path}.{key}")
        result: list[str] = []
        for index, item in enumerate(items):
            if not isinstance(item, str):
                raise TaskValidationError(f"{path}.{key}[{index}] must be a string")
            result.append(item)
        return result

    def _require_tie_breakers(mapping: dict[str, Any], key: str, path: str) -> list[dict[str, str]]:
        items = _require_list(_require_value(mapping, key), f"{path}.{key}")
        result: list[dict[str, str]] = []
        for index, item in enumerate(items):
            entry = _require_mapping(item, f"{path}.{key}[{index}]")
            result.append({str(k): str(v) for k, v in entry.items()})
        return result

    root = _require_mapping(task_data, "task")

    artifacts_data = _require_mapping(_require_value(root, "artifacts"), "task.artifacts")
    mutation_data = _require_mapping(_require_value(root, "mutation"), "task.mutation")
    mutator_data = _require_mapping(_require_value(root, "mutator"), "task.mutator")
    runner_data = _require_mapping(_require_value(root, "runner"), "task.runner")
    scorer_data = _require_mapping(_require_value(root, "scorer"), "task.scorer")
    scorer_parse_data = _require_mapping(_require_value(scorer_data, "parse"), "task.scorer.parse")
    objective_data = _require_mapping(_require_value(root, "objective"), "task.objective")
    policy_data = _require_mapping(_require_value(root, "policy"), "task.policy")
    budget_data = _require_mapping(_require_value(root, "budget"), "task.budget")
    logging_data = _require_mapping(_require_value(root, "logging"), "task.logging")

    direction = _require_str(objective_data, "direction", "task.objective")
    if direction not in {"maximize", "minimize"}:
        raise TaskValidationError("task.objective.direction must be maximize or minimize")

    mode = _require_str(mutation_data, "mode", "task.mutation")
    if mode != "direct_edit":
        raise TaskValidationError("task.mutation.mode must be direct_edit")

    mutator_type = _require_str(mutator_data, "type", "task.mutator")
    if mutator_type != "command":
        raise TaskValidationError("task.mutator.type must be command")

    scorer_type = _require_str(scorer_data, "type", "task.scorer")
    if scorer_type != "command":
        raise TaskValidationError("task.scorer.type must be command")

    parse_format = _require_str(scorer_parse_data, "format", "task.scorer.parse")
    if parse_format != "json":
        raise TaskValidationError("task.scorer.parse.format must be json")

    constraints_data = _require_list(_require_value(root, "constraints"), "task.constraints")
    constraints = []
    for index, item in enumerate(constraints_data):
        constraint_data = _require_mapping(item, f"task.constraints[{index}]")
        op = _require_str(constraint_data, "op", f"task.constraints[{index}]")
        if op not in {"<=", ">=", "=="}:
            raise TaskValidationError(f"task.constraints[{index}].op must be <=, >=, or ==")
        constraints.append(
            ConstraintSpec(
                metric=_require_str(constraint_data, "metric", f"task.constraints[{index}]"),
                op=op,
                value=_require_value(constraint_data, "value"),
            )
        )

    keep_if = _require_str(policy_data, "keep_if", "task.policy")
    if keep_if != "better_primary":
        raise TaskValidationError("task.policy.keep_if must be better_primary")

    on_failure = _require_str(policy_data, "on_failure", "task.policy")
    if on_failure != "discard":
        raise TaskValidationError("task.policy.on_failure must be discard")

    return TaskSpec(
        id=_require_str(root, "id", "task"),
        description=_require_str(root, "description", "task"),
        artifacts=ArtifactSpec(
            include=_require_str_list(artifacts_data, "include", "task.artifacts"),
            exclude=_require_str_list(artifacts_data, "exclude", "task.artifacts"),
            max_files_per_iteration=_require_int(artifacts_data, "max_files_per_iteration", "task.artifacts"),
        ),
        mutation=MutationSpec(
            mode=mode,
            allowed_file_types=_require_str_list(mutation_data, "allowed_file_types", "task.mutation"),
            max_changed_lines=_require_int(mutation_data, "max_changed_lines", "task.mutation"),
        ),
        mutator=MutatorSpec(
            type=mutator_type,
            command=_require_str(mutator_data, "command", "task.mutator"),
            cwd=_require_str(mutator_data, "cwd", "task.mutator"),
            timeout_seconds=_require_int(mutator_data, "timeout_seconds", "task.mutator"),
        ),
        runner=RunnerSpec(
            command=_require_str(runner_data, "command", "task.runner"),
            cwd=_require_str(runner_data, "cwd", "task.runner"),
            timeout_seconds=_require_int(runner_data, "timeout_seconds", "task.runner"),
        ),
        scorer=ScorerSpec(
            type=scorer_type,
            command=_require_str(scorer_data, "command", "task.scorer"),
            parse=ScorerParseSpec(
                format=parse_format,
                score_field=_require_str(scorer_parse_data, "score_field", "task.scorer.parse"),
                metrics_field=_require_str(scorer_parse_data, "metrics_field", "task.scorer.parse"),
            ),
        ),
        objective=ObjectiveSpec(
            primary_metric=_require_str(objective_data, "primary_metric", "task.objective"),
            direction=direction,
        ),
        constraints=constraints,
        policy=PolicySpec(
            keep_if=keep_if,
            tie_breakers=_require_tie_breakers(policy_data, "tie_breakers", "task.policy"),
            on_failure=on_failure,
        ),
        budget=BudgetSpec(
            max_iterations=_require_int(budget_data, "max_iterations", "task.budget"),
            max_failures=_require_int(budget_data, "max_failures", "task.budget"),
        ),
        logging=LoggingSpec(
            results_file=_require_str(logging_data, "results_file", "task.logging"),
            candidate_dir=_require_str(logging_data, "candidate_dir", "task.logging"),
        ),
        root_dir=task_path.parent,
    )

Step 5: Run the task loader tests to verify they pass

Run: uv run python -m unittest tests.test_task_loader -v
Expected: OK

Step 6: Commit the schema extension

git add engine/models.py engine/task_loader.py tests/test_task_loader.py
git commit -m "feat: add mutator spec to task schema"

Task 2: Add The Baseline-Aware Orchestrator Core

Files:

Create: engine/orchestrator.py
Create: tests/test_orchestrator.py
Test: tests/test_orchestrator.py
Step 1: Write failing orchestrator tests

# tests/test_orchestrator.py
from pathlib import Path
import tempfile
import unittest

from engine.orchestrator import run_single_iteration
from engine.models import (
    ArtifactSpec,
    BudgetSpec,
    ConstraintSpec,
    LoggingSpec,
    MutationSpec,
    MutatorSpec,
    ObjectiveSpec,
    PolicySpec,
    RunnerSpec,
    ScorerParseSpec,
    ScorerSpec,
    TaskSpec,
)


def make_task(root_dir: Path) -> TaskSpec:
    return TaskSpec(
        id="demo",
        description="Demo task",
        artifacts=ArtifactSpec(include=["task/*.md"], exclude=[], max_files_per_iteration=1),
        mutation=MutationSpec(mode="direct_edit", allowed_file_types=[".md"], max_changed_lines=20),
        mutator=MutatorSpec(
            type="command",
            command="python scripts/mutate_demo.py --artifact task/sample.md",
            cwd=".",
            timeout_seconds=30,
        ),
        runner=RunnerSpec(
            command="python scripts/evaluate_demo.py --artifact task/sample.md --output work/run.json",
            cwd=".",
            timeout_seconds=30,
        ),
        scorer=ScorerSpec(
            type="command",
            command="python scripts/score_demo.py --input work/run.json",
            parse=ScorerParseSpec(format="json", score_field="score", metrics_field="metrics"),
        ),
        objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
        constraints=[],
        policy=PolicySpec(keep_if="better_primary", tie_breakers=[], on_failure="discard"),
        budget=BudgetSpec(max_iterations=1, max_failures=1),
        logging=LoggingSpec(results_file="work/results.jsonl", candidate_dir="work/candidates"),
        root_dir=root_dir,
    )


class OrchestratorTest(unittest.TestCase):
    def test_discard_leaves_main_workspace_unchanged(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            root = Path(tmp)
            (root / "task").mkdir()
            (root / "scripts").mkdir()
            (root / "work").mkdir()
            (root / "task" / "sample.md").write_text("# Original\n", encoding="utf-8")
            (root / "scripts" / "mutate_demo.py").write_text(
                "from pathlib import Path\n"
                "import argparse\n"
                "p=argparse.ArgumentParser(); p.add_argument('--artifact'); args=p.parse_args()\n"
                "Path(args.artifact).write_text('# Candidate\\n', encoding='utf-8')\n",
                encoding="utf-8",
            )
            (root / "scripts" / "evaluate_demo.py").write_text(
                "from pathlib import Path\n"
                "import argparse, json\n"
                "p=argparse.ArgumentParser(); p.add_argument('--artifact'); p.add_argument('--output'); args=p.parse_args()\n"
                "payload={'score': 0.5, 'metrics': {}}\n"
                "Path(args.output).parent.mkdir(parents=True, exist_ok=True)\n"
                "Path(args.output).write_text(json.dumps(payload), encoding='utf-8')\n",
                encoding="utf-8",
            )
            (root / "scripts" / "score_demo.py").write_text(
                "from pathlib import Path\n"
                "import argparse\n"
                "p=argparse.ArgumentParser(); p.add_argument('--input'); args=p.parse_args()\n"
                "print(Path(args.input).read_text(encoding='utf-8'))\n",
                encoding="utf-8",
            )
            decision = run_single_iteration(make_task(root), baseline_score=1.0)
            self.assertEqual(decision.status, "discard")
            self.assertEqual((root / "task" / "sample.md").read_text(encoding="utf-8"), "# Original\n")

    def test_keep_syncs_candidate_back(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            root = Path(tmp)
            (root / "task").mkdir()
            (root / "scripts").mkdir()
            (root / "work").mkdir()
            (root / "task" / "sample.md").write_text("# Original\n", encoding="utf-8")
            (root / "scripts" / "mutate_demo.py").write_text(
                "from pathlib import Path\n"
                "import argparse\n"
                "p=argparse.ArgumentParser(); p.add_argument('--artifact'); args=p.parse_args()\n"
                "Path(args.artifact).write_text('# Candidate\\n', encoding='utf-8')\n",
                encoding="utf-8",
            )
            (root / "scripts" / "evaluate_demo.py").write_text(
                "from pathlib import Path\n"
                "import argparse, json\n"
                "p=argparse.ArgumentParser(); p.add_argument('--artifact'); p.add_argument('--output'); args=p.parse_args()\n"
                "payload={'score': 2.0, 'metrics': {}}\n"
                "Path(args.output).parent.mkdir(parents=True, exist_ok=True)\n"
                "Path(args.output).write_text(json.dumps(payload), encoding='utf-8')\n",
                encoding="utf-8",
            )
            (root / "scripts" / "score_demo.py").write_text(
                "from pathlib import Path\n"
                "import argparse\n"
                "p=argparse.ArgumentParser(); p.add_argument('--input'); args=p.parse_args()\n"
                "print(Path(args.input).read_text(encoding='utf-8'))\n",
                encoding="utf-8",
            )
            decision = run_single_iteration(make_task(root), baseline_score=1.0)
            self.assertEqual(decision.status, "keep")
            self.assertEqual((root / "task" / "sample.md").read_text(encoding="utf-8"), "# Candidate\n")


if __name__ == "__main__":
    unittest.main()

43 KiB Raw Blame History

Baseline-Aware Single-Iteration Orchestrator Implementation Plan

File Map

New Files

Modified Files

Task 1: Extend The Task Schema For Mutators

Task 4: Wire The CLI To The Orchestrator

Task 5: Add A Deterministic Sample Mutator

Task 6: Update The README For The Baseline-Aware Iteration Model

Final Verification

Task 3: Make Mutation Validation Baseline-Aware

Task 2: Add The Baseline-Aware Orchestrator Core

43 KiB

Raw Blame History