diff --git a/engine/decision_engine.py b/engine/decision_engine.py index 791ed4e..b00f8c3 100644 --- a/engine/decision_engine.py +++ b/engine/decision_engine.py @@ -17,19 +17,61 @@ def _constraint_failed(candidate: ScoreResult, constraint: ConstraintSpec) -> bo raise ValueError(f"unsupported constraint operator: {constraint.op}") +def _baseline_primary_score(baseline: float | ScoreResult | None) -> float | None: + if baseline is None: + return None + if isinstance(baseline, ScoreResult): + return baseline.primary_score + return baseline + + +def _apply_tie_breakers( + baseline: ScoreResult, + candidate: ScoreResult, + tie_breakers: list[dict[str, str]], +) -> tuple[str, str] | None: + for tie_breaker in tie_breakers: + if "lower" in tie_breaker: + metric = tie_breaker["lower"] + baseline_value = baseline.metrics.get(metric) + candidate_value = candidate.metrics.get(metric) + if baseline_value is None or candidate_value is None: + continue + if candidate_value < baseline_value: + return ("keep", f"candidate won tie-breaker: lower {metric}") + if candidate_value > baseline_value: + return ("discard", f"candidate lost tie-breaker: lower {metric}") + continue + + if "higher" in tie_breaker: + metric = tie_breaker["higher"] + baseline_value = baseline.metrics.get(metric) + candidate_value = candidate.metrics.get(metric) + if baseline_value is None or candidate_value is None: + continue + if candidate_value > baseline_value: + return ("keep", f"candidate won tie-breaker: higher {metric}") + if candidate_value < baseline_value: + return ("discard", f"candidate lost tie-breaker: higher {metric}") + + return None + + def decide_candidate( - baseline: float | None, + baseline: float | ScoreResult | None, candidate: ScoreResult, objective: ObjectiveSpec, constraints: list[ConstraintSpec], tie_breakers: list[dict[str, str]], run_result: RunResult, ) -> DecisionResult: + baseline_score = _baseline_primary_score(baseline) + if run_result.exit_code != 0: return DecisionResult( status="crash", reason=f"command failed with exit code {run_result.exit_code}", - baseline_score=baseline, + baseline_score=baseline_score, candidate_score=None, ) @@ -42,12 +84,12 @@ def decide_candidate( return DecisionResult( status="discard", reason=f"constraint failure: {', '.join(failed_constraints)}", - baseline_score=baseline, + baseline_score=baseline_score, candidate_score=candidate.primary_score, constraint_failures=failed_constraints, ) - if baseline is None: + if baseline_score is None: return DecisionResult( status="keep", reason="no baseline available", @@ -56,21 +98,42 @@ def decide_candidate( ) if objective.direction == "maximize": - better = candidate.primary_score > baseline + better = candidate.primary_score > baseline_score + worse = candidate.primary_score < baseline_score else: - better = candidate.primary_score < baseline + better = candidate.primary_score < baseline_score + worse = candidate.primary_score > baseline_score if better: return DecisionResult( status="keep", reason="candidate improved primary score", - baseline_score=baseline, + baseline_score=baseline_score, candidate_score=candidate.primary_score, ) + if worse: + return DecisionResult( + status="discard", + reason="candidate did not improve primary score", + baseline_score=baseline_score, + candidate_score=candidate.primary_score, + ) + + if isinstance(baseline, ScoreResult): + tie_breaker_result = _apply_tie_breakers(baseline, candidate, tie_breakers) + if tie_breaker_result is not None: + status, reason = tie_breaker_result + return DecisionResult( + status=status, + reason=reason, + baseline_score=baseline_score, + candidate_score=candidate.primary_score, + ) + return DecisionResult( status="discard", - reason="candidate did not improve primary score", - baseline_score=baseline, + reason="candidate tied primary score and did not improve tie-breakers", + baseline_score=baseline_score, candidate_score=candidate.primary_score, ) diff --git a/tests/test_execution_pipeline.py b/tests/test_execution_pipeline.py index 23b0437..d21d53b 100644 --- a/tests/test_execution_pipeline.py +++ b/tests/test_execution_pipeline.py @@ -60,6 +60,60 @@ class ExecutionPipelineTest(unittest.TestCase): self.assertEqual(decision.status, "discard") self.assertIn("violation_count", decision.reason) + def test_decide_candidate_keeps_equal_primary_when_lower_tie_breaker_improves(self) -> None: + decision = decide_candidate( + baseline=ScoreResult( + primary_score=4.0, + metrics={"latency_ms": 120}, + raw_output={"score": 4.0, "metrics": {"latency_ms": 120}}, + ), + candidate=ScoreResult( + primary_score=4.0, + metrics={"latency_ms": 100}, + raw_output={"score": 4.0, "metrics": {"latency_ms": 100}}, + ), + objective=ObjectiveSpec(primary_metric="score", direction="maximize"), + constraints=[], + tie_breakers=[{"lower": "latency_ms"}], + run_result=RunResult( + command="python -c \"print('ok')\"", + cwd=Path("."), + exit_code=0, + runtime_seconds=0.1, + stdout="ok\n", + stderr="", + ), + ) + self.assertEqual(decision.status, "keep") + self.assertEqual(decision.reason, "candidate won tie-breaker: lower latency_ms") + + def test_decide_candidate_discards_equal_primary_when_tie_breaker_does_not_improve(self) -> None: + decision = decide_candidate( + baseline=ScoreResult( + primary_score=4.0, + metrics={"latency_ms": 100}, + raw_output={"score": 4.0, "metrics": {"latency_ms": 100}}, + ), + candidate=ScoreResult( + primary_score=4.0, + metrics={"latency_ms": 120}, + raw_output={"score": 4.0, "metrics": {"latency_ms": 120}}, + ), + objective=ObjectiveSpec(primary_metric="score", direction="maximize"), + constraints=[], + tie_breakers=[{"lower": "latency_ms"}], + run_result=RunResult( + command="python -c \"print('ok')\"", + cwd=Path("."), + exit_code=0, + runtime_seconds=0.1, + stdout="ok\n", + stderr="", + ), + ) + self.assertEqual(decision.status, "discard") + self.assertEqual(decision.reason, "candidate lost tie-breaker: lower latency_ms") + class RunTaskCliTest(unittest.TestCase): def _copy_repo_layout(self, destination: Path) -> None: