Honor tie breakers in candidate decisions

This commit is contained in:
sladro 2026-04-02 14:13:17 +08:00
parent c2ddb08135
commit 6560cdde97
2 changed files with 126 additions and 9 deletions

View File

@ -17,19 +17,61 @@ def _constraint_failed(candidate: ScoreResult, constraint: ConstraintSpec) -> bo
raise ValueError(f"unsupported constraint operator: {constraint.op}")
def _baseline_primary_score(baseline: float | ScoreResult | None) -> float | None:
if baseline is None:
return None
if isinstance(baseline, ScoreResult):
return baseline.primary_score
return baseline
def _apply_tie_breakers(
baseline: ScoreResult,
candidate: ScoreResult,
tie_breakers: list[dict[str, str]],
) -> tuple[str, str] | None:
for tie_breaker in tie_breakers:
if "lower" in tie_breaker:
metric = tie_breaker["lower"]
baseline_value = baseline.metrics.get(metric)
candidate_value = candidate.metrics.get(metric)
if baseline_value is None or candidate_value is None:
continue
if candidate_value < baseline_value:
return ("keep", f"candidate won tie-breaker: lower {metric}")
if candidate_value > baseline_value:
return ("discard", f"candidate lost tie-breaker: lower {metric}")
continue
if "higher" in tie_breaker:
metric = tie_breaker["higher"]
baseline_value = baseline.metrics.get(metric)
candidate_value = candidate.metrics.get(metric)
if baseline_value is None or candidate_value is None:
continue
if candidate_value > baseline_value:
return ("keep", f"candidate won tie-breaker: higher {metric}")
if candidate_value < baseline_value:
return ("discard", f"candidate lost tie-breaker: higher {metric}")
return None
def decide_candidate(
baseline: float | None,
baseline: float | ScoreResult | None,
candidate: ScoreResult,
objective: ObjectiveSpec,
constraints: list[ConstraintSpec],
tie_breakers: list[dict[str, str]],
run_result: RunResult,
) -> DecisionResult:
baseline_score = _baseline_primary_score(baseline)
if run_result.exit_code != 0:
return DecisionResult(
status="crash",
reason=f"command failed with exit code {run_result.exit_code}",
baseline_score=baseline,
baseline_score=baseline_score,
candidate_score=None,
)
@ -42,12 +84,12 @@ def decide_candidate(
return DecisionResult(
status="discard",
reason=f"constraint failure: {', '.join(failed_constraints)}",
baseline_score=baseline,
baseline_score=baseline_score,
candidate_score=candidate.primary_score,
constraint_failures=failed_constraints,
)
if baseline is None:
if baseline_score is None:
return DecisionResult(
status="keep",
reason="no baseline available",
@ -56,21 +98,42 @@ def decide_candidate(
)
if objective.direction == "maximize":
better = candidate.primary_score > baseline
better = candidate.primary_score > baseline_score
worse = candidate.primary_score < baseline_score
else:
better = candidate.primary_score < baseline
better = candidate.primary_score < baseline_score
worse = candidate.primary_score > baseline_score
if better:
return DecisionResult(
status="keep",
reason="candidate improved primary score",
baseline_score=baseline,
baseline_score=baseline_score,
candidate_score=candidate.primary_score,
)
if worse:
return DecisionResult(
status="discard",
reason="candidate did not improve primary score",
baseline_score=baseline_score,
candidate_score=candidate.primary_score,
)
if isinstance(baseline, ScoreResult):
tie_breaker_result = _apply_tie_breakers(baseline, candidate, tie_breakers)
if tie_breaker_result is not None:
status, reason = tie_breaker_result
return DecisionResult(
status=status,
reason=reason,
baseline_score=baseline_score,
candidate_score=candidate.primary_score,
)
return DecisionResult(
status="discard",
reason="candidate did not improve primary score",
baseline_score=baseline,
reason="candidate tied primary score and did not improve tie-breakers",
baseline_score=baseline_score,
candidate_score=candidate.primary_score,
)

View File

@ -60,6 +60,60 @@ class ExecutionPipelineTest(unittest.TestCase):
self.assertEqual(decision.status, "discard")
self.assertIn("violation_count", decision.reason)
def test_decide_candidate_keeps_equal_primary_when_lower_tie_breaker_improves(self) -> None:
decision = decide_candidate(
baseline=ScoreResult(
primary_score=4.0,
metrics={"latency_ms": 120},
raw_output={"score": 4.0, "metrics": {"latency_ms": 120}},
),
candidate=ScoreResult(
primary_score=4.0,
metrics={"latency_ms": 100},
raw_output={"score": 4.0, "metrics": {"latency_ms": 100}},
),
objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
constraints=[],
tie_breakers=[{"lower": "latency_ms"}],
run_result=RunResult(
command="python -c \"print('ok')\"",
cwd=Path("."),
exit_code=0,
runtime_seconds=0.1,
stdout="ok\n",
stderr="",
),
)
self.assertEqual(decision.status, "keep")
self.assertEqual(decision.reason, "candidate won tie-breaker: lower latency_ms")
def test_decide_candidate_discards_equal_primary_when_tie_breaker_does_not_improve(self) -> None:
decision = decide_candidate(
baseline=ScoreResult(
primary_score=4.0,
metrics={"latency_ms": 100},
raw_output={"score": 4.0, "metrics": {"latency_ms": 100}},
),
candidate=ScoreResult(
primary_score=4.0,
metrics={"latency_ms": 120},
raw_output={"score": 4.0, "metrics": {"latency_ms": 120}},
),
objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
constraints=[],
tie_breakers=[{"lower": "latency_ms"}],
run_result=RunResult(
command="python -c \"print('ok')\"",
cwd=Path("."),
exit_code=0,
runtime_seconds=0.1,
stdout="ok\n",
stderr="",
),
)
self.assertEqual(decision.status, "discard")
self.assertEqual(decision.reason, "candidate lost tie-breaker: lower latency_ms")
class RunTaskCliTest(unittest.TestCase):
def _copy_repo_layout(self, destination: Path) -> None: