Honor tie breakers in candidate decisions
This commit is contained in:
parent
c2ddb08135
commit
6560cdde97
@ -17,19 +17,61 @@ def _constraint_failed(candidate: ScoreResult, constraint: ConstraintSpec) -> bo
|
||||
raise ValueError(f"unsupported constraint operator: {constraint.op}")
|
||||
|
||||
|
||||
def _baseline_primary_score(baseline: float | ScoreResult | None) -> float | None:
|
||||
if baseline is None:
|
||||
return None
|
||||
if isinstance(baseline, ScoreResult):
|
||||
return baseline.primary_score
|
||||
return baseline
|
||||
|
||||
|
||||
def _apply_tie_breakers(
|
||||
baseline: ScoreResult,
|
||||
candidate: ScoreResult,
|
||||
tie_breakers: list[dict[str, str]],
|
||||
) -> tuple[str, str] | None:
|
||||
for tie_breaker in tie_breakers:
|
||||
if "lower" in tie_breaker:
|
||||
metric = tie_breaker["lower"]
|
||||
baseline_value = baseline.metrics.get(metric)
|
||||
candidate_value = candidate.metrics.get(metric)
|
||||
if baseline_value is None or candidate_value is None:
|
||||
continue
|
||||
if candidate_value < baseline_value:
|
||||
return ("keep", f"candidate won tie-breaker: lower {metric}")
|
||||
if candidate_value > baseline_value:
|
||||
return ("discard", f"candidate lost tie-breaker: lower {metric}")
|
||||
continue
|
||||
|
||||
if "higher" in tie_breaker:
|
||||
metric = tie_breaker["higher"]
|
||||
baseline_value = baseline.metrics.get(metric)
|
||||
candidate_value = candidate.metrics.get(metric)
|
||||
if baseline_value is None or candidate_value is None:
|
||||
continue
|
||||
if candidate_value > baseline_value:
|
||||
return ("keep", f"candidate won tie-breaker: higher {metric}")
|
||||
if candidate_value < baseline_value:
|
||||
return ("discard", f"candidate lost tie-breaker: higher {metric}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def decide_candidate(
|
||||
baseline: float | None,
|
||||
baseline: float | ScoreResult | None,
|
||||
candidate: ScoreResult,
|
||||
objective: ObjectiveSpec,
|
||||
constraints: list[ConstraintSpec],
|
||||
tie_breakers: list[dict[str, str]],
|
||||
run_result: RunResult,
|
||||
) -> DecisionResult:
|
||||
baseline_score = _baseline_primary_score(baseline)
|
||||
|
||||
if run_result.exit_code != 0:
|
||||
return DecisionResult(
|
||||
status="crash",
|
||||
reason=f"command failed with exit code {run_result.exit_code}",
|
||||
baseline_score=baseline,
|
||||
baseline_score=baseline_score,
|
||||
candidate_score=None,
|
||||
)
|
||||
|
||||
@ -42,12 +84,12 @@ def decide_candidate(
|
||||
return DecisionResult(
|
||||
status="discard",
|
||||
reason=f"constraint failure: {', '.join(failed_constraints)}",
|
||||
baseline_score=baseline,
|
||||
baseline_score=baseline_score,
|
||||
candidate_score=candidate.primary_score,
|
||||
constraint_failures=failed_constraints,
|
||||
)
|
||||
|
||||
if baseline is None:
|
||||
if baseline_score is None:
|
||||
return DecisionResult(
|
||||
status="keep",
|
||||
reason="no baseline available",
|
||||
@ -56,21 +98,42 @@ def decide_candidate(
|
||||
)
|
||||
|
||||
if objective.direction == "maximize":
|
||||
better = candidate.primary_score > baseline
|
||||
better = candidate.primary_score > baseline_score
|
||||
worse = candidate.primary_score < baseline_score
|
||||
else:
|
||||
better = candidate.primary_score < baseline
|
||||
better = candidate.primary_score < baseline_score
|
||||
worse = candidate.primary_score > baseline_score
|
||||
|
||||
if better:
|
||||
return DecisionResult(
|
||||
status="keep",
|
||||
reason="candidate improved primary score",
|
||||
baseline_score=baseline,
|
||||
baseline_score=baseline_score,
|
||||
candidate_score=candidate.primary_score,
|
||||
)
|
||||
|
||||
if worse:
|
||||
return DecisionResult(
|
||||
status="discard",
|
||||
reason="candidate did not improve primary score",
|
||||
baseline_score=baseline_score,
|
||||
candidate_score=candidate.primary_score,
|
||||
)
|
||||
|
||||
if isinstance(baseline, ScoreResult):
|
||||
tie_breaker_result = _apply_tie_breakers(baseline, candidate, tie_breakers)
|
||||
if tie_breaker_result is not None:
|
||||
status, reason = tie_breaker_result
|
||||
return DecisionResult(
|
||||
status=status,
|
||||
reason=reason,
|
||||
baseline_score=baseline_score,
|
||||
candidate_score=candidate.primary_score,
|
||||
)
|
||||
|
||||
return DecisionResult(
|
||||
status="discard",
|
||||
reason="candidate did not improve primary score",
|
||||
baseline_score=baseline,
|
||||
reason="candidate tied primary score and did not improve tie-breakers",
|
||||
baseline_score=baseline_score,
|
||||
candidate_score=candidate.primary_score,
|
||||
)
|
||||
|
||||
@ -60,6 +60,60 @@ class ExecutionPipelineTest(unittest.TestCase):
|
||||
self.assertEqual(decision.status, "discard")
|
||||
self.assertIn("violation_count", decision.reason)
|
||||
|
||||
def test_decide_candidate_keeps_equal_primary_when_lower_tie_breaker_improves(self) -> None:
|
||||
decision = decide_candidate(
|
||||
baseline=ScoreResult(
|
||||
primary_score=4.0,
|
||||
metrics={"latency_ms": 120},
|
||||
raw_output={"score": 4.0, "metrics": {"latency_ms": 120}},
|
||||
),
|
||||
candidate=ScoreResult(
|
||||
primary_score=4.0,
|
||||
metrics={"latency_ms": 100},
|
||||
raw_output={"score": 4.0, "metrics": {"latency_ms": 100}},
|
||||
),
|
||||
objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
|
||||
constraints=[],
|
||||
tie_breakers=[{"lower": "latency_ms"}],
|
||||
run_result=RunResult(
|
||||
command="python -c \"print('ok')\"",
|
||||
cwd=Path("."),
|
||||
exit_code=0,
|
||||
runtime_seconds=0.1,
|
||||
stdout="ok\n",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
self.assertEqual(decision.status, "keep")
|
||||
self.assertEqual(decision.reason, "candidate won tie-breaker: lower latency_ms")
|
||||
|
||||
def test_decide_candidate_discards_equal_primary_when_tie_breaker_does_not_improve(self) -> None:
|
||||
decision = decide_candidate(
|
||||
baseline=ScoreResult(
|
||||
primary_score=4.0,
|
||||
metrics={"latency_ms": 100},
|
||||
raw_output={"score": 4.0, "metrics": {"latency_ms": 100}},
|
||||
),
|
||||
candidate=ScoreResult(
|
||||
primary_score=4.0,
|
||||
metrics={"latency_ms": 120},
|
||||
raw_output={"score": 4.0, "metrics": {"latency_ms": 120}},
|
||||
),
|
||||
objective=ObjectiveSpec(primary_metric="score", direction="maximize"),
|
||||
constraints=[],
|
||||
tie_breakers=[{"lower": "latency_ms"}],
|
||||
run_result=RunResult(
|
||||
command="python -c \"print('ok')\"",
|
||||
cwd=Path("."),
|
||||
exit_code=0,
|
||||
runtime_seconds=0.1,
|
||||
stdout="ok\n",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
self.assertEqual(decision.status, "discard")
|
||||
self.assertEqual(decision.reason, "candidate lost tie-breaker: lower latency_ms")
|
||||
|
||||
|
||||
class RunTaskCliTest(unittest.TestCase):
|
||||
def _copy_repo_layout(self, destination: Path) -> None:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user