From b5ba8ac00d64d38fd5b69714fe398b61c56d8160 Mon Sep 17 00:00:00 2001 From: haosenwang1018 <1293965075@qq.com> Date: Mon, 9 Mar 2026 23:51:02 +0800 Subject: [PATCH 1/2] fix NaN loss not caught by fast-fail check `train_loss_f > 100` silently passes on NaN because IEEE 754 NaN comparisons always return False. When an agent experiment produces NaN (e.g. from an aggressive LR change), the run wastes the full 5-minute budget instead of failing fast. `not (x <= 100)` catches both >100 and NaN with no added complexity. Co-Authored-By: Claude Opus 4.6 --- train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 6994fb9..1378bab 100644 --- a/train.py +++ b/train.py @@ -565,8 +565,8 @@ while True: train_loss_f = train_loss.item() - # Fast fail: abort if loss is exploding - if train_loss_f > 100: + # Fast fail: abort if loss is exploding or NaN + if not train_loss_f <= 100: print("FAIL") exit(1) From ebf357841b7358ae106b2f7ac378f9ec97b975f6 Mon Sep 17 00:00:00 2001 From: Contributor Date: Wed, 11 Mar 2026 04:28:08 +0000 Subject: [PATCH 2/2] fix(train): make NaN fast-fail check explicit --- train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 1378bab..2e74397 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" import gc +import math import time from dataclasses import dataclass, asdict @@ -566,7 +567,7 @@ while True: train_loss_f = train_loss.item() # Fast fail: abort if loss is exploding or NaN - if not train_loss_f <= 100: + if math.isnan(train_loss_f) or train_loss_f > 100: print("FAIL") exit(1)