From 09ebea439db47bf08f925fc1654dc30415a58c64 Mon Sep 17 00:00:00 2001 From: Hugh Brown Date: Tue, 10 Mar 2026 21:34:40 -0600 Subject: [PATCH] Guard against infinite loop when no training shards exist, fix README typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add assertion after filtering val_path from parquet_paths for the "train" split so an empty list fails fast instead of spinning in a silent infinite loop. Also remove stray article "a" in README ("a three files" → "three files"). Co-Authored-By: Claude Opus 4.6 --- README.md | 2 +- prepare.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f21194..2bc3051 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The idea: give an AI agent a small but real LLM training setup and let it experi ## How it works -The repo is deliberately kept small and only really has a three files that matter: +The repo is deliberately kept small and only really has three files that matter: - **`prepare.py`** — fixed constants, one-time data prep (downloads training data, trains a BPE tokenizer), and runtime utilities (dataloader, evaluation). Not modified. - **`train.py`** — the single file the agent edits. Contains the full GPT model, optimizer (Muon + AdamW), and training loop. Everything is fair game: architecture, hyperparameters, optimizer, batch size, etc. **This file is edited and iterated on by the agent**. diff --git a/prepare.py b/prepare.py index 62607b9..06bea91 100644 --- a/prepare.py +++ b/prepare.py @@ -258,6 +258,7 @@ def _document_batches(split, tokenizer_batch_size=128): val_path = os.path.join(DATA_DIR, VAL_FILENAME) if split == "train": parquet_paths = [p for p in parquet_paths if p != val_path] + assert len(parquet_paths) > 0, "No training shards found." else: parquet_paths = [val_path] epoch = 1