from __future__ import annotations import math import re from typing import List _WHITESPACE_RE = re.compile(r"\s+") def normalize_text_for_dedup(text: str) -> str: if not text: return "" return _WHITESPACE_RE.sub(" ", text).strip().lower() def estimate_tokens(text: str) -> int: """Estimate token usage without extra dependencies. Heuristic: - CJK characters: ~1 token per char - Non-CJK: ~1 token per 4 characters """ if not text: return 0 cjk = 0 for ch in text: if "\u4e00" <= ch <= "\u9fff": cjk += 1 non_cjk_len = len(text) - cjk return cjk + int(math.ceil(non_cjk_len / 4)) def truncate_to_token_budget(text: str, token_budget: int) -> str: if not text or token_budget <= 0: return "" if estimate_tokens(text) <= token_budget: return text lo, hi = 0, len(text) while lo < hi: mid = (lo + hi + 1) // 2 if estimate_tokens(text[:mid]) <= token_budget: lo = mid else: hi = mid - 1 return text[:lo].rstrip() def fit_texts_to_token_budget( texts: List[str], token_budget: int, *, separator: str = "\n\n", ) -> List[str]: if token_budget <= 0: return [text for text in texts if (text or "").strip()] selected: List[str] = [] used = 0 sep_tokens = estimate_tokens(separator) for text in texts: if not (text or "").strip(): continue add_sep = sep_tokens if selected else 0 text_tokens = estimate_tokens(text) if used + add_sep + text_tokens <= token_budget: if selected: used += sep_tokens selected.append(text) used += text_tokens continue remaining = token_budget - used - add_sep if remaining <= 0: break truncated = truncate_to_token_budget(text, remaining) if truncated: if selected: used += sep_tokens selected.append(truncated) break return selected