92 lines
2.0 KiB
Python
92 lines
2.0 KiB
Python
from __future__ import annotations
|
|
|
|
import math
|
|
import re
|
|
from typing import List
|
|
|
|
|
|
_WHITESPACE_RE = re.compile(r"\s+")
|
|
|
|
|
|
def normalize_text_for_dedup(text: str) -> str:
|
|
if not text:
|
|
return ""
|
|
return _WHITESPACE_RE.sub(" ", text).strip().lower()
|
|
|
|
|
|
def estimate_tokens(text: str) -> int:
|
|
"""Estimate token usage without extra dependencies.
|
|
|
|
Heuristic:
|
|
- CJK characters: ~1 token per char
|
|
- Non-CJK: ~1 token per 4 characters
|
|
"""
|
|
if not text:
|
|
return 0
|
|
|
|
cjk = 0
|
|
for ch in text:
|
|
if "\u4e00" <= ch <= "\u9fff":
|
|
cjk += 1
|
|
|
|
non_cjk_len = len(text) - cjk
|
|
return cjk + int(math.ceil(non_cjk_len / 4))
|
|
|
|
|
|
def truncate_to_token_budget(text: str, token_budget: int) -> str:
|
|
if not text or token_budget <= 0:
|
|
return ""
|
|
if estimate_tokens(text) <= token_budget:
|
|
return text
|
|
|
|
lo, hi = 0, len(text)
|
|
while lo < hi:
|
|
mid = (lo + hi + 1) // 2
|
|
if estimate_tokens(text[:mid]) <= token_budget:
|
|
lo = mid
|
|
else:
|
|
hi = mid - 1
|
|
|
|
return text[:lo].rstrip()
|
|
|
|
|
|
def fit_texts_to_token_budget(
|
|
texts: List[str],
|
|
token_budget: int,
|
|
*,
|
|
separator: str = "\n\n",
|
|
) -> List[str]:
|
|
if token_budget <= 0:
|
|
return [text for text in texts if (text or "").strip()]
|
|
|
|
selected: List[str] = []
|
|
used = 0
|
|
sep_tokens = estimate_tokens(separator)
|
|
|
|
for text in texts:
|
|
if not (text or "").strip():
|
|
continue
|
|
|
|
add_sep = sep_tokens if selected else 0
|
|
text_tokens = estimate_tokens(text)
|
|
|
|
if used + add_sep + text_tokens <= token_budget:
|
|
if selected:
|
|
used += sep_tokens
|
|
selected.append(text)
|
|
used += text_tokens
|
|
continue
|
|
|
|
remaining = token_budget - used - add_sep
|
|
if remaining <= 0:
|
|
break
|
|
|
|
truncated = truncate_to_token_budget(text, remaining)
|
|
if truncated:
|
|
if selected:
|
|
used += sep_tokens
|
|
selected.append(truncated)
|
|
break
|
|
|
|
return selected
|