bidmaster-cli/src/bidmaster/utils/rag_context.py

92 lines
2.0 KiB
Python

from __future__ import annotations
import math
import re
from typing import List
_WHITESPACE_RE = re.compile(r"\s+")
def normalize_text_for_dedup(text: str) -> str:
if not text:
return ""
return _WHITESPACE_RE.sub(" ", text).strip().lower()
def estimate_tokens(text: str) -> int:
"""Estimate token usage without extra dependencies.
Heuristic:
- CJK characters: ~1 token per char
- Non-CJK: ~1 token per 4 characters
"""
if not text:
return 0
cjk = 0
for ch in text:
if "\u4e00" <= ch <= "\u9fff":
cjk += 1
non_cjk_len = len(text) - cjk
return cjk + int(math.ceil(non_cjk_len / 4))
def truncate_to_token_budget(text: str, token_budget: int) -> str:
if not text or token_budget <= 0:
return ""
if estimate_tokens(text) <= token_budget:
return text
lo, hi = 0, len(text)
while lo < hi:
mid = (lo + hi + 1) // 2
if estimate_tokens(text[:mid]) <= token_budget:
lo = mid
else:
hi = mid - 1
return text[:lo].rstrip()
def fit_texts_to_token_budget(
texts: List[str],
token_budget: int,
*,
separator: str = "\n\n",
) -> List[str]:
if token_budget <= 0:
return [text for text in texts if (text or "").strip()]
selected: List[str] = []
used = 0
sep_tokens = estimate_tokens(separator)
for text in texts:
if not (text or "").strip():
continue
add_sep = sep_tokens if selected else 0
text_tokens = estimate_tokens(text)
if used + add_sep + text_tokens <= token_budget:
if selected:
used += sep_tokens
selected.append(text)
used += text_tokens
continue
remaining = token_budget - used - add_sep
if remaining <= 0:
break
truncated = truncate_to_token_budget(text, remaining)
if truncated:
if selected:
used += sep_tokens
selected.append(truncated)
break
return selected