bidmaster-cli/src/bidmaster/utils/rag_context.py

from __future__ import annotations

import math
import re
from typing import List


_WHITESPACE_RE = re.compile(r"\s+")


def normalize_text_for_dedup(text: str) -> str:
    if not text:
        return ""
    return _WHITESPACE_RE.sub(" ", text).strip().lower()


def estimate_tokens(text: str) -> int:
    """Estimate token usage without extra dependencies.

    Heuristic:
    - CJK characters: ~1 token per char
    - Non-CJK: ~1 token per 4 characters
    """
    if not text:
        return 0

    cjk = 0
    for ch in text:
        if "\u4e00" <= ch <= "\u9fff":
            cjk += 1

    non_cjk_len = len(text) - cjk
    return cjk + int(math.ceil(non_cjk_len / 4))


def truncate_to_token_budget(text: str, token_budget: int) -> str:
    if not text or token_budget <= 0:
        return ""
    if estimate_tokens(text) <= token_budget:
        return text

    lo, hi = 0, len(text)
    while lo < hi:
        mid = (lo + hi + 1) // 2
        if estimate_tokens(text[:mid]) <= token_budget:
            lo = mid
        else:
            hi = mid - 1

    return text[:lo].rstrip()


def fit_texts_to_token_budget(
    texts: List[str],
    token_budget: int,
    *,
    separator: str = "\n\n",
) -> List[str]:
    if token_budget <= 0:
        return [text for text in texts if (text or "").strip()]

    selected: List[str] = []
    used = 0
    sep_tokens = estimate_tokens(separator)

    for text in texts:
        if not (text or "").strip():
            continue

        add_sep = sep_tokens if selected else 0
        text_tokens = estimate_tokens(text)

        if used + add_sep + text_tokens <= token_budget:
            if selected:
                used += sep_tokens
            selected.append(text)
            used += text_tokens
            continue

        remaining = token_budget - used - add_sep
        if remaining <= 0:
            break

        truncated = truncate_to_token_budget(text, remaining)
        if truncated:
            if selected:
                used += sep_tokens
            selected.append(truncated)
        break

    return selected