28 lines
777 B
Python
28 lines
777 B
Python
from __future__ import annotations
|
|
|
|
from bidmaster.utils.rag_context import (
|
|
estimate_tokens,
|
|
fit_texts_to_token_budget,
|
|
truncate_to_token_budget,
|
|
)
|
|
|
|
|
|
def test_estimate_tokens_cjk_vs_ascii():
|
|
assert estimate_tokens("中文") >= 2
|
|
assert estimate_tokens("abcd") >= 1
|
|
|
|
|
|
def test_truncate_to_token_budget_truncates():
|
|
text = "中文中文中文" # 6 CJK chars
|
|
truncated = truncate_to_token_budget(text, 3)
|
|
assert truncated
|
|
assert estimate_tokens(truncated) <= 3
|
|
|
|
|
|
def test_fit_texts_to_token_budget_drops_or_truncates_tail():
|
|
first = "中文中文" # ~4 tokens
|
|
second = "中文中文中文" # ~6 tokens
|
|
budget = estimate_tokens(first)
|
|
fitted = fit_texts_to_token_budget([first, second], budget)
|
|
assert fitted == [first]
|