71 lines
2.0 KiB
Python
71 lines
2.0 KiB
Python
from pathlib import Path
|
|
|
|
from docx import Document
|
|
|
|
from bidmaster.utils.document_context import (
|
|
DocumentContextBuilder,
|
|
DocumentContextSearcher,
|
|
)
|
|
|
|
|
|
def _create_sample_doc(doc_path: Path) -> None:
|
|
doc = Document()
|
|
doc.add_heading("第一章 项目总体概述", level=1)
|
|
doc.add_paragraph("本项目聚焦城市智慧照明系统建设,强调云边协同与多维感知能力。")
|
|
doc.add_heading("第二章 建设目标", level=1)
|
|
doc.add_paragraph("目标包括统一管控平台、智能终端、数据中台三大部分。")
|
|
|
|
table = doc.add_table(rows=2, cols=2)
|
|
table.rows[0].cells[0].text = "指标"
|
|
table.rows[0].cells[1].text = "要求"
|
|
table.rows[1].cells[0].text = "系统稳定性"
|
|
table.rows[1].cells[1].text = "7x24小时无故障运行"
|
|
|
|
doc.save(doc_path)
|
|
|
|
|
|
def _dummy_embedding(texts):
|
|
return [[float(len(text))] for text in texts]
|
|
|
|
|
|
def test_document_context_builder_creates_chunks(tmp_path):
|
|
doc_path = tmp_path / "context.docx"
|
|
_create_sample_doc(doc_path)
|
|
|
|
builder = DocumentContextBuilder(
|
|
chunk_size=120,
|
|
chunk_overlap=10,
|
|
embedding_fn=_dummy_embedding,
|
|
)
|
|
|
|
context = builder.build(str(doc_path))
|
|
|
|
assert not context.is_empty()
|
|
assert all(chunk.embedding for chunk in context.chunks)
|
|
assert any("项目总体概述" in chunk.section for chunk in context.chunks)
|
|
|
|
|
|
def test_document_context_searcher_returns_matches(tmp_path):
|
|
doc_path = tmp_path / "search.docx"
|
|
_create_sample_doc(doc_path)
|
|
|
|
builder = DocumentContextBuilder(
|
|
chunk_size=80,
|
|
chunk_overlap=10,
|
|
embedding_fn=_dummy_embedding,
|
|
)
|
|
context = builder.build(str(doc_path))
|
|
|
|
searcher = DocumentContextSearcher(
|
|
context,
|
|
embedding_fn=_dummy_embedding,
|
|
top_k=2,
|
|
)
|
|
|
|
matches = searcher.search("智慧照明平台")
|
|
assert matches
|
|
assert matches[0].score > 0
|
|
|
|
themes = searcher.summarize_themes()
|
|
assert "第一章" in themes
|