Skill-BidCreater/scripts/inspect_docx_text.py
2026-03-09 22:20:38 +08:00

60 lines
1.7 KiB
Python

from __future__ import annotations
import argparse
from pathlib import Path
from docx import Document
from common import write_json, write_text
def extract_docx_text(docx_path: Path) -> str:
document = Document(docx_path)
chunks: list[str] = []
for paragraph in document.paragraphs:
if paragraph.text.strip():
chunks.append(paragraph.text.strip())
for table in document.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
chunks.append(cell.text.strip())
return "\n".join(chunks)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--docx", required=True)
parser.add_argument("--out", required=True)
args = parser.parse_args()
docx_path = Path(args.docx).resolve()
out_dir = Path(args.out).resolve()
out_dir.mkdir(parents=True, exist_ok=True)
text = extract_docx_text(docx_path)
report = {
"docx": str(docx_path),
"character_count": len(text),
"line_count": len([line for line in text.splitlines() if line.strip()]),
}
write_json(out_dir / "docx_text_inspection.json", report)
write_text(out_dir / "docx_text_dump.txt", text)
write_text(
out_dir / "docx_text_inspection.md",
"\n".join(
[
f"# {docx_path.name} 机械文本检查",
"",
f"- 文本字符数:{report['character_count']}",
f"- 非空行数:{report['line_count']}",
"",
"说明:本脚本只做 DOCX 文本提取与基础统计,不负责一致性、合规性或投标判断。",
]
),
)
if __name__ == "__main__":
main()