60 lines
1.7 KiB
Python
60 lines
1.7 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
from docx import Document
|
|
|
|
from common import write_json, write_text
|
|
|
|
|
|
def extract_docx_text(docx_path: Path) -> str:
|
|
document = Document(docx_path)
|
|
chunks: list[str] = []
|
|
for paragraph in document.paragraphs:
|
|
if paragraph.text.strip():
|
|
chunks.append(paragraph.text.strip())
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
if cell.text.strip():
|
|
chunks.append(cell.text.strip())
|
|
return "\n".join(chunks)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--docx", required=True)
|
|
parser.add_argument("--out", required=True)
|
|
args = parser.parse_args()
|
|
|
|
docx_path = Path(args.docx).resolve()
|
|
out_dir = Path(args.out).resolve()
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
text = extract_docx_text(docx_path)
|
|
report = {
|
|
"docx": str(docx_path),
|
|
"character_count": len(text),
|
|
"line_count": len([line for line in text.splitlines() if line.strip()]),
|
|
}
|
|
write_json(out_dir / "docx_text_inspection.json", report)
|
|
write_text(out_dir / "docx_text_dump.txt", text)
|
|
write_text(
|
|
out_dir / "docx_text_inspection.md",
|
|
"\n".join(
|
|
[
|
|
f"# {docx_path.name} 机械文本检查",
|
|
"",
|
|
f"- 文本字符数:{report['character_count']}",
|
|
f"- 非空行数:{report['line_count']}",
|
|
"",
|
|
"说明:本脚本只做 DOCX 文本提取与基础统计,不负责一致性、合规性或投标判断。",
|
|
]
|
|
),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|