Skill-BidCreater/scripts/parse_docx.py
2026-03-09 22:20:38 +08:00

172 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import Any
from docx import Document
from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.paragraph import Paragraph
from common import write_json
CHAPTER_RE = re.compile(r"^第[一二三四五六七八九十百零]+[章节篇部]\s*(.+)?$")
NUMBERED_RE = re.compile(r"^(?P<number>\d+(?:\.\d+){0,8})[.、.\s)]*(?P<title>.+)$")
def normalize_text(value: str) -> str:
return re.sub(r"\s+", " ", value or "").strip()
def table_to_rows(table: Table) -> list[list[str]]:
rows: list[list[str]] = []
for row in table.rows:
values = [normalize_text(cell.text) for cell in row.cells]
if any(values):
rows.append(values)
return rows
def is_heading_style(style_name: str) -> bool:
normalized = normalize_text(style_name).replace(" ", "").lower()
return normalized.startswith("heading") or normalized.startswith("标题")
def parse_heading(text: str, style_name: str) -> dict[str, Any] | None:
source = normalize_text(text)
if not source:
return None
chapter_match = CHAPTER_RE.match(source)
if chapter_match:
return {
"level": 1,
"number": source.split(" ", 1)[0],
"title": source,
"kind": "chapter",
}
if is_heading_style(style_name):
match = NUMBERED_RE.match(source)
if match:
return {
"level": len(match.group("number").split(".")) + 1,
"number": match.group("number"),
"title": normalize_text(match.group("title")),
"kind": "numbered",
}
return {
"level": 2,
"number": "",
"title": source,
"kind": "styled",
}
numbered_match = NUMBERED_RE.match(source)
if numbered_match and len(source) <= 120:
return {
"level": len(numbered_match.group("number").split(".")) + 1,
"number": numbered_match.group("number"),
"title": normalize_text(numbered_match.group("title")),
"kind": "numbered",
}
return None
def iter_blocks(document: DocxDocument) -> list[dict[str, Any]]:
blocks: list[dict[str, Any]] = []
paragraph_index = 0
table_index = 0
for child in document.element.body.iterchildren():
if child.tag.endswith("}p"):
paragraph = Paragraph(child, document)
text = normalize_text(paragraph.text)
if not text:
continue
style_name = paragraph.style.name if paragraph.style else "Normal"
blocks.append(
{
"id": f"p-{paragraph_index}",
"kind": "paragraph",
"text": text,
"style": style_name,
"heading": parse_heading(text, style_name),
}
)
paragraph_index += 1
elif child.tag.endswith("}tbl"):
rows = table_to_rows(Table(child, document))
if not rows:
continue
blocks.append(
{
"id": f"t-{table_index}",
"kind": "table",
"text": "\n".join(" | ".join(row) for row in rows),
"rows": rows,
}
)
table_index += 1
return blocks
def extract_images(document: Document) -> list[dict[str, Any]]:
images: list[dict[str, Any]] = []
image_index = 0
for rel in document.part.rels.values():
target_ref = getattr(rel, "target_ref", "")
if "image" not in target_ref:
continue
images.append(
{
"id": f"img-{image_index}",
"target_ref": target_ref,
}
)
image_index += 1
return images
def build_document_graph(docx_path: Path) -> dict[str, Any]:
document = Document(docx_path)
blocks = iter_blocks(document)
headings = [block["heading"] | {"block_id": block["id"]} for block in blocks if block.get("heading")]
tables = [
{
"id": block["id"],
"rows": block["rows"],
"row_count": len(block["rows"]),
"column_count": max(len(row) for row in block["rows"]) if block["rows"] else 0,
}
for block in blocks
if block["kind"] == "table"
]
return {
"source_docx": str(docx_path),
"blocks": blocks,
"headings": headings,
"tables": tables,
"images": extract_images(document),
"summary": {
"block_count": len(blocks),
"paragraph_count": len([block for block in blocks if block["kind"] == "paragraph"]),
"table_count": len(tables),
"image_count": len(extract_images(document)),
},
}
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--docx", required=True)
parser.add_argument("--out", required=True)
args = parser.parse_args()
graph = build_document_graph(Path(args.docx).resolve())
write_json(Path(args.out).resolve(), graph)
if __name__ == "__main__":
main()