172 lines
5.2 KiB
Python
172 lines
5.2 KiB
Python
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
from docx import Document
|
||
from docx.document import Document as DocxDocument
|
||
from docx.table import Table
|
||
from docx.text.paragraph import Paragraph
|
||
|
||
from common import write_json
|
||
|
||
|
||
CHAPTER_RE = re.compile(r"^第[一二三四五六七八九十百零]+[章节篇部]\s*(.+)?$")
|
||
NUMBERED_RE = re.compile(r"^(?P<number>\d+(?:\.\d+){0,8})[.、.\s))]*(?P<title>.+)$")
|
||
|
||
|
||
def normalize_text(value: str) -> str:
|
||
return re.sub(r"\s+", " ", value or "").strip()
|
||
|
||
|
||
def table_to_rows(table: Table) -> list[list[str]]:
|
||
rows: list[list[str]] = []
|
||
for row in table.rows:
|
||
values = [normalize_text(cell.text) for cell in row.cells]
|
||
if any(values):
|
||
rows.append(values)
|
||
return rows
|
||
|
||
|
||
def is_heading_style(style_name: str) -> bool:
|
||
normalized = normalize_text(style_name).replace(" ", "").lower()
|
||
return normalized.startswith("heading") or normalized.startswith("标题")
|
||
|
||
|
||
def parse_heading(text: str, style_name: str) -> dict[str, Any] | None:
|
||
source = normalize_text(text)
|
||
if not source:
|
||
return None
|
||
chapter_match = CHAPTER_RE.match(source)
|
||
if chapter_match:
|
||
return {
|
||
"level": 1,
|
||
"number": source.split(" ", 1)[0],
|
||
"title": source,
|
||
"kind": "chapter",
|
||
}
|
||
if is_heading_style(style_name):
|
||
match = NUMBERED_RE.match(source)
|
||
if match:
|
||
return {
|
||
"level": len(match.group("number").split(".")) + 1,
|
||
"number": match.group("number"),
|
||
"title": normalize_text(match.group("title")),
|
||
"kind": "numbered",
|
||
}
|
||
return {
|
||
"level": 2,
|
||
"number": "",
|
||
"title": source,
|
||
"kind": "styled",
|
||
}
|
||
numbered_match = NUMBERED_RE.match(source)
|
||
if numbered_match and len(source) <= 120:
|
||
return {
|
||
"level": len(numbered_match.group("number").split(".")) + 1,
|
||
"number": numbered_match.group("number"),
|
||
"title": normalize_text(numbered_match.group("title")),
|
||
"kind": "numbered",
|
||
}
|
||
return None
|
||
|
||
|
||
def iter_blocks(document: DocxDocument) -> list[dict[str, Any]]:
|
||
blocks: list[dict[str, Any]] = []
|
||
paragraph_index = 0
|
||
table_index = 0
|
||
for child in document.element.body.iterchildren():
|
||
if child.tag.endswith("}p"):
|
||
paragraph = Paragraph(child, document)
|
||
text = normalize_text(paragraph.text)
|
||
if not text:
|
||
continue
|
||
style_name = paragraph.style.name if paragraph.style else "Normal"
|
||
blocks.append(
|
||
{
|
||
"id": f"p-{paragraph_index}",
|
||
"kind": "paragraph",
|
||
"text": text,
|
||
"style": style_name,
|
||
"heading": parse_heading(text, style_name),
|
||
}
|
||
)
|
||
paragraph_index += 1
|
||
elif child.tag.endswith("}tbl"):
|
||
rows = table_to_rows(Table(child, document))
|
||
if not rows:
|
||
continue
|
||
blocks.append(
|
||
{
|
||
"id": f"t-{table_index}",
|
||
"kind": "table",
|
||
"text": "\n".join(" | ".join(row) for row in rows),
|
||
"rows": rows,
|
||
}
|
||
)
|
||
table_index += 1
|
||
return blocks
|
||
|
||
|
||
def extract_images(document: Document) -> list[dict[str, Any]]:
|
||
images: list[dict[str, Any]] = []
|
||
image_index = 0
|
||
for rel in document.part.rels.values():
|
||
target_ref = getattr(rel, "target_ref", "")
|
||
if "image" not in target_ref:
|
||
continue
|
||
images.append(
|
||
{
|
||
"id": f"img-{image_index}",
|
||
"target_ref": target_ref,
|
||
}
|
||
)
|
||
image_index += 1
|
||
return images
|
||
|
||
|
||
def build_document_graph(docx_path: Path) -> dict[str, Any]:
|
||
document = Document(docx_path)
|
||
blocks = iter_blocks(document)
|
||
headings = [block["heading"] | {"block_id": block["id"]} for block in blocks if block.get("heading")]
|
||
tables = [
|
||
{
|
||
"id": block["id"],
|
||
"rows": block["rows"],
|
||
"row_count": len(block["rows"]),
|
||
"column_count": max(len(row) for row in block["rows"]) if block["rows"] else 0,
|
||
}
|
||
for block in blocks
|
||
if block["kind"] == "table"
|
||
]
|
||
return {
|
||
"source_docx": str(docx_path),
|
||
"blocks": blocks,
|
||
"headings": headings,
|
||
"tables": tables,
|
||
"images": extract_images(document),
|
||
"summary": {
|
||
"block_count": len(blocks),
|
||
"paragraph_count": len([block for block in blocks if block["kind"] == "paragraph"]),
|
||
"table_count": len(tables),
|
||
"image_count": len(extract_images(document)),
|
||
},
|
||
}
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--docx", required=True)
|
||
parser.add_argument("--out", required=True)
|
||
args = parser.parse_args()
|
||
|
||
graph = build_document_graph(Path(args.docx).resolve())
|
||
write_json(Path(args.out).resolve(), graph)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|