Skill-BidCreater/scripts/parse_docx.py

from __future__ import annotations

import argparse
import json
import re
from pathlib import Path
from typing import Any

from docx import Document
from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.paragraph import Paragraph

from common import write_json


CHAPTER_RE = re.compile(r"^第[一二三四五六七八九十百零]+[章节篇部]\s*(.+)?$")
NUMBERED_RE = re.compile(r"^(?P<number>\d+(?:\.\d+){0,8})[.、．\s）)]*(?P<title>.+)$")


def normalize_text(value: str) -> str:
    return re.sub(r"\s+", " ", value or "").strip()


def table_to_rows(table: Table) -> list[list[str]]:
    rows: list[list[str]] = []
    for row in table.rows:
        values = [normalize_text(cell.text) for cell in row.cells]
        if any(values):
            rows.append(values)
    return rows


def is_heading_style(style_name: str) -> bool:
    normalized = normalize_text(style_name).replace(" ", "").lower()
    return normalized.startswith("heading") or normalized.startswith("标题")


def parse_heading(text: str, style_name: str) -> dict[str, Any] | None:
    source = normalize_text(text)
    if not source:
        return None
    chapter_match = CHAPTER_RE.match(source)
    if chapter_match:
        return {
            "level": 1,
            "number": source.split(" ", 1)[0],
            "title": source,
            "kind": "chapter",
        }
    if is_heading_style(style_name):
        match = NUMBERED_RE.match(source)
        if match:
            return {
                "level": len(match.group("number").split(".")) + 1,
                "number": match.group("number"),
                "title": normalize_text(match.group("title")),
                "kind": "numbered",
            }
        return {
            "level": 2,
            "number": "",
            "title": source,
            "kind": "styled",
        }
    numbered_match = NUMBERED_RE.match(source)
    if numbered_match and len(source) <= 120:
        return {
            "level": len(numbered_match.group("number").split(".")) + 1,
            "number": numbered_match.group("number"),
            "title": normalize_text(numbered_match.group("title")),
            "kind": "numbered",
        }
    return None


def iter_blocks(document: DocxDocument) -> list[dict[str, Any]]:
    blocks: list[dict[str, Any]] = []
    paragraph_index = 0
    table_index = 0
    for child in document.element.body.iterchildren():
        if child.tag.endswith("}p"):
            paragraph = Paragraph(child, document)
            text = normalize_text(paragraph.text)
            if not text:
                continue
            style_name = paragraph.style.name if paragraph.style else "Normal"
            blocks.append(
                {
                    "id": f"p-{paragraph_index}",
                    "kind": "paragraph",
                    "text": text,
                    "style": style_name,
                    "heading": parse_heading(text, style_name),
                }
            )
            paragraph_index += 1
        elif child.tag.endswith("}tbl"):
            rows = table_to_rows(Table(child, document))
            if not rows:
                continue
            blocks.append(
                {
                    "id": f"t-{table_index}",
                    "kind": "table",
                    "text": "\n".join(" | ".join(row) for row in rows),
                    "rows": rows,
                }
            )
            table_index += 1
    return blocks


def extract_images(document: Document) -> list[dict[str, Any]]:
    images: list[dict[str, Any]] = []
    image_index = 0
    for rel in document.part.rels.values():
        target_ref = getattr(rel, "target_ref", "")
        if "image" not in target_ref:
            continue
        images.append(
            {
                "id": f"img-{image_index}",
                "target_ref": target_ref,
            }
        )
        image_index += 1
    return images


def build_document_graph(docx_path: Path) -> dict[str, Any]:
    document = Document(docx_path)
    blocks = iter_blocks(document)
    headings = [block["heading"] | {"block_id": block["id"]} for block in blocks if block.get("heading")]
    tables = [
        {
            "id": block["id"],
            "rows": block["rows"],
            "row_count": len(block["rows"]),
            "column_count": max(len(row) for row in block["rows"]) if block["rows"] else 0,
        }
        for block in blocks
        if block["kind"] == "table"
    ]
    return {
        "source_docx": str(docx_path),
        "blocks": blocks,
        "headings": headings,
        "tables": tables,
        "images": extract_images(document),
        "summary": {
            "block_count": len(blocks),
            "paragraph_count": len([block for block in blocks if block["kind"] == "paragraph"]),
            "table_count": len(tables),
            "image_count": len(extract_images(document)),
        },
    }


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--docx", required=True)
    parser.add_argument("--out", required=True)
    args = parser.parse_args()

    graph = build_document_graph(Path(args.docx).resolve())
    write_json(Path(args.out).resolve(), graph)


if __name__ == "__main__":
    main()