854 lines
34 KiB
Python
854 lines
34 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
from dataclasses import dataclass
|
|
from hashlib import sha1
|
|
from pathlib import Path
|
|
from typing import Any, Iterator
|
|
|
|
from docx import Document
|
|
from docx.document import Document as DocxDocument
|
|
from docx.oxml import OxmlElement
|
|
from docx.table import Table, _Cell
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
try:
|
|
from pdf2image import convert_from_path
|
|
except ImportError: # pragma: no cover
|
|
convert_from_path = None
|
|
|
|
try:
|
|
from docx.oxml.ns import qn
|
|
except ImportError: # pragma: no cover
|
|
qn = None
|
|
|
|
NAMESPACES = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
|
TEXT_WINDOW_DEFAULT = 40
|
|
|
|
|
|
@dataclass
|
|
class NodeRecord:
|
|
node_id: str
|
|
node_type: str
|
|
text: str
|
|
style_name: str | None
|
|
heading_level: int | None
|
|
path: list[str]
|
|
ordinal: int
|
|
parent_id: str | None
|
|
anchor: str
|
|
container: str
|
|
table_index: int | None = None
|
|
row_index: int | None = None
|
|
cell_index: int | None = None
|
|
block_index: int | None = None
|
|
xml_path: str | None = None
|
|
has_image: bool = False
|
|
object_ref: Any = None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"node_id": self.node_id,
|
|
"node_type": self.node_type,
|
|
"text": self.text,
|
|
"style_name": self.style_name,
|
|
"heading_level": self.heading_level,
|
|
"path": self.path,
|
|
"ordinal": self.ordinal,
|
|
"parent_id": self.parent_id,
|
|
"anchor": self.anchor,
|
|
"container": self.container,
|
|
"table_index": self.table_index,
|
|
"row_index": self.row_index,
|
|
"cell_index": self.cell_index,
|
|
"block_index": self.block_index,
|
|
"xml_path": self.xml_path,
|
|
"has_image": self.has_image,
|
|
}
|
|
|
|
|
|
class QueryError(RuntimeError):
|
|
pass
|
|
|
|
|
|
def read_json(path: Path) -> Any:
|
|
with path.open("r", encoding="utf-8-sig") as handle:
|
|
return json.load(handle)
|
|
|
|
|
|
def write_json(path: Path, payload: Any) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with path.open("w", encoding="utf-8", newline="\n") as handle:
|
|
json.dump(payload, handle, ensure_ascii=False, indent=2)
|
|
handle.write("\n")
|
|
|
|
|
|
def heading_level_for_style(style_name: str | None) -> int | None:
|
|
if not style_name:
|
|
return None
|
|
compact_style = normalize_text(style_name)
|
|
match = re.match(r"Heading\s+(\d+)$", compact_style, flags=re.IGNORECASE)
|
|
if match:
|
|
return int(match.group(1))
|
|
match = re.match(r"标题\s*(\d+)$", compact_style)
|
|
return int(match.group(1)) if match else None
|
|
|
|
|
|
def normalize_text(value: str) -> str:
|
|
return re.sub(r"\s+", " ", value or "").strip()
|
|
|
|
|
|
def create_docx_document(spec_data: dict[str, Any]) -> dict[str, Any]:
|
|
output_docx = Path(spec_data["output_docx"]).resolve()
|
|
blocks = spec_data.get("blocks", [])
|
|
if not isinstance(blocks, list):
|
|
raise QueryError("blocks must be a list")
|
|
|
|
output_docx.parent.mkdir(parents=True, exist_ok=True)
|
|
document = Document()
|
|
title = spec_data.get("title")
|
|
if title:
|
|
document.core_properties.title = str(title)
|
|
|
|
block_reports: list[dict[str, Any]] = []
|
|
|
|
def render_block(block: dict[str, Any], index_path: list[int]) -> None:
|
|
if not isinstance(block, dict):
|
|
raise QueryError(f"block {'.'.join(str(part) for part in index_path)} must be an object")
|
|
block_type = block.get("type", "paragraph")
|
|
if block_type == "heading":
|
|
level = int(block.get("level", 1))
|
|
if level < 1 or level > 9:
|
|
raise QueryError(f"block {'.'.join(str(part) for part in index_path)} heading level must be between 1 and 9")
|
|
text = str(block.get("text", ""))
|
|
paragraph = document.add_paragraph(style=f"Heading {level}")
|
|
paragraph.add_run(text)
|
|
block_reports.append({"index": ".".join(str(part) for part in index_path), "type": block_type, "text": summarize_text(text), "level": level})
|
|
children = block.get("children", [])
|
|
if children and not isinstance(children, list):
|
|
raise QueryError(f"block {'.'.join(str(part) for part in index_path)} children must be a list")
|
|
if isinstance(children, list):
|
|
for child_index, child in enumerate(children):
|
|
render_block(child, index_path + [child_index])
|
|
return
|
|
if block_type == "paragraph":
|
|
text = str(block.get("text", ""))
|
|
paragraph = document.add_paragraph()
|
|
style_name = block.get("style")
|
|
if style_name:
|
|
try:
|
|
paragraph.style = str(style_name)
|
|
except KeyError:
|
|
pass
|
|
paragraph.add_run(text)
|
|
block_reports.append({"index": ".".join(str(part) for part in index_path), "type": block_type, "text": summarize_text(text)})
|
|
return
|
|
if block_type == "list":
|
|
items = block.get("items", [])
|
|
if not isinstance(items, list):
|
|
raise QueryError(f"block {'.'.join(str(part) for part in index_path)} items must be a list")
|
|
style_name = str(block.get("style", "List Bullet"))
|
|
for item in items:
|
|
paragraph = document.add_paragraph()
|
|
try:
|
|
paragraph.style = style_name
|
|
except KeyError:
|
|
pass
|
|
paragraph.add_run(str(item))
|
|
block_reports.append({"index": ".".join(str(part) for part in index_path), "type": block_type, "item_count": len(items)})
|
|
return
|
|
if block_type == "table":
|
|
rows = block.get("rows", [])
|
|
if not isinstance(rows, list) or not rows or not isinstance(rows[0], list) or not rows[0]:
|
|
raise QueryError(f"block {'.'.join(str(part) for part in index_path)} rows must be a non-empty 2D list")
|
|
table = document.add_table(rows=0, cols=len(rows[0]))
|
|
style_name = block.get("style")
|
|
if style_name:
|
|
try:
|
|
table.style = str(style_name)
|
|
except KeyError:
|
|
pass
|
|
for row_values in rows:
|
|
if not isinstance(row_values, list) or len(row_values) != len(rows[0]):
|
|
raise QueryError(f"block {'.'.join(str(part) for part in index_path)} table rows must have equal column counts")
|
|
row = table.add_row()
|
|
for cell_index, value in enumerate(row_values):
|
|
row.cells[cell_index].text = str(value)
|
|
block_reports.append({"index": ".".join(str(part) for part in index_path), "type": block_type, "row_count": len(rows), "column_count": len(rows[0])})
|
|
return
|
|
if block_type == "page_break":
|
|
document.add_page_break()
|
|
block_reports.append({"index": ".".join(str(part) for part in index_path), "type": block_type})
|
|
return
|
|
raise QueryError(f"unsupported block type: {block_type}")
|
|
|
|
for index, block in enumerate(blocks):
|
|
render_block(block, [index])
|
|
document.save(str(output_docx))
|
|
final_index = index_document(output_docx)
|
|
return {
|
|
"status": "ok",
|
|
"output_docx": str(output_docx),
|
|
"block_count": len(blocks),
|
|
"blocks": block_reports,
|
|
"final_summary": final_index["summary"],
|
|
}
|
|
|
|
|
|
def export_outline_artifacts(payload: dict[str, Any]) -> dict[str, Any]:
|
|
technical_outline = payload.get("technical_outline")
|
|
business_outline = payload.get("business_outline")
|
|
technical_json = Path(payload["technical_outline_json"]).resolve()
|
|
business_json = Path(payload["business_outline_json"]).resolve()
|
|
technical_docx = Path(payload["technical_docx"]).resolve()
|
|
business_docx = Path(payload["business_docx"]).resolve()
|
|
|
|
for outline_name, outline in (("technical_outline", technical_outline), ("business_outline", business_outline)):
|
|
if not isinstance(outline, dict):
|
|
raise QueryError(f"{outline_name} must be an object")
|
|
if not isinstance(outline.get("blocks"), list):
|
|
raise QueryError(f"{outline_name}.blocks must be a list")
|
|
|
|
write_json(technical_json, technical_outline)
|
|
write_json(business_json, business_outline)
|
|
|
|
technical_report = create_docx_document(
|
|
{
|
|
"output_docx": str(technical_docx),
|
|
"title": str(technical_outline.get("title", "技术标目录")),
|
|
"blocks": technical_outline["blocks"],
|
|
}
|
|
)
|
|
business_report = create_docx_document(
|
|
{
|
|
"output_docx": str(business_docx),
|
|
"title": str(business_outline.get("title", "商务及其他目录")),
|
|
"blocks": business_outline["blocks"],
|
|
}
|
|
)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"technical_outline_json": str(technical_json),
|
|
"business_outline_json": str(business_json),
|
|
"technical_docx": str(technical_docx),
|
|
"business_docx": str(business_docx),
|
|
"technical_report": technical_report,
|
|
"business_report": business_report,
|
|
}
|
|
|
|
|
|
def slugify_text(value: str, *, limit: int = 32) -> str:
|
|
compact = normalize_text(value)
|
|
if not compact:
|
|
return "empty"
|
|
compact = re.sub(r"[^\w\u4e00-\u9fff-]+", "-", compact, flags=re.UNICODE)
|
|
compact = re.sub(r"-+", "-", compact).strip("-").lower()
|
|
return compact[:limit] or "empty"
|
|
|
|
|
|
def summarize_text(value: str, *, limit: int = 80) -> str:
|
|
return normalize_text(value)[:limit]
|
|
|
|
|
|
def iter_block_items(parent: DocxDocument | _Cell) -> Iterator[Paragraph | Table]:
|
|
parent_element = parent.element.body if isinstance(parent, DocxDocument) else parent._tc
|
|
for child in parent_element.iterchildren():
|
|
if child.tag.endswith("}p"):
|
|
yield Paragraph(child, parent)
|
|
elif child.tag.endswith("}tbl"):
|
|
yield Table(child, parent)
|
|
|
|
|
|
def paragraph_has_image(paragraph: Paragraph) -> bool:
|
|
return bool(paragraph._element.xpath(".//w:drawing"))
|
|
|
|
|
|
def paragraph_is_list_item(paragraph: Paragraph) -> bool:
|
|
style_name = paragraph.style.name if paragraph.style else ""
|
|
if style_name.lower().startswith("list"):
|
|
return True
|
|
p_pr = paragraph._element.pPr
|
|
return p_pr is not None and p_pr.numPr is not None
|
|
|
|
|
|
def build_anchor(path: list[str], node_type: str, text: str, ordinal: int) -> str:
|
|
seed = "|".join(["/".join(path), node_type, summarize_text(text, limit=32), str(ordinal)])
|
|
digest = sha1(seed.encode("utf-8")).hexdigest()[:10]
|
|
slug = slugify_text(text, limit=24)
|
|
path_slug = slugify_text("-".join(path), limit=24)
|
|
return f"{path_slug}:{node_type}:{slug}:{ordinal}:{digest}"
|
|
|
|
|
|
def _index_document_core(document: Document) -> list[NodeRecord]:
|
|
nodes: list[NodeRecord] = []
|
|
heading_stack: list[str] = []
|
|
heading_ids: dict[int, str] = {}
|
|
ordinal = 0
|
|
|
|
def current_parent_id() -> str | None:
|
|
if not heading_ids:
|
|
return None
|
|
return heading_ids[max(heading_ids)]
|
|
|
|
def add_record(
|
|
*,
|
|
node_type: str,
|
|
text: str,
|
|
style_name: str | None,
|
|
heading_level: int | None,
|
|
path: list[str],
|
|
parent_id: str | None,
|
|
container: str,
|
|
object_ref: Any,
|
|
table_index: int | None = None,
|
|
row_index: int | None = None,
|
|
cell_index: int | None = None,
|
|
block_index: int | None = None,
|
|
xml_path: str | None = None,
|
|
has_image: bool = False,
|
|
) -> NodeRecord:
|
|
nonlocal ordinal
|
|
ordinal += 1
|
|
node_id = f"n-{ordinal:05d}"
|
|
record = NodeRecord(
|
|
node_id=node_id,
|
|
node_type=node_type,
|
|
text=normalize_text(text),
|
|
style_name=style_name,
|
|
heading_level=heading_level,
|
|
path=path,
|
|
ordinal=ordinal,
|
|
parent_id=parent_id,
|
|
anchor=build_anchor(path, node_type, text, ordinal),
|
|
container=container,
|
|
table_index=table_index,
|
|
row_index=row_index,
|
|
cell_index=cell_index,
|
|
block_index=block_index,
|
|
xml_path=xml_path,
|
|
has_image=has_image,
|
|
object_ref=object_ref,
|
|
)
|
|
nodes.append(record)
|
|
return record
|
|
|
|
for block_index, block in enumerate(iter_block_items(document)):
|
|
if isinstance(block, Paragraph):
|
|
text = normalize_text(block.text)
|
|
style_name = block.style.name if block.style else None
|
|
level = heading_level_for_style(style_name)
|
|
if level is not None:
|
|
while len(heading_stack) >= level:
|
|
heading_stack.pop()
|
|
heading_stack.append(text or f"Heading {level}")
|
|
heading_ids = {key: value for key, value in heading_ids.items() if key < level}
|
|
record = add_record(
|
|
node_type="heading",
|
|
text=text,
|
|
style_name=style_name,
|
|
heading_level=level,
|
|
path=list(heading_stack),
|
|
parent_id=heading_ids.get(level - 1),
|
|
container="document",
|
|
object_ref=block,
|
|
block_index=block_index,
|
|
has_image=paragraph_has_image(block),
|
|
)
|
|
heading_ids[level] = record.node_id
|
|
if record.has_image:
|
|
add_record(
|
|
node_type="image_placeholder",
|
|
text=text or "[image]",
|
|
style_name=style_name,
|
|
heading_level=level,
|
|
path=list(heading_stack),
|
|
parent_id=record.node_id,
|
|
container="document",
|
|
object_ref=block,
|
|
block_index=block_index,
|
|
has_image=True,
|
|
)
|
|
continue
|
|
record = add_record(
|
|
node_type="list_item" if paragraph_is_list_item(block) else "paragraph",
|
|
text=text,
|
|
style_name=style_name,
|
|
heading_level=None,
|
|
path=list(heading_stack),
|
|
parent_id=current_parent_id(),
|
|
container="document",
|
|
object_ref=block,
|
|
block_index=block_index,
|
|
has_image=paragraph_has_image(block),
|
|
)
|
|
if record.has_image:
|
|
add_record(
|
|
node_type="image_placeholder",
|
|
text=text or "[image]",
|
|
style_name=style_name,
|
|
heading_level=None,
|
|
path=list(heading_stack),
|
|
parent_id=record.node_id,
|
|
container="document",
|
|
object_ref=block,
|
|
block_index=block_index,
|
|
has_image=True,
|
|
)
|
|
else:
|
|
table_text = "\n".join(
|
|
" | ".join(normalize_text(cell.text) for cell in row.cells)
|
|
for row in block.rows
|
|
)
|
|
table_record = add_record(
|
|
node_type="table",
|
|
text=table_text,
|
|
style_name=block.style.name if block.style else None,
|
|
heading_level=None,
|
|
path=list(heading_stack),
|
|
parent_id=current_parent_id(),
|
|
container="document",
|
|
object_ref=block,
|
|
block_index=block_index,
|
|
xml_path=f"table[{block_index}]",
|
|
)
|
|
for row_index, row in enumerate(block.rows):
|
|
row_text = " | ".join(normalize_text(cell.text) for cell in row.cells)
|
|
row_record = add_record(
|
|
node_type="table_row",
|
|
text=row_text,
|
|
style_name=table_record.style_name,
|
|
heading_level=None,
|
|
path=list(heading_stack),
|
|
parent_id=table_record.node_id,
|
|
container="table",
|
|
object_ref=row,
|
|
table_index=block_index,
|
|
row_index=row_index,
|
|
xml_path=f"table[{block_index}]/row[{row_index}]",
|
|
)
|
|
for cell_index, cell in enumerate(row.cells):
|
|
add_record(
|
|
node_type="table_cell",
|
|
text="\n".join(
|
|
normalize_text(paragraph.text)
|
|
for paragraph in cell.paragraphs
|
|
if normalize_text(paragraph.text)
|
|
),
|
|
style_name=None,
|
|
heading_level=None,
|
|
path=list(heading_stack),
|
|
parent_id=row_record.node_id,
|
|
container="table",
|
|
object_ref=cell,
|
|
table_index=block_index,
|
|
row_index=row_index,
|
|
cell_index=cell_index,
|
|
xml_path=f"table[{block_index}]/row[{row_index}]/cell[{cell_index}]",
|
|
)
|
|
return nodes
|
|
|
|
|
|
def index_document(docx_path: Path) -> dict[str, Any]:
|
|
document = Document(str(docx_path))
|
|
nodes = _index_document_core(document)
|
|
return {
|
|
"status": "ok",
|
|
"docx": str(docx_path),
|
|
"summary": {
|
|
"node_count": len(nodes),
|
|
"heading_count": sum(1 for node in nodes if node.node_type == "heading"),
|
|
"paragraph_count": sum(1 for node in nodes if node.node_type == "paragraph"),
|
|
"list_item_count": sum(1 for node in nodes if node.node_type == "list_item"),
|
|
"table_count": sum(1 for node in nodes if node.node_type == "table"),
|
|
"image_placeholder_count": sum(1 for node in nodes if node.node_type == "image_placeholder"),
|
|
},
|
|
"nodes": [node.to_dict() for node in nodes],
|
|
}
|
|
|
|
|
|
def query_nodes(index_data: dict[str, Any], query: dict[str, Any]) -> dict[str, Any]:
|
|
nodes = index_data.get("nodes", [])
|
|
mode = query.get("match_mode", "contains_text")
|
|
value = query.get("value")
|
|
if value is None and mode not in {"node_type"}:
|
|
raise QueryError("query.value is required")
|
|
node_type_filter = query.get("node_type")
|
|
style_name_filter = query.get("style_name")
|
|
heading_level = query.get("heading_level")
|
|
allow_multiple = bool(query.get("allow_multiple", False))
|
|
occurrence = query.get("occurrence")
|
|
window = int(query.get("context_window", TEXT_WINDOW_DEFAULT))
|
|
|
|
def node_matches(node: dict[str, Any]) -> bool:
|
|
if node_type_filter and node.get("node_type") != node_type_filter:
|
|
return False
|
|
if style_name_filter and node.get("style_name") != style_name_filter:
|
|
return False
|
|
if heading_level is not None and node.get("heading_level") != heading_level:
|
|
return False
|
|
node_text = node.get("text", "")
|
|
if mode == "exact_text":
|
|
return node_text == value
|
|
if mode == "contains_text":
|
|
return value in node_text
|
|
if mode == "regex":
|
|
return re.search(value, node_text) is not None
|
|
if mode == "heading_path":
|
|
return node.get("node_type") == "heading" and " > ".join(node.get("path", [])) == value
|
|
if mode == "heading_text":
|
|
return node.get("node_type") == "heading" and node_text == value
|
|
if mode == "table_title":
|
|
path_parts = node.get("path", [])
|
|
return node.get("node_type") == "table" and bool(path_parts) and path_parts[-1] == value
|
|
if mode == "style_name":
|
|
return node.get("style_name") == value
|
|
if mode == "node_type":
|
|
return node.get("node_type") == query.get("value")
|
|
if mode == "anchor":
|
|
return node.get("anchor") == value
|
|
if mode == "node_id":
|
|
return node.get("node_id") == value
|
|
raise QueryError(f"unsupported match_mode: {mode}")
|
|
|
|
matches = [node for node in nodes if node_matches(node)]
|
|
if occurrence is not None:
|
|
matches = [matches[occurrence]] if 0 <= occurrence < len(matches) else []
|
|
ambiguous = len(matches) > 1 and not allow_multiple
|
|
best_match = matches[0] if len(matches) == 1 or (allow_multiple and matches) else None
|
|
|
|
def with_context(node: dict[str, Any]) -> dict[str, Any]:
|
|
text = node.get("text", "")
|
|
return {
|
|
**node,
|
|
"context": {
|
|
"before": text[:window],
|
|
"after": text[-window:] if text else "",
|
|
},
|
|
}
|
|
|
|
return {
|
|
"status": "ok",
|
|
"query": query,
|
|
"match_count": len(matches),
|
|
"ambiguous": ambiguous,
|
|
"best_match": with_context(best_match) if best_match else None,
|
|
"candidate_anchors": [match["anchor"] for match in matches],
|
|
"matches": [with_context(match) for match in matches],
|
|
"errors": ["query matched multiple nodes"] if ambiguous else [],
|
|
"warnings": [],
|
|
}
|
|
|
|
|
|
def find_records(index_data: dict[str, Any], query: dict[str, Any]) -> list[dict[str, Any]]:
|
|
result = query_nodes(index_data, query)
|
|
if result["ambiguous"] and query.get("on_ambiguous", "error") == "error":
|
|
raise QueryError("query matched multiple nodes")
|
|
if result["match_count"] == 0 and query.get("on_missing", "error") == "error":
|
|
raise QueryError("query matched no nodes")
|
|
return result["matches"]
|
|
|
|
|
|
def clone_run_format(source_run: Any, target_run: Any) -> None:
|
|
target_run.bold = source_run.bold
|
|
target_run.italic = source_run.italic
|
|
target_run.underline = source_run.underline
|
|
target_run.font.name = source_run.font.name
|
|
target_run.font.size = source_run.font.size
|
|
if source_run.font.color and source_run.font.color.rgb:
|
|
target_run.font.color.rgb = source_run.font.color.rgb
|
|
if qn and source_run._element.rPr is not None and source_run._element.rPr.rFonts is not None:
|
|
east_asia = source_run._element.rPr.rFonts.get(qn("w:eastAsia"))
|
|
if east_asia:
|
|
target_run._element.get_or_add_rPr().rFonts.set(qn("w:eastAsia"), east_asia)
|
|
|
|
|
|
def clear_paragraph(paragraph: Paragraph) -> None:
|
|
p_element = paragraph._element
|
|
for child in list(p_element):
|
|
if child.tag.endswith("}r") or child.tag.endswith("}hyperlink"):
|
|
p_element.remove(child)
|
|
|
|
|
|
def replace_text_in_paragraph(paragraph: Paragraph, old_text: str, new_text: str) -> bool:
|
|
if old_text not in paragraph.text:
|
|
return False
|
|
for run in paragraph.runs:
|
|
if old_text in run.text:
|
|
run.text = run.text.replace(old_text, new_text, 1)
|
|
return True
|
|
existing_runs = list(paragraph.runs)
|
|
first_run = existing_runs[0] if existing_runs else paragraph.add_run()
|
|
clear_paragraph(paragraph)
|
|
new_run = paragraph.add_run(new_text)
|
|
if existing_runs:
|
|
clone_run_format(first_run, new_run)
|
|
return True
|
|
|
|
|
|
def delete_block(block: Paragraph | Table) -> None:
|
|
element = block._element
|
|
parent = element.getparent()
|
|
if parent is not None:
|
|
parent.remove(element)
|
|
|
|
|
|
def insert_paragraph_relative(target: Paragraph | Table, *, after: bool, style_name: str | None = None) -> Paragraph:
|
|
new_p = OxmlElement("w:p")
|
|
if after:
|
|
target._element.addnext(new_p)
|
|
else:
|
|
target._element.addprevious(new_p)
|
|
paragraph = Paragraph(new_p, target._parent)
|
|
if style_name:
|
|
try:
|
|
paragraph.style = style_name
|
|
except KeyError:
|
|
pass
|
|
return paragraph
|
|
|
|
|
|
def append_paragraph_contents(paragraph: Paragraph, text: str, source: Paragraph | None = None) -> None:
|
|
if source is not None and source.style is not None:
|
|
paragraph.style = source.style
|
|
paragraph.paragraph_format.left_indent = source.paragraph_format.left_indent
|
|
paragraph.paragraph_format.right_indent = source.paragraph_format.right_indent
|
|
paragraph.paragraph_format.first_line_indent = source.paragraph_format.first_line_indent
|
|
paragraph.paragraph_format.space_before = source.paragraph_format.space_before
|
|
paragraph.paragraph_format.space_after = source.paragraph_format.space_after
|
|
paragraph.paragraph_format.line_spacing = source.paragraph_format.line_spacing
|
|
paragraph.alignment = source.alignment
|
|
if source is not None and source.runs:
|
|
run = paragraph.add_run(text)
|
|
clone_run_format(source.runs[0], run)
|
|
else:
|
|
paragraph.add_run(text)
|
|
|
|
|
|
def create_table_after(target: Paragraph | Table, rows: list[list[str]], style_name: str | None = None) -> Table:
|
|
parent = target._parent
|
|
cols = len(rows[0]) if rows else 1
|
|
table = parent.add_table(rows=0, cols=cols)
|
|
if style_name:
|
|
try:
|
|
table.style = style_name
|
|
except KeyError:
|
|
pass
|
|
for row_values in rows:
|
|
row = table.add_row()
|
|
for index, value in enumerate(row_values):
|
|
row.cells[index].text = value
|
|
target._element.addnext(table._element)
|
|
return table
|
|
|
|
|
|
def build_live_index(document: Document) -> tuple[dict[str, Any], dict[str, NodeRecord]]:
|
|
nodes = _index_document_core(document)
|
|
return {
|
|
"status": "ok",
|
|
"summary": {"node_count": len(nodes)},
|
|
"nodes": [node.to_dict() for node in nodes],
|
|
}, {node.anchor: node for node in nodes}
|
|
|
|
|
|
def insert_blocks(record: NodeRecord, operation: dict[str, Any], *, after: bool) -> None:
|
|
content_type = operation.get("content_type", "paragraphs")
|
|
content = operation.get("content")
|
|
if record.node_type not in {"paragraph", "list_item", "heading", "table"}:
|
|
raise QueryError("insert operations only support block nodes")
|
|
target = record.object_ref
|
|
if content_type == "paragraphs":
|
|
paragraphs = content if isinstance(content, list) else [str(content)]
|
|
previous: Paragraph | Table = target
|
|
for index, paragraph_text in enumerate(paragraphs):
|
|
new_paragraph = insert_paragraph_relative(
|
|
previous,
|
|
after=after if index == 0 else True,
|
|
style_name=record.style_name if record.node_type in {"paragraph", "list_item"} else "Normal",
|
|
)
|
|
source_paragraph = target if isinstance(target, Paragraph) and record.node_type in {"paragraph", "list_item"} else None
|
|
append_paragraph_contents(new_paragraph, str(paragraph_text), source=source_paragraph)
|
|
previous = new_paragraph
|
|
return
|
|
if content_type == "heading":
|
|
payload = content if isinstance(content, dict) else {"text": str(content)}
|
|
level = int(payload.get("level", record.heading_level or 1))
|
|
new_paragraph = insert_paragraph_relative(target, after=after, style_name=f"Heading {level}")
|
|
append_paragraph_contents(new_paragraph, str(payload.get("text", "")), source=target if isinstance(target, Paragraph) else None)
|
|
try:
|
|
new_paragraph.style = f"Heading {level}"
|
|
except KeyError:
|
|
pass
|
|
return
|
|
if content_type == "list":
|
|
items = content if isinstance(content, list) else []
|
|
previous: Paragraph | Table = target
|
|
for index, item in enumerate(items):
|
|
new_paragraph = insert_paragraph_relative(
|
|
previous,
|
|
after=after if index == 0 else True,
|
|
style_name="List Bullet",
|
|
)
|
|
source_paragraph = target if isinstance(target, Paragraph) and record.node_type == "list_item" else None
|
|
append_paragraph_contents(new_paragraph, str(item), source=source_paragraph)
|
|
try:
|
|
new_paragraph.style = "List Bullet"
|
|
except KeyError:
|
|
pass
|
|
previous = new_paragraph
|
|
return
|
|
if content_type == "table":
|
|
rows = content.get("rows") if isinstance(content, dict) else content
|
|
if not isinstance(rows, list) or not rows:
|
|
raise QueryError("table content must provide rows")
|
|
style_name = None
|
|
if isinstance(target, Table) and target.style is not None:
|
|
style_name = target.style.name
|
|
create_table_after(target, rows, style_name=style_name)
|
|
return
|
|
raise QueryError(f"unsupported content_type: {content_type}")
|
|
|
|
|
|
def replace_block(record: NodeRecord, operation: dict[str, Any]) -> None:
|
|
target = record.object_ref
|
|
insert_blocks(record, operation, after=False)
|
|
delete_block(target)
|
|
|
|
|
|
def apply_patch_document(patch_data: dict[str, Any]) -> dict[str, Any]:
|
|
source_docx = Path(patch_data["source_docx"]).resolve()
|
|
output_docx = Path(patch_data.get("output_docx", source_docx)).resolve()
|
|
in_place = bool(patch_data.get("in_place", False))
|
|
if not in_place and output_docx == source_docx:
|
|
raise QueryError("output_docx must differ from source_docx unless in_place is true")
|
|
output_docx.parent.mkdir(parents=True, exist_ok=True)
|
|
if not in_place:
|
|
shutil.copy2(source_docx, output_docx)
|
|
document = Document(str(output_docx))
|
|
operations = patch_data.get("operations", [])
|
|
operation_reports: list[dict[str, Any]] = []
|
|
|
|
for index, operation in enumerate(operations):
|
|
live_index, record_map = build_live_index(document)
|
|
matches = find_records(live_index, operation.get("target", {}))
|
|
if len(matches) > 1 and operation.get("on_ambiguous", "error") == "error":
|
|
raise QueryError(f"operation {index} matched multiple nodes")
|
|
selected = matches if operation.get("allow_multiple") else matches[:1]
|
|
if not selected and operation.get("on_missing", "error") == "error":
|
|
raise QueryError(f"operation {index} matched no nodes")
|
|
affected: list[dict[str, Any]] = []
|
|
for match in selected:
|
|
record = record_map[match["anchor"]]
|
|
before_summary = summarize_text(record.text)
|
|
op_name = operation["op"]
|
|
if op_name == "replace_text":
|
|
old_text = operation["old_text"]
|
|
new_text = operation["new_text"]
|
|
if record.node_type not in {"paragraph", "list_item", "heading"}:
|
|
raise QueryError("replace_text only supports paragraph-like nodes")
|
|
if not replace_text_in_paragraph(record.object_ref, old_text, new_text):
|
|
raise QueryError(f"text not found in node {record.anchor}")
|
|
elif op_name == "delete_node":
|
|
if record.node_type not in {"paragraph", "list_item", "heading", "table"}:
|
|
raise QueryError("delete_node only supports block nodes")
|
|
delete_block(record.object_ref)
|
|
elif op_name == "insert_before":
|
|
insert_blocks(record, operation, after=False)
|
|
elif op_name == "insert_after":
|
|
insert_blocks(record, operation, after=True)
|
|
elif op_name == "replace_node":
|
|
replace_block(record, operation)
|
|
else:
|
|
raise QueryError(f"unsupported op: {op_name}")
|
|
affected.append(
|
|
{
|
|
"anchor": record.anchor,
|
|
"node_type": record.node_type,
|
|
"before": before_summary,
|
|
"op": op_name,
|
|
}
|
|
)
|
|
document.save(str(output_docx))
|
|
operation_reports.append(
|
|
{
|
|
"index": index,
|
|
"op": operation["op"],
|
|
"match_count": len(selected),
|
|
"affected": affected,
|
|
}
|
|
)
|
|
|
|
document.save(str(output_docx))
|
|
final_index = index_document(output_docx)
|
|
return {
|
|
"status": "ok",
|
|
"source_docx": str(source_docx),
|
|
"output_docx": str(output_docx),
|
|
"in_place": in_place,
|
|
"operation_count": len(operations),
|
|
"operations": operation_reports,
|
|
"errors": [],
|
|
"warnings": [],
|
|
"final_summary": final_index["summary"],
|
|
}
|
|
|
|
|
|
def render_docx(docx_path: Path, out_dir: Path) -> dict[str, Any]:
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
pdf_path = out_dir / f"{docx_path.stem}.pdf"
|
|
png_dir = out_dir / "pages"
|
|
png_dir.mkdir(parents=True, exist_ok=True)
|
|
soffice = shutil.which("soffice")
|
|
if not soffice:
|
|
return {
|
|
"status": "render_skipped",
|
|
"docx": str(docx_path),
|
|
"pdf": None,
|
|
"page_count": 0,
|
|
"images": [],
|
|
"errors": [],
|
|
"warnings": ["LibreOffice/soffice not found"],
|
|
}
|
|
process = subprocess.run(
|
|
[soffice, "--headless", "--convert-to", "pdf", "--outdir", str(out_dir), str(docx_path)],
|
|
capture_output=True,
|
|
text=True,
|
|
encoding="utf-8",
|
|
)
|
|
if process.returncode != 0 or not pdf_path.exists():
|
|
return {
|
|
"status": "error",
|
|
"docx": str(docx_path),
|
|
"pdf": str(pdf_path),
|
|
"page_count": 0,
|
|
"images": [],
|
|
"errors": [process.stderr.strip() or "failed to convert docx to pdf"],
|
|
"warnings": [],
|
|
}
|
|
|
|
images: list[str] = []
|
|
warnings: list[str] = []
|
|
if convert_from_path is None:
|
|
warnings.append("pdf2image not installed")
|
|
else:
|
|
try:
|
|
for page_number, image in enumerate(convert_from_path(str(pdf_path)), start=1):
|
|
image_path = png_dir / f"page-{page_number:03d}.png"
|
|
image.save(str(image_path), "PNG")
|
|
images.append(str(image_path))
|
|
except Exception as exc: # pragma: no cover
|
|
warnings.append(f"PNG render skipped: {exc}")
|
|
|
|
return {
|
|
"status": "ok",
|
|
"docx": str(docx_path),
|
|
"pdf": str(pdf_path),
|
|
"page_count": len(images),
|
|
"images": images,
|
|
"errors": [],
|
|
"warnings": warnings,
|
|
}
|