Word 填充后新增 Markdown 清理节点,修复漏掉标题的bug

This commit is contained in:
sladro 2025-12-29 18:21:57 +08:00
parent 91a2050b9f
commit 4a86081ade
2 changed files with 22 additions and 1 deletions

8
check_headings.py Normal file
View File

@ -0,0 +1,8 @@
from docx import Document
doc = Document('mianyang/mianyang.docx')
for i, para in enumerate(doc.paragraphs):
style_name = getattr(para.style, 'name', '') or ''
if style_name.startswith('Heading'):
text = para.text.strip()
if text:
print(f'[{i}] {style_name}: {repr(text[:80])}')

View File

@ -116,6 +116,7 @@ class InitConfigNode(BaseNode):
chapters: List[Dict[str, Any]] = []
parent_stack: List[Dict[str, Any]] = []
level_counters: defaultdict[int, int] = defaultdict(int)
seen_ids: Dict[str, int] = {} # 记录已使用的ID及其出现次数
for para in doc.paragraphs:
style_name = getattr(para.style, 'name', '') or ''
@ -153,7 +154,19 @@ class InitConfigNode(BaseNode):
segment = self._format_unlabeled_segment(level_counters[level])
id_path = [*parent_path, segment] if parent_path else [segment]
chapter_id = f"chapter_{'_'.join(id_path)}"
base_chapter_id = f"chapter_{'_'.join(id_path)}"
# 检测重复ID并添加后缀
if base_chapter_id in seen_ids:
seen_ids[base_chapter_id] += 1
chapter_id = f"{base_chapter_id}_{seen_ids[base_chapter_id]}"
logger.warning(
"检测到重复章节ID '%s',自动重命名为 '%s' (标题: %s)",
base_chapter_id, chapter_id, title
)
else:
seen_ids[base_chapter_id] = 1
chapter_id = base_chapter_id
chapter_info = {
"id": chapter_id,