Skill-BidCreater/scripts/search_docx_json.py
2026-03-09 22:20:38 +08:00

77 lines
2.4 KiB
Python

from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import Any
from common import read_json, write_json
def compile_patterns(contains: list[str], regexes: list[str]) -> tuple[list[str], list[re.Pattern[str]]]:
compiled = [re.compile(pattern, re.IGNORECASE) for pattern in regexes]
return contains, compiled
def match_block(block: dict[str, Any], contains: list[str], regexes: list[re.Pattern[str]], kinds: set[str], heading_only: bool, block_ids: set[str]) -> bool:
if kinds and block.get("kind") not in kinds:
return False
if block_ids and block.get("id") not in block_ids:
return False
if heading_only and not block.get("heading"):
return False
text = block.get("text", "")
if contains and not all(term.lower() in text.lower() for term in contains):
return False
if regexes and not all(regex.search(text) for regex in regexes):
return False
return True
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--graph", required=True)
parser.add_argument("--contains", action="append", default=[])
parser.add_argument("--regex", action="append", default=[])
parser.add_argument("--kind", action="append", default=[])
parser.add_argument("--heading-only", action="store_true")
parser.add_argument("--block-id", action="append", default=[])
parser.add_argument("--limit", type=int, default=20)
parser.add_argument("--out")
args = parser.parse_args()
graph = read_json(Path(args.graph).resolve())
contains, regexes = compile_patterns(args.contains, args.regex)
kinds = set(args.kind)
block_ids = set(args.block_id)
matches = [
block
for block in graph.get("blocks", [])
if match_block(block, contains, regexes, kinds, args.heading_only, block_ids)
][: args.limit]
result = {
"query": {
"contains": contains,
"regex": args.regex,
"kind": args.kind,
"heading_only": args.heading_only,
"block_ids": args.block_id,
"limit": args.limit,
},
"matches": matches,
"count": len(matches),
}
if args.out:
write_json(Path(args.out).resolve(), result)
else:
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()