from __future__ import annotations import argparse import json import re from pathlib import Path from typing import Any from common import read_json, write_json def compile_patterns(contains: list[str], regexes: list[str]) -> tuple[list[str], list[re.Pattern[str]]]: compiled = [re.compile(pattern, re.IGNORECASE) for pattern in regexes] return contains, compiled def match_block(block: dict[str, Any], contains: list[str], regexes: list[re.Pattern[str]], kinds: set[str], heading_only: bool, block_ids: set[str]) -> bool: if kinds and block.get("kind") not in kinds: return False if block_ids and block.get("id") not in block_ids: return False if heading_only and not block.get("heading"): return False text = block.get("text", "") if contains and not all(term.lower() in text.lower() for term in contains): return False if regexes and not all(regex.search(text) for regex in regexes): return False return True def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--graph", required=True) parser.add_argument("--contains", action="append", default=[]) parser.add_argument("--regex", action="append", default=[]) parser.add_argument("--kind", action="append", default=[]) parser.add_argument("--heading-only", action="store_true") parser.add_argument("--block-id", action="append", default=[]) parser.add_argument("--limit", type=int, default=20) parser.add_argument("--out") args = parser.parse_args() graph = read_json(Path(args.graph).resolve()) contains, regexes = compile_patterns(args.contains, args.regex) kinds = set(args.kind) block_ids = set(args.block_id) matches = [ block for block in graph.get("blocks", []) if match_block(block, contains, regexes, kinds, args.heading_only, block_ids) ][: args.limit] result = { "query": { "contains": contains, "regex": args.regex, "kind": args.kind, "heading_only": args.heading_only, "block_ids": args.block_id, "limit": args.limit, }, "matches": matches, "count": len(matches), } if args.out: write_json(Path(args.out).resolve(), result) else: print(json.dumps(result, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()