Skill-BidCreater/scripts/common.py

from __future__ import annotations

import json
import os
import re
import tempfile
from pathlib import Path
from typing import Any

import yaml

REPO_ROOT = Path(__file__).resolve().parents[2]
INPUT_ROOT = REPO_ROOT / "input"
OUTPUT_ROOT = REPO_ROOT / "output"

VALID_BUNDLES = ("technical", "business-other")
BUNDLE_ALIASES = {
    "technical": "technical",
    "business-other": "business-other",
    "business_other": "business-other",
}
BUNDLE_DEFAULTS: dict[str, dict[str, str]] = {
    "technical": {
        "outline_json": "final_outline_technical.json",
        "content_json": "final_bid_content_technical.json",
        "outline_docx": "技术标_目录版.docx",
        "bid_docx": "技术标.docx",
        "outline_doc_title": "技术标（目录版）",
        "outline_toc_title": "目录",
        "bid_doc_title": "技术标",
        "bid_toc_title": "目录",
    },
    "business-other": {
        "outline_json": "final_outline_business_other.json",
        "content_json": "final_bid_content_business_other.json",
        "outline_docx": "商务及其他_目录版.docx",
        "bid_docx": "商务及其他.docx",
        "outline_doc_title": "商务及其他（目录版）",
        "outline_toc_title": "目录",
        "bid_doc_title": "商务及其他",
        "bid_toc_title": "目录",
    },
}

BANNED_WORDS = ["可能", "大概", "应该", "我觉得", "AI建议", "待确认"]

# Weak filename hints only. These hints may help AI label discovered files,
# but they must never be treated as workflow routing, directory semantics,
# or mandatory material categories.
MATERIAL_CATALOG = [
    {"key": "business_license", "label": "营业执照副本", "keywords": ["营业执照", "license"]},
    {"key": "qualification_certificate", "label": "资质证书", "keywords": ["资质", "证书", "许可", "qualification"]},
    {"key": "legal_representative_id", "label": "法定代表人身份证明", "keywords": ["法人", "法定代表人", "身份证明"]},
    {"key": "authorization_letter", "label": "授权委托书", "keywords": ["授权", "委托书", "authorization"]},
    {"key": "project_manager_certificate", "label": "项目经理证书", "keywords": ["项目经理", "pmp", "建造师"]},
    {"key": "similar_project_case", "label": "类似项目业绩证明", "keywords": ["业绩", "案例", "合同", "验收", "case"]},
    {"key": "quotation_basis", "label": "报价依据说明", "keywords": ["报价", "清单", "预算", "quote", "price"]},
]

RESERVED_PROJECT_DIRS = {
    "rfp",
    "work",
    "reports",
    "final",
    "__pycache__",
    ".git",
    ".hg",
    ".svn",
    ".idea",
    ".vscode",
    ".venv",
    "venv",
    "node_modules",
}


def ensure_dir(path: Path) -> Path:
    path.mkdir(parents=True, exist_ok=True)
    return path


def write_text(path: Path, text: str) -> None:
    ensure_dir(path.parent)
    path.write_text(text, encoding="utf-8", newline="\n")


def write_json(path: Path, data: Any) -> None:
    ensure_dir(path.parent)
    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")


def write_json_atomic(path: Path, data: Any, *, indent: int = 2, ensure_ascii: bool = False) -> None:
    ensure_dir(path.parent)
    temp_path: Path | None = None
    encoder = json.JSONEncoder(ensure_ascii=ensure_ascii, indent=indent)
    try:
        with tempfile.NamedTemporaryFile(
            mode="w",
            encoding="utf-8",
            newline="\n",
            dir=str(path.parent),
            prefix=f"{path.stem}.",
            suffix=".tmp",
            delete=False,
        ) as temp_file:
            temp_path = Path(temp_file.name)
            for chunk in encoder.iterencode(data):
                temp_file.write(chunk)
            temp_file.flush()
            os.fsync(temp_file.fileno())
        temp_path.replace(path)
    except Exception:
        if temp_path and temp_path.exists():
            temp_path.unlink(missing_ok=True)
        raise


def read_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8-sig"))


def load_yaml(path: Path) -> dict[str, Any]:
    if not path.exists():
        return {}
    data = yaml.safe_load(path.read_text(encoding="utf-8-sig"))
    return data if isinstance(data, dict) else {}


def normalize_text(text: str) -> str:
    return re.sub(r"\s+", " ", text or "").strip()


def normalize_bundle(bundle: str | None) -> str | None:
    if bundle is None:
        return None
    normalized = BUNDLE_ALIASES.get(bundle.strip())
    if normalized:
        return normalized
    raise ValueError(f"不支持的 bundle: {bundle}。允许值：{', '.join(VALID_BUNDLES)}")


def ensure_output_layout(project_dir: Path) -> dict[str, Path]:
    output_root = project_dir
    layout = {
        "root": output_root,
        "final": output_root / "final",
        "artifacts": output_root / "work",
        "tables": output_root / "work",
        "reports": output_root / "reports",
        "work": output_root / "work",
    }
    for path in layout.values():
        ensure_dir(path)
    return layout


def get_bundle_defaults(bundle: str) -> dict[str, str]:
    normalized = normalize_bundle(bundle)
    if normalized is None:
        raise ValueError("bundle 不能为空。")
    return BUNDLE_DEFAULTS[normalized]


def get_bundle_outline_path(output_layout: dict[str, Path], bundle: str) -> Path:
    return output_layout["work"] / get_bundle_defaults(bundle)["outline_json"]


def get_bundle_content_path(output_layout: dict[str, Path], bundle: str) -> Path:
    return output_layout["work"] / get_bundle_defaults(bundle)["content_json"]


def get_bundle_outline_docx_path(output_layout: dict[str, Path], bundle: str) -> Path:
    return output_layout["final"] / get_bundle_defaults(bundle)["outline_docx"]


def get_bundle_bid_docx_path(output_layout: dict[str, Path], bundle: str) -> Path:
    return output_layout["final"] / get_bundle_defaults(bundle)["bid_docx"]


def find_rfp_docx(project_dir: Path) -> Path:
    rfp_dir = project_dir / "rfp"
    if not rfp_dir.exists():
        raise FileNotFoundError(f"未找到招标文件目录: {rfp_dir}")
    docx_files = sorted(rfp_dir.glob("*.docx"))
    if not docx_files:
        raise FileNotFoundError(f"未找到 DOCX 招标文件: {rfp_dir}")
    return docx_files[0]


def get_project_config(project_dir: Path) -> dict[str, Any]:
    return load_yaml(project_dir / "config" / "project.yaml")


def is_reserved_project_entry(path: Path) -> bool:
    return path.name.lower() in RESERVED_PROJECT_DIRS


def is_hidden_project_entry(path: Path) -> bool:
    return path.name.startswith(".")


def iter_material_entries(project_dir: Path) -> list[Path]:
    if not project_dir.exists():
        return []
    entries: list[Path] = []
    for entry in sorted(project_dir.iterdir()):
        if is_reserved_project_entry(entry) or is_hidden_project_entry(entry):
            continue
        entries.append(entry)
    return entries


def safe_filename(name: str) -> str:
    return re.sub(r'[<>:"/\\\\|?*]+', "_", name).strip(" .") or "untitled"


def markdown_table(headers: list[str], rows: list[list[str]]) -> str:
    lines = [
        "| " + " | ".join(headers) + " |",
        "| " + " | ".join(["---"] * len(headers)) + " |",
    ]
    for row in rows:
        lines.append("| " + " | ".join(row) + " |")
    return "\n".join(lines)


def get_font_candidates() -> list[Path]:
    windir = Path("C:/Windows/Fonts")
    return [
        windir / "msyh.ttc",
        windir / "msyhbd.ttc",
        windir / "simhei.ttf",
        windir / "simsun.ttc",
    ]


def find_font_path() -> Path | None:
    for path in get_font_candidates():
        if path.exists():
            return path
    return None


def list_files(path: Path) -> list[Path]:
    if not path.exists():
        return []
    return [item for item in sorted(path.rglob("*")) if item.is_file()]