ZhangQiPro/parse_reports.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
解析Word分析报告，提取评分数据
"""

import win32com.client
import os
import re
import json

def extract_scores_from_text(text):
    """从文本中提取评分数据"""
    scores = {}

    # 提取学号
    student_id_match = re.search(r'学号：(\d+)', text)
    if student_id_match:
        scores['studentId'] = student_id_match.group(1)

    # 提取姓名
    name_match = re.search(r'姓名：([^\s]+)', text)
    if name_match:
        scores['name'] = name_match.group(1)

    # 提取总分
    total_match = re.search(r'总分[：:]\s*(\d+)', text)
    if total_match:
        scores['totalScore'] = int(total_match.group(1))

    # 提取各项能力评分（学校评分和企业评分）
    abilities = ['数据采集', '数据清洗', '数据分析', '结果解读', '工具实操']

    for ability in abilities:
        # 学校评分
        school_pattern = rf'{ability}[能力]*.*?学校[评分为]*[：:]*\s*(\d+)\s*分'
        school_match = re.search(school_pattern, text)
        if school_match:
            scores[f'{ability}_学校'] = int(school_match.group(1))

        # 企业评分
        company_pattern = rf'{ability}[能力]*.*?企业[评分为]*[：:]*\s*(\d+)\s*分'
        company_match = re.search(company_pattern, text)
        if company_match:
            scores[f'{ability}_企业'] = int(company_match.group(1))

    # 提取学生自评和互评
    self_eval_match = re.search(r'学生自评[：:]*\s*(\d+)\s*分', text)
    if self_eval_match:
        scores['学生自评'] = int(self_eval_match.group(1))

    peer_eval_match = re.search(r'学生互评[：:]*\s*(\d+)\s*分', text)
    if peer_eval_match:
        scores['学生互评'] = int(peer_eval_match.group(1))

    return scores

def parse_all_reports(base_dir='分析报告'):
    """解析所有报告文件"""
    word = win32com.client.Dispatch('Word.Application')
    word.Visible = False

    all_scores = {}

    # 遍历所有课程文件夹
    for course_dir in os.listdir(base_dir):
        course_path = os.path.join(base_dir, course_dir)
        if not os.path.isdir(course_path):
            continue

        print(f'\n处理课程: {course_dir}')

        for filename in os.listdir(course_path):
            if not filename.endswith('.doc'):
                continue

            file_path = os.path.join(course_path, filename)
            doc_path = os.path.abspath(file_path)

            try:
                doc = word.Documents.Open(doc_path)
                text = doc.Content.Text
                doc.Close()

                # 提取评分
                scores = extract_scores_from_text(text)
                if scores.get('studentId'):
                    student_id = scores['studentId']
                    if student_id not in all_scores:
                        all_scores[student_id] = {
                            'name': scores.get('name', ''),
                            'courses': {}
                        }

                    all_scores[student_id]['courses'][course_dir] = scores
                    print(f'  ✓ {filename}: {scores.get("name", "")} - {scores.get("totalScore", 0)}分')

            except Exception as e:
                print(f'  ✗ 错误 {filename}: {e}')

    word.Quit()

    return all_scores

if __name__ == '__main__':
    print('开始解析Word文档...\n')
    scores = parse_all_reports()

    # 保存为JSON
    output_file = '分析报告/extracted_scores.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(scores, f, ensure_ascii=False, indent=2)

    print(f'\n\n解析完成！')
    print(f'共处理 {len(scores)} 名学生的数据')
    print(f'结果已保存至: {output_file}')