ZhangQiPro/parse_reports.py
sladro 01ad9c18ea feat: 完成Mock数据替换-基于Word文档真实数据
## 数据来源
- 239个Word文档(.doc格式)
- 6门金融课程真实评分数据
- 40名学生(233061301101-140)

## 完成工作

###  第一批次:基础数据
- 学生名单:40名真实学生
- 课程项目:6门真实课程
- 年级/班级:2023级金融工程1班

###  第二批次:评价数据
- 企业评价:40条(百分制→5分制)
- 教师评价:40条(基于Word学校评分)
- 专家评价:40条(综合评分)

###  第三批次:画像数据
- 能力维度:5个真实维度(数据采集、数据清洗、数据分析、结果解读、工具实操)
- abilityRadar:40名学生的5维能力分数(基于Word文档计算平均值)
- gradeDistribution:40名学生的6门课程真实总分

## 核心原则
 所有mock数据完全基于Word文档真实数据
 可以计算组合现有数据,但禁止随意编造
 Word文档不包含的内容,保持现有或标记为空

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-01 16:25:56 +08:00

117 lines
3.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
解析Word分析报告提取评分数据
"""
import win32com.client
import os
import re
import json
def extract_scores_from_text(text):
"""从文本中提取评分数据"""
scores = {}
# 提取学号
student_id_match = re.search(r'学号:(\d+)', text)
if student_id_match:
scores['studentId'] = student_id_match.group(1)
# 提取姓名
name_match = re.search(r'姓名:([^\s]+)', text)
if name_match:
scores['name'] = name_match.group(1)
# 提取总分
total_match = re.search(r'总分[:]\s*(\d+)', text)
if total_match:
scores['totalScore'] = int(total_match.group(1))
# 提取各项能力评分(学校评分和企业评分)
abilities = ['数据采集', '数据清洗', '数据分析', '结果解读', '工具实操']
for ability in abilities:
# 学校评分
school_pattern = rf'{ability}[能力]*.*?学校[评分为]*[:]*\s*(\d+)\s*分'
school_match = re.search(school_pattern, text)
if school_match:
scores[f'{ability}_学校'] = int(school_match.group(1))
# 企业评分
company_pattern = rf'{ability}[能力]*.*?企业[评分为]*[:]*\s*(\d+)\s*分'
company_match = re.search(company_pattern, text)
if company_match:
scores[f'{ability}_企业'] = int(company_match.group(1))
# 提取学生自评和互评
self_eval_match = re.search(r'学生自评[:]*\s*(\d+)\s*分', text)
if self_eval_match:
scores['学生自评'] = int(self_eval_match.group(1))
peer_eval_match = re.search(r'学生互评[:]*\s*(\d+)\s*分', text)
if peer_eval_match:
scores['学生互评'] = int(peer_eval_match.group(1))
return scores
def parse_all_reports(base_dir='分析报告'):
"""解析所有报告文件"""
word = win32com.client.Dispatch('Word.Application')
word.Visible = False
all_scores = {}
# 遍历所有课程文件夹
for course_dir in os.listdir(base_dir):
course_path = os.path.join(base_dir, course_dir)
if not os.path.isdir(course_path):
continue
print(f'\n处理课程: {course_dir}')
for filename in os.listdir(course_path):
if not filename.endswith('.doc'):
continue
file_path = os.path.join(course_path, filename)
doc_path = os.path.abspath(file_path)
try:
doc = word.Documents.Open(doc_path)
text = doc.Content.Text
doc.Close()
# 提取评分
scores = extract_scores_from_text(text)
if scores.get('studentId'):
student_id = scores['studentId']
if student_id not in all_scores:
all_scores[student_id] = {
'name': scores.get('name', ''),
'courses': {}
}
all_scores[student_id]['courses'][course_dir] = scores
print(f'{filename}: {scores.get("name", "")} - {scores.get("totalScore", 0)}')
except Exception as e:
print(f' ✗ 错误 {filename}: {e}')
word.Quit()
return all_scores
if __name__ == '__main__':
print('开始解析Word文档...\n')
scores = parse_all_reports()
# 保存为JSON
output_file = '分析报告/extracted_scores.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(scores, f, ensure_ascii=False, indent=2)
print(f'\n\n解析完成!')
print(f'共处理 {len(scores)} 名学生的数据')
print(f'结果已保存至: {output_file}')