## 数据来源 - 239个Word文档(.doc格式) - 6门金融课程真实评分数据 - 40名学生(233061301101-140) ## 完成工作 ### ✅ 第一批次:基础数据 - 学生名单:40名真实学生 - 课程项目:6门真实课程 - 年级/班级:2023级金融工程1班 ### ✅ 第二批次:评价数据 - 企业评价:40条(百分制→5分制) - 教师评价:40条(基于Word学校评分) - 专家评价:40条(综合评分) ### ✅ 第三批次:画像数据 - 能力维度:5个真实维度(数据采集、数据清洗、数据分析、结果解读、工具实操) - abilityRadar:40名学生的5维能力分数(基于Word文档计算平均值) - gradeDistribution:40名学生的6门课程真实总分 ## 核心原则 ✅ 所有mock数据完全基于Word文档真实数据 ✅ 可以计算组合现有数据,但禁止随意编造 ✅ Word文档不包含的内容,保持现有或标记为空 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
117 lines
3.7 KiB
Python
117 lines
3.7 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
解析Word分析报告,提取评分数据
|
||
"""
|
||
|
||
import win32com.client
|
||
import os
|
||
import re
|
||
import json
|
||
|
||
def extract_scores_from_text(text):
|
||
"""从文本中提取评分数据"""
|
||
scores = {}
|
||
|
||
# 提取学号
|
||
student_id_match = re.search(r'学号:(\d+)', text)
|
||
if student_id_match:
|
||
scores['studentId'] = student_id_match.group(1)
|
||
|
||
# 提取姓名
|
||
name_match = re.search(r'姓名:([^\s]+)', text)
|
||
if name_match:
|
||
scores['name'] = name_match.group(1)
|
||
|
||
# 提取总分
|
||
total_match = re.search(r'总分[::]\s*(\d+)', text)
|
||
if total_match:
|
||
scores['totalScore'] = int(total_match.group(1))
|
||
|
||
# 提取各项能力评分(学校评分和企业评分)
|
||
abilities = ['数据采集', '数据清洗', '数据分析', '结果解读', '工具实操']
|
||
|
||
for ability in abilities:
|
||
# 学校评分
|
||
school_pattern = rf'{ability}[能力]*.*?学校[评分为]*[::]*\s*(\d+)\s*分'
|
||
school_match = re.search(school_pattern, text)
|
||
if school_match:
|
||
scores[f'{ability}_学校'] = int(school_match.group(1))
|
||
|
||
# 企业评分
|
||
company_pattern = rf'{ability}[能力]*.*?企业[评分为]*[::]*\s*(\d+)\s*分'
|
||
company_match = re.search(company_pattern, text)
|
||
if company_match:
|
||
scores[f'{ability}_企业'] = int(company_match.group(1))
|
||
|
||
# 提取学生自评和互评
|
||
self_eval_match = re.search(r'学生自评[::]*\s*(\d+)\s*分', text)
|
||
if self_eval_match:
|
||
scores['学生自评'] = int(self_eval_match.group(1))
|
||
|
||
peer_eval_match = re.search(r'学生互评[::]*\s*(\d+)\s*分', text)
|
||
if peer_eval_match:
|
||
scores['学生互评'] = int(peer_eval_match.group(1))
|
||
|
||
return scores
|
||
|
||
def parse_all_reports(base_dir='分析报告'):
|
||
"""解析所有报告文件"""
|
||
word = win32com.client.Dispatch('Word.Application')
|
||
word.Visible = False
|
||
|
||
all_scores = {}
|
||
|
||
# 遍历所有课程文件夹
|
||
for course_dir in os.listdir(base_dir):
|
||
course_path = os.path.join(base_dir, course_dir)
|
||
if not os.path.isdir(course_path):
|
||
continue
|
||
|
||
print(f'\n处理课程: {course_dir}')
|
||
|
||
for filename in os.listdir(course_path):
|
||
if not filename.endswith('.doc'):
|
||
continue
|
||
|
||
file_path = os.path.join(course_path, filename)
|
||
doc_path = os.path.abspath(file_path)
|
||
|
||
try:
|
||
doc = word.Documents.Open(doc_path)
|
||
text = doc.Content.Text
|
||
doc.Close()
|
||
|
||
# 提取评分
|
||
scores = extract_scores_from_text(text)
|
||
if scores.get('studentId'):
|
||
student_id = scores['studentId']
|
||
if student_id not in all_scores:
|
||
all_scores[student_id] = {
|
||
'name': scores.get('name', ''),
|
||
'courses': {}
|
||
}
|
||
|
||
all_scores[student_id]['courses'][course_dir] = scores
|
||
print(f' ✓ {filename}: {scores.get("name", "")} - {scores.get("totalScore", 0)}分')
|
||
|
||
except Exception as e:
|
||
print(f' ✗ 错误 {filename}: {e}')
|
||
|
||
word.Quit()
|
||
|
||
return all_scores
|
||
|
||
if __name__ == '__main__':
|
||
print('开始解析Word文档...\n')
|
||
scores = parse_all_reports()
|
||
|
||
# 保存为JSON
|
||
output_file = '分析报告/extracted_scores.json'
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(scores, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f'\n\n解析完成!')
|
||
print(f'共处理 {len(scores)} 名学生的数据')
|
||
print(f'结果已保存至: {output_file}')
|