kangda-robot-backend/ruoyi-fastapi-backend/convert_qa.py

102 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
qa_file = '/Users/tianjianyong/apps/Company/kangda-robot-backend/ruoyi-fastapi-backend/doc/qa.txt'
output_file = '/Users/tianjianyong/apps/Company/kangda-robot-backend/ruoyi-fastapi-backend/config/static_qa.json'
with open(qa_file, 'r', encoding='utf-8') as f:
content = f.read()
qa_pairs = []
current_category = None
current_question = None
current_answer = None
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith('一、') or line.startswith('二、') or line.startswith('三、') or line.startswith('四、') or line.startswith('五、') or line.startswith('六、'):
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": current_answer,
"category": current_category,
"priority": 10,
"sub_questions": [],
"variations": []
})
current_question = None
current_answer = None
if '公司概况' in line:
current_category = '公司概况'
elif '历史发展' in line:
current_category = '历史发展'
elif '生产基地与研发中心' in line:
current_category = '生产基地与研发中心'
elif '财务数据' in line:
current_category = '财务数据'
elif '业务介绍' in line:
current_category = '业务介绍'
elif '日常聊天' in line:
current_category = '日常聊天'
else:
current_category = '其他'
elif line.startswith('(一)') or line.startswith('(二)') or line.startswith('(三)'):
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": current_answer,
"category": current_category,
"priority": 10,
"sub_questions": [],
"variations": []
})
current_question = None
current_answer = None
elif re.match(r'^\d+\.', line) or re.match(r'^[一二三四五六七八九十]+\', line):
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": current_answer,
"category": current_category,
"priority": 10,
"sub_questions": [],
"variations": []
})
question_match = re.match(r'^\d+\.|^[一二三四五六七八九十]+\', line)
if question_match:
current_question = line[question_match.end():].strip()
current_answer = None
elif line.startswith('•答案:'):
if current_question:
current_answer = line[4:].strip()
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": current_answer,
"category": current_category,
"priority": 10,
"sub_questions": [],
"variations": []
})
output = {
"qa_pairs": qa_pairs
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"成功转换 {len(qa_pairs)} 个问答对到 {output_file}")