完成--完成获取系统训练历史方法

This commit is contained in:
haotian 2025-02-20 11:46:24 +08:00
parent cb75799ae2
commit b00e6dac71
4 changed files with 191 additions and 8 deletions

View File

@ -644,25 +644,64 @@ Error Response:
### 3.2 获取训练历史
```http
GET /api/system/history
GET /api/system/history?page=1&page_size=10&start_time=2025-02-01&end_time=2025-02-19&status=completed&experiment_name=breast_cancer_classification
Parameters:
- page: 页码 (默认: 1)
- page_size: 每页数量 (默认: 10)
- start_time: 开始时间 (可选, 格式: YYYY-MM-DD)
- end_time: 结束时间 (可选, 格式: YYYY-MM-DD)
- status: 运行状态过滤 (可选: completed, failed, running)
- experiment_name: 实验名称过滤 (可选)
Response:
{
"status": "success",
"history": [
{
"task_id": "train_20230820_001",
"model": "xgboost",
"run_id": "7970364d490f4e0aa0375c2db26215f3",
"experiment_id": "656341556838275234",
"experiment_name": "breast_cancer_classification",
"model_name": "XGBClassifier",
"dataset": "breast_cancer",
"start_time": "2023-08-20T10:00:00",
"end_time": "2023-08-20T10:30:00",
"start_time": "2025-02-19T08:43:02",
"end_time": "2025-02-19T08:43:05",
"duration": "3s",
"status": "completed",
"parameters": {
"max_depth": 6,
"learning_rate": 0.1,
"n_estimators": 100
},
"metrics": {
"accuracy": 0.95,
"f1": 0.94
"accuracy": 0.956,
"precision": 0.962,
"recall": 0.935,
"f1": 0.948
},
"tags": {
"version": "v1.0",
"author": "admin"
}
}
]
],
"pagination": {
"current_page": 1,
"page_size": 10,
"total_pages": 5,
"total_items": 42
}
}
Error Response:
{
"status": "error",
"message": "获取训练历史失败",
"details": {
"error_type": "MLflowError",
"error_message": "Failed to connect to MLflow server"
}
}
```
@ -1017,4 +1056,5 @@ Authorization: Bearer <token>
3. 性能评估:
- 使用多个评估指标
- 考虑模型的稳定性
- 关注模型在特定场景的表现
- 关注模型在特定场景的表现

View File

@ -0,0 +1,4 @@
from function.system_monitor import SystemMonitor
system_monitor = SystemMonitor()
print(system_monitor.get_training_history())

View File

@ -6,6 +6,9 @@ from typing import Dict, List, Optional
import logging
from pathlib import Path
import time
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
class SystemMonitor:
"""系统资源监控类"""
@ -14,6 +17,7 @@ class SystemMonitor:
"""初始化系统监控器"""
self.logger = logging.getLogger(__name__)
self._setup_logging()
self.mlflow_client = MlflowClient()
def _setup_logging(self):
"""设置日志"""
@ -205,4 +209,139 @@ class SystemMonitor:
'error_type': type(e).__name__,
'error_message': str(e)
}
}
def get_training_history(
self,
page: int = 1,
page_size: int = 10,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
status: Optional[str] = None,
experiment_name: Optional[str] = None
) -> Dict:
"""
获取训练历史记录
Args:
page: 页码
page_size: 每页数量
start_time: 开始时间 (YYYY-MM-DD)
end_time: 结束时间 (YYYY-MM-DD)
status: 运行状态过滤
experiment_name: 实验名称过滤
Returns:
训练历史记录
"""
try:
# 构建过滤条件
filter_string = []
if status:
filter_string.append(f"status = '{status.upper()}'")
if start_time:
start_timestamp = int(pd.Timestamp(start_time).timestamp() * 1000)
filter_string.append(f"start_time >= {start_timestamp}")
if end_time:
end_timestamp = int(pd.Timestamp(end_time).timestamp() * 1000)
filter_string.append(f"start_time <= {end_timestamp}")
filter_string = " and ".join(filter_string) if filter_string else None
# 获取实验ID
if experiment_name:
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
return {
'status': 'error',
'message': f'实验 {experiment_name} 不存在'
}
experiment_ids = [experiment.experiment_id]
else:
experiments = mlflow.search_experiments()
experiment_ids = [exp.experiment_id for exp in experiments]
# 获取所有运行记录
all_runs = []
for exp_id in experiment_ids:
runs = mlflow.search_runs(
experiment_ids=[exp_id],
filter_string=filter_string,
order_by=['start_time DESC']
)
if not runs.empty:
all_runs.extend(runs.to_dict('records'))
# 计算分页
total_items = len(all_runs)
total_pages = (total_items + page_size - 1) // page_size
start_idx = (page - 1) * page_size
end_idx = min(start_idx + page_size, total_items)
# 格式化运行记录
history = []
for run in all_runs[start_idx:end_idx]:
# 获取实验信息
experiment = mlflow.get_experiment(run['experiment_id'])
# 计算持续时间
start_time = pd.to_datetime(run['start_time'])
end_time = pd.to_datetime(run['end_time']) if run['end_time'] else pd.Timestamp.now()
duration = end_time - start_time
# 收集参数和指标
params = {}
metrics = {}
tags = {}
for key, value in run.items():
if key.startswith('params.'):
params[key.replace('params.', '')] = value
elif key.startswith('metrics.'):
metrics[key.replace('metrics.', '')] = value
elif key.startswith('tags.'):
tags[key.replace('tags.', '')] = value
# 格式化记录
history_item = {
'run_id': run['run_id'],
'experiment_id': run['experiment_id'],
'experiment_name': experiment.name,
'model_name': params.get('algorithm', 'Unknown'),
'dataset': params.get('dataset', 'Unknown'),
'start_time': start_time.strftime('%Y-%m-%dT%H:%M:%S'),
'end_time': end_time.strftime('%Y-%m-%dT%H:%M:%S'),
'duration': str(duration).split('.')[0], # 去除微秒部分
'status': run['status'],
'parameters': params,
'metrics': metrics,
'tags': tags
}
history.append(history_item)
self.logger.info(f"成功获取训练历史记录, 共{total_items}条记录")
return {
'status': 'success',
'history': history,
'pagination': {
'current_page': page,
'page_size': page_size,
'total_pages': total_pages,
'total_items': total_items
}
}
except Exception as e:
error_msg = f"获取训练历史失败: {str(e)}"
self.logger.error(error_msg)
return {
'status': 'error',
'message': '获取训练历史失败',
'details': {
'error_type': type(e).__name__,
'error_message': str(e)
}
}