完成--完成获取系统训练历史方法
This commit is contained in:
parent
cb75799ae2
commit
b00e6dac71
@ -644,25 +644,64 @@ Error Response:
|
||||
|
||||
### 3.2 获取训练历史
|
||||
```http
|
||||
GET /api/system/history
|
||||
GET /api/system/history?page=1&page_size=10&start_time=2025-02-01&end_time=2025-02-19&status=completed&experiment_name=breast_cancer_classification
|
||||
|
||||
Parameters:
|
||||
- page: 页码 (默认: 1)
|
||||
- page_size: 每页数量 (默认: 10)
|
||||
- start_time: 开始时间 (可选, 格式: YYYY-MM-DD)
|
||||
- end_time: 结束时间 (可选, 格式: YYYY-MM-DD)
|
||||
- status: 运行状态过滤 (可选: completed, failed, running)
|
||||
- experiment_name: 实验名称过滤 (可选)
|
||||
|
||||
|
||||
Response:
|
||||
{
|
||||
"status": "success",
|
||||
"history": [
|
||||
{
|
||||
"task_id": "train_20230820_001",
|
||||
"model": "xgboost",
|
||||
"run_id": "7970364d490f4e0aa0375c2db26215f3",
|
||||
"experiment_id": "656341556838275234",
|
||||
"experiment_name": "breast_cancer_classification",
|
||||
"model_name": "XGBClassifier",
|
||||
"dataset": "breast_cancer",
|
||||
"start_time": "2023-08-20T10:00:00",
|
||||
"end_time": "2023-08-20T10:30:00",
|
||||
"start_time": "2025-02-19T08:43:02",
|
||||
"end_time": "2025-02-19T08:43:05",
|
||||
"duration": "3s",
|
||||
"status": "completed",
|
||||
"parameters": {
|
||||
"max_depth": 6,
|
||||
"learning_rate": 0.1,
|
||||
"n_estimators": 100
|
||||
},
|
||||
"metrics": {
|
||||
"accuracy": 0.95,
|
||||
"f1": 0.94
|
||||
"accuracy": 0.956,
|
||||
"precision": 0.962,
|
||||
"recall": 0.935,
|
||||
"f1": 0.948
|
||||
},
|
||||
"tags": {
|
||||
"version": "v1.0",
|
||||
"author": "admin"
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
"pagination": {
|
||||
"current_page": 1,
|
||||
"page_size": 10,
|
||||
"total_pages": 5,
|
||||
"total_items": 42
|
||||
}
|
||||
}
|
||||
|
||||
Error Response:
|
||||
{
|
||||
"status": "error",
|
||||
"message": "获取训练历史失败",
|
||||
"details": {
|
||||
"error_type": "MLflowError",
|
||||
"error_message": "Failed to connect to MLflow server"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@ -1017,4 +1056,5 @@ Authorization: Bearer <token>
|
||||
3. 性能评估:
|
||||
- 使用多个评估指标
|
||||
- 考虑模型的稳定性
|
||||
- 关注模型在特定场景的表现
|
||||
- 关注模型在特定场景的表现
|
||||
4
example_system_moniter_train_history.py
Normal file
4
example_system_moniter_train_history.py
Normal file
@ -0,0 +1,4 @@
|
||||
from function.system_monitor import SystemMonitor
|
||||
|
||||
system_monitor = SystemMonitor()
|
||||
print(system_monitor.get_training_history())
|
||||
Binary file not shown.
@ -6,6 +6,9 @@ from typing import Dict, List, Optional
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import time
|
||||
import mlflow
|
||||
from mlflow.tracking import MlflowClient
|
||||
import pandas as pd
|
||||
|
||||
class SystemMonitor:
|
||||
"""系统资源监控类"""
|
||||
@ -14,6 +17,7 @@ class SystemMonitor:
|
||||
"""初始化系统监控器"""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._setup_logging()
|
||||
self.mlflow_client = MlflowClient()
|
||||
|
||||
def _setup_logging(self):
|
||||
"""设置日志"""
|
||||
@ -205,4 +209,139 @@ class SystemMonitor:
|
||||
'error_type': type(e).__name__,
|
||||
'error_message': str(e)
|
||||
}
|
||||
}
|
||||
|
||||
def get_training_history(
|
||||
self,
|
||||
page: int = 1,
|
||||
page_size: int = 10,
|
||||
start_time: Optional[str] = None,
|
||||
end_time: Optional[str] = None,
|
||||
status: Optional[str] = None,
|
||||
experiment_name: Optional[str] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
获取训练历史记录
|
||||
|
||||
Args:
|
||||
page: 页码
|
||||
page_size: 每页数量
|
||||
start_time: 开始时间 (YYYY-MM-DD)
|
||||
end_time: 结束时间 (YYYY-MM-DD)
|
||||
status: 运行状态过滤
|
||||
experiment_name: 实验名称过滤
|
||||
|
||||
Returns:
|
||||
训练历史记录
|
||||
"""
|
||||
try:
|
||||
# 构建过滤条件
|
||||
filter_string = []
|
||||
|
||||
if status:
|
||||
filter_string.append(f"status = '{status.upper()}'")
|
||||
|
||||
if start_time:
|
||||
start_timestamp = int(pd.Timestamp(start_time).timestamp() * 1000)
|
||||
filter_string.append(f"start_time >= {start_timestamp}")
|
||||
|
||||
if end_time:
|
||||
end_timestamp = int(pd.Timestamp(end_time).timestamp() * 1000)
|
||||
filter_string.append(f"start_time <= {end_timestamp}")
|
||||
|
||||
filter_string = " and ".join(filter_string) if filter_string else None
|
||||
|
||||
# 获取实验ID
|
||||
if experiment_name:
|
||||
experiment = mlflow.get_experiment_by_name(experiment_name)
|
||||
if experiment is None:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': f'实验 {experiment_name} 不存在'
|
||||
}
|
||||
experiment_ids = [experiment.experiment_id]
|
||||
else:
|
||||
experiments = mlflow.search_experiments()
|
||||
experiment_ids = [exp.experiment_id for exp in experiments]
|
||||
|
||||
# 获取所有运行记录
|
||||
all_runs = []
|
||||
for exp_id in experiment_ids:
|
||||
runs = mlflow.search_runs(
|
||||
experiment_ids=[exp_id],
|
||||
filter_string=filter_string,
|
||||
order_by=['start_time DESC']
|
||||
)
|
||||
if not runs.empty:
|
||||
all_runs.extend(runs.to_dict('records'))
|
||||
|
||||
# 计算分页
|
||||
total_items = len(all_runs)
|
||||
total_pages = (total_items + page_size - 1) // page_size
|
||||
start_idx = (page - 1) * page_size
|
||||
end_idx = min(start_idx + page_size, total_items)
|
||||
|
||||
# 格式化运行记录
|
||||
history = []
|
||||
for run in all_runs[start_idx:end_idx]:
|
||||
# 获取实验信息
|
||||
experiment = mlflow.get_experiment(run['experiment_id'])
|
||||
|
||||
# 计算持续时间
|
||||
start_time = pd.to_datetime(run['start_time'])
|
||||
end_time = pd.to_datetime(run['end_time']) if run['end_time'] else pd.Timestamp.now()
|
||||
duration = end_time - start_time
|
||||
|
||||
# 收集参数和指标
|
||||
params = {}
|
||||
metrics = {}
|
||||
tags = {}
|
||||
for key, value in run.items():
|
||||
if key.startswith('params.'):
|
||||
params[key.replace('params.', '')] = value
|
||||
elif key.startswith('metrics.'):
|
||||
metrics[key.replace('metrics.', '')] = value
|
||||
elif key.startswith('tags.'):
|
||||
tags[key.replace('tags.', '')] = value
|
||||
|
||||
# 格式化记录
|
||||
history_item = {
|
||||
'run_id': run['run_id'],
|
||||
'experiment_id': run['experiment_id'],
|
||||
'experiment_name': experiment.name,
|
||||
'model_name': params.get('algorithm', 'Unknown'),
|
||||
'dataset': params.get('dataset', 'Unknown'),
|
||||
'start_time': start_time.strftime('%Y-%m-%dT%H:%M:%S'),
|
||||
'end_time': end_time.strftime('%Y-%m-%dT%H:%M:%S'),
|
||||
'duration': str(duration).split('.')[0], # 去除微秒部分
|
||||
'status': run['status'],
|
||||
'parameters': params,
|
||||
'metrics': metrics,
|
||||
'tags': tags
|
||||
}
|
||||
history.append(history_item)
|
||||
|
||||
self.logger.info(f"成功获取训练历史记录, 共{total_items}条记录")
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'history': history,
|
||||
'pagination': {
|
||||
'current_page': page,
|
||||
'page_size': page_size,
|
||||
'total_pages': total_pages,
|
||||
'total_items': total_items
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"获取训练历史失败: {str(e)}"
|
||||
self.logger.error(error_msg)
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': '获取训练历史失败',
|
||||
'details': {
|
||||
'error_type': type(e).__name__,
|
||||
'error_message': str(e)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user