完成--完成系统监控获取系统日志方法

This commit is contained in:
haotian 2025-02-21 10:00:42 +08:00
parent c89d1907a4
commit d222cfe968
5 changed files with 203 additions and 5 deletions

View File

@ -706,19 +706,61 @@ Response:
### 3.4 获取系统日志
```http
GET /api/system/logs?level=error&start_time=2023-08-20T00:00:00
GET /api/system/logs?level=error&start_time=2025-02-19T00:00:00&end_time=2025-02-19T23:59:59&module=training&page=1&page_size=20
Parameters:
- level: 日志级别过滤 (可选: debug, info, warning, error, critical)
- start_time: 开始时间 (可选, 格式: YYYY-MM-DDThh:mm:ss)
- end_time: 结束时间 (可选, 格式: YYYY-MM-DDThh:mm:ss)
- module: 模块名称过滤 (可选: training, data_processing, model, system)
- page: 页码 (默认: 1)
- page_size: 每页数量 (默认: 20)
Response:
{
"status": "success",
"logs": [
{
"timestamp": "2023-08-20T10:15:00",
"timestamp": "2025-02-19T10:15:00",
"level": "ERROR",
"module": "training",
"message": "Out of memory error in GPU 0"
"message": "Out of memory error in GPU 0",
"details": {
"error_type": "RuntimeError",
"gpu_id": 0,
"memory_used": "15.6GB",
"memory_total": "16GB"
},
"context": {
"experiment_id": "656341556838275234",
"run_id": "7970364d490f4e0aa0375c2db26215f3",
"model": "XGBClassifier"
}
}
]
],
"pagination": {
"current_page": 1,
"page_size": 20,
"total_pages": 3,
"total_items": 42
},
"summary": {
"error_count": 5,
"warning_count": 12,
"info_count": 25,
"most_frequent_error": "Out of memory error",
"most_affected_module": "training"
}
}
Error Response:
{
"status": "error",
"message": "获取系统日志失败",
"details": {
"error_type": "FileNotFoundError",
"error_message": "Log file not found"
}
}
```

View File

@ -19,7 +19,7 @@
## 4. 监控系统 (1)
- 20250226
- [ ] 资源监控
- [x] 资源监控
- [ ] 训练监控
- [ ] 告警系统
- [ ] 日志聚合

View File

@ -0,0 +1,5 @@
from function.system_monitor import SystemMonitor
systemMoniter = SystemMonitor()
print(systemMoniter.get_system_logs())

View File

@ -9,6 +9,9 @@ import time
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
import json
from collections import Counter
import re
class SystemMonitor:
"""系统资源监控类"""
@ -344,4 +347,152 @@ class SystemMonitor:
'error_type': type(e).__name__,
'error_message': str(e)
}
}
def get_system_logs(
self,
level: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
module: Optional[str] = None,
page: int = 1,
page_size: int = 20
) -> Dict:
"""
获取系统日志
Args:
level: 日志级别过滤
start_time: 开始时间 (YYYY-MM-DDThh:mm:ss)
end_time: 结束时间 (YYYY-MM-DDThh:mm:ss)
module: 模块名称过滤
page: 页码
page_size: 每页数量
Returns:
系统日志信息
"""
try:
# 获取所有日志文件
log_dir = Path('.log')
log_files = sorted(log_dir.glob('*.log'), reverse=True)
if not log_files:
return {
'status': 'error',
'message': '未找到日志文件',
'details': {
'error_type': 'FileNotFoundError',
'error_message': 'No log files found'
}
}
# 解析时间范围
start_dt = pd.to_datetime(start_time) if start_time else None
end_dt = pd.to_datetime(end_time) if end_time else pd.Timestamp.now()
# 读取并解析日志
all_logs = []
level_counts = Counter()
error_types = Counter()
module_counts = Counter()
log_pattern = re.compile(
r'(?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2},\d{3})\s+-\s+'
r'(?P<name>[\w.]+)\s+-\s+'
r'(?P<level>\w+)\s+-\s+'
r'(?P<message>.*?)(?:\s+\{(?P<details>.*)\})?$'
)
for log_file in log_files:
with open(log_file, 'r', encoding='utf-8') as f:
for line in f:
match = log_pattern.match(line.strip())
if not match:
continue
log_data = match.groupdict()
log_time = pd.to_datetime(log_data['timestamp'])
# 时间范围过滤
if start_dt and log_time < start_dt:
continue
if log_time > end_dt:
continue
# 级别过滤
log_level = log_data['level'].upper()
if level and log_level != level.upper():
continue
# 模块过滤
log_module = log_data['name']
if module and log_module != module:
continue
# 解析详细信息和上下文
try:
details = json.loads('{' + log_data.get('details', '') + '}')
except:
details = {}
# 统计信息
level_counts[log_level] += 1
if log_level == 'ERROR':
error_types[log_data['message'].split(':')[0]] += 1
module_counts[log_module] += 1
# 格式化日志记录
log_entry = {
'timestamp': log_time.strftime('%Y-%m-%dT%H:%M:%S'),
'level': log_level,
'module': log_module,
'message': log_data['message'],
'details': details,
'context': {
k: v for k, v in details.items()
if k in ['experiment_id', 'run_id', 'model']
}
}
all_logs.append(log_entry)
# 计算分页
total_items = len(all_logs)
total_pages = (total_items + page_size - 1) // page_size
start_idx = (page - 1) * page_size
end_idx = min(start_idx + page_size, total_items)
# 生成摘要信息
summary = {
'error_count': level_counts.get('ERROR', 0),
'warning_count': level_counts.get('WARNING', 0),
'info_count': level_counts.get('INFO', 0),
'most_frequent_error': error_types.most_common(1)[0][0] if error_types else None,
'most_affected_module': module_counts.most_common(1)[0][0] if module_counts else None
}
self.logger.info(f"成功获取系统日志, 共{total_items}条记录")
return {
'status': 'success',
'logs': all_logs[start_idx:end_idx],
'pagination': {
'current_page': page,
'page_size': page_size,
'total_pages': total_pages,
'total_items': total_items
},
'summary': summary
}
except Exception as e:
error_msg = f"获取系统日志失败: {str(e)}"
self.logger.error(error_msg)
return {
'status': 'error',
'message': '获取系统日志失败',
'details': {
'error_type': type(e).__name__,
'error_message': str(e)
}
}