完成--完成系统监控获取系统日志方法
This commit is contained in:
parent
c89d1907a4
commit
d222cfe968
@ -706,19 +706,61 @@ Response:
|
||||
|
||||
### 3.4 获取系统日志
|
||||
```http
|
||||
GET /api/system/logs?level=error&start_time=2023-08-20T00:00:00
|
||||
GET /api/system/logs?level=error&start_time=2025-02-19T00:00:00&end_time=2025-02-19T23:59:59&module=training&page=1&page_size=20
|
||||
|
||||
Parameters:
|
||||
- level: 日志级别过滤 (可选: debug, info, warning, error, critical)
|
||||
- start_time: 开始时间 (可选, 格式: YYYY-MM-DDThh:mm:ss)
|
||||
- end_time: 结束时间 (可选, 格式: YYYY-MM-DDThh:mm:ss)
|
||||
- module: 模块名称过滤 (可选: training, data_processing, model, system)
|
||||
- page: 页码 (默认: 1)
|
||||
- page_size: 每页数量 (默认: 20)
|
||||
|
||||
Response:
|
||||
{
|
||||
"status": "success",
|
||||
"logs": [
|
||||
{
|
||||
"timestamp": "2023-08-20T10:15:00",
|
||||
"timestamp": "2025-02-19T10:15:00",
|
||||
"level": "ERROR",
|
||||
"module": "training",
|
||||
"message": "Out of memory error in GPU 0"
|
||||
"message": "Out of memory error in GPU 0",
|
||||
"details": {
|
||||
"error_type": "RuntimeError",
|
||||
"gpu_id": 0,
|
||||
"memory_used": "15.6GB",
|
||||
"memory_total": "16GB"
|
||||
},
|
||||
"context": {
|
||||
"experiment_id": "656341556838275234",
|
||||
"run_id": "7970364d490f4e0aa0375c2db26215f3",
|
||||
"model": "XGBClassifier"
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
"pagination": {
|
||||
"current_page": 1,
|
||||
"page_size": 20,
|
||||
"total_pages": 3,
|
||||
"total_items": 42
|
||||
},
|
||||
"summary": {
|
||||
"error_count": 5,
|
||||
"warning_count": 12,
|
||||
"info_count": 25,
|
||||
"most_frequent_error": "Out of memory error",
|
||||
"most_affected_module": "training"
|
||||
}
|
||||
}
|
||||
|
||||
Error Response:
|
||||
{
|
||||
"status": "error",
|
||||
"message": "获取系统日志失败",
|
||||
"details": {
|
||||
"error_type": "FileNotFoundError",
|
||||
"error_message": "Log file not found"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
|
||||
## 4. 监控系统 (1)
|
||||
- 20250226
|
||||
- [ ] 资源监控
|
||||
- [x] 资源监控
|
||||
- [ ] 训练监控
|
||||
- [ ] 告警系统
|
||||
- [ ] 日志聚合
|
||||
|
||||
5
example_system_moniter_get_log.py
Normal file
5
example_system_moniter_get_log.py
Normal file
@ -0,0 +1,5 @@
|
||||
from function.system_monitor import SystemMonitor
|
||||
|
||||
systemMoniter = SystemMonitor()
|
||||
|
||||
print(systemMoniter.get_system_logs())
|
||||
Binary file not shown.
@ -9,6 +9,9 @@ import time
|
||||
import mlflow
|
||||
from mlflow.tracking import MlflowClient
|
||||
import pandas as pd
|
||||
import json
|
||||
from collections import Counter
|
||||
import re
|
||||
|
||||
class SystemMonitor:
|
||||
"""系统资源监控类"""
|
||||
@ -344,4 +347,152 @@ class SystemMonitor:
|
||||
'error_type': type(e).__name__,
|
||||
'error_message': str(e)
|
||||
}
|
||||
}
|
||||
|
||||
def get_system_logs(
|
||||
self,
|
||||
level: Optional[str] = None,
|
||||
start_time: Optional[str] = None,
|
||||
end_time: Optional[str] = None,
|
||||
module: Optional[str] = None,
|
||||
page: int = 1,
|
||||
page_size: int = 20
|
||||
) -> Dict:
|
||||
"""
|
||||
获取系统日志
|
||||
|
||||
Args:
|
||||
level: 日志级别过滤
|
||||
start_time: 开始时间 (YYYY-MM-DDThh:mm:ss)
|
||||
end_time: 结束时间 (YYYY-MM-DDThh:mm:ss)
|
||||
module: 模块名称过滤
|
||||
page: 页码
|
||||
page_size: 每页数量
|
||||
|
||||
Returns:
|
||||
系统日志信息
|
||||
"""
|
||||
try:
|
||||
# 获取所有日志文件
|
||||
log_dir = Path('.log')
|
||||
log_files = sorted(log_dir.glob('*.log'), reverse=True)
|
||||
|
||||
if not log_files:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': '未找到日志文件',
|
||||
'details': {
|
||||
'error_type': 'FileNotFoundError',
|
||||
'error_message': 'No log files found'
|
||||
}
|
||||
}
|
||||
|
||||
# 解析时间范围
|
||||
start_dt = pd.to_datetime(start_time) if start_time else None
|
||||
end_dt = pd.to_datetime(end_time) if end_time else pd.Timestamp.now()
|
||||
|
||||
# 读取并解析日志
|
||||
all_logs = []
|
||||
level_counts = Counter()
|
||||
error_types = Counter()
|
||||
module_counts = Counter()
|
||||
|
||||
log_pattern = re.compile(
|
||||
r'(?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2},\d{3})\s+-\s+'
|
||||
r'(?P<name>[\w.]+)\s+-\s+'
|
||||
r'(?P<level>\w+)\s+-\s+'
|
||||
r'(?P<message>.*?)(?:\s+\{(?P<details>.*)\})?$'
|
||||
)
|
||||
|
||||
for log_file in log_files:
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
match = log_pattern.match(line.strip())
|
||||
if not match:
|
||||
continue
|
||||
|
||||
log_data = match.groupdict()
|
||||
log_time = pd.to_datetime(log_data['timestamp'])
|
||||
|
||||
# 时间范围过滤
|
||||
if start_dt and log_time < start_dt:
|
||||
continue
|
||||
if log_time > end_dt:
|
||||
continue
|
||||
|
||||
# 级别过滤
|
||||
log_level = log_data['level'].upper()
|
||||
if level and log_level != level.upper():
|
||||
continue
|
||||
|
||||
# 模块过滤
|
||||
log_module = log_data['name']
|
||||
if module and log_module != module:
|
||||
continue
|
||||
|
||||
# 解析详细信息和上下文
|
||||
try:
|
||||
details = json.loads('{' + log_data.get('details', '') + '}')
|
||||
except:
|
||||
details = {}
|
||||
|
||||
# 统计信息
|
||||
level_counts[log_level] += 1
|
||||
if log_level == 'ERROR':
|
||||
error_types[log_data['message'].split(':')[0]] += 1
|
||||
module_counts[log_module] += 1
|
||||
|
||||
# 格式化日志记录
|
||||
log_entry = {
|
||||
'timestamp': log_time.strftime('%Y-%m-%dT%H:%M:%S'),
|
||||
'level': log_level,
|
||||
'module': log_module,
|
||||
'message': log_data['message'],
|
||||
'details': details,
|
||||
'context': {
|
||||
k: v for k, v in details.items()
|
||||
if k in ['experiment_id', 'run_id', 'model']
|
||||
}
|
||||
}
|
||||
all_logs.append(log_entry)
|
||||
|
||||
# 计算分页
|
||||
total_items = len(all_logs)
|
||||
total_pages = (total_items + page_size - 1) // page_size
|
||||
start_idx = (page - 1) * page_size
|
||||
end_idx = min(start_idx + page_size, total_items)
|
||||
|
||||
# 生成摘要信息
|
||||
summary = {
|
||||
'error_count': level_counts.get('ERROR', 0),
|
||||
'warning_count': level_counts.get('WARNING', 0),
|
||||
'info_count': level_counts.get('INFO', 0),
|
||||
'most_frequent_error': error_types.most_common(1)[0][0] if error_types else None,
|
||||
'most_affected_module': module_counts.most_common(1)[0][0] if module_counts else None
|
||||
}
|
||||
|
||||
self.logger.info(f"成功获取系统日志, 共{total_items}条记录")
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'logs': all_logs[start_idx:end_idx],
|
||||
'pagination': {
|
||||
'current_page': page,
|
||||
'page_size': page_size,
|
||||
'total_pages': total_pages,
|
||||
'total_items': total_items
|
||||
},
|
||||
'summary': summary
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"获取系统日志失败: {str(e)}"
|
||||
self.logger.error(error_msg)
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': '获取系统日志失败',
|
||||
'details': {
|
||||
'error_type': type(e).__name__,
|
||||
'error_message': str(e)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user