完成--完成监控系统资源方法

This commit is contained in:
haotian 2025-02-20 11:30:20 +08:00
parent b0609e1ad1
commit cb75799ae2
5 changed files with 374 additions and 9 deletions

View File

@ -559,17 +559,85 @@ Response:
"gpu": [
{
"id": 0,
"name": "NVIDIA T4",
"memory_used": 8542,
"memory_total": 16384,
"utilization": 75
"name": "NVIDIA GeForce RTX 3090",
"memory": {
"total": 24576, // MB
"used": 3678, // MB
"free": 20898 // MB
},
"utilization": {
"gpu": 45, // %
"memory": 15 // %
},
"temperature": 65, // °C
"power": {
"draw": 180.5, // W
"limit": 350.0 // W
},
"processes": [
{
"pid": 1234,
"name": "python",
"memory": 2048 // MB
}
]
}
],
"cpu": {
"usage_percent": 65,
"memory_used": 32768,
"memory_total": 65536
"count": {
"physical": 16,
"logical": 32
},
"utilization": 35.5, // %
"frequency": {
"current": 3.6, // GHz
"min": 2.5, // GHz
"max": 4.2 // GHz
},
"temperature": 45.5, // °C
"memory": {
"total": 32768, // MB
"used": 16384, // MB
"free": 16384, // MB
"percent": 50.0 // %
},
"swap": {
"total": 8192, // MB
"used": 1024, // MB
"free": 7168, // MB
"percent": 12.5 // %
}
},
"disk": {
"/": {
"total": 512000, // MB
"used": 256000, // MB
"free": 256000, // MB
"percent": 50.0 // %
},
"/home": {
"total": 1024000, // MB
"used": 512000, // MB
"free": 512000, // MB
"percent": 50.0 // %
}
},
"processes": {
"total": 256,
"running": 2,
"sleeping": 254
}
},
"timestamp": "2025-02-19T15:30:45"
}
Error Response:
{
"status": "error",
"message": "获取资源信息失败",
"details": {
"error_type": "GPUQueryError",
"error_message": "Failed to query GPU information"
}
}
```
@ -616,8 +684,6 @@ Response:
}
```
## 附录A方法详细说明
### A1. 数据预处理方法

View File

@ -0,0 +1,5 @@
from function.system_monitor import SystemMonitor
system_monitor = SystemMonitor()
print(system_monitor.get_system_resources())

Binary file not shown.

208
function/system_monitor.py Normal file
View File

@ -0,0 +1,208 @@
import psutil
import GPUtil
import platform
from datetime import datetime
from typing import Dict, List, Optional
import logging
from pathlib import Path
import time
class SystemMonitor:
"""系统资源监控类"""
def __init__(self):
"""初始化系统监控器"""
self.logger = logging.getLogger(__name__)
self._setup_logging()
def _setup_logging(self):
"""设置日志"""
log_dir = Path('.log')
log_dir.mkdir(exist_ok=True)
file_handler = logging.FileHandler(
log_dir / f'system_monitor_{datetime.now():%Y%m%d_%H%M%S}.log'
)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
)
self.logger.addHandler(file_handler)
self.logger.setLevel(logging.INFO)
def _get_gpu_info(self) -> List[Dict]:
"""获取GPU信息"""
try:
gpus = GPUtil.getGPUs()
gpu_info = []
for gpu in gpus:
# 获取GPU进程信息
processes = []
# if gpu.processes:
# for proc in gpu.processes:
# process = psutil.Process(proc.pid)
# processes.append({
# 'pid': proc.pid,
# 'name': process.name(),
# 'memory': proc.gpu_memory # MB
# })
gpu_info.append({
'id': gpu.id,
'name': gpu.name,
'memory': {
'total': gpu.memoryTotal, # MB
'used': gpu.memoryUsed, # MB
'free': gpu.memoryFree # MB
},
'utilization': {
'gpu': gpu.load * 100, # %
'memory': gpu.memoryUtil * 100 # %
},
'temperature': gpu.temperature, # °C
'power': {
'draw': gpu.powerDraw if hasattr(gpu, 'powerDraw') else None, # W
'limit': gpu.powerLimit if hasattr(gpu, 'powerLimit') else None # W
},
'processes': processes
})
return gpu_info
except Exception as e:
self.logger.error(f"获取GPU信息失败: {str(e)}")
return []
def _get_cpu_info(self) -> Dict:
"""获取CPU信息"""
try:
cpu_freq = psutil.cpu_freq()
cpu_temp = psutil.sensors_temperatures().get('coretemp', [None])[0]
return {
'count': {
'physical': psutil.cpu_count(logical=False),
'logical': psutil.cpu_count(logical=True)
},
'utilization': psutil.cpu_percent(interval=1), # %
'frequency': {
'current': cpu_freq.current / 1000 if cpu_freq else None, # GHz
'min': cpu_freq.min / 1000 if cpu_freq else None, # GHz
'max': cpu_freq.max / 1000 if cpu_freq else None # GHz
},
'temperature': cpu_temp.current if cpu_temp else None, # °C
'memory': self._get_memory_info(),
'swap': self._get_swap_info()
}
except Exception as e:
self.logger.error(f"获取CPU信息失败: {str(e)}")
return {}
def _get_memory_info(self) -> Dict:
"""获取内存信息"""
try:
mem = psutil.virtual_memory()
return {
'total': mem.total // (1024 * 1024), # MB
'used': mem.used // (1024 * 1024), # MB
'free': mem.free // (1024 * 1024), # MB
'percent': mem.percent # %
}
except Exception as e:
self.logger.error(f"获取内存信息失败: {str(e)}")
return {}
def _get_swap_info(self) -> Dict:
"""获取交换内存信息"""
try:
swap = psutil.swap_memory()
return {
'total': swap.total // (1024 * 1024), # MB
'used': swap.used // (1024 * 1024), # MB
'free': swap.free // (1024 * 1024), # MB
'percent': swap.percent # %
}
except Exception as e:
self.logger.error(f"获取交换内存信息失败: {str(e)}")
return {}
def _get_disk_info(self) -> Dict:
"""获取磁盘信息"""
try:
disk_info = {}
for partition in psutil.disk_partitions():
try:
usage = psutil.disk_usage(partition.mountpoint)
disk_info[partition.mountpoint] = {
'total': usage.total // (1024 * 1024), # MB
'used': usage.used // (1024 * 1024), # MB
'free': usage.free // (1024 * 1024), # MB
'percent': usage.percent # %
}
except (PermissionError, OSError):
continue
return disk_info
except Exception as e:
self.logger.error(f"获取磁盘信息失败: {str(e)}")
return {}
def _get_process_info(self) -> Dict:
"""获取进程信息"""
try:
processes = {
'total': len(psutil.pids()),
'running': 0,
'sleeping': 0
}
for proc in psutil.process_iter(['status']):
try:
status = proc.info['status']
if status == 'running':
processes['running'] += 1
elif status == 'sleeping':
processes['sleeping'] += 1
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
return processes
except Exception as e:
self.logger.error(f"获取进程信息失败: {str(e)}")
return {}
def get_system_resources(self) -> Dict:
"""
获取系统资源使用情况
Returns:
系统资源信息
"""
try:
resources = {
'gpu': self._get_gpu_info(),
'cpu': self._get_cpu_info(),
'disk': self._get_disk_info(),
'processes': self._get_process_info()
}
self.logger.info("成功获取系统资源信息")
return {
'status': 'success',
'resources': resources,
'timestamp': datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
}
except Exception as e:
error_msg = f"获取系统资源信息失败: {str(e)}"
self.logger.error(error_msg)
return {
'status': 'error',
'message': '获取资源信息失败',
'details': {
'error_type': type(e).__name__,
'error_message': str(e)
}
}

86
test_system_monitor.py Normal file
View File

@ -0,0 +1,86 @@
import pytest
from function.system_monitor import SystemMonitor
from typing import Dict
class TestSystemMonitor:
@pytest.fixture
def system_monitor(self):
return SystemMonitor()
def test_get_system_resources(self, system_monitor):
"""测试获取系统资源信息"""
result = system_monitor.get_system_resources()
# 验证返回格式
assert isinstance(result, dict)
assert 'status' in result
assert result['status'] == 'success'
assert 'resources' in result
assert 'timestamp' in result
resources = result['resources']
# 验证GPU信息
assert 'gpu' in resources
if resources['gpu']: # 如果有GPU
gpu = resources['gpu'][0]
assert 'id' in gpu
assert 'name' in gpu
assert 'memory' in gpu
assert 'utilization' in gpu
assert 'temperature' in gpu
# 验证CPU信息
assert 'cpu' in resources
cpu = resources['cpu']
assert 'count' in cpu
assert 'utilization' in cpu
assert 'memory' in cpu
assert 'swap' in cpu
# 验证内存信息
memory = cpu['memory']
assert 'total' in memory
assert 'used' in memory
assert 'free' in memory
assert 'percent' in memory
assert memory['total'] > 0
assert 0 <= memory['percent'] <= 100
# 验证磁盘信息
assert 'disk' in resources
assert len(resources['disk']) > 0
for mount_point, disk_info in resources['disk'].items():
assert 'total' in disk_info
assert 'used' in disk_info
assert 'free' in disk_info
assert 'percent' in disk_info
assert disk_info['total'] > 0
assert 0 <= disk_info['percent'] <= 100
# 验证进程信息
assert 'processes' in resources
processes = resources['processes']
assert 'total' in processes
assert 'running' in processes
assert 'sleeping' in processes
assert processes['total'] > 0
assert processes['running'] >= 0
assert processes['sleeping'] >= 0
def test_error_handling(self, system_monitor, monkeypatch):
"""测试错误处理"""
def mock_gpu_error(*args, **kwargs):
raise Exception("GPU query failed")
# 模拟GPU查询错误
monkeypatch.setattr(system_monitor, '_get_gpu_info', mock_gpu_error)
result = system_monitor.get_system_resources()
assert result['status'] == 'success' # 即使GPU查询失败其他资源信息仍应返回
assert result['resources']['gpu'] == [] # GPU信息应为空列表
# 验证其他资源信息仍然可用
assert 'cpu' in result['resources']
assert 'disk' in result['resources']
assert 'processes' in result['resources']