MLPlatform/function/system_monitor.py

208 lines
7.4 KiB
Python

import psutil
import GPUtil
import platform
from datetime import datetime
from typing import Dict, List, Optional
import logging
from pathlib import Path
import time
class SystemMonitor:
"""系统资源监控类"""
def __init__(self):
"""初始化系统监控器"""
self.logger = logging.getLogger(__name__)
self._setup_logging()
def _setup_logging(self):
"""设置日志"""
log_dir = Path('.log')
log_dir.mkdir(exist_ok=True)
file_handler = logging.FileHandler(
log_dir / f'system_monitor_{datetime.now():%Y%m%d_%H%M%S}.log'
)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
)
self.logger.addHandler(file_handler)
self.logger.setLevel(logging.INFO)
def _get_gpu_info(self) -> List[Dict]:
"""获取GPU信息"""
try:
gpus = GPUtil.getGPUs()
gpu_info = []
for gpu in gpus:
# 获取GPU进程信息
processes = []
# if gpu.processes:
# for proc in gpu.processes:
# process = psutil.Process(proc.pid)
# processes.append({
# 'pid': proc.pid,
# 'name': process.name(),
# 'memory': proc.gpu_memory # MB
# })
gpu_info.append({
'id': gpu.id,
'name': gpu.name,
'memory': {
'total': gpu.memoryTotal, # MB
'used': gpu.memoryUsed, # MB
'free': gpu.memoryFree # MB
},
'utilization': {
'gpu': gpu.load * 100, # %
'memory': gpu.memoryUtil * 100 # %
},
'temperature': gpu.temperature, # °C
'power': {
'draw': gpu.powerDraw if hasattr(gpu, 'powerDraw') else None, # W
'limit': gpu.powerLimit if hasattr(gpu, 'powerLimit') else None # W
},
'processes': processes
})
return gpu_info
except Exception as e:
self.logger.error(f"获取GPU信息失败: {str(e)}")
return []
def _get_cpu_info(self) -> Dict:
"""获取CPU信息"""
try:
cpu_freq = psutil.cpu_freq()
cpu_temp = psutil.sensors_temperatures().get('coretemp', [None])[0]
return {
'count': {
'physical': psutil.cpu_count(logical=False),
'logical': psutil.cpu_count(logical=True)
},
'utilization': psutil.cpu_percent(interval=1), # %
'frequency': {
'current': cpu_freq.current / 1000 if cpu_freq else None, # GHz
'min': cpu_freq.min / 1000 if cpu_freq else None, # GHz
'max': cpu_freq.max / 1000 if cpu_freq else None # GHz
},
'temperature': cpu_temp.current if cpu_temp else None, # °C
'memory': self._get_memory_info(),
'swap': self._get_swap_info()
}
except Exception as e:
self.logger.error(f"获取CPU信息失败: {str(e)}")
return {}
def _get_memory_info(self) -> Dict:
"""获取内存信息"""
try:
mem = psutil.virtual_memory()
return {
'total': mem.total // (1024 * 1024), # MB
'used': mem.used // (1024 * 1024), # MB
'free': mem.free // (1024 * 1024), # MB
'percent': mem.percent # %
}
except Exception as e:
self.logger.error(f"获取内存信息失败: {str(e)}")
return {}
def _get_swap_info(self) -> Dict:
"""获取交换内存信息"""
try:
swap = psutil.swap_memory()
return {
'total': swap.total // (1024 * 1024), # MB
'used': swap.used // (1024 * 1024), # MB
'free': swap.free // (1024 * 1024), # MB
'percent': swap.percent # %
}
except Exception as e:
self.logger.error(f"获取交换内存信息失败: {str(e)}")
return {}
def _get_disk_info(self) -> Dict:
"""获取磁盘信息"""
try:
disk_info = {}
for partition in psutil.disk_partitions():
try:
usage = psutil.disk_usage(partition.mountpoint)
disk_info[partition.mountpoint] = {
'total': usage.total // (1024 * 1024), # MB
'used': usage.used // (1024 * 1024), # MB
'free': usage.free // (1024 * 1024), # MB
'percent': usage.percent # %
}
except (PermissionError, OSError):
continue
return disk_info
except Exception as e:
self.logger.error(f"获取磁盘信息失败: {str(e)}")
return {}
def _get_process_info(self) -> Dict:
"""获取进程信息"""
try:
processes = {
'total': len(psutil.pids()),
'running': 0,
'sleeping': 0
}
for proc in psutil.process_iter(['status']):
try:
status = proc.info['status']
if status == 'running':
processes['running'] += 1
elif status == 'sleeping':
processes['sleeping'] += 1
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
return processes
except Exception as e:
self.logger.error(f"获取进程信息失败: {str(e)}")
return {}
def get_system_resources(self) -> Dict:
"""
获取系统资源使用情况
Returns:
系统资源信息
"""
try:
resources = {
'gpu': self._get_gpu_info(),
'cpu': self._get_cpu_info(),
'disk': self._get_disk_info(),
'processes': self._get_process_info()
}
self.logger.info("成功获取系统资源信息")
return {
'status': 'success',
'resources': resources,
'timestamp': datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
}
except Exception as e:
error_msg = f"获取系统资源信息失败: {str(e)}"
self.logger.error(error_msg)
return {
'status': 'error',
'message': '获取资源信息失败',
'details': {
'error_type': type(e).__name__,
'error_message': str(e)
}
}