EG/plugins/user/realtime_communication/monitoring/communication_monitor.py
2025-12-12 16:16:15 +08:00

825 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
通信监控模块
实时监控通信性能和记录日志
"""
import time
import threading
import json
import os
from typing import Dict, Any, List, Optional
from collections import deque
class CommunicationMonitor:
"""
通信监控器
实时监控通信性能和记录日志
"""
def __init__(self, plugin):
"""
初始化通信监控器
Args:
plugin: 实时通信插件实例
"""
self.plugin = plugin
self.enabled = False
self.initialized = False
# 监控配置
self.monitor_config = {
"enable_monitoring": True,
"monitoring_interval": 5.0,
"log_level": "INFO",
"enable_file_logging": True,
"log_file_path": "/home/hello/EG/plugins/user/realtime_communication/logs/communication_monitor.log",
"max_log_file_size": 10485760, # 10MB
"enable_performance_monitoring": True,
"enable_alerts": True,
"alert_thresholds": {
"cpu_usage": 80.0,
"memory_usage": 85.0,
"network_latency": 100.0,
"error_rate": 5.0,
"message_queue_size": 1000
},
"enable_metrics_collection": True,
"metrics_retention_period": 3600, # 1小时
"enable_realtime_dashboard": True
}
# 性能指标存储
self.performance_metrics = {
"cpu_usage": deque(maxlen=1000),
"memory_usage": deque(maxlen=1000),
"network_in": deque(maxlen=1000),
"network_out": deque(maxlen=1000),
"active_connections": deque(maxlen=1000),
"messages_per_second": deque(maxlen=1000),
"message_queue_size": deque(maxlen=1000),
"error_rate": deque(maxlen=1000)
}
# 监控状态
self.monitor_state = {
"last_monitoring_update": 0.0,
"last_performance_check": 0.0,
"total_logs": 0,
"total_alerts": 0,
"active_alerts": 0
}
# 警报系统
self.active_alerts = {}
self.alert_history = deque(maxlen=1000)
# 日志文件
self.log_file = None
self.log_file_size = 0
# 监控统计
self.monitor_stats = {
"metrics_collected": 0,
"logs_written": 0,
"alerts_generated": 0,
"alerts_resolved": 0,
"performance_checks": 0,
"monitor_errors": 0
}
# 线程锁
self.monitor_lock = threading.RLock()
# 回调函数
self.monitor_callbacks = {
"metric_collected": [],
"alert_triggered": [],
"alert_resolved": [],
"log_entry": [],
"performance_degraded": []
}
# 监控线程
self.monitor_thread = None
self.monitor_thread_running = False
# 时间戳记录
self.last_log_write = 0.0
self.last_metric_collection = 0.0
print("✓ 通信监控器已创建")
def initialize(self) -> bool:
"""
初始化通信监控器
Returns:
是否初始化成功
"""
try:
print("正在初始化通信监控器...")
# 创建日志目录
log_dir = os.path.dirname(self.monitor_config["log_file_path"])
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# 初始化日志文件
if self.monitor_config["enable_file_logging"]:
self._initialize_log_file()
# 启动监控线程
self._start_monitor_thread()
self.initialized = True
print("✓ 通信监控器初始化完成")
return True
except Exception as e:
print(f"✗ 通信监控器初始化失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
return False
def _initialize_log_file(self):
"""初始化日志文件"""
try:
# 检查文件是否存在
if os.path.exists(self.monitor_config["log_file_path"]):
self.log_file_size = os.path.getsize(self.monitor_config["log_file_path"])
# 打开日志文件
self.log_file = open(self.monitor_config["log_file_path"], "a", encoding="utf-8")
print(f"✓ 日志文件已初始化: {self.monitor_config['log_file_path']}")
except Exception as e:
print(f"✗ 日志文件初始化失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _start_monitor_thread(self):
"""启动监控线程"""
try:
self.monitor_thread_running = True
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
self.monitor_thread.start()
print("✓ 监控线程已启动")
except Exception as e:
print(f"✗ 监控线程启动失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _monitor_loop(self):
"""监控线程循环"""
try:
while self.monitor_thread_running:
try:
if self.enabled and self.monitor_config["enable_monitoring"]:
current_time = time.time()
# 收集性能指标
if current_time - self.last_metric_collection >= self.monitor_config["monitoring_interval"]:
self._collect_performance_metrics()
self.last_metric_collection = current_time
# 检查警报
self._check_alerts()
# 检查日志文件大小
self._check_log_file_size()
# 短暂休眠
time.sleep(1.0)
except Exception as e:
print(f"✗ 监控循环错误: {e}")
self.monitor_stats["monitor_errors"] += 1
time.sleep(5.0) # 出错时延长休眠
except Exception as e:
print(f"✗ 监控线程失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def enable(self) -> bool:
"""
启用通信监控器
Returns:
是否启用成功
"""
try:
if not self.initialized:
print("✗ 通信监控器未初始化")
return False
self.enabled = True
print("✓ 通信监控器已启用")
return True
except Exception as e:
print(f"✗ 通信监控器启用失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
return False
def disable(self):
"""禁用通信监控器"""
try:
self.enabled = False
# 停止监控线程
if self.monitor_thread_running:
self.monitor_thread_running = False
if self.monitor_thread and self.monitor_thread.is_alive():
self.monitor_thread.join(timeout=5.0)
# 关闭日志文件
if self.log_file:
self.log_file.close()
self.log_file = None
print("✓ 通信监控器已禁用")
except Exception as e:
print(f"✗ 通信监控器禁用失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
def finalize(self):
"""清理通信监控器资源"""
try:
# 禁用通信监控器
if self.enabled:
self.disable()
# 清理回调
self.monitor_callbacks.clear()
self.initialized = False
print("✓ 通信监控器资源已清理")
except Exception as e:
print(f"✗ 通信监控器资源清理失败: {e}")
import traceback
traceback.print_exc()
def update(self, dt: float):
"""
更新通信监控器状态
Args:
dt: 时间增量(秒)
"""
try:
if not self.enabled:
return
current_time = time.time()
self.monitor_state["last_monitoring_update"] = current_time
except Exception as e:
print(f"✗ 通信监控器更新失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
def _collect_performance_metrics(self):
"""收集性能指标"""
try:
current_time = time.time()
metrics = {}
# 收集系统资源使用情况
try:
import psutil
# CPU使用率
cpu_percent = psutil.cpu_percent(interval=0.1)
metrics["cpu_usage"] = cpu_percent
self.performance_metrics["cpu_usage"].append((current_time, cpu_percent))
# 内存使用率
memory_info = psutil.virtual_memory()
memory_percent = memory_info.percent
metrics["memory_usage"] = memory_percent
self.performance_metrics["memory_usage"].append((current_time, memory_percent))
# 网络使用情况
net_io = psutil.net_io_counters()
metrics["network_in"] = net_io.bytes_recv
metrics["network_out"] = net_io.bytes_sent
self.performance_metrics["network_in"].append((current_time, net_io.bytes_recv))
self.performance_metrics["network_out"].append((current_time, net_io.bytes_sent))
except ImportError:
# 如果没有psutil使用简化的指标
metrics["cpu_usage"] = 0.0
metrics["memory_usage"] = 0.0
metrics["network_in"] = 0
metrics["network_out"] = 0
# 收集服务器指标
if self.plugin.websocket_server:
server_stats = self.plugin.websocket_server.get_server_stats()
server_state = server_stats.get("state", {})
# 活跃连接数
active_connections = server_state.get("current_connections", 0)
metrics["active_connections"] = active_connections
self.performance_metrics["active_connections"].append((current_time, active_connections))
# 数据传输量
bytes_sent = server_state.get("bytes_sent", 0)
bytes_received = server_state.get("bytes_received", 0)
metrics["bytes_sent"] = bytes_sent
metrics["bytes_received"] = bytes_received
# 收集客户端指标
if self.plugin.client_manager:
client_stats = self.plugin.client_manager.get_client_stats()
client_state = client_stats.get("state", {})
client_count = client_state.get("current_clients", 0)
metrics["client_count"] = client_count
# 收集消息指标
if self.plugin.message_router:
message_stats = self.plugin.message_router.get_message_stats()
message_state = message_stats.get("state", {})
message_stats_data = message_stats.get("stats", {})
# 消息队列大小
pending_messages = message_state.get("pending_messages", 0)
metrics["message_queue_size"] = pending_messages
self.performance_metrics["message_queue_size"].append((current_time, pending_messages))
# 消息处理速率
messages_routed = message_stats_data.get("messages_routed", 0)
if hasattr(self, '_last_messages_routed'):
mps = (messages_routed - self._last_messages_routed) / self.monitor_config["monitoring_interval"]
metrics["messages_per_second"] = max(0, mps)
self.performance_metrics["messages_per_second"].append((current_time, metrics["messages_per_second"]))
else:
metrics["messages_per_second"] = 0
self.performance_metrics["messages_per_second"].append((current_time, 0))
self._last_messages_routed = messages_routed
# 收集房间指标
if self.plugin.room_manager:
room_stats = self.plugin.room_manager.get_room_stats()
room_state = room_stats.get("state", {})
active_rooms = room_state.get("active_rooms", 0)
metrics["active_rooms"] = active_rooms
# 更新统计
self.monitor_stats["metrics_collected"] += 1
self.last_metric_collection = current_time
# 触发指标收集回调
self._trigger_monitor_callback("metric_collected", {
"metrics": metrics,
"timestamp": current_time
})
# 记录日志
if self._should_log("DEBUG"):
self._write_log("DEBUG", f"性能指标收集: {metrics}")
except Exception as e:
print(f"✗ 性能指标收集失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _check_alerts(self):
"""检查警报条件"""
try:
if not self.monitor_config["enable_alerts"]:
return
current_time = time.time()
thresholds = self.monitor_config["alert_thresholds"]
# 检查CPU使用率
if self.performance_metrics["cpu_usage"]:
latest_cpu = self.performance_metrics["cpu_usage"][-1][1]
if latest_cpu > thresholds["cpu_usage"]:
self._trigger_alert("high_cpu_usage", f"CPU使用率过高: {latest_cpu:.2f}%", current_time)
# 检查内存使用率
if self.performance_metrics["memory_usage"]:
latest_memory = self.performance_metrics["memory_usage"][-1][1]
if latest_memory > thresholds["memory_usage"]:
self._trigger_alert("high_memory_usage", f"内存使用率过高: {latest_memory:.2f}%", current_time)
# 检查消息队列大小
if self.performance_metrics["message_queue_size"]:
latest_queue_size = self.performance_metrics["message_queue_size"][-1][1]
if latest_queue_size > thresholds["message_queue_size"]:
self._trigger_alert("high_message_queue", f"消息队列过大: {latest_queue_size}", current_time)
# 检查错误率
if self.performance_metrics["error_rate"]:
latest_error_rate = self.performance_metrics["error_rate"][-1][1]
if latest_error_rate > thresholds["error_rate"]:
self._trigger_alert("high_error_rate", f"错误率过高: {latest_error_rate:.2f}%", current_time)
# 检查活跃连接数
if self.performance_metrics["active_connections"]:
latest_connections = self.performance_metrics["active_connections"][-1][1]
if self.plugin.websocket_server:
max_connections = self.plugin.websocket_server.websocket_config.get("max_connections", 1000)
if latest_connections > max_connections * 0.9: # 90%阈值
self._trigger_alert("high_connections", f"连接数接近上限: {latest_connections}/{max_connections}", current_time)
except Exception as e:
print(f"✗ 警报检查失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _trigger_alert(self, alert_type: str, message: str, timestamp: float = None):
"""
触发警报
Args:
alert_type: 警报类型
message: 警报消息
timestamp: 时间戳
"""
try:
if timestamp is None:
timestamp = time.time()
alert_id = f"{alert_type}_{int(timestamp)}"
# 检查是否已存在相同类型的活动警报
with self.monitor_lock:
if alert_type in self.active_alerts:
# 更新现有警报
self.active_alerts[alert_type]["count"] += 1
self.active_alerts[alert_type]["last_triggered"] = timestamp
else:
# 创建新警报
self.active_alerts[alert_type] = {
"id": alert_id,
"type": alert_type,
"message": message,
"first_triggered": timestamp,
"last_triggered": timestamp,
"count": 1,
"resolved": False
}
self.monitor_state["active_alerts"] += 1
self.monitor_stats["alerts_generated"] += 1
# 添加到警报历史
self.alert_history.append({
"id": alert_id,
"type": alert_type,
"message": message,
"timestamp": timestamp,
"resolved": False
})
# 触发警报回调
self._trigger_monitor_callback("alert_triggered", {
"alert_id": alert_id,
"alert_type": alert_type,
"message": message,
"timestamp": timestamp
})
# 记录日志
self._write_log("WARNING", f"警报触发 [{alert_type}]: {message}")
except Exception as e:
print(f"✗ 警报触发失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def resolve_alert(self, alert_type: str):
"""
解决警报
Args:
alert_type: 警报类型
"""
try:
with self.monitor_lock:
if alert_type in self.active_alerts:
alert_data = self.active_alerts[alert_type]
alert_data["resolved"] = True
alert_data["resolved_time"] = time.time()
self.monitor_state["active_alerts"] -= 1
self.monitor_stats["alerts_resolved"] += 1
# 触发警报解决回调
self._trigger_monitor_callback("alert_resolved", {
"alert_type": alert_type,
"alert_data": alert_data
})
# 从活动警报中移除
del self.active_alerts[alert_type]
# 记录日志
self._write_log("INFO", f"警报已解决: {alert_type}")
except Exception as e:
print(f"✗ 警报解决失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _check_log_file_size(self):
"""检查日志文件大小"""
try:
if not self.monitor_config["enable_file_logging"] or not self.log_file:
return
# 检查文件大小
if os.path.exists(self.monitor_config["log_file_path"]):
current_size = os.path.getsize(self.monitor_config["log_file_path"])
if current_size > self.monitor_config["max_log_file_size"]:
# 轮转日志文件
self._rotate_log_file()
except Exception as e:
print(f"✗ 日志文件大小检查失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _rotate_log_file(self):
"""轮转日志文件"""
try:
if self.log_file:
self.log_file.close()
# 重命名当前日志文件
timestamp = int(time.time())
rotated_filename = f"{self.monitor_config['log_file_path']}.{timestamp}"
os.rename(self.monitor_config["log_file_path"], rotated_filename)
# 重新初始化日志文件
self._initialize_log_file()
print(f"✓ 日志文件已轮转: {rotated_filename}")
except Exception as e:
print(f"✗ 日志文件轮转失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _write_log(self, level: str, message: str, extra_data: Dict[str, Any] = None):
"""
写入日志
Args:
level: 日志级别
message: 日志消息
extra_data: 额外数据
"""
try:
if not self._should_log(level):
return
timestamp = time.time()
log_entry = {
"timestamp": timestamp,
"level": level,
"message": message,
"extra_data": extra_data
}
# 控制台输出
print(f"[{level}] {message}")
# 文件日志
if self.monitor_config["enable_file_logging"] and self.log_file:
try:
log_line = json.dumps(log_entry, ensure_ascii=False)
self.log_file.write(log_line + "\n")
self.log_file.flush()
self.log_file_size += len(log_line) + 1
self.monitor_stats["logs_written"] += 1
self.monitor_state["total_logs"] += 1
except Exception as e:
print(f"✗ 文件日志写入失败: {e}")
# 触发日志条目回调
self._trigger_monitor_callback("log_entry", log_entry)
self.last_log_write = timestamp
except Exception as e:
print(f"✗ 日志写入失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _should_log(self, level: str) -> bool:
"""
检查是否应该记录指定级别的日志
Args:
level: 日志级别
Returns:
是否应该记录日志
"""
try:
log_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
current_level = self.monitor_config["log_level"]
if current_level not in log_levels:
return True # 默认记录所有日志
if level not in log_levels:
return True # 默认记录未知级别的日志
current_index = log_levels.index(current_level)
level_index = log_levels.index(level)
# 只记录等于或高于当前级别的日志
return level_index >= current_index
except Exception as e:
print(f"✗ 日志级别检查失败: {e}")
return True
def get_performance_metrics(self, metric_name: str = None, limit: int = 100) -> Any:
"""
获取性能指标
Args:
metric_name: 指标名称(可选)
limit: 限制返回的数量
Returns:
性能指标数据
"""
try:
with self.monitor_lock:
if metric_name:
if metric_name in self.performance_metrics:
metrics = list(self.performance_metrics[metric_name])
return metrics[-limit:] if len(metrics) > limit else metrics
else:
return []
else:
# 返回所有指标
result = {}
for name, metrics in self.performance_metrics.items():
metric_list = list(metrics)
result[name] = metric_list[-limit:] if len(metric_list) > limit else metric_list
return result
except Exception as e:
print(f"✗ 获取性能指标失败: {e}")
self.monitor_stats["monitor_errors"] += 1
return {}
def get_active_alerts(self) -> Dict[str, Any]:
"""
获取活动警报
Returns:
活动警报字典
"""
try:
with self.monitor_lock:
return self.active_alerts.copy()
except Exception as e:
print(f"✗ 获取活动警报失败: {e}")
self.monitor_stats["monitor_errors"] += 1
return {}
def get_alert_history(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
获取警报历史
Args:
limit: 限制返回的数量
Returns:
警报历史列表
"""
try:
with self.monitor_lock:
history = list(self.alert_history)
return history[-limit:] if len(history) > limit else history
except Exception as e:
print(f"✗ 获取警报历史失败: {e}")
self.monitor_stats["monitor_errors"] += 1
return []
def get_monitor_stats(self) -> Dict[str, Any]:
"""
获取监控统计信息
Returns:
监控统计字典
"""
return {
"state": self.monitor_state.copy(),
"stats": self.monitor_stats.copy(),
"config": self.monitor_config.copy(),
"active_alerts_count": len(self.active_alerts),
"alert_history_count": len(self.alert_history)
}
def reset_stats(self):
"""重置监控统计信息"""
try:
self.monitor_stats = {
"metrics_collected": 0,
"logs_written": 0,
"alerts_generated": 0,
"alerts_resolved": 0,
"performance_checks": 0,
"monitor_errors": 0
}
self.monitor_state["total_logs"] = 0
self.monitor_state["total_alerts"] = 0
self.monitor_state["active_alerts"] = 0
print("✓ 监控统计信息已重置")
except Exception as e:
print(f"✗ 监控统计信息重置失败: {e}")
def set_monitor_config(self, config: Dict[str, Any]) -> bool:
"""
设置监控配置
Args:
config: 监控配置字典
Returns:
是否设置成功
"""
try:
self.monitor_config.update(config)
print(f"✓ 监控配置已更新: {self.monitor_config}")
return True
except Exception as e:
print(f"✗ 监控配置设置失败: {e}")
return False
def get_monitor_config(self) -> Dict[str, Any]:
"""
获取监控配置
Returns:
监控配置字典
"""
return self.monitor_config.copy()
def _trigger_monitor_callback(self, callback_type: str, data: Dict[str, Any]):
"""
触发监控回调
Args:
callback_type: 回调类型
data: 回调数据
"""
try:
if callback_type in self.monitor_callbacks:
for callback in self.monitor_callbacks[callback_type]:
try:
callback(data)
except Exception as e:
print(f"✗ 监控回调执行失败: {callback_type} - {e}")
except Exception as e:
print(f"✗ 监控回调触发失败: {e}")
def register_monitor_callback(self, callback_type: str, callback: callable):
"""
注册监控回调
Args:
callback_type: 回调类型
callback: 回调函数
"""
try:
if callback_type in self.monitor_callbacks:
self.monitor_callbacks[callback_type].append(callback)
print(f"✓ 监控回调已注册: {callback_type}")
else:
print(f"✗ 无效的回调类型: {callback_type}")
except Exception as e:
print(f"✗ 监控回调注册失败: {e}")
def unregister_monitor_callback(self, callback_type: str, callback: callable):
"""
注销监控回调
Args:
callback_type: 回调类型
callback: 回调函数
"""
try:
if callback_type in self.monitor_callbacks:
if callback in self.monitor_callbacks[callback_type]:
self.monitor_callbacks[callback_type].remove(callback)
print(f"✓ 监控回调已注销: {callback_type}")
else:
print(f"✗ 无效的回调类型: {callback_type}")
except Exception as e:
print(f"✗ 监控回调注销失败: {e}")