EG/plugins/user/matchmaking_system/monitoring/match_monitor.py
2025-10-30 15:01:29 +08:00

600 lines
21 KiB
Python

"""
匹配监控系统模块
监控匹配过程和性能
"""
import time
from typing import Dict, Any, List, Optional
from collections import deque
class MatchMonitor:
"""
匹配监控器
监控匹配过程和性能
"""
def __init__(self, plugin):
"""
初始化匹配监控器
Args:
plugin: 匹配系统插件实例
"""
self.plugin = plugin
self.enabled = False
self.initialized = False
# 监控配置
self.monitor_config = {
"enable_monitoring": True,
"monitoring_interval": 5.0,
"log_level": "INFO",
"enable_performance_monitoring": True,
"enable_alerts": True,
"alert_thresholds": {
"match_queue_time": 60.0,
"match_failure_rate": 10.0,
"system_latency": 100.0
},
"enable_metrics_collection": True,
"metrics_retention_period": 3600 # 1小时
}
# 性能指标存储
self.performance_metrics = {
"match_queue_times": deque(maxlen=1000),
"match_success_rates": deque(maxlen=1000),
"system_latencies": deque(maxlen=1000),
"player_wait_times": deque(maxlen=1000),
"algorithm_performance": deque(maxlen=1000)
}
# 监控状态
self.monitor_state = {
"last_monitoring_update": 0.0,
"last_performance_check": 0.0,
"total_monitored_matches": 0,
"active_alerts": 0
}
# 警报系统
self.active_alerts = {}
self.alert_history = deque(maxlen=1000)
# 监控统计
self.monitor_stats = {
"metrics_collected": 0,
"alerts_generated": 0,
"alerts_resolved": 0,
"performance_checks": 0,
"monitor_errors": 0
}
# 回调函数
self.monitor_callbacks = {
"metric_collected": [],
"alert_triggered": [],
"alert_resolved": [],
"performance_degraded": []
}
# 时间戳记录
self.last_metric_collection = 0.0
self.last_alert_check = 0.0
print("✓ 匹配监控器已创建")
def initialize(self) -> bool:
"""
初始化匹配监控器
Returns:
是否初始化成功
"""
try:
print("正在初始化匹配监控器...")
self.initialized = True
print("✓ 匹配监控器初始化完成")
return True
except Exception as e:
print(f"✗ 匹配监控器初始化失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
return False
def enable(self) -> bool:
"""
启用匹配监控器
Returns:
是否启用成功
"""
try:
if not self.initialized:
print("✗ 匹配监控器未初始化")
return False
self.enabled = True
print("✓ 匹配监控器已启用")
return True
except Exception as e:
print(f"✗ 匹配监控器启用失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
return False
def disable(self):
"""禁用匹配监控器"""
try:
self.enabled = False
print("✓ 匹配监控器已禁用")
except Exception as e:
print(f"✗ 匹配监控器禁用失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
def finalize(self):
"""清理匹配监控器资源"""
try:
# 禁用匹配监控器
if self.enabled:
self.disable()
# 清理回调
self.monitor_callbacks.clear()
self.initialized = False
print("✓ 匹配监控器资源已清理")
except Exception as e:
print(f"✗ 匹配监控器资源清理失败: {e}")
import traceback
traceback.print_exc()
def update(self, dt: float):
"""
更新匹配监控器状态
Args:
dt: 时间增量(秒)
"""
try:
if not self.enabled:
return
current_time = time.time()
self.monitor_state["last_monitoring_update"] = current_time
# 定期收集性能指标
if current_time - self.last_metric_collection >= self.monitor_config["monitoring_interval"]:
self._collect_performance_metrics()
self.last_metric_collection = current_time
# 定期检查警报
if current_time - self.last_alert_check >= self.monitor_config["monitoring_interval"]:
self._check_alerts()
self.last_alert_check = current_time
except Exception as e:
print(f"✗ 匹配监控器更新失败: {e}")
self.monitor_stats["monitor_errors"] += 1
import traceback
traceback.print_exc()
def _collect_performance_metrics(self):
"""收集性能指标"""
try:
current_time = time.time()
metrics = {}
# 收集匹配管理器指标
if self.plugin.match_manager:
match_stats = self.plugin.match_manager.get_match_stats()
match_state = match_stats.get("state", {})
metrics["active_matches"] = match_state.get("active_matches", 0)
metrics["successful_matches"] = match_state.get("successful_matches", 0)
metrics["failed_matches"] = match_state.get("failed_matches", 0)
# 计算成功率
total_matches = metrics["successful_matches"] + metrics["failed_matches"]
if total_matches > 0:
success_rate = (metrics["successful_matches"] / total_matches) * 100
metrics["match_success_rate"] = success_rate
self.performance_metrics["match_success_rates"].append((current_time, success_rate))
# 收集队列管理器指标
if self.plugin.queue_manager:
queue_stats = self.plugin.queue_manager.get_queue_stats()
queue_state = queue_stats.get("state", {})
metrics["total_players"] = queue_state.get("total_players", 0)
metrics["players_matched"] = queue_state.get("players_matched", 0)
metrics["players_timed_out"] = queue_state.get("players_timed_out", 0)
# 计算超时率
total_players = metrics["players_matched"] + metrics["players_timed_out"]
if total_players > 0:
timeout_rate = (metrics["players_timed_out"] / total_players) * 100
metrics["queue_timeout_rate"] = timeout_rate
# 收集算法管理器指标
if self.plugin.algorithm_manager:
algorithm_stats = self.plugin.algorithm_manager.get_algorithm_stats()
algorithm_state = algorithm_stats.get("state", {})
metrics["total_algorithms"] = algorithm_state.get("total_matches", 0)
metrics["successful_algorithms"] = algorithm_state.get("successful_matches", 0)
metrics["average_match_quality"] = algorithm_state.get("average_match_quality", 0)
self.performance_metrics["algorithm_performance"].append(
(current_time, metrics["average_match_quality"])
)
# 收集房间分配器指标
if self.plugin.room_allocator:
room_stats = self.plugin.room_allocator.get_room_stats()
room_state = room_stats.get("state", {})
metrics["active_rooms"] = room_state.get("active_rooms", 0)
metrics["total_rooms"] = room_state.get("total_rooms", 0)
# 更新统计
self.monitor_stats["metrics_collected"] += 1
self.monitor_state["total_monitored_matches"] += 1
# 触发指标收集回调
self._trigger_monitor_callback("metric_collected", {
"metrics": metrics,
"timestamp": current_time
})
except Exception as e:
print(f"✗ 性能指标收集失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _check_alerts(self):
"""检查警报条件"""
try:
if not self.monitor_config["enable_alerts"]:
return
current_time = time.time()
thresholds = self.monitor_config["alert_thresholds"]
# 检查匹配队列时间
if self.performance_metrics["match_queue_times"]:
latest_queue_time = self.performance_metrics["match_queue_times"][-1][1]
if latest_queue_time > thresholds["match_queue_time"]:
self._trigger_alert("high_queue_time", f"匹配队列时间过长: {latest_queue_time:.2f}", current_time)
# 检查匹配失败率
if self.performance_metrics["match_success_rates"]:
latest_success_rate = self.performance_metrics["match_success_rates"][-1][1]
failure_rate = 100 - latest_success_rate
if failure_rate > thresholds["match_failure_rate"]:
self._trigger_alert("high_failure_rate", f"匹配失败率过高: {failure_rate:.2f}%", current_time)
# 检查系统延迟
if self.performance_metrics["system_latencies"]:
latest_latency = self.performance_metrics["system_latencies"][-1][1]
if latest_latency > thresholds["system_latency"]:
self._trigger_alert("high_latency", f"系统延迟过高: {latest_latency:.2f}ms", current_time)
except Exception as e:
print(f"✗ 警报检查失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def _trigger_alert(self, alert_type: str, message: str, timestamp: float = None):
"""
触发警报
Args:
alert_type: 警报类型
message: 警报消息
timestamp: 时间戳
"""
try:
if timestamp is None:
timestamp = time.time()
alert_id = f"{alert_type}_{int(timestamp)}"
# 检查是否已存在相同类型的活动警报
if alert_type in self.active_alerts:
# 更新现有警报
self.active_alerts[alert_type]["count"] += 1
self.active_alerts[alert_type]["last_triggered"] = timestamp
else:
# 创建新警报
self.active_alerts[alert_type] = {
"id": alert_id,
"type": alert_type,
"message": message,
"first_triggered": timestamp,
"last_triggered": timestamp,
"count": 1,
"resolved": False
}
self.monitor_state["active_alerts"] += 1
self.monitor_stats["alerts_generated"] += 1
# 添加到警报历史
self.alert_history.append({
"id": alert_id,
"type": alert_type,
"message": message,
"timestamp": timestamp,
"resolved": False
})
# 触发警报回调
self._trigger_monitor_callback("alert_triggered", {
"alert_id": alert_id,
"alert_type": alert_type,
"message": message,
"timestamp": timestamp
})
print(f"⚠️ 警报触发 [{alert_type}]: {message}")
except Exception as e:
print(f"✗ 警报触发失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def resolve_alert(self, alert_type: str):
"""
解决警报
Args:
alert_type: 警报类型
"""
try:
if alert_type in self.active_alerts:
alert_data = self.active_alerts[alert_type]
alert_data["resolved"] = True
alert_data["resolved_time"] = time.time()
self.monitor_state["active_alerts"] -= 1
self.monitor_stats["alerts_resolved"] += 1
# 触发警报解决回调
self._trigger_monitor_callback("alert_resolved", {
"alert_type": alert_type,
"alert_data": alert_data
})
# 从活动警报中移除
del self.active_alerts[alert_type]
print(f"✅ 警报已解决: {alert_type}")
except Exception as e:
print(f"✗ 警报解决失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def record_match_queue_time(self, queue_time: float):
"""
记录匹配队列时间
Args:
queue_time: 队列时间(秒)
"""
try:
current_time = time.time()
self.performance_metrics["match_queue_times"].append((current_time, queue_time))
except Exception as e:
print(f"✗ 匹配队列时间记录失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def record_player_wait_time(self, wait_time: float):
"""
记录玩家等待时间
Args:
wait_time: 等待时间(秒)
"""
try:
current_time = time.time()
self.performance_metrics["player_wait_times"].append((current_time, wait_time))
except Exception as e:
print(f"✗ 玩家等待时间记录失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def record_system_latency(self, latency: float):
"""
记录系统延迟
Args:
latency: 延迟(毫秒)
"""
try:
current_time = time.time()
self.performance_metrics["system_latencies"].append((current_time, latency))
except Exception as e:
print(f"✗ 系统延迟记录失败: {e}")
self.monitor_stats["monitor_errors"] += 1
def get_performance_metrics(self, metric_name: str = None, limit: int = 100) -> Any:
"""
获取性能指标
Args:
metric_name: 指标名称(可选)
limit: 限制返回的数量
Returns:
性能指标数据
"""
try:
if metric_name:
if metric_name in self.performance_metrics:
metrics = list(self.performance_metrics[metric_name])
return metrics[-limit:] if len(metrics) > limit else metrics
else:
return []
else:
# 返回所有指标
result = {}
for name, metrics in self.performance_metrics.items():
metric_list = list(metrics)
result[name] = metric_list[-limit:] if len(metric_list) > limit else metric_list
return result
except Exception as e:
print(f"✗ 获取性能指标失败: {e}")
self.monitor_stats["monitor_errors"] += 1
return {}
def get_active_alerts(self) -> Dict[str, Any]:
"""
获取活动警报
Returns:
活动警报字典
"""
try:
return self.active_alerts.copy()
except Exception as e:
print(f"✗ 获取活动警报失败: {e}")
self.monitor_stats["monitor_errors"] += 1
return {}
def get_alert_history(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
获取警报历史
Args:
limit: 限制返回的数量
Returns:
警报历史列表
"""
try:
history = list(self.alert_history)
return history[-limit:] if len(history) > limit else history
except Exception as e:
print(f"✗ 获取警报历史失败: {e}")
self.monitor_stats["monitor_errors"] += 1
return []
def get_monitor_stats(self) -> Dict[str, Any]:
"""
获取监控统计信息
Returns:
监控统计字典
"""
return {
"state": self.monitor_state.copy(),
"stats": self.monitor_stats.copy(),
"config": self.monitor_config.copy(),
"active_alerts_count": len(self.active_alerts),
"alert_history_count": len(self.alert_history)
}
def reset_stats(self):
"""重置监控统计信息"""
try:
self.monitor_stats = {
"metrics_collected": 0,
"alerts_generated": 0,
"alerts_resolved": 0,
"performance_checks": 0,
"monitor_errors": 0
}
self.monitor_state["total_monitored_matches"] = 0
self.monitor_state["active_alerts"] = 0
print("✓ 监控统计信息已重置")
except Exception as e:
print(f"✗ 监控统计信息重置失败: {e}")
def set_monitor_config(self, config: Dict[str, Any]) -> bool:
"""
设置监控配置
Args:
config: 监控配置字典
Returns:
是否设置成功
"""
try:
self.monitor_config.update(config)
print(f"✓ 监控配置已更新: {self.monitor_config}")
return True
except Exception as e:
print(f"✗ 监控配置设置失败: {e}")
return False
def get_monitor_config(self) -> Dict[str, Any]:
"""
获取监控配置
Returns:
监控配置字典
"""
return self.monitor_config.copy()
def _trigger_monitor_callback(self, callback_type: str, data: Dict[str, Any]):
"""
触发监控回调
Args:
callback_type: 回调类型
data: 回调数据
"""
try:
if callback_type in self.monitor_callbacks:
for callback in self.monitor_callbacks[callback_type]:
try:
callback(data)
except Exception as e:
print(f"✗ 监控回调执行失败: {callback_type} - {e}")
except Exception as e:
print(f"✗ 监控回调触发失败: {e}")
def register_monitor_callback(self, callback_type: str, callback: callable):
"""
注册监控回调
Args:
callback_type: 回调类型
callback: 回调函数
"""
try:
if callback_type in self.monitor_callbacks:
self.monitor_callbacks[callback_type].append(callback)
print(f"✓ 监控回调已注册: {callback_type}")
else:
print(f"✗ 无效的回调类型: {callback_type}")
except Exception as e:
print(f"✗ 监控回调注册失败: {e}")
def unregister_monitor_callback(self, callback_type: str, callback: callable):
"""
注销监控回调
Args:
callback_type: 回调类型
callback: 回调函数
"""
try:
if callback_type in self.monitor_callbacks:
if callback in self.monitor_callbacks[callback_type]:
self.monitor_callbacks[callback_type].remove(callback)
print(f"✓ 监控回调已注销: {callback_type}")
else:
print(f"✗ 无效的回调类型: {callback_type}")
except Exception as e:
print(f"✗ 监控回调注销失败: {e}")