600 lines
21 KiB
Python
600 lines
21 KiB
Python
"""
|
|
匹配监控系统模块
|
|
监控匹配过程和性能
|
|
"""
|
|
|
|
import time
|
|
from typing import Dict, Any, List, Optional
|
|
from collections import deque
|
|
|
|
class MatchMonitor:
|
|
"""
|
|
匹配监控器
|
|
监控匹配过程和性能
|
|
"""
|
|
|
|
def __init__(self, plugin):
|
|
"""
|
|
初始化匹配监控器
|
|
|
|
Args:
|
|
plugin: 匹配系统插件实例
|
|
"""
|
|
self.plugin = plugin
|
|
self.enabled = False
|
|
self.initialized = False
|
|
|
|
# 监控配置
|
|
self.monitor_config = {
|
|
"enable_monitoring": True,
|
|
"monitoring_interval": 5.0,
|
|
"log_level": "INFO",
|
|
"enable_performance_monitoring": True,
|
|
"enable_alerts": True,
|
|
"alert_thresholds": {
|
|
"match_queue_time": 60.0,
|
|
"match_failure_rate": 10.0,
|
|
"system_latency": 100.0
|
|
},
|
|
"enable_metrics_collection": True,
|
|
"metrics_retention_period": 3600 # 1小时
|
|
}
|
|
|
|
# 性能指标存储
|
|
self.performance_metrics = {
|
|
"match_queue_times": deque(maxlen=1000),
|
|
"match_success_rates": deque(maxlen=1000),
|
|
"system_latencies": deque(maxlen=1000),
|
|
"player_wait_times": deque(maxlen=1000),
|
|
"algorithm_performance": deque(maxlen=1000)
|
|
}
|
|
|
|
# 监控状态
|
|
self.monitor_state = {
|
|
"last_monitoring_update": 0.0,
|
|
"last_performance_check": 0.0,
|
|
"total_monitored_matches": 0,
|
|
"active_alerts": 0
|
|
}
|
|
|
|
# 警报系统
|
|
self.active_alerts = {}
|
|
self.alert_history = deque(maxlen=1000)
|
|
|
|
# 监控统计
|
|
self.monitor_stats = {
|
|
"metrics_collected": 0,
|
|
"alerts_generated": 0,
|
|
"alerts_resolved": 0,
|
|
"performance_checks": 0,
|
|
"monitor_errors": 0
|
|
}
|
|
|
|
# 回调函数
|
|
self.monitor_callbacks = {
|
|
"metric_collected": [],
|
|
"alert_triggered": [],
|
|
"alert_resolved": [],
|
|
"performance_degraded": []
|
|
}
|
|
|
|
# 时间戳记录
|
|
self.last_metric_collection = 0.0
|
|
self.last_alert_check = 0.0
|
|
|
|
print("✓ 匹配监控器已创建")
|
|
|
|
def initialize(self) -> bool:
|
|
"""
|
|
初始化匹配监控器
|
|
|
|
Returns:
|
|
是否初始化成功
|
|
"""
|
|
try:
|
|
print("正在初始化匹配监控器...")
|
|
|
|
self.initialized = True
|
|
print("✓ 匹配监控器初始化完成")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"✗ 匹配监控器初始化失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def enable(self) -> bool:
|
|
"""
|
|
启用匹配监控器
|
|
|
|
Returns:
|
|
是否启用成功
|
|
"""
|
|
try:
|
|
if not self.initialized:
|
|
print("✗ 匹配监控器未初始化")
|
|
return False
|
|
|
|
self.enabled = True
|
|
print("✓ 匹配监控器已启用")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"✗ 匹配监控器启用失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def disable(self):
|
|
"""禁用匹配监控器"""
|
|
try:
|
|
self.enabled = False
|
|
print("✓ 匹配监控器已禁用")
|
|
|
|
except Exception as e:
|
|
print(f"✗ 匹配监控器禁用失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def finalize(self):
|
|
"""清理匹配监控器资源"""
|
|
try:
|
|
# 禁用匹配监控器
|
|
if self.enabled:
|
|
self.disable()
|
|
|
|
# 清理回调
|
|
self.monitor_callbacks.clear()
|
|
|
|
self.initialized = False
|
|
print("✓ 匹配监控器资源已清理")
|
|
|
|
except Exception as e:
|
|
print(f"✗ 匹配监控器资源清理失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def update(self, dt: float):
|
|
"""
|
|
更新匹配监控器状态
|
|
|
|
Args:
|
|
dt: 时间增量(秒)
|
|
"""
|
|
try:
|
|
if not self.enabled:
|
|
return
|
|
|
|
current_time = time.time()
|
|
self.monitor_state["last_monitoring_update"] = current_time
|
|
|
|
# 定期收集性能指标
|
|
if current_time - self.last_metric_collection >= self.monitor_config["monitoring_interval"]:
|
|
self._collect_performance_metrics()
|
|
self.last_metric_collection = current_time
|
|
|
|
# 定期检查警报
|
|
if current_time - self.last_alert_check >= self.monitor_config["monitoring_interval"]:
|
|
self._check_alerts()
|
|
self.last_alert_check = current_time
|
|
|
|
except Exception as e:
|
|
print(f"✗ 匹配监控器更新失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def _collect_performance_metrics(self):
|
|
"""收集性能指标"""
|
|
try:
|
|
current_time = time.time()
|
|
metrics = {}
|
|
|
|
# 收集匹配管理器指标
|
|
if self.plugin.match_manager:
|
|
match_stats = self.plugin.match_manager.get_match_stats()
|
|
match_state = match_stats.get("state", {})
|
|
|
|
metrics["active_matches"] = match_state.get("active_matches", 0)
|
|
metrics["successful_matches"] = match_state.get("successful_matches", 0)
|
|
metrics["failed_matches"] = match_state.get("failed_matches", 0)
|
|
|
|
# 计算成功率
|
|
total_matches = metrics["successful_matches"] + metrics["failed_matches"]
|
|
if total_matches > 0:
|
|
success_rate = (metrics["successful_matches"] / total_matches) * 100
|
|
metrics["match_success_rate"] = success_rate
|
|
self.performance_metrics["match_success_rates"].append((current_time, success_rate))
|
|
|
|
# 收集队列管理器指标
|
|
if self.plugin.queue_manager:
|
|
queue_stats = self.plugin.queue_manager.get_queue_stats()
|
|
queue_state = queue_stats.get("state", {})
|
|
|
|
metrics["total_players"] = queue_state.get("total_players", 0)
|
|
metrics["players_matched"] = queue_state.get("players_matched", 0)
|
|
metrics["players_timed_out"] = queue_state.get("players_timed_out", 0)
|
|
|
|
# 计算超时率
|
|
total_players = metrics["players_matched"] + metrics["players_timed_out"]
|
|
if total_players > 0:
|
|
timeout_rate = (metrics["players_timed_out"] / total_players) * 100
|
|
metrics["queue_timeout_rate"] = timeout_rate
|
|
|
|
# 收集算法管理器指标
|
|
if self.plugin.algorithm_manager:
|
|
algorithm_stats = self.plugin.algorithm_manager.get_algorithm_stats()
|
|
algorithm_state = algorithm_stats.get("state", {})
|
|
|
|
metrics["total_algorithms"] = algorithm_state.get("total_matches", 0)
|
|
metrics["successful_algorithms"] = algorithm_state.get("successful_matches", 0)
|
|
metrics["average_match_quality"] = algorithm_state.get("average_match_quality", 0)
|
|
|
|
self.performance_metrics["algorithm_performance"].append(
|
|
(current_time, metrics["average_match_quality"])
|
|
)
|
|
|
|
# 收集房间分配器指标
|
|
if self.plugin.room_allocator:
|
|
room_stats = self.plugin.room_allocator.get_room_stats()
|
|
room_state = room_stats.get("state", {})
|
|
|
|
metrics["active_rooms"] = room_state.get("active_rooms", 0)
|
|
metrics["total_rooms"] = room_state.get("total_rooms", 0)
|
|
|
|
# 更新统计
|
|
self.monitor_stats["metrics_collected"] += 1
|
|
self.monitor_state["total_monitored_matches"] += 1
|
|
|
|
# 触发指标收集回调
|
|
self._trigger_monitor_callback("metric_collected", {
|
|
"metrics": metrics,
|
|
"timestamp": current_time
|
|
})
|
|
|
|
except Exception as e:
|
|
print(f"✗ 性能指标收集失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
|
|
def _check_alerts(self):
|
|
"""检查警报条件"""
|
|
try:
|
|
if not self.monitor_config["enable_alerts"]:
|
|
return
|
|
|
|
current_time = time.time()
|
|
thresholds = self.monitor_config["alert_thresholds"]
|
|
|
|
# 检查匹配队列时间
|
|
if self.performance_metrics["match_queue_times"]:
|
|
latest_queue_time = self.performance_metrics["match_queue_times"][-1][1]
|
|
if latest_queue_time > thresholds["match_queue_time"]:
|
|
self._trigger_alert("high_queue_time", f"匹配队列时间过长: {latest_queue_time:.2f}秒", current_time)
|
|
|
|
# 检查匹配失败率
|
|
if self.performance_metrics["match_success_rates"]:
|
|
latest_success_rate = self.performance_metrics["match_success_rates"][-1][1]
|
|
failure_rate = 100 - latest_success_rate
|
|
if failure_rate > thresholds["match_failure_rate"]:
|
|
self._trigger_alert("high_failure_rate", f"匹配失败率过高: {failure_rate:.2f}%", current_time)
|
|
|
|
# 检查系统延迟
|
|
if self.performance_metrics["system_latencies"]:
|
|
latest_latency = self.performance_metrics["system_latencies"][-1][1]
|
|
if latest_latency > thresholds["system_latency"]:
|
|
self._trigger_alert("high_latency", f"系统延迟过高: {latest_latency:.2f}ms", current_time)
|
|
|
|
except Exception as e:
|
|
print(f"✗ 警报检查失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
|
|
def _trigger_alert(self, alert_type: str, message: str, timestamp: float = None):
|
|
"""
|
|
触发警报
|
|
|
|
Args:
|
|
alert_type: 警报类型
|
|
message: 警报消息
|
|
timestamp: 时间戳
|
|
"""
|
|
try:
|
|
if timestamp is None:
|
|
timestamp = time.time()
|
|
|
|
alert_id = f"{alert_type}_{int(timestamp)}"
|
|
|
|
# 检查是否已存在相同类型的活动警报
|
|
if alert_type in self.active_alerts:
|
|
# 更新现有警报
|
|
self.active_alerts[alert_type]["count"] += 1
|
|
self.active_alerts[alert_type]["last_triggered"] = timestamp
|
|
else:
|
|
# 创建新警报
|
|
self.active_alerts[alert_type] = {
|
|
"id": alert_id,
|
|
"type": alert_type,
|
|
"message": message,
|
|
"first_triggered": timestamp,
|
|
"last_triggered": timestamp,
|
|
"count": 1,
|
|
"resolved": False
|
|
}
|
|
|
|
self.monitor_state["active_alerts"] += 1
|
|
self.monitor_stats["alerts_generated"] += 1
|
|
|
|
# 添加到警报历史
|
|
self.alert_history.append({
|
|
"id": alert_id,
|
|
"type": alert_type,
|
|
"message": message,
|
|
"timestamp": timestamp,
|
|
"resolved": False
|
|
})
|
|
|
|
# 触发警报回调
|
|
self._trigger_monitor_callback("alert_triggered", {
|
|
"alert_id": alert_id,
|
|
"alert_type": alert_type,
|
|
"message": message,
|
|
"timestamp": timestamp
|
|
})
|
|
|
|
print(f"⚠️ 警报触发 [{alert_type}]: {message}")
|
|
|
|
except Exception as e:
|
|
print(f"✗ 警报触发失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
|
|
def resolve_alert(self, alert_type: str):
|
|
"""
|
|
解决警报
|
|
|
|
Args:
|
|
alert_type: 警报类型
|
|
"""
|
|
try:
|
|
if alert_type in self.active_alerts:
|
|
alert_data = self.active_alerts[alert_type]
|
|
alert_data["resolved"] = True
|
|
alert_data["resolved_time"] = time.time()
|
|
|
|
self.monitor_state["active_alerts"] -= 1
|
|
self.monitor_stats["alerts_resolved"] += 1
|
|
|
|
# 触发警报解决回调
|
|
self._trigger_monitor_callback("alert_resolved", {
|
|
"alert_type": alert_type,
|
|
"alert_data": alert_data
|
|
})
|
|
|
|
# 从活动警报中移除
|
|
del self.active_alerts[alert_type]
|
|
|
|
print(f"✅ 警报已解决: {alert_type}")
|
|
|
|
except Exception as e:
|
|
print(f"✗ 警报解决失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
|
|
def record_match_queue_time(self, queue_time: float):
|
|
"""
|
|
记录匹配队列时间
|
|
|
|
Args:
|
|
queue_time: 队列时间(秒)
|
|
"""
|
|
try:
|
|
current_time = time.time()
|
|
self.performance_metrics["match_queue_times"].append((current_time, queue_time))
|
|
except Exception as e:
|
|
print(f"✗ 匹配队列时间记录失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
|
|
def record_player_wait_time(self, wait_time: float):
|
|
"""
|
|
记录玩家等待时间
|
|
|
|
Args:
|
|
wait_time: 等待时间(秒)
|
|
"""
|
|
try:
|
|
current_time = time.time()
|
|
self.performance_metrics["player_wait_times"].append((current_time, wait_time))
|
|
except Exception as e:
|
|
print(f"✗ 玩家等待时间记录失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
|
|
def record_system_latency(self, latency: float):
|
|
"""
|
|
记录系统延迟
|
|
|
|
Args:
|
|
latency: 延迟(毫秒)
|
|
"""
|
|
try:
|
|
current_time = time.time()
|
|
self.performance_metrics["system_latencies"].append((current_time, latency))
|
|
except Exception as e:
|
|
print(f"✗ 系统延迟记录失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
|
|
def get_performance_metrics(self, metric_name: str = None, limit: int = 100) -> Any:
|
|
"""
|
|
获取性能指标
|
|
|
|
Args:
|
|
metric_name: 指标名称(可选)
|
|
limit: 限制返回的数量
|
|
|
|
Returns:
|
|
性能指标数据
|
|
"""
|
|
try:
|
|
if metric_name:
|
|
if metric_name in self.performance_metrics:
|
|
metrics = list(self.performance_metrics[metric_name])
|
|
return metrics[-limit:] if len(metrics) > limit else metrics
|
|
else:
|
|
return []
|
|
else:
|
|
# 返回所有指标
|
|
result = {}
|
|
for name, metrics in self.performance_metrics.items():
|
|
metric_list = list(metrics)
|
|
result[name] = metric_list[-limit:] if len(metric_list) > limit else metric_list
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f"✗ 获取性能指标失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
return {}
|
|
|
|
def get_active_alerts(self) -> Dict[str, Any]:
|
|
"""
|
|
获取活动警报
|
|
|
|
Returns:
|
|
活动警报字典
|
|
"""
|
|
try:
|
|
return self.active_alerts.copy()
|
|
except Exception as e:
|
|
print(f"✗ 获取活动警报失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
return {}
|
|
|
|
def get_alert_history(self, limit: int = 100) -> List[Dict[str, Any]]:
|
|
"""
|
|
获取警报历史
|
|
|
|
Args:
|
|
limit: 限制返回的数量
|
|
|
|
Returns:
|
|
警报历史列表
|
|
"""
|
|
try:
|
|
history = list(self.alert_history)
|
|
return history[-limit:] if len(history) > limit else history
|
|
except Exception as e:
|
|
print(f"✗ 获取警报历史失败: {e}")
|
|
self.monitor_stats["monitor_errors"] += 1
|
|
return []
|
|
|
|
def get_monitor_stats(self) -> Dict[str, Any]:
|
|
"""
|
|
获取监控统计信息
|
|
|
|
Returns:
|
|
监控统计字典
|
|
"""
|
|
return {
|
|
"state": self.monitor_state.copy(),
|
|
"stats": self.monitor_stats.copy(),
|
|
"config": self.monitor_config.copy(),
|
|
"active_alerts_count": len(self.active_alerts),
|
|
"alert_history_count": len(self.alert_history)
|
|
}
|
|
|
|
def reset_stats(self):
|
|
"""重置监控统计信息"""
|
|
try:
|
|
self.monitor_stats = {
|
|
"metrics_collected": 0,
|
|
"alerts_generated": 0,
|
|
"alerts_resolved": 0,
|
|
"performance_checks": 0,
|
|
"monitor_errors": 0
|
|
}
|
|
|
|
self.monitor_state["total_monitored_matches"] = 0
|
|
self.monitor_state["active_alerts"] = 0
|
|
|
|
print("✓ 监控统计信息已重置")
|
|
except Exception as e:
|
|
print(f"✗ 监控统计信息重置失败: {e}")
|
|
|
|
def set_monitor_config(self, config: Dict[str, Any]) -> bool:
|
|
"""
|
|
设置监控配置
|
|
|
|
Args:
|
|
config: 监控配置字典
|
|
|
|
Returns:
|
|
是否设置成功
|
|
"""
|
|
try:
|
|
self.monitor_config.update(config)
|
|
print(f"✓ 监控配置已更新: {self.monitor_config}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"✗ 监控配置设置失败: {e}")
|
|
return False
|
|
|
|
def get_monitor_config(self) -> Dict[str, Any]:
|
|
"""
|
|
获取监控配置
|
|
|
|
Returns:
|
|
监控配置字典
|
|
"""
|
|
return self.monitor_config.copy()
|
|
|
|
def _trigger_monitor_callback(self, callback_type: str, data: Dict[str, Any]):
|
|
"""
|
|
触发监控回调
|
|
|
|
Args:
|
|
callback_type: 回调类型
|
|
data: 回调数据
|
|
"""
|
|
try:
|
|
if callback_type in self.monitor_callbacks:
|
|
for callback in self.monitor_callbacks[callback_type]:
|
|
try:
|
|
callback(data)
|
|
except Exception as e:
|
|
print(f"✗ 监控回调执行失败: {callback_type} - {e}")
|
|
except Exception as e:
|
|
print(f"✗ 监控回调触发失败: {e}")
|
|
|
|
def register_monitor_callback(self, callback_type: str, callback: callable):
|
|
"""
|
|
注册监控回调
|
|
|
|
Args:
|
|
callback_type: 回调类型
|
|
callback: 回调函数
|
|
"""
|
|
try:
|
|
if callback_type in self.monitor_callbacks:
|
|
self.monitor_callbacks[callback_type].append(callback)
|
|
print(f"✓ 监控回调已注册: {callback_type}")
|
|
else:
|
|
print(f"✗ 无效的回调类型: {callback_type}")
|
|
except Exception as e:
|
|
print(f"✗ 监控回调注册失败: {e}")
|
|
|
|
def unregister_monitor_callback(self, callback_type: str, callback: callable):
|
|
"""
|
|
注销监控回调
|
|
|
|
Args:
|
|
callback_type: 回调类型
|
|
callback: 回调函数
|
|
"""
|
|
try:
|
|
if callback_type in self.monitor_callbacks:
|
|
if callback in self.monitor_callbacks[callback_type]:
|
|
self.monitor_callbacks[callback_type].remove(callback)
|
|
print(f"✓ 监控回调已注销: {callback_type}")
|
|
else:
|
|
print(f"✗ 无效的回调类型: {callback_type}")
|
|
except Exception as e:
|
|
print(f"✗ 监控回调注销失败: {e}") |