QAUP_Management/deploy/docker/alert-manager.sh

359 lines
11 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# QAUP 告警管理脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
ALERT_LOG="$PROJECT_ROOT/logs/alerts.log"
ALERT_CONFIG="$SCRIPT_DIR/healthcheck.yml"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_message() {
local color=$1
local message=$2
echo -e "${color}${message}${NC}"
}
# 获取时间戳
get_timestamp() {
date '+%Y-%m-%d %H:%M:%S'
}
# 记录告警
log_alert() {
local severity=$1
local service=$2
local message=$3
local timestamp=$(get_timestamp)
# 确保日志目录存在
mkdir -p "$(dirname "$ALERT_LOG")"
# 记录到日志文件
echo "[$timestamp] [$severity] [$service] $message" >> "$ALERT_LOG"
# 输出到控制台
case $severity in
"CRITICAL")
print_message $RED "🚨 CRITICAL [$service]: $message"
;;
"WARNING")
print_message $YELLOW "⚠️ WARNING [$service]: $message"
;;
"INFO")
print_message $BLUE " INFO [$service]: $message"
;;
*)
print_message $GREEN "✓ [$service]: $message"
;;
esac
}
# 检查容器状态
check_container_alerts() {
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
for container in "${containers[@]}"; do
if ! docker ps --format "{{.Names}}" | grep -q "^${container}$"; then
log_alert "CRITICAL" "$container" "容器已停止运行"
elif docker ps --format "{{.Names}}\t{{.Status}}" | grep "$container" | grep -q "unhealthy"; then
log_alert "WARNING" "$container" "容器健康检查失败"
fi
done
}
# 检查资源使用率
check_resource_alerts() {
# 检查磁盘使用率
local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//')
if [ $disk_usage -gt 80 ]; then
log_alert "WARNING" "SYSTEM" "磁盘使用率过高: ${disk_usage}%"
fi
# 检查内存使用率
local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
if [ $mem_usage -gt 85 ]; then
log_alert "WARNING" "SYSTEM" "内存使用率过高: ${mem_usage}%"
fi
# 检查容器资源使用
while IFS= read -r line; do
local container=$(echo "$line" | awk '{print $1}')
local cpu_usage=$(echo "$line" | awk '{print $2}' | sed 's/%//')
local mem_usage=$(echo "$line" | awk '{print $7}' | sed 's/%//')
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
log_alert "WARNING" "$container" "CPU使用率过高: ${cpu_usage}%"
fi
if (( $(echo "$mem_usage > 85" | bc -l) )); then
log_alert "WARNING" "$container" "内存使用率过高: ${mem_usage}%"
fi
done < <(docker stats --no-stream --format "{{.Container}} {{.CPUPerc}} {{.MemUsage}} {{.MemPerc}}" 2>/dev/null)
}
# 检查服务响应时间
check_response_time_alerts() {
# 检查应用响应时间
local app_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost:8080/actuator/health 2>/dev/null || echo "999")
local app_time_ms=$(echo "$app_response_time * 1000" | bc)
if (( $(echo "$app_time_ms > 5000" | bc -l) )); then
log_alert "WARNING" "qaup-app" "响应时间过长: ${app_time_ms}ms"
fi
# 检查 Nginx 响应时间
local nginx_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost/health 2>/dev/null || echo "999")
local nginx_time_ms=$(echo "$nginx_response_time * 1000" | bc)
if (( $(echo "$nginx_time_ms > 2000" | bc -l) )); then
log_alert "WARNING" "qaup-nginx" "响应时间过长: ${nginx_time_ms}ms"
fi
}
# 检查错误日志
check_error_log_alerts() {
local error_patterns=("ERROR" "FATAL" "Exception" "failed" "timeout")
local containers=("qaup-app" "qaup-nginx" "qaup-postgres" "qaup-redis")
for container in "${containers[@]}"; do
# 检查最近5分钟的日志
local recent_logs=$(docker logs "$container" --since 5m 2>&1)
for pattern in "${error_patterns[@]}"; do
local error_count=$(echo "$recent_logs" | grep -c "$pattern" || echo "0")
if [ $error_count -gt 10 ]; then
log_alert "WARNING" "$container" "检测到大量错误日志: $pattern ($error_count 次)"
fi
done
done
}
# 检查数据库连接
check_database_alerts() {
# 检查数据库连接数
local connection_count=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | xargs || echo "0")
if [ $connection_count -gt 150 ]; then
log_alert "WARNING" "qaup-postgres" "数据库连接数过多: $connection_count"
fi
# 检查数据库锁等待
local lock_waits=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock';" 2>/dev/null | xargs || echo "0")
if [ $lock_waits -gt 5 ]; then
log_alert "WARNING" "qaup-postgres" "检测到数据库锁等待: $lock_waits 个连接"
fi
}
# 发送邮件告警
send_email_alert() {
local subject=$1
local message=$2
local recipients=${3:-"admin@example.com"}
# 这里可以集成实际的邮件发送服务
# 例如使用 sendmail, postfix 或者第三方邮件服务
print_message $BLUE "邮件告警功能需要配置邮件服务器"
print_message $YELLOW "主题: $subject"
print_message $YELLOW "内容: $message"
print_message $YELLOW "收件人: $recipients"
}
# 发送钉钉/企业微信告警
send_webhook_alert() {
local webhook_url=$1
local message=$2
if [ -n "$webhook_url" ]; then
curl -X POST "$webhook_url" \
-H 'Content-Type: application/json' \
-d "{\"text\":\"$message\"}" \
&>/dev/null
if [ $? -eq 0 ]; then
print_message $GREEN "Webhook 告警发送成功"
else
print_message $RED "Webhook 告警发送失败"
fi
fi
}
# 运行所有检查
run_all_checks() {
print_message $BLUE "开始系统告警检查..."
check_container_alerts
check_resource_alerts
check_response_time_alerts
check_error_log_alerts
check_database_alerts
print_message $GREEN "告警检查完成"
}
# 持续监控模式
continuous_monitoring() {
local interval=${1:-300} # 默认5分钟间隔
print_message $GREEN "开始持续告警监控 (间隔: ${interval}秒)"
print_message $YELLOW "按 Ctrl+C 停止监控"
while true; do
run_all_checks
sleep $interval
done
}
# 查看告警历史
view_alert_history() {
local lines=${1:-50}
local filter=${2:-""}
if [ ! -f "$ALERT_LOG" ]; then
print_message $YELLOW "告警日志文件不存在: $ALERT_LOG"
return
fi
print_message $BLUE "告警历史 (最近 $lines 条):"
if [ -n "$filter" ]; then
tail -n $lines "$ALERT_LOG" | grep "$filter"
else
tail -n $lines "$ALERT_LOG"
fi
}
# 清理告警日志
cleanup_alert_logs() {
local days=${1:-30}
if [ -f "$ALERT_LOG" ]; then
# 备份当前日志
local backup_file="${ALERT_LOG}.backup.$(date +%Y%m%d)"
cp "$ALERT_LOG" "$backup_file"
# 只保留最近指定天数的日志
local cutoff_date=$(date -d "$days days ago" '+%Y-%m-%d')
awk -v cutoff="$cutoff_date" '$0 >= "["cutoff {print}' "$ALERT_LOG" > "${ALERT_LOG}.tmp"
mv "${ALERT_LOG}.tmp" "$ALERT_LOG"
print_message $GREEN "告警日志已清理,保留 $days 天内的记录"
print_message $BLUE "备份文件: $backup_file"
else
print_message $YELLOW "告警日志文件不存在"
fi
}
# 生成告警统计报告
generate_alert_stats() {
if [ ! -f "$ALERT_LOG" ]; then
print_message $YELLOW "告警日志文件不存在"
return
fi
print_message $BLUE "告警统计报告:"
echo ""
# 按严重程度统计
print_message $BLUE "按严重程度统计:"
echo "CRITICAL: $(grep -c "CRITICAL" "$ALERT_LOG" || echo "0")"
echo "WARNING: $(grep -c "WARNING" "$ALERT_LOG" || echo "0")"
echo "INFO: $(grep -c "INFO" "$ALERT_LOG" || echo "0")"
echo ""
# 按服务统计
print_message $BLUE "按服务统计:"
grep -o '\[.*\].*\[.*\]' "$ALERT_LOG" | awk -F']' '{print $2}' | sed 's/\[//' | sort | uniq -c | sort -nr
echo ""
# 最近24小时统计
local yesterday=$(date -d "1 day ago" '+%Y-%m-%d')
local recent_alerts=$(grep "$yesterday\|$(date '+%Y-%m-%d')" "$ALERT_LOG" | wc -l)
print_message $BLUE "最近24小时告警数: $recent_alerts"
}
# 测试告警系统
test_alerts() {
print_message $BLUE "测试告警系统..."
log_alert "INFO" "SYSTEM" "告警系统测试开始"
log_alert "WARNING" "TEST" "这是一个测试警告"
log_alert "CRITICAL" "TEST" "这是一个测试严重告警"
log_alert "INFO" "SYSTEM" "告警系统测试完成"
print_message $GREEN "告警系统测试完成,请检查日志文件: $ALERT_LOG"
}
# 显示帮助信息
show_help() {
echo "QAUP 告警管理脚本"
echo ""
echo "用法: $0 [命令] [选项]"
echo ""
echo "命令:"
echo " check 运行所有告警检查"
echo " monitor [interval] 持续监控模式 (默认300秒)"
echo " history [lines] 查看告警历史"
echo " cleanup [days] 清理告警日志 (默认30天)"
echo " stats 生成告警统计报告"
echo " test 测试告警系统"
echo ""
echo "示例:"
echo " $0 check # 运行一次检查"
echo " $0 monitor 60 # 每60秒监控一次"
echo " $0 history 100 # 查看最近100条告警"
echo " $0 cleanup 7 # 清理7天前的告警日志"
}
# 主函数
main() {
if [ $# -eq 0 ]; then
show_help
exit 0
fi
local command=$1
shift
case $command in
check)
run_all_checks
;;
monitor)
continuous_monitoring "$@"
;;
history)
view_alert_history "$@"
;;
cleanup)
cleanup_alert_logs "$@"
;;
stats)
generate_alert_stats
;;
test)
test_alerts
;;
help|--help|-h)
show_help
;;
*)
print_message $RED "未知命令: $command"
show_help
exit 1
;;
esac
}
main "$@"