#!/bin/bash # QAUP 告警管理脚本 set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" ALERT_LOG="$PROJECT_ROOT/logs/alerts.log" ALERT_CONFIG="$SCRIPT_DIR/healthcheck.yml" # 颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' print_message() { local color=$1 local message=$2 echo -e "${color}${message}${NC}" } # 获取时间戳 get_timestamp() { date '+%Y-%m-%d %H:%M:%S' } # 记录告警 log_alert() { local severity=$1 local service=$2 local message=$3 local timestamp=$(get_timestamp) # 确保日志目录存在 mkdir -p "$(dirname "$ALERT_LOG")" # 记录到日志文件 echo "[$timestamp] [$severity] [$service] $message" >> "$ALERT_LOG" # 输出到控制台 case $severity in "CRITICAL") print_message $RED "🚨 CRITICAL [$service]: $message" ;; "WARNING") print_message $YELLOW "⚠️ WARNING [$service]: $message" ;; "INFO") print_message $BLUE "ℹ️ INFO [$service]: $message" ;; *) print_message $GREEN "✓ [$service]: $message" ;; esac } # 检查容器状态 check_container_alerts() { local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx") for container in "${containers[@]}"; do if ! docker ps --format "{{.Names}}" | grep -q "^${container}$"; then log_alert "CRITICAL" "$container" "容器已停止运行" elif docker ps --format "{{.Names}}\t{{.Status}}" | grep "$container" | grep -q "unhealthy"; then log_alert "WARNING" "$container" "容器健康检查失败" fi done } # 检查资源使用率 check_resource_alerts() { # 检查磁盘使用率 local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//') if [ $disk_usage -gt 80 ]; then log_alert "WARNING" "SYSTEM" "磁盘使用率过高: ${disk_usage}%" fi # 检查内存使用率 local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}') if [ $mem_usage -gt 85 ]; then log_alert "WARNING" "SYSTEM" "内存使用率过高: ${mem_usage}%" fi # 检查容器资源使用 while IFS= read -r line; do local container=$(echo "$line" | awk '{print $1}') local cpu_usage=$(echo "$line" | awk '{print $2}' | sed 's/%//') local mem_usage=$(echo "$line" | awk '{print $7}' | sed 's/%//') if (( $(echo "$cpu_usage > 80" | bc -l) )); then log_alert "WARNING" "$container" "CPU使用率过高: ${cpu_usage}%" fi if (( $(echo "$mem_usage > 85" | bc -l) )); then log_alert "WARNING" "$container" "内存使用率过高: ${mem_usage}%" fi done < <(docker stats --no-stream --format "{{.Container}} {{.CPUPerc}} {{.MemUsage}} {{.MemPerc}}" 2>/dev/null) } # 检查服务响应时间 check_response_time_alerts() { # 检查应用响应时间 local app_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost:8080/actuator/health 2>/dev/null || echo "999") local app_time_ms=$(echo "$app_response_time * 1000" | bc) if (( $(echo "$app_time_ms > 5000" | bc -l) )); then log_alert "WARNING" "qaup-app" "响应时间过长: ${app_time_ms}ms" fi # 检查 Nginx 响应时间 local nginx_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost/health 2>/dev/null || echo "999") local nginx_time_ms=$(echo "$nginx_response_time * 1000" | bc) if (( $(echo "$nginx_time_ms > 2000" | bc -l) )); then log_alert "WARNING" "qaup-nginx" "响应时间过长: ${nginx_time_ms}ms" fi } # 检查错误日志 check_error_log_alerts() { local error_patterns=("ERROR" "FATAL" "Exception" "failed" "timeout") local containers=("qaup-app" "qaup-nginx" "qaup-postgres" "qaup-redis") for container in "${containers[@]}"; do # 检查最近5分钟的日志 local recent_logs=$(docker logs "$container" --since 5m 2>&1) for pattern in "${error_patterns[@]}"; do local error_count=$(echo "$recent_logs" | grep -c "$pattern" || echo "0") if [ $error_count -gt 10 ]; then log_alert "WARNING" "$container" "检测到大量错误日志: $pattern ($error_count 次)" fi done done } # 检查数据库连接 check_database_alerts() { # 检查数据库连接数 local connection_count=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | xargs || echo "0") if [ $connection_count -gt 150 ]; then log_alert "WARNING" "qaup-postgres" "数据库连接数过多: $connection_count" fi # 检查数据库锁等待 local lock_waits=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock';" 2>/dev/null | xargs || echo "0") if [ $lock_waits -gt 5 ]; then log_alert "WARNING" "qaup-postgres" "检测到数据库锁等待: $lock_waits 个连接" fi } # 发送邮件告警 send_email_alert() { local subject=$1 local message=$2 local recipients=${3:-"admin@example.com"} # 这里可以集成实际的邮件发送服务 # 例如使用 sendmail, postfix 或者第三方邮件服务 print_message $BLUE "邮件告警功能需要配置邮件服务器" print_message $YELLOW "主题: $subject" print_message $YELLOW "内容: $message" print_message $YELLOW "收件人: $recipients" } # 发送钉钉/企业微信告警 send_webhook_alert() { local webhook_url=$1 local message=$2 if [ -n "$webhook_url" ]; then curl -X POST "$webhook_url" \ -H 'Content-Type: application/json' \ -d "{\"text\":\"$message\"}" \ &>/dev/null if [ $? -eq 0 ]; then print_message $GREEN "Webhook 告警发送成功" else print_message $RED "Webhook 告警发送失败" fi fi } # 运行所有检查 run_all_checks() { print_message $BLUE "开始系统告警检查..." check_container_alerts check_resource_alerts check_response_time_alerts check_error_log_alerts check_database_alerts print_message $GREEN "告警检查完成" } # 持续监控模式 continuous_monitoring() { local interval=${1:-300} # 默认5分钟间隔 print_message $GREEN "开始持续告警监控 (间隔: ${interval}秒)" print_message $YELLOW "按 Ctrl+C 停止监控" while true; do run_all_checks sleep $interval done } # 查看告警历史 view_alert_history() { local lines=${1:-50} local filter=${2:-""} if [ ! -f "$ALERT_LOG" ]; then print_message $YELLOW "告警日志文件不存在: $ALERT_LOG" return fi print_message $BLUE "告警历史 (最近 $lines 条):" if [ -n "$filter" ]; then tail -n $lines "$ALERT_LOG" | grep "$filter" else tail -n $lines "$ALERT_LOG" fi } # 清理告警日志 cleanup_alert_logs() { local days=${1:-30} if [ -f "$ALERT_LOG" ]; then # 备份当前日志 local backup_file="${ALERT_LOG}.backup.$(date +%Y%m%d)" cp "$ALERT_LOG" "$backup_file" # 只保留最近指定天数的日志 local cutoff_date=$(date -d "$days days ago" '+%Y-%m-%d') awk -v cutoff="$cutoff_date" '$0 >= "["cutoff {print}' "$ALERT_LOG" > "${ALERT_LOG}.tmp" mv "${ALERT_LOG}.tmp" "$ALERT_LOG" print_message $GREEN "告警日志已清理,保留 $days 天内的记录" print_message $BLUE "备份文件: $backup_file" else print_message $YELLOW "告警日志文件不存在" fi } # 生成告警统计报告 generate_alert_stats() { if [ ! -f "$ALERT_LOG" ]; then print_message $YELLOW "告警日志文件不存在" return fi print_message $BLUE "告警统计报告:" echo "" # 按严重程度统计 print_message $BLUE "按严重程度统计:" echo "CRITICAL: $(grep -c "CRITICAL" "$ALERT_LOG" || echo "0")" echo "WARNING: $(grep -c "WARNING" "$ALERT_LOG" || echo "0")" echo "INFO: $(grep -c "INFO" "$ALERT_LOG" || echo "0")" echo "" # 按服务统计 print_message $BLUE "按服务统计:" grep -o '\[.*\].*\[.*\]' "$ALERT_LOG" | awk -F']' '{print $2}' | sed 's/\[//' | sort | uniq -c | sort -nr echo "" # 最近24小时统计 local yesterday=$(date -d "1 day ago" '+%Y-%m-%d') local recent_alerts=$(grep "$yesterday\|$(date '+%Y-%m-%d')" "$ALERT_LOG" | wc -l) print_message $BLUE "最近24小时告警数: $recent_alerts" } # 测试告警系统 test_alerts() { print_message $BLUE "测试告警系统..." log_alert "INFO" "SYSTEM" "告警系统测试开始" log_alert "WARNING" "TEST" "这是一个测试警告" log_alert "CRITICAL" "TEST" "这是一个测试严重告警" log_alert "INFO" "SYSTEM" "告警系统测试完成" print_message $GREEN "告警系统测试完成,请检查日志文件: $ALERT_LOG" } # 显示帮助信息 show_help() { echo "QAUP 告警管理脚本" echo "" echo "用法: $0 [命令] [选项]" echo "" echo "命令:" echo " check 运行所有告警检查" echo " monitor [interval] 持续监控模式 (默认300秒)" echo " history [lines] 查看告警历史" echo " cleanup [days] 清理告警日志 (默认30天)" echo " stats 生成告警统计报告" echo " test 测试告警系统" echo "" echo "示例:" echo " $0 check # 运行一次检查" echo " $0 monitor 60 # 每60秒监控一次" echo " $0 history 100 # 查看最近100条告警" echo " $0 cleanup 7 # 清理7天前的告警日志" } # 主函数 main() { if [ $# -eq 0 ]; then show_help exit 0 fi local command=$1 shift case $command in check) run_all_checks ;; monitor) continuous_monitoring "$@" ;; history) view_alert_history "$@" ;; cleanup) cleanup_alert_logs "$@" ;; stats) generate_alert_stats ;; test) test_alerts ;; help|--help|-h) show_help ;; *) print_message $RED "未知命令: $command" show_help exit 1 ;; esac } main "$@"