QAUP_Management/deploy/docker/alert-manager.sh

#!/bin/bash

# QAUP 告警管理脚本

set -e

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
ALERT_LOG="$PROJECT_ROOT/logs/alerts.log"
ALERT_CONFIG="$SCRIPT_DIR/healthcheck.yml"

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

print_message() {
    local color=$1
    local message=$2
    echo -e "${color}${message}${NC}"
}

# 获取时间戳
get_timestamp() {
    date '+%Y-%m-%d %H:%M:%S'
}

# 记录告警
log_alert() {
    local severity=$1
    local service=$2
    local message=$3
    local timestamp=$(get_timestamp)

    # 确保日志目录存在
    mkdir -p "$(dirname "$ALERT_LOG")"

    # 记录到日志文件
    echo "[$timestamp] [$severity] [$service] $message" >> "$ALERT_LOG"

    # 输出到控制台
    case $severity in
        "CRITICAL")
            print_message $RED "🚨 CRITICAL [$service]: $message"
            ;;
        "WARNING")
            print_message $YELLOW "⚠️  WARNING [$service]: $message"
            ;;
        "INFO")
            print_message $BLUE "ℹ️  INFO [$service]: $message"
            ;;
        *)
            print_message $GREEN "✓ [$service]: $message"
            ;;
    esac
}

# 检查容器状态
check_container_alerts() {
    local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")

    for container in "${containers[@]}"; do
        if ! docker ps --format "{{.Names}}" | grep -q "^${container}$"; then
            log_alert "CRITICAL" "$container" "容器已停止运行"
        elif docker ps --format "{{.Names}}\t{{.Status}}" | grep "$container" | grep -q "unhealthy"; then
            log_alert "WARNING" "$container" "容器健康检查失败"
        fi
    done
}

# 检查资源使用率
check_resource_alerts() {
    # 检查磁盘使用率
    local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//')
    if [ $disk_usage -gt 80 ]; then
        log_alert "WARNING" "SYSTEM" "磁盘使用率过高: ${disk_usage}%"
    fi

    # 检查内存使用率
    local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
    if [ $mem_usage -gt 85 ]; then
        log_alert "WARNING" "SYSTEM" "内存使用率过高: ${mem_usage}%"
    fi

    # 检查容器资源使用
    while IFS= read -r line; do
        local container=$(echo "$line" | awk '{print $1}')
        local cpu_usage=$(echo "$line" | awk '{print $2}' | sed 's/%//')
        local mem_usage=$(echo "$line" | awk '{print $7}' | sed 's/%//')

        if (( $(echo "$cpu_usage > 80" | bc -l) )); then
            log_alert "WARNING" "$container" "CPU使用率过高: ${cpu_usage}%"
        fi

        if (( $(echo "$mem_usage > 85" | bc -l) )); then
            log_alert "WARNING" "$container" "内存使用率过高: ${mem_usage}%"
        fi
    done < <(docker stats --no-stream --format "{{.Container}} {{.CPUPerc}} {{.MemUsage}} {{.MemPerc}}" 2>/dev/null)
}

# 检查服务响应时间
check_response_time_alerts() {
    # 检查应用响应时间
    local app_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost:8080/actuator/health 2>/dev/null || echo "999")
    local app_time_ms=$(echo "$app_response_time * 1000" | bc)

    if (( $(echo "$app_time_ms > 5000" | bc -l) )); then
        log_alert "WARNING" "qaup-app" "响应时间过长: ${app_time_ms}ms"
    fi

    # 检查 Nginx 响应时间
    local nginx_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost/health 2>/dev/null || echo "999")
    local nginx_time_ms=$(echo "$nginx_response_time * 1000" | bc)

    if (( $(echo "$nginx_time_ms > 2000" | bc -l) )); then
        log_alert "WARNING" "qaup-nginx" "响应时间过长: ${nginx_time_ms}ms"
    fi
}

# 检查错误日志
check_error_log_alerts() {
    local error_patterns=("ERROR" "FATAL" "Exception" "failed" "timeout")
    local containers=("qaup-app" "qaup-nginx" "qaup-postgres" "qaup-redis")

    for container in "${containers[@]}"; do
        # 检查最近5分钟的日志
        local recent_logs=$(docker logs "$container" --since 5m 2>&1)

        for pattern in "${error_patterns[@]}"; do
            local error_count=$(echo "$recent_logs" | grep -c "$pattern" || echo "0")

            if [ $error_count -gt 10 ]; then
                log_alert "WARNING" "$container" "检测到大量错误日志: $pattern ($error_count 次)"
            fi
        done
    done
}

# 检查数据库连接
check_database_alerts() {
    # 检查数据库连接数
    local connection_count=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | xargs || echo "0")

    if [ $connection_count -gt 150 ]; then
        log_alert "WARNING" "qaup-postgres" "数据库连接数过多: $connection_count"
    fi

    # 检查数据库锁等待
    local lock_waits=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock';" 2>/dev/null | xargs || echo "0")

    if [ $lock_waits -gt 5 ]; then
        log_alert "WARNING" "qaup-postgres" "检测到数据库锁等待: $lock_waits 个连接"
    fi
}

# 发送邮件告警
send_email_alert() {
    local subject=$1
    local message=$2
    local recipients=${3:-"admin@example.com"}

    # 这里可以集成实际的邮件发送服务
    # 例如使用 sendmail, postfix 或者第三方邮件服务

    print_message $BLUE "邮件告警功能需要配置邮件服务器"
    print_message $YELLOW "主题: $subject"
    print_message $YELLOW "内容: $message"
    print_message $YELLOW "收件人: $recipients"
}

# 发送钉钉/企业微信告警
send_webhook_alert() {
    local webhook_url=$1
    local message=$2

    if [ -n "$webhook_url" ]; then
        curl -X POST "$webhook_url" \
            -H 'Content-Type: application/json' \
            -d "{\"text\":\"$message\"}" \
            &>/dev/null

        if [ $? -eq 0 ]; then
            print_message $GREEN "Webhook 告警发送成功"
        else
            print_message $RED "Webhook 告警发送失败"
        fi
    fi
}

# 运行所有检查
run_all_checks() {
    print_message $BLUE "开始系统告警检查..."

    check_container_alerts
    check_resource_alerts
    check_response_time_alerts
    check_error_log_alerts
    check_database_alerts

    print_message $GREEN "告警检查完成"
}

# 持续监控模式
continuous_monitoring() {
    local interval=${1:-300}  # 默认5分钟间隔

    print_message $GREEN "开始持续告警监控 (间隔: ${interval}秒)"
    print_message $YELLOW "按 Ctrl+C 停止监控"

    while true; do
        run_all_checks
        sleep $interval
    done
}

# 查看告警历史
view_alert_history() {
    local lines=${1:-50}
    local filter=${2:-""}

    if [ ! -f "$ALERT_LOG" ]; then
        print_message $YELLOW "告警日志文件不存在: $ALERT_LOG"
        return
    fi

    print_message $BLUE "告警历史 (最近 $lines 条):"

    if [ -n "$filter" ]; then
        tail -n $lines "$ALERT_LOG" | grep "$filter"
    else
        tail -n $lines "$ALERT_LOG"
    fi
}

# 清理告警日志
cleanup_alert_logs() {
    local days=${1:-30}

    if [ -f "$ALERT_LOG" ]; then
        # 备份当前日志
        local backup_file="${ALERT_LOG}.backup.$(date +%Y%m%d)"
        cp "$ALERT_LOG" "$backup_file"

        # 只保留最近指定天数的日志
        local cutoff_date=$(date -d "$days days ago" '+%Y-%m-%d')
        awk -v cutoff="$cutoff_date" '$0 >= "["cutoff {print}' "$ALERT_LOG" > "${ALERT_LOG}.tmp"
        mv "${ALERT_LOG}.tmp" "$ALERT_LOG"

        print_message $GREEN "告警日志已清理，保留 $days 天内的记录"
        print_message $BLUE "备份文件: $backup_file"
    else
        print_message $YELLOW "告警日志文件不存在"
    fi
}

# 生成告警统计报告
generate_alert_stats() {
    if [ ! -f "$ALERT_LOG" ]; then
        print_message $YELLOW "告警日志文件不存在"
        return
    fi

    print_message $BLUE "告警统计报告:"
    echo ""

    # 按严重程度统计
    print_message $BLUE "按严重程度统计:"
    echo "CRITICAL: $(grep -c "CRITICAL" "$ALERT_LOG" || echo "0")"
    echo "WARNING:  $(grep -c "WARNING" "$ALERT_LOG" || echo "0")"
    echo "INFO:     $(grep -c "INFO" "$ALERT_LOG" || echo "0")"
    echo ""

    # 按服务统计
    print_message $BLUE "按服务统计:"
    grep -o '\[.*\].*\[.*\]' "$ALERT_LOG" | awk -F']' '{print $2}' | sed 's/\[//' | sort | uniq -c | sort -nr
    echo ""

    # 最近24小时统计
    local yesterday=$(date -d "1 day ago" '+%Y-%m-%d')
    local recent_alerts=$(grep "$yesterday\|$(date '+%Y-%m-%d')" "$ALERT_LOG" | wc -l)
    print_message $BLUE "最近24小时告警数: $recent_alerts"
}

# 测试告警系统
test_alerts() {
    print_message $BLUE "测试告警系统..."

    log_alert "INFO" "SYSTEM" "告警系统测试开始"
    log_alert "WARNING" "TEST" "这是一个测试警告"
    log_alert "CRITICAL" "TEST" "这是一个测试严重告警"
    log_alert "INFO" "SYSTEM" "告警系统测试完成"

    print_message $GREEN "告警系统测试完成，请检查日志文件: $ALERT_LOG"
}

# 显示帮助信息
show_help() {
    echo "QAUP 告警管理脚本"
    echo ""
    echo "用法: $0 [命令] [选项]"
    echo ""
    echo "命令:"
    echo "  check               运行所有告警检查"
    echo "  monitor [interval]  持续监控模式 (默认300秒)"
    echo "  history [lines]     查看告警历史"
    echo "  cleanup [days]      清理告警日志 (默认30天)"
    echo "  stats               生成告警统计报告"
    echo "  test                测试告警系统"
    echo ""
    echo "示例:"
    echo "  $0 check                    # 运行一次检查"
    echo "  $0 monitor 60               # 每60秒监控一次"
    echo "  $0 history 100              # 查看最近100条告警"
    echo "  $0 cleanup 7                # 清理7天前的告警日志"
}

# 主函数
main() {
    if [ $# -eq 0 ]; then
        show_help
        exit 0
    fi

    local command=$1
    shift

    case $command in
        check)
            run_all_checks
            ;;
        monitor)
            continuous_monitoring "$@"
            ;;
        history)
            view_alert_history "$@"
            ;;
        cleanup)
            cleanup_alert_logs "$@"
            ;;
        stats)
            generate_alert_stats
            ;;
        test)
            test_alerts
            ;;
        help|--help|-h)
            show_help
            ;;
        *)
            print_message $RED "未知命令: $command"
            show_help
            exit 1
            ;;
    esac
}

main "$@"