359 lines
11 KiB
Bash
Executable File
359 lines
11 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# QAUP 告警管理脚本
|
||
|
||
set -e
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
ALERT_LOG="$PROJECT_ROOT/logs/alerts.log"
|
||
ALERT_CONFIG="$SCRIPT_DIR/healthcheck.yml"
|
||
|
||
# 颜色输出
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m'
|
||
|
||
print_message() {
|
||
local color=$1
|
||
local message=$2
|
||
echo -e "${color}${message}${NC}"
|
||
}
|
||
|
||
# 获取时间戳
|
||
get_timestamp() {
|
||
date '+%Y-%m-%d %H:%M:%S'
|
||
}
|
||
|
||
# 记录告警
|
||
log_alert() {
|
||
local severity=$1
|
||
local service=$2
|
||
local message=$3
|
||
local timestamp=$(get_timestamp)
|
||
|
||
# 确保日志目录存在
|
||
mkdir -p "$(dirname "$ALERT_LOG")"
|
||
|
||
# 记录到日志文件
|
||
echo "[$timestamp] [$severity] [$service] $message" >> "$ALERT_LOG"
|
||
|
||
# 输出到控制台
|
||
case $severity in
|
||
"CRITICAL")
|
||
print_message $RED "🚨 CRITICAL [$service]: $message"
|
||
;;
|
||
"WARNING")
|
||
print_message $YELLOW "⚠️ WARNING [$service]: $message"
|
||
;;
|
||
"INFO")
|
||
print_message $BLUE "ℹ️ INFO [$service]: $message"
|
||
;;
|
||
*)
|
||
print_message $GREEN "✓ [$service]: $message"
|
||
;;
|
||
esac
|
||
}
|
||
|
||
# 检查容器状态
|
||
check_container_alerts() {
|
||
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
|
||
|
||
for container in "${containers[@]}"; do
|
||
if ! docker ps --format "{{.Names}}" | grep -q "^${container}$"; then
|
||
log_alert "CRITICAL" "$container" "容器已停止运行"
|
||
elif docker ps --format "{{.Names}}\t{{.Status}}" | grep "$container" | grep -q "unhealthy"; then
|
||
log_alert "WARNING" "$container" "容器健康检查失败"
|
||
fi
|
||
done
|
||
}
|
||
|
||
# 检查资源使用率
|
||
check_resource_alerts() {
|
||
# 检查磁盘使用率
|
||
local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//')
|
||
if [ $disk_usage -gt 80 ]; then
|
||
log_alert "WARNING" "SYSTEM" "磁盘使用率过高: ${disk_usage}%"
|
||
fi
|
||
|
||
# 检查内存使用率
|
||
local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
|
||
if [ $mem_usage -gt 85 ]; then
|
||
log_alert "WARNING" "SYSTEM" "内存使用率过高: ${mem_usage}%"
|
||
fi
|
||
|
||
# 检查容器资源使用
|
||
while IFS= read -r line; do
|
||
local container=$(echo "$line" | awk '{print $1}')
|
||
local cpu_usage=$(echo "$line" | awk '{print $2}' | sed 's/%//')
|
||
local mem_usage=$(echo "$line" | awk '{print $7}' | sed 's/%//')
|
||
|
||
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
|
||
log_alert "WARNING" "$container" "CPU使用率过高: ${cpu_usage}%"
|
||
fi
|
||
|
||
if (( $(echo "$mem_usage > 85" | bc -l) )); then
|
||
log_alert "WARNING" "$container" "内存使用率过高: ${mem_usage}%"
|
||
fi
|
||
done < <(docker stats --no-stream --format "{{.Container}} {{.CPUPerc}} {{.MemUsage}} {{.MemPerc}}" 2>/dev/null)
|
||
}
|
||
|
||
# 检查服务响应时间
|
||
check_response_time_alerts() {
|
||
# 检查应用响应时间
|
||
local app_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost:8080/actuator/health 2>/dev/null || echo "999")
|
||
local app_time_ms=$(echo "$app_response_time * 1000" | bc)
|
||
|
||
if (( $(echo "$app_time_ms > 5000" | bc -l) )); then
|
||
log_alert "WARNING" "qaup-app" "响应时间过长: ${app_time_ms}ms"
|
||
fi
|
||
|
||
# 检查 Nginx 响应时间
|
||
local nginx_response_time=$(curl -o /dev/null -s -w '%{time_total}' http://localhost/health 2>/dev/null || echo "999")
|
||
local nginx_time_ms=$(echo "$nginx_response_time * 1000" | bc)
|
||
|
||
if (( $(echo "$nginx_time_ms > 2000" | bc -l) )); then
|
||
log_alert "WARNING" "qaup-nginx" "响应时间过长: ${nginx_time_ms}ms"
|
||
fi
|
||
}
|
||
|
||
# 检查错误日志
|
||
check_error_log_alerts() {
|
||
local error_patterns=("ERROR" "FATAL" "Exception" "failed" "timeout")
|
||
local containers=("qaup-app" "qaup-nginx" "qaup-postgres" "qaup-redis")
|
||
|
||
for container in "${containers[@]}"; do
|
||
# 检查最近5分钟的日志
|
||
local recent_logs=$(docker logs "$container" --since 5m 2>&1)
|
||
|
||
for pattern in "${error_patterns[@]}"; do
|
||
local error_count=$(echo "$recent_logs" | grep -c "$pattern" || echo "0")
|
||
|
||
if [ $error_count -gt 10 ]; then
|
||
log_alert "WARNING" "$container" "检测到大量错误日志: $pattern ($error_count 次)"
|
||
fi
|
||
done
|
||
done
|
||
}
|
||
|
||
# 检查数据库连接
|
||
check_database_alerts() {
|
||
# 检查数据库连接数
|
||
local connection_count=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | xargs || echo "0")
|
||
|
||
if [ $connection_count -gt 150 ]; then
|
||
log_alert "WARNING" "qaup-postgres" "数据库连接数过多: $connection_count"
|
||
fi
|
||
|
||
# 检查数据库锁等待
|
||
local lock_waits=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock';" 2>/dev/null | xargs || echo "0")
|
||
|
||
if [ $lock_waits -gt 5 ]; then
|
||
log_alert "WARNING" "qaup-postgres" "检测到数据库锁等待: $lock_waits 个连接"
|
||
fi
|
||
}
|
||
|
||
# 发送邮件告警
|
||
send_email_alert() {
|
||
local subject=$1
|
||
local message=$2
|
||
local recipients=${3:-"admin@example.com"}
|
||
|
||
# 这里可以集成实际的邮件发送服务
|
||
# 例如使用 sendmail, postfix 或者第三方邮件服务
|
||
|
||
print_message $BLUE "邮件告警功能需要配置邮件服务器"
|
||
print_message $YELLOW "主题: $subject"
|
||
print_message $YELLOW "内容: $message"
|
||
print_message $YELLOW "收件人: $recipients"
|
||
}
|
||
|
||
# 发送钉钉/企业微信告警
|
||
send_webhook_alert() {
|
||
local webhook_url=$1
|
||
local message=$2
|
||
|
||
if [ -n "$webhook_url" ]; then
|
||
curl -X POST "$webhook_url" \
|
||
-H 'Content-Type: application/json' \
|
||
-d "{\"text\":\"$message\"}" \
|
||
&>/dev/null
|
||
|
||
if [ $? -eq 0 ]; then
|
||
print_message $GREEN "Webhook 告警发送成功"
|
||
else
|
||
print_message $RED "Webhook 告警发送失败"
|
||
fi
|
||
fi
|
||
}
|
||
|
||
# 运行所有检查
|
||
run_all_checks() {
|
||
print_message $BLUE "开始系统告警检查..."
|
||
|
||
check_container_alerts
|
||
check_resource_alerts
|
||
check_response_time_alerts
|
||
check_error_log_alerts
|
||
check_database_alerts
|
||
|
||
print_message $GREEN "告警检查完成"
|
||
}
|
||
|
||
# 持续监控模式
|
||
continuous_monitoring() {
|
||
local interval=${1:-300} # 默认5分钟间隔
|
||
|
||
print_message $GREEN "开始持续告警监控 (间隔: ${interval}秒)"
|
||
print_message $YELLOW "按 Ctrl+C 停止监控"
|
||
|
||
while true; do
|
||
run_all_checks
|
||
sleep $interval
|
||
done
|
||
}
|
||
|
||
# 查看告警历史
|
||
view_alert_history() {
|
||
local lines=${1:-50}
|
||
local filter=${2:-""}
|
||
|
||
if [ ! -f "$ALERT_LOG" ]; then
|
||
print_message $YELLOW "告警日志文件不存在: $ALERT_LOG"
|
||
return
|
||
fi
|
||
|
||
print_message $BLUE "告警历史 (最近 $lines 条):"
|
||
|
||
if [ -n "$filter" ]; then
|
||
tail -n $lines "$ALERT_LOG" | grep "$filter"
|
||
else
|
||
tail -n $lines "$ALERT_LOG"
|
||
fi
|
||
}
|
||
|
||
# 清理告警日志
|
||
cleanup_alert_logs() {
|
||
local days=${1:-30}
|
||
|
||
if [ -f "$ALERT_LOG" ]; then
|
||
# 备份当前日志
|
||
local backup_file="${ALERT_LOG}.backup.$(date +%Y%m%d)"
|
||
cp "$ALERT_LOG" "$backup_file"
|
||
|
||
# 只保留最近指定天数的日志
|
||
local cutoff_date=$(date -d "$days days ago" '+%Y-%m-%d')
|
||
awk -v cutoff="$cutoff_date" '$0 >= "["cutoff {print}' "$ALERT_LOG" > "${ALERT_LOG}.tmp"
|
||
mv "${ALERT_LOG}.tmp" "$ALERT_LOG"
|
||
|
||
print_message $GREEN "告警日志已清理,保留 $days 天内的记录"
|
||
print_message $BLUE "备份文件: $backup_file"
|
||
else
|
||
print_message $YELLOW "告警日志文件不存在"
|
||
fi
|
||
}
|
||
|
||
# 生成告警统计报告
|
||
generate_alert_stats() {
|
||
if [ ! -f "$ALERT_LOG" ]; then
|
||
print_message $YELLOW "告警日志文件不存在"
|
||
return
|
||
fi
|
||
|
||
print_message $BLUE "告警统计报告:"
|
||
echo ""
|
||
|
||
# 按严重程度统计
|
||
print_message $BLUE "按严重程度统计:"
|
||
echo "CRITICAL: $(grep -c "CRITICAL" "$ALERT_LOG" || echo "0")"
|
||
echo "WARNING: $(grep -c "WARNING" "$ALERT_LOG" || echo "0")"
|
||
echo "INFO: $(grep -c "INFO" "$ALERT_LOG" || echo "0")"
|
||
echo ""
|
||
|
||
# 按服务统计
|
||
print_message $BLUE "按服务统计:"
|
||
grep -o '\[.*\].*\[.*\]' "$ALERT_LOG" | awk -F']' '{print $2}' | sed 's/\[//' | sort | uniq -c | sort -nr
|
||
echo ""
|
||
|
||
# 最近24小时统计
|
||
local yesterday=$(date -d "1 day ago" '+%Y-%m-%d')
|
||
local recent_alerts=$(grep "$yesterday\|$(date '+%Y-%m-%d')" "$ALERT_LOG" | wc -l)
|
||
print_message $BLUE "最近24小时告警数: $recent_alerts"
|
||
}
|
||
|
||
# 测试告警系统
|
||
test_alerts() {
|
||
print_message $BLUE "测试告警系统..."
|
||
|
||
log_alert "INFO" "SYSTEM" "告警系统测试开始"
|
||
log_alert "WARNING" "TEST" "这是一个测试警告"
|
||
log_alert "CRITICAL" "TEST" "这是一个测试严重告警"
|
||
log_alert "INFO" "SYSTEM" "告警系统测试完成"
|
||
|
||
print_message $GREEN "告警系统测试完成,请检查日志文件: $ALERT_LOG"
|
||
}
|
||
|
||
# 显示帮助信息
|
||
show_help() {
|
||
echo "QAUP 告警管理脚本"
|
||
echo ""
|
||
echo "用法: $0 [命令] [选项]"
|
||
echo ""
|
||
echo "命令:"
|
||
echo " check 运行所有告警检查"
|
||
echo " monitor [interval] 持续监控模式 (默认300秒)"
|
||
echo " history [lines] 查看告警历史"
|
||
echo " cleanup [days] 清理告警日志 (默认30天)"
|
||
echo " stats 生成告警统计报告"
|
||
echo " test 测试告警系统"
|
||
echo ""
|
||
echo "示例:"
|
||
echo " $0 check # 运行一次检查"
|
||
echo " $0 monitor 60 # 每60秒监控一次"
|
||
echo " $0 history 100 # 查看最近100条告警"
|
||
echo " $0 cleanup 7 # 清理7天前的告警日志"
|
||
}
|
||
|
||
# 主函数
|
||
main() {
|
||
if [ $# -eq 0 ]; then
|
||
show_help
|
||
exit 0
|
||
fi
|
||
|
||
local command=$1
|
||
shift
|
||
|
||
case $command in
|
||
check)
|
||
run_all_checks
|
||
;;
|
||
monitor)
|
||
continuous_monitoring "$@"
|
||
;;
|
||
history)
|
||
view_alert_history "$@"
|
||
;;
|
||
cleanup)
|
||
cleanup_alert_logs "$@"
|
||
;;
|
||
stats)
|
||
generate_alert_stats
|
||
;;
|
||
test)
|
||
test_alerts
|
||
;;
|
||
help|--help|-h)
|
||
show_help
|
||
;;
|
||
*)
|
||
print_message $RED "未知命令: $command"
|
||
show_help
|
||
exit 1
|
||
;;
|
||
esac
|
||
}
|
||
|
||
main "$@" |