#!/bin/bash # QAUP 系统监控脚本 set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" ENV_FILE="$PROJECT_ROOT/.env" # 颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' print_message() { local color=$1 local message=$2 echo -e "${color}${message}${NC}" } # 获取时间戳 get_timestamp() { date '+%Y-%m-%d %H:%M:%S' } # 检查容器状态 check_container_status() { print_message $BLUE "检查容器状态..." local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx") local unhealthy_containers=() for container in "${containers[@]}"; do if docker ps --format "table {{.Names}}\t{{.Status}}" | grep -q "$container"; then local status=$(docker ps --format "{{.Status}}" --filter "name=$container") if [[ "$status" == *"healthy"* ]] || [[ "$status" == *"Up"* ]]; then print_message $GREEN "✓ $container: 运行正常" else print_message $RED "✗ $container: $status" unhealthy_containers+=("$container") fi else print_message $RED "✗ $container: 未运行" unhealthy_containers+=("$container") fi done if [ ${#unhealthy_containers[@]} -gt 0 ]; then print_message $YELLOW "异常容器: ${unhealthy_containers[*]}" return 1 fi return 0 } # 检查服务健康状态 check_service_health() { print_message $BLUE "检查服务健康状态..." local errors=0 # 检查数据库连接 if docker exec qaup-postgres pg_isready -h localhost -p 5432 -U postgres &>/dev/null; then print_message $GREEN "✓ PostgreSQL: 连接正常" else print_message $RED "✗ PostgreSQL: 连接失败" errors=$((errors + 1)) fi # 检查 Redis 连接 if docker exec qaup-redis redis-cli ping &>/dev/null; then print_message $GREEN "✓ Redis: 连接正常" else print_message $RED "✗ Redis: 连接失败" errors=$((errors + 1)) fi # 检查应用健康端点 if curl -f -s http://localhost:8080/actuator/health &>/dev/null; then print_message $GREEN "✓ QAUP 应用: 健康检查通过" else print_message $RED "✗ QAUP 应用: 健康检查失败" errors=$((errors + 1)) fi # 检查 Nginx if curl -f -s http://localhost/health &>/dev/null; then print_message $GREEN "✓ Nginx: 健康检查通过" else print_message $RED "✗ Nginx: 健康检查失败" errors=$((errors + 1)) fi return $errors } # 检查资源使用情况 check_resource_usage() { print_message $BLUE "检查资源使用情况..." # 检查容器资源使用 echo "" print_message $BLUE "容器资源使用:" docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" echo "" print_message $BLUE "系统资源使用:" # CPU 使用率 local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//') print_message $GREEN "CPU 使用率: $cpu_usage" # 内存使用情况 local mem_info=$(free -h | awk 'NR==2{printf "已使用: %s/%s (%.2f%%)", $3,$2,$3*100/$2}') print_message $GREEN "内存使用: $mem_info" # 磁盘使用情况 local disk_usage=$(df -h . | awk 'NR==2{printf "已使用: %s/%s (%s)", $3,$2,$5}') print_message $GREEN "磁盘使用: $disk_usage" } # 检查日志文件大小 check_log_sizes() { print_message $BLUE "检查日志文件大小..." local log_dirs=("./logs/postgres" "./logs/redis" "./logs/app" "./logs/nginx") local large_logs=() for log_dir in "${log_dirs[@]}"; do if [ -d "$log_dir" ]; then local total_size=$(du -sh "$log_dir" 2>/dev/null | cut -f1) print_message $GREEN "✓ $log_dir: $total_size" # 检查是否有大于 100MB 的日志文件 while IFS= read -r -d '' file; do local file_size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null) if [ $file_size -gt 104857600 ]; then # 100MB large_logs+=("$file") fi done < <(find "$log_dir" -name "*.log" -print0 2>/dev/null) fi done if [ ${#large_logs[@]} -gt 0 ]; then print_message $YELLOW "发现大日志文件:" for log in "${large_logs[@]}"; do local size=$(du -sh "$log" | cut -f1) print_message $YELLOW " $log: $size" done print_message $YELLOW "建议配置日志轮转" fi } # 检查网络连接 check_network_connectivity() { print_message $BLUE "检查网络连接..." # 检查容器间网络连接 if docker exec qaup-app nc -z qaup-postgres 5432 &>/dev/null; then print_message $GREEN "✓ 应用 -> 数据库: 连接正常" else print_message $RED "✗ 应用 -> 数据库: 连接失败" fi if docker exec qaup-app nc -z qaup-redis 6379 &>/dev/null; then print_message $GREEN "✓ 应用 -> Redis: 连接正常" else print_message $RED "✗ 应用 -> Redis: 连接失败" fi if docker exec qaup-nginx nc -z qaup-app 8080 &>/dev/null; then print_message $GREEN "✓ Nginx -> 应用: 连接正常" else print_message $RED "✗ Nginx -> 应用: 连接失败" fi } # 生成监控报告 generate_monitoring_report() { local timestamp=$(get_timestamp) local report_file="./logs/monitoring_report_$(date +%Y%m%d_%H%M%S).txt" print_message $BLUE "生成监控报告: $report_file" { echo "QAUP 系统监控报告" echo "生成时间: $timestamp" echo "========================================" echo "" echo "容器状态:" docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" echo "" echo "资源使用情况:" docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" echo "" echo "系统资源:" free -h echo "" df -h echo "" echo "网络状态:" netstat -tuln | grep -E ":(80|443|8080|5432|6379) " echo "" } > "$report_file" print_message $GREEN "监控报告已生成: $report_file" } # 持续监控模式 continuous_monitoring() { local interval=${1:-60} # 默认60秒间隔 print_message $GREEN "开始持续监控模式 (间隔: ${interval}秒)" print_message $YELLOW "按 Ctrl+C 停止监控" while true; do clear print_message $BLUE "=========================================" print_message $BLUE "QAUP 系统监控 - $(get_timestamp)" print_message $BLUE "=========================================" check_container_status echo "" check_service_health echo "" check_resource_usage print_message $BLUE "=========================================" print_message $YELLOW "下次检查: $(date -d "+${interval} seconds" '+%H:%M:%S')" sleep $interval done } # 告警检查 check_alerts() { print_message $BLUE "检查系统告警..." local alerts=() # 检查容器状态 local unhealthy_containers=$(docker ps -a --format "{{.Names}}" --filter "health=unhealthy") if [ -n "$unhealthy_containers" ]; then alerts+=("不健康的容器: $unhealthy_containers") fi # 检查磁盘使用率 local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//') if [ $disk_usage -gt 80 ]; then alerts+=("磁盘使用率过高: ${disk_usage}%") fi # 检查内存使用率 local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}') if [ $mem_usage -gt 85 ]; then alerts+=("内存使用率过高: ${mem_usage}%") fi # 输出告警 if [ ${#alerts[@]} -gt 0 ]; then print_message $RED "发现告警:" for alert in "${alerts[@]}"; do print_message $RED " ⚠ $alert" done return 1 else print_message $GREEN "✓ 无告警" return 0 fi } # 显示帮助信息 show_help() { echo "QAUP 系统监控脚本" echo "" echo "用法: $0 [命令] [选项]" echo "" echo "命令:" echo " status 检查系统状态" echo " health 检查服务健康状态" echo " resources 检查资源使用情况" echo " logs 检查日志文件" echo " network 检查网络连接" echo " alerts 检查系统告警" echo " report 生成监控报告" echo " watch 持续监控模式" echo "" echo "示例:" echo " $0 status # 检查系统状态" echo " $0 watch 30 # 每30秒监控一次" echo " $0 report # 生成监控报告" } # 主函数 main() { if [ $# -eq 0 ]; then show_help exit 0 fi local command=$1 shift case $command in status) check_container_status ;; health) check_service_health ;; resources) check_resource_usage ;; logs) check_log_sizes ;; network) check_network_connectivity ;; alerts) check_alerts ;; report) generate_monitoring_report ;; watch) continuous_monitoring "$@" ;; help|--help|-h) show_help ;; *) print_message $RED "未知命令: $command" show_help exit 1 ;; esac } main "$@"