QAUP_Management/deploy/docker/monitor.sh

348 lines
9.9 KiB
Bash
Executable File

#!/bin/bash
# QAUP 系统监控脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
ENV_FILE="$PROJECT_ROOT/.env"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_message() {
local color=$1
local message=$2
echo -e "${color}${message}${NC}"
}
# 获取时间戳
get_timestamp() {
date '+%Y-%m-%d %H:%M:%S'
}
# 检查容器状态
check_container_status() {
print_message $BLUE "检查容器状态..."
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
local unhealthy_containers=()
for container in "${containers[@]}"; do
if docker ps --format "table {{.Names}}\t{{.Status}}" | grep -q "$container"; then
local status=$(docker ps --format "{{.Status}}" --filter "name=$container")
if [[ "$status" == *"healthy"* ]] || [[ "$status" == *"Up"* ]]; then
print_message $GREEN "$container: 运行正常"
else
print_message $RED "$container: $status"
unhealthy_containers+=("$container")
fi
else
print_message $RED "$container: 未运行"
unhealthy_containers+=("$container")
fi
done
if [ ${#unhealthy_containers[@]} -gt 0 ]; then
print_message $YELLOW "异常容器: ${unhealthy_containers[*]}"
return 1
fi
return 0
}
# 检查服务健康状态
check_service_health() {
print_message $BLUE "检查服务健康状态..."
local errors=0
# 检查数据库连接
if docker exec qaup-postgres pg_isready -h localhost -p 5432 -U postgres &>/dev/null; then
print_message $GREEN "✓ PostgreSQL: 连接正常"
else
print_message $RED "✗ PostgreSQL: 连接失败"
errors=$((errors + 1))
fi
# 检查 Redis 连接
if docker exec qaup-redis redis-cli ping &>/dev/null; then
print_message $GREEN "✓ Redis: 连接正常"
else
print_message $RED "✗ Redis: 连接失败"
errors=$((errors + 1))
fi
# 检查应用健康端点
if curl -f -s http://localhost:8080/actuator/health &>/dev/null; then
print_message $GREEN "✓ QAUP 应用: 健康检查通过"
else
print_message $RED "✗ QAUP 应用: 健康检查失败"
errors=$((errors + 1))
fi
# 检查 Nginx
if curl -f -s http://localhost/health &>/dev/null; then
print_message $GREEN "✓ Nginx: 健康检查通过"
else
print_message $RED "✗ Nginx: 健康检查失败"
errors=$((errors + 1))
fi
return $errors
}
# 检查资源使用情况
check_resource_usage() {
print_message $BLUE "检查资源使用情况..."
# 检查容器资源使用
echo ""
print_message $BLUE "容器资源使用:"
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}"
echo ""
print_message $BLUE "系统资源使用:"
# CPU 使用率
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
print_message $GREEN "CPU 使用率: $cpu_usage"
# 内存使用情况
local mem_info=$(free -h | awk 'NR==2{printf "已使用: %s/%s (%.2f%%)", $3,$2,$3*100/$2}')
print_message $GREEN "内存使用: $mem_info"
# 磁盘使用情况
local disk_usage=$(df -h . | awk 'NR==2{printf "已使用: %s/%s (%s)", $3,$2,$5}')
print_message $GREEN "磁盘使用: $disk_usage"
}
# 检查日志文件大小
check_log_sizes() {
print_message $BLUE "检查日志文件大小..."
local log_dirs=("./logs/postgres" "./logs/redis" "./logs/app" "./logs/nginx")
local large_logs=()
for log_dir in "${log_dirs[@]}"; do
if [ -d "$log_dir" ]; then
local total_size=$(du -sh "$log_dir" 2>/dev/null | cut -f1)
print_message $GREEN "$log_dir: $total_size"
# 检查是否有大于 100MB 的日志文件
while IFS= read -r -d '' file; do
local file_size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null)
if [ $file_size -gt 104857600 ]; then # 100MB
large_logs+=("$file")
fi
done < <(find "$log_dir" -name "*.log" -print0 2>/dev/null)
fi
done
if [ ${#large_logs[@]} -gt 0 ]; then
print_message $YELLOW "发现大日志文件:"
for log in "${large_logs[@]}"; do
local size=$(du -sh "$log" | cut -f1)
print_message $YELLOW " $log: $size"
done
print_message $YELLOW "建议配置日志轮转"
fi
}
# 检查网络连接
check_network_connectivity() {
print_message $BLUE "检查网络连接..."
# 检查容器间网络连接
if docker exec qaup-app nc -z qaup-postgres 5432 &>/dev/null; then
print_message $GREEN "✓ 应用 -> 数据库: 连接正常"
else
print_message $RED "✗ 应用 -> 数据库: 连接失败"
fi
if docker exec qaup-app nc -z qaup-redis 6379 &>/dev/null; then
print_message $GREEN "✓ 应用 -> Redis: 连接正常"
else
print_message $RED "✗ 应用 -> Redis: 连接失败"
fi
if docker exec qaup-nginx nc -z qaup-app 8080 &>/dev/null; then
print_message $GREEN "✓ Nginx -> 应用: 连接正常"
else
print_message $RED "✗ Nginx -> 应用: 连接失败"
fi
}
# 生成监控报告
generate_monitoring_report() {
local timestamp=$(get_timestamp)
local report_file="./logs/monitoring_report_$(date +%Y%m%d_%H%M%S).txt"
print_message $BLUE "生成监控报告: $report_file"
{
echo "QAUP 系统监控报告"
echo "生成时间: $timestamp"
echo "========================================"
echo ""
echo "容器状态:"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "资源使用情况:"
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
echo ""
echo "系统资源:"
free -h
echo ""
df -h
echo ""
echo "网络状态:"
netstat -tuln | grep -E ":(80|443|8080|5432|6379) "
echo ""
} > "$report_file"
print_message $GREEN "监控报告已生成: $report_file"
}
# 持续监控模式
continuous_monitoring() {
local interval=${1:-60} # 默认60秒间隔
print_message $GREEN "开始持续监控模式 (间隔: ${interval}秒)"
print_message $YELLOW "按 Ctrl+C 停止监控"
while true; do
clear
print_message $BLUE "========================================="
print_message $BLUE "QAUP 系统监控 - $(get_timestamp)"
print_message $BLUE "========================================="
check_container_status
echo ""
check_service_health
echo ""
check_resource_usage
print_message $BLUE "========================================="
print_message $YELLOW "下次检查: $(date -d "+${interval} seconds" '+%H:%M:%S')"
sleep $interval
done
}
# 告警检查
check_alerts() {
print_message $BLUE "检查系统告警..."
local alerts=()
# 检查容器状态
local unhealthy_containers=$(docker ps -a --format "{{.Names}}" --filter "health=unhealthy")
if [ -n "$unhealthy_containers" ]; then
alerts+=("不健康的容器: $unhealthy_containers")
fi
# 检查磁盘使用率
local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//')
if [ $disk_usage -gt 80 ]; then
alerts+=("磁盘使用率过高: ${disk_usage}%")
fi
# 检查内存使用率
local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
if [ $mem_usage -gt 85 ]; then
alerts+=("内存使用率过高: ${mem_usage}%")
fi
# 输出告警
if [ ${#alerts[@]} -gt 0 ]; then
print_message $RED "发现告警:"
for alert in "${alerts[@]}"; do
print_message $RED "$alert"
done
return 1
else
print_message $GREEN "✓ 无告警"
return 0
fi
}
# 显示帮助信息
show_help() {
echo "QAUP 系统监控脚本"
echo ""
echo "用法: $0 [命令] [选项]"
echo ""
echo "命令:"
echo " status 检查系统状态"
echo " health 检查服务健康状态"
echo " resources 检查资源使用情况"
echo " logs 检查日志文件"
echo " network 检查网络连接"
echo " alerts 检查系统告警"
echo " report 生成监控报告"
echo " watch 持续监控模式"
echo ""
echo "示例:"
echo " $0 status # 检查系统状态"
echo " $0 watch 30 # 每30秒监控一次"
echo " $0 report # 生成监控报告"
}
# 主函数
main() {
if [ $# -eq 0 ]; then
show_help
exit 0
fi
local command=$1
shift
case $command in
status)
check_container_status
;;
health)
check_service_health
;;
resources)
check_resource_usage
;;
logs)
check_log_sizes
;;
network)
check_network_connectivity
;;
alerts)
check_alerts
;;
report)
generate_monitoring_report
;;
watch)
continuous_monitoring "$@"
;;
help|--help|-h)
show_help
;;
*)
print_message $RED "未知命令: $command"
show_help
exit 1
;;
esac
}
main "$@"