348 lines
9.9 KiB
Bash
Executable File
348 lines
9.9 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# QAUP 系统监控脚本
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
ENV_FILE="$PROJECT_ROOT/.env"
|
|
|
|
# 颜色输出
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
print_message() {
|
|
local color=$1
|
|
local message=$2
|
|
echo -e "${color}${message}${NC}"
|
|
}
|
|
|
|
# 获取时间戳
|
|
get_timestamp() {
|
|
date '+%Y-%m-%d %H:%M:%S'
|
|
}
|
|
|
|
# 检查容器状态
|
|
check_container_status() {
|
|
print_message $BLUE "检查容器状态..."
|
|
|
|
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
|
|
local unhealthy_containers=()
|
|
|
|
for container in "${containers[@]}"; do
|
|
if docker ps --format "table {{.Names}}\t{{.Status}}" | grep -q "$container"; then
|
|
local status=$(docker ps --format "{{.Status}}" --filter "name=$container")
|
|
if [[ "$status" == *"healthy"* ]] || [[ "$status" == *"Up"* ]]; then
|
|
print_message $GREEN "✓ $container: 运行正常"
|
|
else
|
|
print_message $RED "✗ $container: $status"
|
|
unhealthy_containers+=("$container")
|
|
fi
|
|
else
|
|
print_message $RED "✗ $container: 未运行"
|
|
unhealthy_containers+=("$container")
|
|
fi
|
|
done
|
|
|
|
if [ ${#unhealthy_containers[@]} -gt 0 ]; then
|
|
print_message $YELLOW "异常容器: ${unhealthy_containers[*]}"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# 检查服务健康状态
|
|
check_service_health() {
|
|
print_message $BLUE "检查服务健康状态..."
|
|
|
|
local errors=0
|
|
|
|
# 检查数据库连接
|
|
if docker exec qaup-postgres pg_isready -h localhost -p 5432 -U postgres &>/dev/null; then
|
|
print_message $GREEN "✓ PostgreSQL: 连接正常"
|
|
else
|
|
print_message $RED "✗ PostgreSQL: 连接失败"
|
|
errors=$((errors + 1))
|
|
fi
|
|
|
|
# 检查 Redis 连接
|
|
if docker exec qaup-redis redis-cli ping &>/dev/null; then
|
|
print_message $GREEN "✓ Redis: 连接正常"
|
|
else
|
|
print_message $RED "✗ Redis: 连接失败"
|
|
errors=$((errors + 1))
|
|
fi
|
|
|
|
# 检查应用健康端点
|
|
if curl -f -s http://localhost:8080/actuator/health &>/dev/null; then
|
|
print_message $GREEN "✓ QAUP 应用: 健康检查通过"
|
|
else
|
|
print_message $RED "✗ QAUP 应用: 健康检查失败"
|
|
errors=$((errors + 1))
|
|
fi
|
|
|
|
# 检查 Nginx
|
|
if curl -f -s http://localhost/health &>/dev/null; then
|
|
print_message $GREEN "✓ Nginx: 健康检查通过"
|
|
else
|
|
print_message $RED "✗ Nginx: 健康检查失败"
|
|
errors=$((errors + 1))
|
|
fi
|
|
|
|
return $errors
|
|
}
|
|
|
|
# 检查资源使用情况
|
|
check_resource_usage() {
|
|
print_message $BLUE "检查资源使用情况..."
|
|
|
|
# 检查容器资源使用
|
|
echo ""
|
|
print_message $BLUE "容器资源使用:"
|
|
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}"
|
|
|
|
echo ""
|
|
print_message $BLUE "系统资源使用:"
|
|
|
|
# CPU 使用率
|
|
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
|
|
print_message $GREEN "CPU 使用率: $cpu_usage"
|
|
|
|
# 内存使用情况
|
|
local mem_info=$(free -h | awk 'NR==2{printf "已使用: %s/%s (%.2f%%)", $3,$2,$3*100/$2}')
|
|
print_message $GREEN "内存使用: $mem_info"
|
|
|
|
# 磁盘使用情况
|
|
local disk_usage=$(df -h . | awk 'NR==2{printf "已使用: %s/%s (%s)", $3,$2,$5}')
|
|
print_message $GREEN "磁盘使用: $disk_usage"
|
|
}
|
|
|
|
# 检查日志文件大小
|
|
check_log_sizes() {
|
|
print_message $BLUE "检查日志文件大小..."
|
|
|
|
local log_dirs=("./logs/postgres" "./logs/redis" "./logs/app" "./logs/nginx")
|
|
local large_logs=()
|
|
|
|
for log_dir in "${log_dirs[@]}"; do
|
|
if [ -d "$log_dir" ]; then
|
|
local total_size=$(du -sh "$log_dir" 2>/dev/null | cut -f1)
|
|
print_message $GREEN "✓ $log_dir: $total_size"
|
|
|
|
# 检查是否有大于 100MB 的日志文件
|
|
while IFS= read -r -d '' file; do
|
|
local file_size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null)
|
|
if [ $file_size -gt 104857600 ]; then # 100MB
|
|
large_logs+=("$file")
|
|
fi
|
|
done < <(find "$log_dir" -name "*.log" -print0 2>/dev/null)
|
|
fi
|
|
done
|
|
|
|
if [ ${#large_logs[@]} -gt 0 ]; then
|
|
print_message $YELLOW "发现大日志文件:"
|
|
for log in "${large_logs[@]}"; do
|
|
local size=$(du -sh "$log" | cut -f1)
|
|
print_message $YELLOW " $log: $size"
|
|
done
|
|
print_message $YELLOW "建议配置日志轮转"
|
|
fi
|
|
}
|
|
|
|
# 检查网络连接
|
|
check_network_connectivity() {
|
|
print_message $BLUE "检查网络连接..."
|
|
|
|
# 检查容器间网络连接
|
|
if docker exec qaup-app nc -z qaup-postgres 5432 &>/dev/null; then
|
|
print_message $GREEN "✓ 应用 -> 数据库: 连接正常"
|
|
else
|
|
print_message $RED "✗ 应用 -> 数据库: 连接失败"
|
|
fi
|
|
|
|
if docker exec qaup-app nc -z qaup-redis 6379 &>/dev/null; then
|
|
print_message $GREEN "✓ 应用 -> Redis: 连接正常"
|
|
else
|
|
print_message $RED "✗ 应用 -> Redis: 连接失败"
|
|
fi
|
|
|
|
if docker exec qaup-nginx nc -z qaup-app 8080 &>/dev/null; then
|
|
print_message $GREEN "✓ Nginx -> 应用: 连接正常"
|
|
else
|
|
print_message $RED "✗ Nginx -> 应用: 连接失败"
|
|
fi
|
|
}
|
|
|
|
# 生成监控报告
|
|
generate_monitoring_report() {
|
|
local timestamp=$(get_timestamp)
|
|
local report_file="./logs/monitoring_report_$(date +%Y%m%d_%H%M%S).txt"
|
|
|
|
print_message $BLUE "生成监控报告: $report_file"
|
|
|
|
{
|
|
echo "QAUP 系统监控报告"
|
|
echo "生成时间: $timestamp"
|
|
echo "========================================"
|
|
echo ""
|
|
|
|
echo "容器状态:"
|
|
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
|
echo ""
|
|
|
|
echo "资源使用情况:"
|
|
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
|
|
echo ""
|
|
|
|
echo "系统资源:"
|
|
free -h
|
|
echo ""
|
|
df -h
|
|
echo ""
|
|
|
|
echo "网络状态:"
|
|
netstat -tuln | grep -E ":(80|443|8080|5432|6379) "
|
|
echo ""
|
|
|
|
} > "$report_file"
|
|
|
|
print_message $GREEN "监控报告已生成: $report_file"
|
|
}
|
|
|
|
# 持续监控模式
|
|
continuous_monitoring() {
|
|
local interval=${1:-60} # 默认60秒间隔
|
|
|
|
print_message $GREEN "开始持续监控模式 (间隔: ${interval}秒)"
|
|
print_message $YELLOW "按 Ctrl+C 停止监控"
|
|
|
|
while true; do
|
|
clear
|
|
print_message $BLUE "========================================="
|
|
print_message $BLUE "QAUP 系统监控 - $(get_timestamp)"
|
|
print_message $BLUE "========================================="
|
|
|
|
check_container_status
|
|
echo ""
|
|
check_service_health
|
|
echo ""
|
|
check_resource_usage
|
|
|
|
print_message $BLUE "========================================="
|
|
print_message $YELLOW "下次检查: $(date -d "+${interval} seconds" '+%H:%M:%S')"
|
|
|
|
sleep $interval
|
|
done
|
|
}
|
|
|
|
# 告警检查
|
|
check_alerts() {
|
|
print_message $BLUE "检查系统告警..."
|
|
|
|
local alerts=()
|
|
|
|
# 检查容器状态
|
|
local unhealthy_containers=$(docker ps -a --format "{{.Names}}" --filter "health=unhealthy")
|
|
if [ -n "$unhealthy_containers" ]; then
|
|
alerts+=("不健康的容器: $unhealthy_containers")
|
|
fi
|
|
|
|
# 检查磁盘使用率
|
|
local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//')
|
|
if [ $disk_usage -gt 80 ]; then
|
|
alerts+=("磁盘使用率过高: ${disk_usage}%")
|
|
fi
|
|
|
|
# 检查内存使用率
|
|
local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
|
|
if [ $mem_usage -gt 85 ]; then
|
|
alerts+=("内存使用率过高: ${mem_usage}%")
|
|
fi
|
|
|
|
# 输出告警
|
|
if [ ${#alerts[@]} -gt 0 ]; then
|
|
print_message $RED "发现告警:"
|
|
for alert in "${alerts[@]}"; do
|
|
print_message $RED " ⚠ $alert"
|
|
done
|
|
return 1
|
|
else
|
|
print_message $GREEN "✓ 无告警"
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
# 显示帮助信息
|
|
show_help() {
|
|
echo "QAUP 系统监控脚本"
|
|
echo ""
|
|
echo "用法: $0 [命令] [选项]"
|
|
echo ""
|
|
echo "命令:"
|
|
echo " status 检查系统状态"
|
|
echo " health 检查服务健康状态"
|
|
echo " resources 检查资源使用情况"
|
|
echo " logs 检查日志文件"
|
|
echo " network 检查网络连接"
|
|
echo " alerts 检查系统告警"
|
|
echo " report 生成监控报告"
|
|
echo " watch 持续监控模式"
|
|
echo ""
|
|
echo "示例:"
|
|
echo " $0 status # 检查系统状态"
|
|
echo " $0 watch 30 # 每30秒监控一次"
|
|
echo " $0 report # 生成监控报告"
|
|
}
|
|
|
|
# 主函数
|
|
main() {
|
|
if [ $# -eq 0 ]; then
|
|
show_help
|
|
exit 0
|
|
fi
|
|
|
|
local command=$1
|
|
shift
|
|
|
|
case $command in
|
|
status)
|
|
check_container_status
|
|
;;
|
|
health)
|
|
check_service_health
|
|
;;
|
|
resources)
|
|
check_resource_usage
|
|
;;
|
|
logs)
|
|
check_log_sizes
|
|
;;
|
|
network)
|
|
check_network_connectivity
|
|
;;
|
|
alerts)
|
|
check_alerts
|
|
;;
|
|
report)
|
|
generate_monitoring_report
|
|
;;
|
|
watch)
|
|
continuous_monitoring "$@"
|
|
;;
|
|
help|--help|-h)
|
|
show_help
|
|
;;
|
|
*)
|
|
print_message $RED "未知命令: $command"
|
|
show_help
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
main "$@" |