#!/bin/bash # QAUP 运维管理脚本 set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # 颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' print_message() { local color=$1 local message=$2 echo -e "${color}${message}${NC}" } # 显示系统概览 show_system_overview() { print_message $GREEN "=========================================" print_message $GREEN "QAUP 系统概览" print_message $GREEN "=========================================" # 系统基本信息 echo "" print_message $BLUE "系统信息:" echo " 主机名: $(hostname)" echo " 操作系统: $(uname -s) $(uname -r)" echo " 当前时间: $(date)" echo " 运行时间: $(uptime -p 2>/dev/null || uptime | awk '{print $3,$4}' | sed 's/,//')" # 资源使用情况 echo "" print_message $BLUE "资源使用:" echo " CPU 负载: $(uptime | awk -F'load average:' '{print $2}' | xargs)" echo " 内存使用: $(free -h | awk 'NR==2{printf "%s/%s (%.2f%%)", $3,$2,$3*100/$2}')" echo " 磁盘使用: $(df -h . | awk 'NR==2{printf "%s/%s (%s)", $3,$2,$5}')" # Docker 信息 echo "" print_message $BLUE "Docker 信息:" echo " Docker 版本: $(docker --version | cut -d' ' -f3 | sed 's/,//')" echo " 运行容器数: $(docker ps -q | wc -l)" echo " 总容器数: $(docker ps -aq | wc -l)" echo " 镜像数量: $(docker images -q | wc -l)" # 服务状态 echo "" print_message $BLUE "服务状态:" local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx") for container in "${containers[@]}"; do if docker ps --format "{{.Names}}" | grep -q "^${container}$"; then local status=$(docker ps --format "{{.Status}}" --filter "name=$container") print_message $GREEN " ✓ $container: $status" else print_message $RED " ✗ $container: 未运行" fi done # 网络状态 echo "" print_message $BLUE "网络状态:" local ports=("80" "443" "8080") for port in "${ports[@]}"; do if netstat -tuln 2>/dev/null | grep -q ":$port "; then print_message $GREEN " ✓ 端口 $port: 监听中" else print_message $YELLOW " ⚠ 端口 $port: 未监听" fi done echo "" } # 快速诊断 quick_diagnosis() { print_message $BLUE "执行快速诊断..." local issues=() # 检查 Docker 服务 if ! systemctl is-active --quiet docker 2>/dev/null; then issues+=("Docker 服务未运行") fi # 检查容器状态 local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx") for container in "${containers[@]}"; do if ! docker ps --format "{{.Names}}" | grep -q "^${container}$"; then issues+=("容器 $container 未运行") fi done # 检查磁盘空间 local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//') if [ $disk_usage -gt 85 ]; then issues+=("磁盘使用率过高: ${disk_usage}%") fi # 检查内存使用 local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}') if [ $mem_usage -gt 90 ]; then issues+=("内存使用率过高: ${mem_usage}%") fi # 检查服务响应 if ! curl -f -s http://localhost/health &>/dev/null; then issues+=("前端服务无响应") fi if ! curl -f -s http://localhost:8080/actuator/health &>/dev/null; then issues+=("后端服务无响应") fi # 显示诊断结果 echo "" if [ ${#issues[@]} -eq 0 ]; then print_message $GREEN "✓ 系统状态正常,未发现问题" else print_message $YELLOW "发现以下问题:" for issue in "${issues[@]}"; do echo " - $issue" done echo "" print_message $BLUE "建议操作:" echo " 1. 运行详细检查: $0 health-check" echo " 2. 查看系统日志: $0 logs" echo " 3. 重启服务: ./deploy.sh restart" fi echo "" } # 健康检查 health_check() { print_message $BLUE "执行系统健康检查..." # 运行集成测试的健康检查部分 if [ -x "$PROJECT_ROOT/docker/integration-test.sh" ]; then "$PROJECT_ROOT/docker/integration-test.sh" quick else print_message $YELLOW "集成测试脚本不存在,执行基本健康检查" # 基本健康检查 local health_ok=true # 检查容器健康状态 local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx") for container in "${containers[@]}"; do if docker ps --format "{{.Names}}" | grep -q "^${container}$"; then print_message $GREEN "✓ $container 运行正常" else print_message $RED "✗ $container 未运行" health_ok=false fi done # 检查服务响应 if curl -f -s http://localhost/health &>/dev/null; then print_message $GREEN "✓ 前端服务响应正常" else print_message $RED "✗ 前端服务无响应" health_ok=false fi if curl -f -s http://localhost:8080/actuator/health &>/dev/null; then print_message $GREEN "✓ 后端服务响应正常" else print_message $RED "✗ 后端服务无响应" health_ok=false fi if [ "$health_ok" = true ]; then print_message $GREEN "✓ 系统健康检查通过" return 0 else print_message $RED "✗ 系统健康检查失败" return 1 fi fi } # 查看系统日志 view_logs() { local service=${1:-"all"} local lines=${2:-50} print_message $BLUE "查看系统日志 (最近 $lines 行)..." case $service in "all") echo "" print_message $BLUE "=== 应用日志 ===" docker logs --tail $lines qaup-app 2>/dev/null || echo "应用容器未运行" echo "" print_message $BLUE "=== Nginx 日志 ===" docker logs --tail $lines qaup-nginx 2>/dev/null || echo "Nginx 容器未运行" echo "" print_message $BLUE "=== 数据库日志 ===" docker logs --tail $lines qaup-postgres 2>/dev/null || echo "数据库容器未运行" echo "" print_message $BLUE "=== Redis 日志 ===" docker logs --tail $lines qaup-redis 2>/dev/null || echo "Redis 容器未运行" ;; "app"|"application") docker logs --tail $lines -f qaup-app ;; "nginx"|"web") docker logs --tail $lines -f qaup-nginx ;; "postgres"|"database"|"db") docker logs --tail $lines -f qaup-postgres ;; "redis"|"cache") docker logs --tail $lines -f qaup-redis ;; *) print_message $RED "未知服务: $service" print_message $BLUE "可用服务: all, app, nginx, postgres, redis" ;; esac } # 系统清理 system_cleanup() { print_message $BLUE "执行系统清理..." # 确认操作 read -p "确定要执行系统清理吗? (y/N): " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then print_message $BLUE "清理操作已取消" return fi local cleaned_items=0 # 清理 Docker 资源 print_message $BLUE "清理 Docker 资源..." # 清理停止的容器 local stopped_containers=$(docker ps -aq --filter "status=exited") if [ -n "$stopped_containers" ]; then docker rm $stopped_containers cleaned_items=$((cleaned_items + $(echo $stopped_containers | wc -w))) print_message $GREEN "✓ 已清理停止的容器" fi # 清理未使用的镜像 local unused_images=$(docker images -f "dangling=true" -q) if [ -n "$unused_images" ]; then docker rmi $unused_images cleaned_items=$((cleaned_items + $(echo $unused_images | wc -w))) print_message $GREEN "✓ 已清理未使用的镜像" fi # 清理未使用的网络 docker network prune -f &>/dev/null print_message $GREEN "✓ 已清理未使用的网络" # 清理未使用的卷 docker volume prune -f &>/dev/null print_message $GREEN "✓ 已清理未使用的卷" # 清理旧日志 if [ -x "$PROJECT_ROOT/docker/log-manager.sh" ]; then "$PROJECT_ROOT/docker/log-manager.sh" cleanup 30 print_message $GREEN "✓ 已清理旧日志文件" fi # 清理临时文件 find /tmp -name "*qaup*" -mtime +1 -delete 2>/dev/null || true print_message $GREEN "✓ 已清理临时文件" print_message $GREEN "系统清理完成,共清理 $cleaned_items 个项目" } # 性能分析 performance_analysis() { print_message $BLUE "执行性能分析..." echo "" print_message $BLUE "=== 系统资源使用 ===" # CPU 使用率 local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//') echo "CPU 使用率: $cpu_usage" # 内存详细信息 echo "" echo "内存使用详情:" free -h # 磁盘 I/O echo "" echo "磁盘使用情况:" df -h # 容器资源使用 echo "" print_message $BLUE "=== 容器资源使用 ===" docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}" # 网络连接 echo "" print_message $BLUE "=== 网络连接状态 ===" netstat -tuln | grep -E ":(80|443|8080|5432|6379) " | head -10 # 进程信息 echo "" print_message $BLUE "=== 高资源使用进程 ===" ps aux --sort=-%cpu | head -10 # 数据库性能 echo "" print_message $BLUE "=== 数据库性能 ===" if docker ps --format "{{.Names}}" | grep -q "qaup-postgres"; then local db_connections=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | xargs || echo "N/A") echo "数据库连接数: $db_connections" local db_size=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT pg_size_pretty(pg_database_size('qaup'));" 2>/dev/null | xargs || echo "N/A") echo "数据库大小: $db_size" else echo "数据库容器未运行" fi # Redis 性能 echo "" print_message $BLUE "=== Redis 性能 ===" if docker ps --format "{{.Names}}" | grep -q "qaup-redis"; then local redis_memory=$(docker exec qaup-redis redis-cli info memory | grep "used_memory_human" | cut -d: -f2 | tr -d '\r' || echo "N/A") echo "Redis 内存使用: $redis_memory" local redis_keys=$(docker exec qaup-redis redis-cli dbsize 2>/dev/null || echo "N/A") echo "Redis 键数量: $redis_keys" else echo "Redis 容器未运行" fi } # 安全检查 security_check() { print_message $BLUE "执行安全检查..." local security_issues=() # 检查文件权限 echo "" print_message $BLUE "=== 文件权限检查 ===" if [ -f "$PROJECT_ROOT/.env" ]; then local env_perms=$(stat -c "%a" "$PROJECT_ROOT/.env") if [ "$env_perms" = "600" ]; then print_message $GREEN "✓ .env 文件权限正确 ($env_perms)" else print_message $YELLOW "⚠ .env 文件权限不安全 ($env_perms),建议设置为 600" security_issues+=("环境文件权限不安全") fi fi # 检查容器安全 echo "" print_message $BLUE "=== 容器安全检查 ===" local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx") for container in "${containers[@]}"; do if docker ps --format "{{.Names}}" | grep -q "^${container}$"; then # 检查是否以 root 用户运行 local user=$(docker exec "$container" whoami 2>/dev/null || echo "unknown") if [ "$user" = "root" ]; then print_message $YELLOW "⚠ $container 以 root 用户运行" security_issues+=("$container 以 root 用户运行") else print_message $GREEN "✓ $container 使用非 root 用户: $user" fi fi done # 检查网络安全 echo "" print_message $BLUE "=== 网络安全检查 ===" # 检查不应该暴露的端口 local sensitive_ports=("5432" "6379") for port in "${sensitive_ports[@]}"; do if netstat -tuln 2>/dev/null | grep -q "0.0.0.0:$port "; then print_message $YELLOW "⚠ 敏感端口 $port 对外暴露" security_issues+=("端口 $port 对外暴露") else print_message $GREEN "✓ 端口 $port 未对外暴露" fi done # 检查 SSL/TLS if [ -d "$PROJECT_ROOT/ssl" ]; then print_message $GREEN "✓ SSL 证书目录存在" # 检查证书文件权限 if [ -f "$PROJECT_ROOT/ssl/server.key" ]; then local key_perms=$(stat -c "%a" "$PROJECT_ROOT/ssl/server.key") if [ "$key_perms" = "600" ]; then print_message $GREEN "✓ SSL 私钥权限正确" else print_message $YELLOW "⚠ SSL 私钥权限不安全" security_issues+=("SSL 私钥权限不安全") fi fi else print_message $YELLOW "⚠ 未配置 SSL 证书" security_issues+=("未配置 SSL 证书") fi # 显示安全检查结果 echo "" if [ ${#security_issues[@]} -eq 0 ]; then print_message $GREEN "✓ 安全检查通过,未发现安全问题" else print_message $YELLOW "发现以下安全问题:" for issue in "${security_issues[@]}"; do echo " - $issue" done echo "" print_message $BLUE "建议操作:" echo " 1. 运行安全配置脚本: ./docker/security-setup.sh all" echo " 2. 检查防火墙设置: sudo ufw status" echo " 3. 更新系统和软件包" fi } # 备份管理 backup_management() { local action=${1:-"status"} case $action in "status") print_message $BLUE "备份状态:" if [ -d "$PROJECT_ROOT/backup" ]; then local backup_count=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" | wc -l) echo " 备份文件数量: $backup_count" if [ $backup_count -gt 0 ]; then local latest_backup=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-) local backup_date=$(echo "$latest_backup" | grep -o '[0-9]\{8\}_[0-9]\{6\}' | sed 's/_/ /') local backup_size=$(du -h "$latest_backup" | cut -f1) echo " 最新备份: $(basename "$latest_backup")" echo " 备份时间: $backup_date" echo " 备份大小: $backup_size" else print_message $YELLOW " 未找到备份文件" fi else print_message $YELLOW " 备份目录不存在" fi ;; "create") print_message $BLUE "创建系统备份..." if [ -x "$PROJECT_ROOT/docker/backup-restore.sh" ]; then "$PROJECT_ROOT/docker/backup-restore.sh" full else print_message $RED "备份脚本不存在" fi ;; "list") if [ -x "$PROJECT_ROOT/docker/backup-restore.sh" ]; then "$PROJECT_ROOT/docker/backup-restore.sh" list else print_message $RED "备份脚本不存在" fi ;; "cleanup") print_message $BLUE "清理旧备份..." if [ -x "$PROJECT_ROOT/docker/backup-restore.sh" ]; then "$PROJECT_ROOT/docker/backup-restore.sh" cleanup 30 else print_message $RED "备份脚本不存在" fi ;; *) print_message $RED "未知备份操作: $action" echo "可用操作: status, create, list, cleanup" ;; esac } # 服务管理 service_management() { local action=${1:-"status"} local service=${2:-"all"} case $action in "start") print_message $BLUE "启动服务: $service" if [ "$service" = "all" ]; then "$PROJECT_ROOT/deploy.sh" start else docker start "qaup-$service" fi ;; "stop") print_message $BLUE "停止服务: $service" if [ "$service" = "all" ]; then "$PROJECT_ROOT/deploy.sh" stop else docker stop "qaup-$service" fi ;; "restart") print_message $BLUE "重启服务: $service" if [ "$service" = "all" ]; then "$PROJECT_ROOT/deploy.sh" restart else docker restart "qaup-$service" fi ;; "status") print_message $BLUE "服务状态:" docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" --filter "name=qaup-" ;; *) print_message $RED "未知服务操作: $action" echo "可用操作: start, stop, restart, status" ;; esac } # 生成运维报告 generate_ops_report() { local report_file="$PROJECT_ROOT/logs/ops_report_$(date +%Y%m%d_%H%M%S).md" print_message $BLUE "生成运维报告: $report_file" { echo "# QAUP 系统运维报告" echo "" echo "生成时间: $(date)" echo "报告类型: 系统状态报告" echo "" echo "## 系统概览" echo "" echo "- 主机名: $(hostname)" echo "- 操作系统: $(uname -s) $(uname -r)" echo "- 运行时间: $(uptime -p 2>/dev/null || uptime | awk '{print $3,$4}' | sed 's/,//')" echo "- Docker 版本: $(docker --version | cut -d' ' -f3 | sed 's/,//')" echo "" echo "## 资源使用" echo "" echo "- CPU 负载: $(uptime | awk -F'load average:' '{print $2}' | xargs)" echo "- 内存使用: $(free -h | awk 'NR==2{printf "%s/%s (%.2f%%)", $3,$2,$3*100/$2}')" echo "- 磁盘使用: $(df -h . | awk 'NR==2{printf "%s/%s (%s)", $3,$2,$5}')" echo "" echo "## 服务状态" echo "" echo "```" docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" --filter "name=qaup-" echo "```" echo "" echo "## 容器资源使用" echo "" echo "```" docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" echo "```" echo "" echo "## 网络状态" echo "" echo "```" netstat -tuln | grep -E ":(80|443|8080|5432|6379) " | head -10 echo "```" echo "" echo "## 备份状态" echo "" if [ -d "$PROJECT_ROOT/backup" ]; then local backup_count=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" | wc -l) echo "- 备份文件数量: $backup_count" if [ $backup_count -gt 0 ]; then local latest_backup=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-) local backup_date=$(echo "$latest_backup" | grep -o '[0-9]\{8\}_[0-9]\{6\}' | sed 's/_/ /') local backup_size=$(du -h "$latest_backup" | cut -f1) echo "- 最新备份: $(basename "$latest_backup")" echo "- 备份时间: $backup_date" echo "- 备份大小: $backup_size" fi else echo "- 备份目录不存在" fi echo "" echo "## 建议操作" echo "" echo "- 定期检查系统资源使用情况" echo "- 定期执行系统备份" echo "- 监控服务健康状态" echo "- 清理旧日志和临时文件" echo "- 更新系统和软件包" echo "" } > "$report_file" print_message $GREEN "运维报告已生成: $report_file" } # 显示帮助信息 show_help() { echo "QAUP 运维管理脚本" echo "" echo "用法: $0 [命令] [参数]" echo "" echo "系统管理命令:" echo " overview 显示系统概览" echo " diagnosis 快速诊断" echo " health-check 健康检查" echo " performance 性能分析" echo " security 安全检查" echo " cleanup 系统清理" echo " report 生成运维报告" echo "" echo "服务管理命令:" echo " service [service] 服务管理" echo " actions: start, stop, restart, status" echo " services: all, app, nginx, postgres, redis" echo "" echo "日志管理命令:" echo " logs [service] [lines] 查看日志" echo " services: all, app, nginx, postgres, redis" echo "" echo "备份管理命令:" echo " backup 备份管理" echo " actions: status, create, list, cleanup" echo "" echo "示例:" echo " $0 overview # 显示系统概览" echo " $0 diagnosis # 快速诊断" echo " $0 service restart app # 重启应用服务" echo " $0 logs app 100 # 查看应用最近100行日志" echo " $0 backup create # 创建系统备份" echo " $0 cleanup # 系统清理" } # 主函数 main() { if [ $# -eq 0 ]; then show_system_overview exit 0 fi local command=$1 shift case $command in overview) show_system_overview ;; diagnosis) quick_diagnosis ;; health-check|health) health_check ;; performance|perf) performance_analysis ;; security|sec) security_check ;; cleanup|clean) system_cleanup ;; report) generate_ops_report ;; service|svc) service_management "$@" ;; logs|log) view_logs "$@" ;; backup|bak) backup_management "$@" ;; help|--help|-h) show_help ;; *) print_message $RED "未知命令: $command" show_help exit 1 ;; esac } main "$@"