QAUP_Management/deploy/docker/ops-manager.sh

711 lines
23 KiB
Bash
Executable File

#!/bin/bash
# QAUP 运维管理脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_message() {
local color=$1
local message=$2
echo -e "${color}${message}${NC}"
}
# 显示系统概览
show_system_overview() {
print_message $GREEN "========================================="
print_message $GREEN "QAUP 系统概览"
print_message $GREEN "========================================="
# 系统基本信息
echo ""
print_message $BLUE "系统信息:"
echo " 主机名: $(hostname)"
echo " 操作系统: $(uname -s) $(uname -r)"
echo " 当前时间: $(date)"
echo " 运行时间: $(uptime -p 2>/dev/null || uptime | awk '{print $3,$4}' | sed 's/,//')"
# 资源使用情况
echo ""
print_message $BLUE "资源使用:"
echo " CPU 负载: $(uptime | awk -F'load average:' '{print $2}' | xargs)"
echo " 内存使用: $(free -h | awk 'NR==2{printf "%s/%s (%.2f%%)", $3,$2,$3*100/$2}')"
echo " 磁盘使用: $(df -h . | awk 'NR==2{printf "%s/%s (%s)", $3,$2,$5}')"
# Docker 信息
echo ""
print_message $BLUE "Docker 信息:"
echo " Docker 版本: $(docker --version | cut -d' ' -f3 | sed 's/,//')"
echo " 运行容器数: $(docker ps -q | wc -l)"
echo " 总容器数: $(docker ps -aq | wc -l)"
echo " 镜像数量: $(docker images -q | wc -l)"
# 服务状态
echo ""
print_message $BLUE "服务状态:"
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
for container in "${containers[@]}"; do
if docker ps --format "{{.Names}}" | grep -q "^${container}$"; then
local status=$(docker ps --format "{{.Status}}" --filter "name=$container")
print_message $GREEN "$container: $status"
else
print_message $RED "$container: 未运行"
fi
done
# 网络状态
echo ""
print_message $BLUE "网络状态:"
local ports=("80" "443" "8080")
for port in "${ports[@]}"; do
if netstat -tuln 2>/dev/null | grep -q ":$port "; then
print_message $GREEN " ✓ 端口 $port: 监听中"
else
print_message $YELLOW " ⚠ 端口 $port: 未监听"
fi
done
echo ""
}
# 快速诊断
quick_diagnosis() {
print_message $BLUE "执行快速诊断..."
local issues=()
# 检查 Docker 服务
if ! systemctl is-active --quiet docker 2>/dev/null; then
issues+=("Docker 服务未运行")
fi
# 检查容器状态
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
for container in "${containers[@]}"; do
if ! docker ps --format "{{.Names}}" | grep -q "^${container}$"; then
issues+=("容器 $container 未运行")
fi
done
# 检查磁盘空间
local disk_usage=$(df . | awk 'NR==2 {print $5}' | sed 's/%//')
if [ $disk_usage -gt 85 ]; then
issues+=("磁盘使用率过高: ${disk_usage}%")
fi
# 检查内存使用
local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
if [ $mem_usage -gt 90 ]; then
issues+=("内存使用率过高: ${mem_usage}%")
fi
# 检查服务响应
if ! curl -f -s http://localhost/health &>/dev/null; then
issues+=("前端服务无响应")
fi
if ! curl -f -s http://localhost:8080/actuator/health &>/dev/null; then
issues+=("后端服务无响应")
fi
# 显示诊断结果
echo ""
if [ ${#issues[@]} -eq 0 ]; then
print_message $GREEN "✓ 系统状态正常,未发现问题"
else
print_message $YELLOW "发现以下问题:"
for issue in "${issues[@]}"; do
echo " - $issue"
done
echo ""
print_message $BLUE "建议操作:"
echo " 1. 运行详细检查: $0 health-check"
echo " 2. 查看系统日志: $0 logs"
echo " 3. 重启服务: ./deploy.sh restart"
fi
echo ""
}
# 健康检查
health_check() {
print_message $BLUE "执行系统健康检查..."
# 运行集成测试的健康检查部分
if [ -x "$PROJECT_ROOT/docker/integration-test.sh" ]; then
"$PROJECT_ROOT/docker/integration-test.sh" quick
else
print_message $YELLOW "集成测试脚本不存在,执行基本健康检查"
# 基本健康检查
local health_ok=true
# 检查容器健康状态
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
for container in "${containers[@]}"; do
if docker ps --format "{{.Names}}" | grep -q "^${container}$"; then
print_message $GREEN "$container 运行正常"
else
print_message $RED "$container 未运行"
health_ok=false
fi
done
# 检查服务响应
if curl -f -s http://localhost/health &>/dev/null; then
print_message $GREEN "✓ 前端服务响应正常"
else
print_message $RED "✗ 前端服务无响应"
health_ok=false
fi
if curl -f -s http://localhost:8080/actuator/health &>/dev/null; then
print_message $GREEN "✓ 后端服务响应正常"
else
print_message $RED "✗ 后端服务无响应"
health_ok=false
fi
if [ "$health_ok" = true ]; then
print_message $GREEN "✓ 系统健康检查通过"
return 0
else
print_message $RED "✗ 系统健康检查失败"
return 1
fi
fi
}
# 查看系统日志
view_logs() {
local service=${1:-"all"}
local lines=${2:-50}
print_message $BLUE "查看系统日志 (最近 $lines 行)..."
case $service in
"all")
echo ""
print_message $BLUE "=== 应用日志 ==="
docker logs --tail $lines qaup-app 2>/dev/null || echo "应用容器未运行"
echo ""
print_message $BLUE "=== Nginx 日志 ==="
docker logs --tail $lines qaup-nginx 2>/dev/null || echo "Nginx 容器未运行"
echo ""
print_message $BLUE "=== 数据库日志 ==="
docker logs --tail $lines qaup-postgres 2>/dev/null || echo "数据库容器未运行"
echo ""
print_message $BLUE "=== Redis 日志 ==="
docker logs --tail $lines qaup-redis 2>/dev/null || echo "Redis 容器未运行"
;;
"app"|"application")
docker logs --tail $lines -f qaup-app
;;
"nginx"|"web")
docker logs --tail $lines -f qaup-nginx
;;
"postgres"|"database"|"db")
docker logs --tail $lines -f qaup-postgres
;;
"redis"|"cache")
docker logs --tail $lines -f qaup-redis
;;
*)
print_message $RED "未知服务: $service"
print_message $BLUE "可用服务: all, app, nginx, postgres, redis"
;;
esac
}
# 系统清理
system_cleanup() {
print_message $BLUE "执行系统清理..."
# 确认操作
read -p "确定要执行系统清理吗? (y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_message $BLUE "清理操作已取消"
return
fi
local cleaned_items=0
# 清理 Docker 资源
print_message $BLUE "清理 Docker 资源..."
# 清理停止的容器
local stopped_containers=$(docker ps -aq --filter "status=exited")
if [ -n "$stopped_containers" ]; then
docker rm $stopped_containers
cleaned_items=$((cleaned_items + $(echo $stopped_containers | wc -w)))
print_message $GREEN "✓ 已清理停止的容器"
fi
# 清理未使用的镜像
local unused_images=$(docker images -f "dangling=true" -q)
if [ -n "$unused_images" ]; then
docker rmi $unused_images
cleaned_items=$((cleaned_items + $(echo $unused_images | wc -w)))
print_message $GREEN "✓ 已清理未使用的镜像"
fi
# 清理未使用的网络
docker network prune -f &>/dev/null
print_message $GREEN "✓ 已清理未使用的网络"
# 清理未使用的卷
docker volume prune -f &>/dev/null
print_message $GREEN "✓ 已清理未使用的卷"
# 清理旧日志
if [ -x "$PROJECT_ROOT/docker/log-manager.sh" ]; then
"$PROJECT_ROOT/docker/log-manager.sh" cleanup 30
print_message $GREEN "✓ 已清理旧日志文件"
fi
# 清理临时文件
find /tmp -name "*qaup*" -mtime +1 -delete 2>/dev/null || true
print_message $GREEN "✓ 已清理临时文件"
print_message $GREEN "系统清理完成,共清理 $cleaned_items 个项目"
}
# 性能分析
performance_analysis() {
print_message $BLUE "执行性能分析..."
echo ""
print_message $BLUE "=== 系统资源使用 ==="
# CPU 使用率
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
echo "CPU 使用率: $cpu_usage"
# 内存详细信息
echo ""
echo "内存使用详情:"
free -h
# 磁盘 I/O
echo ""
echo "磁盘使用情况:"
df -h
# 容器资源使用
echo ""
print_message $BLUE "=== 容器资源使用 ==="
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}"
# 网络连接
echo ""
print_message $BLUE "=== 网络连接状态 ==="
netstat -tuln | grep -E ":(80|443|8080|5432|6379) " | head -10
# 进程信息
echo ""
print_message $BLUE "=== 高资源使用进程 ==="
ps aux --sort=-%cpu | head -10
# 数据库性能
echo ""
print_message $BLUE "=== 数据库性能 ==="
if docker ps --format "{{.Names}}" | grep -q "qaup-postgres"; then
local db_connections=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | xargs || echo "N/A")
echo "数据库连接数: $db_connections"
local db_size=$(docker exec qaup-postgres psql -U postgres -d qaup -t -c "SELECT pg_size_pretty(pg_database_size('qaup'));" 2>/dev/null | xargs || echo "N/A")
echo "数据库大小: $db_size"
else
echo "数据库容器未运行"
fi
# Redis 性能
echo ""
print_message $BLUE "=== Redis 性能 ==="
if docker ps --format "{{.Names}}" | grep -q "qaup-redis"; then
local redis_memory=$(docker exec qaup-redis redis-cli info memory | grep "used_memory_human" | cut -d: -f2 | tr -d '\r' || echo "N/A")
echo "Redis 内存使用: $redis_memory"
local redis_keys=$(docker exec qaup-redis redis-cli dbsize 2>/dev/null || echo "N/A")
echo "Redis 键数量: $redis_keys"
else
echo "Redis 容器未运行"
fi
}
# 安全检查
security_check() {
print_message $BLUE "执行安全检查..."
local security_issues=()
# 检查文件权限
echo ""
print_message $BLUE "=== 文件权限检查 ==="
if [ -f "$PROJECT_ROOT/.env" ]; then
local env_perms=$(stat -c "%a" "$PROJECT_ROOT/.env")
if [ "$env_perms" = "600" ]; then
print_message $GREEN "✓ .env 文件权限正确 ($env_perms)"
else
print_message $YELLOW "⚠ .env 文件权限不安全 ($env_perms),建议设置为 600"
security_issues+=("环境文件权限不安全")
fi
fi
# 检查容器安全
echo ""
print_message $BLUE "=== 容器安全检查 ==="
local containers=("qaup-postgres" "qaup-redis" "qaup-app" "qaup-nginx")
for container in "${containers[@]}"; do
if docker ps --format "{{.Names}}" | grep -q "^${container}$"; then
# 检查是否以 root 用户运行
local user=$(docker exec "$container" whoami 2>/dev/null || echo "unknown")
if [ "$user" = "root" ]; then
print_message $YELLOW "$container 以 root 用户运行"
security_issues+=("$container 以 root 用户运行")
else
print_message $GREEN "$container 使用非 root 用户: $user"
fi
fi
done
# 检查网络安全
echo ""
print_message $BLUE "=== 网络安全检查 ==="
# 检查不应该暴露的端口
local sensitive_ports=("5432" "6379")
for port in "${sensitive_ports[@]}"; do
if netstat -tuln 2>/dev/null | grep -q "0.0.0.0:$port "; then
print_message $YELLOW "⚠ 敏感端口 $port 对外暴露"
security_issues+=("端口 $port 对外暴露")
else
print_message $GREEN "✓ 端口 $port 未对外暴露"
fi
done
# 检查 SSL/TLS
if [ -d "$PROJECT_ROOT/ssl" ]; then
print_message $GREEN "✓ SSL 证书目录存在"
# 检查证书文件权限
if [ -f "$PROJECT_ROOT/ssl/server.key" ]; then
local key_perms=$(stat -c "%a" "$PROJECT_ROOT/ssl/server.key")
if [ "$key_perms" = "600" ]; then
print_message $GREEN "✓ SSL 私钥权限正确"
else
print_message $YELLOW "⚠ SSL 私钥权限不安全"
security_issues+=("SSL 私钥权限不安全")
fi
fi
else
print_message $YELLOW "⚠ 未配置 SSL 证书"
security_issues+=("未配置 SSL 证书")
fi
# 显示安全检查结果
echo ""
if [ ${#security_issues[@]} -eq 0 ]; then
print_message $GREEN "✓ 安全检查通过,未发现安全问题"
else
print_message $YELLOW "发现以下安全问题:"
for issue in "${security_issues[@]}"; do
echo " - $issue"
done
echo ""
print_message $BLUE "建议操作:"
echo " 1. 运行安全配置脚本: ./docker/security-setup.sh all"
echo " 2. 检查防火墙设置: sudo ufw status"
echo " 3. 更新系统和软件包"
fi
}
# 备份管理
backup_management() {
local action=${1:-"status"}
case $action in
"status")
print_message $BLUE "备份状态:"
if [ -d "$PROJECT_ROOT/backup" ]; then
local backup_count=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" | wc -l)
echo " 备份文件数量: $backup_count"
if [ $backup_count -gt 0 ]; then
local latest_backup=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-)
local backup_date=$(echo "$latest_backup" | grep -o '[0-9]\{8\}_[0-9]\{6\}' | sed 's/_/ /')
local backup_size=$(du -h "$latest_backup" | cut -f1)
echo " 最新备份: $(basename "$latest_backup")"
echo " 备份时间: $backup_date"
echo " 备份大小: $backup_size"
else
print_message $YELLOW " 未找到备份文件"
fi
else
print_message $YELLOW " 备份目录不存在"
fi
;;
"create")
print_message $BLUE "创建系统备份..."
if [ -x "$PROJECT_ROOT/docker/backup-restore.sh" ]; then
"$PROJECT_ROOT/docker/backup-restore.sh" full
else
print_message $RED "备份脚本不存在"
fi
;;
"list")
if [ -x "$PROJECT_ROOT/docker/backup-restore.sh" ]; then
"$PROJECT_ROOT/docker/backup-restore.sh" list
else
print_message $RED "备份脚本不存在"
fi
;;
"cleanup")
print_message $BLUE "清理旧备份..."
if [ -x "$PROJECT_ROOT/docker/backup-restore.sh" ]; then
"$PROJECT_ROOT/docker/backup-restore.sh" cleanup 30
else
print_message $RED "备份脚本不存在"
fi
;;
*)
print_message $RED "未知备份操作: $action"
echo "可用操作: status, create, list, cleanup"
;;
esac
}
# 服务管理
service_management() {
local action=${1:-"status"}
local service=${2:-"all"}
case $action in
"start")
print_message $BLUE "启动服务: $service"
if [ "$service" = "all" ]; then
"$PROJECT_ROOT/deploy.sh" start
else
docker start "qaup-$service"
fi
;;
"stop")
print_message $BLUE "停止服务: $service"
if [ "$service" = "all" ]; then
"$PROJECT_ROOT/deploy.sh" stop
else
docker stop "qaup-$service"
fi
;;
"restart")
print_message $BLUE "重启服务: $service"
if [ "$service" = "all" ]; then
"$PROJECT_ROOT/deploy.sh" restart
else
docker restart "qaup-$service"
fi
;;
"status")
print_message $BLUE "服务状态:"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" --filter "name=qaup-"
;;
*)
print_message $RED "未知服务操作: $action"
echo "可用操作: start, stop, restart, status"
;;
esac
}
# 生成运维报告
generate_ops_report() {
local report_file="$PROJECT_ROOT/logs/ops_report_$(date +%Y%m%d_%H%M%S).md"
print_message $BLUE "生成运维报告: $report_file"
{
echo "# QAUP 系统运维报告"
echo ""
echo "生成时间: $(date)"
echo "报告类型: 系统状态报告"
echo ""
echo "## 系统概览"
echo ""
echo "- 主机名: $(hostname)"
echo "- 操作系统: $(uname -s) $(uname -r)"
echo "- 运行时间: $(uptime -p 2>/dev/null || uptime | awk '{print $3,$4}' | sed 's/,//')"
echo "- Docker 版本: $(docker --version | cut -d' ' -f3 | sed 's/,//')"
echo ""
echo "## 资源使用"
echo ""
echo "- CPU 负载: $(uptime | awk -F'load average:' '{print $2}' | xargs)"
echo "- 内存使用: $(free -h | awk 'NR==2{printf "%s/%s (%.2f%%)", $3,$2,$3*100/$2}')"
echo "- 磁盘使用: $(df -h . | awk 'NR==2{printf "%s/%s (%s)", $3,$2,$5}')"
echo ""
echo "## 服务状态"
echo ""
echo "```"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" --filter "name=qaup-"
echo "```"
echo ""
echo "## 容器资源使用"
echo ""
echo "```"
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
echo "```"
echo ""
echo "## 网络状态"
echo ""
echo "```"
netstat -tuln | grep -E ":(80|443|8080|5432|6379) " | head -10
echo "```"
echo ""
echo "## 备份状态"
echo ""
if [ -d "$PROJECT_ROOT/backup" ]; then
local backup_count=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" | wc -l)
echo "- 备份文件数量: $backup_count"
if [ $backup_count -gt 0 ]; then
local latest_backup=$(find "$PROJECT_ROOT/backup" -name "qaup_full_backup_*.tar.gz" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-)
local backup_date=$(echo "$latest_backup" | grep -o '[0-9]\{8\}_[0-9]\{6\}' | sed 's/_/ /')
local backup_size=$(du -h "$latest_backup" | cut -f1)
echo "- 最新备份: $(basename "$latest_backup")"
echo "- 备份时间: $backup_date"
echo "- 备份大小: $backup_size"
fi
else
echo "- 备份目录不存在"
fi
echo ""
echo "## 建议操作"
echo ""
echo "- 定期检查系统资源使用情况"
echo "- 定期执行系统备份"
echo "- 监控服务健康状态"
echo "- 清理旧日志和临时文件"
echo "- 更新系统和软件包"
echo ""
} > "$report_file"
print_message $GREEN "运维报告已生成: $report_file"
}
# 显示帮助信息
show_help() {
echo "QAUP 运维管理脚本"
echo ""
echo "用法: $0 [命令] [参数]"
echo ""
echo "系统管理命令:"
echo " overview 显示系统概览"
echo " diagnosis 快速诊断"
echo " health-check 健康检查"
echo " performance 性能分析"
echo " security 安全检查"
echo " cleanup 系统清理"
echo " report 生成运维报告"
echo ""
echo "服务管理命令:"
echo " service <action> [service] 服务管理"
echo " actions: start, stop, restart, status"
echo " services: all, app, nginx, postgres, redis"
echo ""
echo "日志管理命令:"
echo " logs [service] [lines] 查看日志"
echo " services: all, app, nginx, postgres, redis"
echo ""
echo "备份管理命令:"
echo " backup <action> 备份管理"
echo " actions: status, create, list, cleanup"
echo ""
echo "示例:"
echo " $0 overview # 显示系统概览"
echo " $0 diagnosis # 快速诊断"
echo " $0 service restart app # 重启应用服务"
echo " $0 logs app 100 # 查看应用最近100行日志"
echo " $0 backup create # 创建系统备份"
echo " $0 cleanup # 系统清理"
}
# 主函数
main() {
if [ $# -eq 0 ]; then
show_system_overview
exit 0
fi
local command=$1
shift
case $command in
overview)
show_system_overview
;;
diagnosis)
quick_diagnosis
;;
health-check|health)
health_check
;;
performance|perf)
performance_analysis
;;
security|sec)
security_check
;;
cleanup|clean)
system_cleanup
;;
report)
generate_ops_report
;;
service|svc)
service_management "$@"
;;
logs|log)
view_logs "$@"
;;
backup|bak)
backup_management "$@"
;;
help|--help|-h)
show_help
;;
*)
print_message $RED "未知命令: $command"
show_help
exit 1
;;
esac
}
main "$@"