#!/bin/bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" CONFIG_FILE="$PROJECT_DIR/config/cluster-vars.yml" # Source common library if available if [ -f "$SCRIPT_DIR/lib/common.sh" ]; then source "$SCRIPT_DIR/lib/common.sh" else log() { echo "[INFO] $1"; } log_error() { echo "[ERROR] $1" >&2; } log_warn() { echo "[WARN] $1"; } fi log "=== 验证K3s集群部署 ===" echo "" # Counters TOTAL_CHECKS=0 PASSED_CHECKS=0 FAILED_CHECKS=0 WARNING_CHECKS=0 # Check function check() { local name="$1" local command="$2" local is_critical="${3:-true}" TOTAL_CHECKS=$((TOTAL_CHECKS + 1)) echo -n "检查: $name ... " if eval "$command" &>/dev/null; then echo "✓ 通过" PASSED_CHECKS=$((PASSED_CHECKS + 1)) return 0 else if [ "$is_critical" = "true" ]; then echo "✗ 失败" FAILED_CHECKS=$((FAILED_CHECKS + 1)) else echo "⚠ 警告" WARNING_CHECKS=$((WARNING_CHECKS + 1)) fi return 1 fi } # Detailed check with output check_detailed() { local name="$1" local command="$2" echo "" echo "==========================================" echo " $name" echo "==========================================" eval "$command" echo "" } echo "==========================================" echo " 1. 基础环境检查" echo "==========================================" echo "" check "kubectl命令可用" "command -v kubectl" check "kubectl连接集群" "kubectl cluster-info" check "配置文件存在" "test -f $CONFIG_FILE" if command -v yq &>/dev/null; then check "yq工具可用" "command -v yq" else check "yq工具可用" "false" "false" fi echo "" echo "==========================================" echo " 2. K3s集群状态" echo "==========================================" echo "" check "所有节点Ready" "kubectl get nodes | grep -v NotReady | grep Ready" check "kube-system命名空间存在" "kubectl get namespace kube-system" check "CoreDNS运行正常" "kubectl get deployment coredns -n kube-system -o jsonpath='{.status.availableReplicas}' | grep -v '^0$'" check_detailed "节点状态" "kubectl get nodes -o wide" check_detailed "系统Pod状态" "kubectl get pods -n kube-system" echo "" echo "==========================================" echo " 3. Gitea服务检查" echo "==========================================" echo "" if kubectl get namespace gitea &>/dev/null; then check "Gitea命名空间存在" "kubectl get namespace gitea" check "Gitea部署存在" "kubectl get deployment gitea -n gitea" if kubectl get deployment gitea -n gitea &>/dev/null; then check "Gitea Pod运行正常" "kubectl get pods -n gitea -l app.kubernetes.io/name=gitea -o jsonpath='{.items[0].status.phase}' | grep Running" check "Gitea服务可访问" "kubectl get svc gitea-http -n gitea" check_detailed "Gitea服务详情" "kubectl get all -n gitea" # Get Gitea access info GITEA_NODEPORT=$(kubectl get svc gitea-http -n gitea -o jsonpath='{.spec.ports[0].nodePort}' 2>/dev/null || echo "N/A") NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}' 2>/dev/null) if [ -z "$NODE_IP" ]; then NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' 2>/dev/null) fi echo "Gitea访问信息:" echo " URL: http://$NODE_IP:$GITEA_NODEPORT" echo "" fi else check "Gitea命名空间存在" "false" "false" log_warn "Gitea未部署" fi echo "" echo "==========================================" echo " 4. ArgoCD服务检查" echo "==========================================" echo "" if kubectl get namespace argocd &>/dev/null; then check "ArgoCD命名空间存在" "kubectl get namespace argocd" check "ArgoCD Server部署存在" "kubectl get deployment argocd-server -n argocd" if kubectl get deployment argocd-server -n argocd &>/dev/null; then check "ArgoCD Server运行正常" "kubectl get pods -n argocd -l app.kubernetes.io/name=argocd-server -o jsonpath='{.items[0].status.phase}' | grep Running" check "ArgoCD Application Controller运行正常" "kubectl get pods -n argocd -l app.kubernetes.io/name=argocd-application-controller -o jsonpath='{.items[0].status.phase}' | grep Running" check "ArgoCD Repo Server运行正常" "kubectl get pods -n argocd -l app.kubernetes.io/name=argocd-repo-server -o jsonpath='{.items[0].status.phase}' | grep Running" check_detailed "ArgoCD服务详情" "kubectl get all -n argocd" # Get ArgoCD access info ARGOCD_NODEPORT=$(kubectl get svc argocd-server -n argocd -o jsonpath='{.spec.ports[0].nodePort}' 2>/dev/null || echo "N/A") echo "ArgoCD访问信息:" echo " URL: https://$NODE_IP:$ARGOCD_NODEPORT" echo " 用户名: admin" echo "" fi else check "ArgoCD命名空间存在" "false" "false" log_warn "ArgoCD未部署" fi echo "" echo "==========================================" echo " 5. HTTPS证书检查" echo "==========================================" echo "" if kubectl get namespace cert-manager &>/dev/null; then check "cert-manager命名空间存在" "kubectl get namespace cert-manager" check "cert-manager部署存在" "kubectl get deployment cert-manager -n cert-manager" if kubectl get deployment cert-manager -n cert-manager &>/dev/null; then check "cert-manager运行正常" "kubectl get pods -n cert-manager -l app=cert-manager -o jsonpath='{.items[0].status.phase}' | grep Running" # Check ClusterIssuers if kubectl get clusterissuer &>/dev/null 2>&1; then check_detailed "ClusterIssuer状态" "kubectl get clusterissuer" fi # Check Certificates if kubectl get certificate -A &>/dev/null 2>&1; then check_detailed "证书状态" "kubectl get certificate -A" fi fi else check "cert-manager命名空间存在" "false" "false" log_warn "cert-manager未部署,HTTPS功能不可用" fi echo "" echo "==========================================" echo " 6. GitOps工作流检查" echo "==========================================" echo "" if kubectl get namespace argocd &>/dev/null; then # Check for ArgoCD Applications if kubectl get application -n argocd &>/dev/null 2>&1; then APP_COUNT=$(kubectl get application -n argocd --no-headers 2>/dev/null | wc -l) if [ "$APP_COUNT" -gt 0 ]; then check "ArgoCD应用已创建" "test $APP_COUNT -gt 0" check_detailed "ArgoCD应用状态" "kubectl get application -n argocd" else check "ArgoCD应用已创建" "false" "false" log_warn "未找到ArgoCD应用" fi else check "ArgoCD应用已创建" "false" "false" log_warn "ArgoCD CRD可能未就绪" fi else log_warn "ArgoCD未部署,跳过GitOps检查" fi echo "" echo "==========================================" echo " 7. 存储检查" echo "==========================================" echo "" check "PersistentVolume存在" "kubectl get pv" "false" check "PersistentVolumeClaim存在" "kubectl get pvc -A" "false" if kubectl get pvc -A &>/dev/null 2>&1; then check_detailed "存储卷状态" "kubectl get pv,pvc -A" fi echo "" echo "==========================================" echo " 验证总结" echo "==========================================" echo "" echo "总检查项: $TOTAL_CHECKS" echo "通过: $PASSED_CHECKS ✓" echo "失败: $FAILED_CHECKS ✗" echo "警告: $WARNING_CHECKS ⚠" echo "" if [ $FAILED_CHECKS -eq 0 ]; then log "✓ 所有关键检查通过!" if [ $WARNING_CHECKS -gt 0 ]; then log_warn "存在 $WARNING_CHECKS 个警告项,建议检查" fi echo "" echo "==========================================" echo " 快速访问指南" echo "==========================================" echo "" if [ -n "${NODE_IP:-}" ]; then if [ -n "${GITEA_NODEPORT:-}" ] && [ "$GITEA_NODEPORT" != "N/A" ]; then echo "Gitea:" echo " http://$NODE_IP:$GITEA_NODEPORT" echo "" fi if [ -n "${ARGOCD_NODEPORT:-}" ] && [ "$ARGOCD_NODEPORT" != "N/A" ]; then echo "ArgoCD:" echo " https://$NODE_IP:$ARGOCD_NODEPORT" echo " 用户名: admin" echo "" fi fi echo "常用命令:" echo " 查看所有Pod: kubectl get pods -A" echo " 查看节点: kubectl get nodes" echo " 查看服务: kubectl get svc -A" echo "" exit 0 else log_error "发现 $FAILED_CHECKS 个失败项,请检查并修复" echo "" echo "故障排查建议:" echo " 1. 查看Pod日志: kubectl logs -n " echo " 2. 查看Pod详情: kubectl describe pod -n " echo " 3. 查看事件: kubectl get events -A --sort-by='.lastTimestamp'" echo " 4. 重新部署: ./scripts/deploy-all.sh" echo "" exit 1 fi