# K8s 服务发现 Bug 最终修复方案(含调试日志) ## 目录 - [1. 修复方案概述](#1-修复方案概述) - [2. 完整修复代码](#2-完整修复代码) - [3. 调试日志说明](#3-调试日志说明) - [4. 测试验证](#4-测试验证) - [5. 问题排查指南](#5-问题排查指南) --- ## 1. 修复方案概述 ### 1.1 修复内容 基于历史修复尝试的教训,本次修复包含以下内容: 1. ✅ **修复监听资源类型**:从 Pod 改为 Endpoints 2. ✅ **GetConn 使用 DNS**:避免连接被强制关闭(关键!) 3. ✅ **GetConns 使用 Endpoints**:支持负载均衡和自动更新 4. ✅ **延迟关闭旧连接**:避免正在进行的请求失败 5. ✅ **添加健康检查**:确保连接有效性 6. ✅ **添加 KeepAlive**:支持自动重连 7. ✅ **添加详细调试日志**:方便问题排查 ### 1.2 核心原则 - **GetConn → DNS**:避免连接被强制关闭,导致消息同步和推送失败 - **GetConns → Endpoints**:支持负载均衡和自动更新 - **延迟关闭**:给正在进行的请求时间完成 - **详细日志**:记录关键操作,方便调试 --- ## 2. 完整修复代码 ### 2.1 修复后的完整文件 ```go package kubernetes import ( "context" "fmt" "log" "os" "sync" "time" "google.golang.org/grpc" "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials/insecure" "google.golang.org/grpc/keepalive" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" ) type KubernetesConnManager struct { clientset *kubernetes.Clientset namespace string dialOptions []grpc.DialOption rpcTargets map[string]string selfTarget string mu sync.RWMutex connMap map[string][]*grpc.ClientConn } // NewKubernetesConnManager creates a new connection manager that uses Kubernetes services for service discovery. func NewKubernetesConnManager(namespace string, options ...grpc.DialOption) (*KubernetesConnManager, error) { log.Printf("[K8s Discovery] Initializing Kubernetes connection manager, namespace: %s", namespace) // 获取集群内配置 config, err := rest.InClusterConfig() if err != nil { log.Printf("[K8s Discovery] ERROR: Failed to create in-cluster config: %v", err) return nil, fmt.Errorf("failed to create in-cluster config: %v", err) } log.Printf("[K8s Discovery] Successfully created in-cluster config") // 创建 K8s API 客户端 clientset, err := kubernetes.NewForConfig(config) if err != nil { log.Printf("[K8s Discovery] ERROR: Failed to create clientset: %v", err) return nil, fmt.Errorf("failed to create clientset: %v", err) } log.Printf("[K8s Discovery] Successfully created clientset") // 初始化连接管理器 k := &KubernetesConnManager{ clientset: clientset, namespace: namespace, dialOptions: options, connMap: make(map[string][]*grpc.ClientConn), rpcTargets: make(map[string]string), } // 启动后台 goroutine 监听 Endpoints 变化 log.Printf("[K8s Discovery] Starting Endpoints watcher") go k.watchEndpoints() log.Printf("[K8s Discovery] Kubernetes connection manager initialized successfully") return k, nil } // initializeConns 初始化指定服务的所有 gRPC 连接 func (k *KubernetesConnManager) initializeConns(serviceName string) error { log.Printf("[K8s Discovery] [%s] Starting to initialize connections", serviceName) // 步骤 1: 获取 Service 的端口 port, err := k.getServicePort(serviceName) if err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to get service port: %v", serviceName, err) return fmt.Errorf("failed to get service port: %w", err) } log.Printf("[K8s Discovery] [%s] Service port: %d", serviceName, port) // 步骤 2: 获取 Service 对应的 Endpoints endpoints, err := k.clientset.CoreV1().Endpoints(k.namespace).Get( context.Background(), serviceName, metav1.GetOptions{}, ) if err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to get endpoints: %v", serviceName, err) return fmt.Errorf("failed to get endpoints for service %s: %w", serviceName, err) } // 统计 Endpoints 数量 var totalAddresses int for _, subset := range endpoints.Subsets { totalAddresses += len(subset.Addresses) } log.Printf("[K8s Discovery] [%s] Found %d endpoint addresses", serviceName, totalAddresses) // 步骤 3: 为每个 Pod IP 创建 gRPC 连接 var newConns []*grpc.ClientConn var newTargets []string var failedTargets []string for _, subset := range endpoints.Subsets { for _, address := range subset.Addresses { target := fmt.Sprintf("%s:%d", address.IP, port) log.Printf("[K8s Discovery] [%s] Creating connection to %s", serviceName, target) // 创建 gRPC 连接,配置 KeepAlive conn, err := grpc.Dial( target, append(k.dialOptions, grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithKeepaliveParams(keepalive.ClientParameters{ Time: 10 * time.Second, Timeout: 3 * time.Second, PermitWithoutStream: true, }), )..., ) if err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to dial %s: %v", serviceName, target, err) failedTargets = append(failedTargets, target) // 如果连接失败,关闭已创建的连接并返回错误 for _, c := range newConns { _ = c.Close() } return fmt.Errorf("failed to dial endpoint %s: %w", target, err) } // 检查连接状态 state := conn.GetState() log.Printf("[K8s Discovery] [%s] Connection to %s created, state: %v", serviceName, target, state) newConns = append(newConns, conn) newTargets = append(newTargets, target) } } if len(failedTargets) > 0 { log.Printf("[K8s Discovery] [%s] WARNING: Failed to connect to %d targets: %v", serviceName, len(failedTargets), failedTargets) } log.Printf("[K8s Discovery] [%s] Successfully created %d connections", serviceName, len(newConns)) // 步骤 4: 获取旧连接并延迟关闭 k.mu.Lock() oldConns, exists := k.connMap[serviceName] var oldConnCount int if exists { oldConnCount = len(oldConns) log.Printf("[K8s Discovery] [%s] Found %d old connections to close", serviceName, oldConnCount) } // 步骤 5: 立即替换为新连接,让新请求使用新连接 k.connMap[serviceName] = newConns k.mu.Unlock() log.Printf("[K8s Discovery] [%s] Connection map updated: %d old -> %d new", serviceName, oldConnCount, len(newConns)) // 步骤 6: 延迟关闭旧连接,给正在进行的请求时间完成 if exists && len(oldConns) > 0 { log.Printf("[K8s Discovery] [%s] Scheduling delayed close for %d old connections (5 seconds delay)", serviceName, len(oldConns)) go func() { // 等待 5 秒,让正在进行的请求完成 time.Sleep(5 * time.Second) log.Printf("[K8s Discovery] [%s] Closing %d old connections", serviceName, len(oldConns)) closedCount := 0 for _, oldConn := range oldConns { if err := oldConn.Close(); err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to close old connection: %v", serviceName, err) } else { closedCount++ } } log.Printf("[K8s Discovery] [%s] Closed %d/%d old connections", serviceName, closedCount, len(oldConns)) }() } log.Printf("[K8s Discovery] [%s] Connection initialization completed successfully", serviceName) return nil } // GetConns returns gRPC client connections for a given Kubernetes service name. func (k *KubernetesConnManager) GetConns(ctx context.Context, serviceName string, opts ...grpc.DialOption) ([]*grpc.ClientConn, error) { log.Printf("[K8s Discovery] [%s] GetConns called", serviceName) // 步骤 1: 第一次检查缓存(读锁) k.mu.RLock() conns, exists := k.connMap[serviceName] k.mu.RUnlock() // 步骤 2: 如果缓存中有连接,检查健康状态 if exists { log.Printf("[K8s Discovery] [%s] Found %d connections in cache, checking health", serviceName, len(conns)) // 检查连接健康状态 validConns := k.filterValidConns(serviceName, conns) // 如果还有有效连接,更新缓存并返回 if len(validConns) > 0 { // 如果有效连接数量减少,更新缓存 if len(validConns) < len(conns) { log.Printf("[K8s Discovery] [%s] Removed %d invalid connections, %d valid connections remaining", serviceName, len(conns)-len(validConns), len(validConns)) k.mu.Lock() k.connMap[serviceName] = validConns k.mu.Unlock() } else { log.Printf("[K8s Discovery] [%s] All %d connections are healthy", serviceName, len(validConns)) } return validConns, nil } // 如果所有连接都失效,清除缓存并重新初始化 log.Printf("[K8s Discovery] [%s] All connections are invalid, clearing cache and reinitializing", serviceName) k.mu.Lock() delete(k.connMap, serviceName) k.mu.Unlock() } else { log.Printf("[K8s Discovery] [%s] No connections in cache, initializing", serviceName) } // 步骤 3: 缓存中没有连接或所有连接都失效,重新初始化 k.mu.Lock() // 双重检查:在获取写锁后再次检查,避免重复初始化 conns, exists = k.connMap[serviceName] if exists { log.Printf("[K8s Discovery] [%s] Connections were initialized by another goroutine", serviceName) k.mu.Unlock() return conns, nil } k.mu.Unlock() // 初始化新连接 log.Printf("[K8s Discovery] [%s] Initializing new connections", serviceName) if err := k.initializeConns(serviceName); err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to initialize connections: %v", serviceName, err) return nil, fmt.Errorf("failed to initialize connections for service %s: %w", serviceName, err) } // 返回新初始化的连接 k.mu.RLock() conns = k.connMap[serviceName] k.mu.RUnlock() log.Printf("[K8s Discovery] [%s] Returning %d connections", serviceName, len(conns)) return conns, nil } // filterValidConns 过滤出有效的连接 func (k *KubernetesConnManager) filterValidConns(serviceName string, conns []*grpc.ClientConn) []*grpc.ClientConn { validConns := make([]*grpc.ClientConn, 0, len(conns)) invalidStates := make(map[connectivity.State]int) for i, conn := range conns { state := conn.GetState() // 只保留 Ready 和 Idle 状态的连接 if state == connectivity.Ready || state == connectivity.Idle { validConns = append(validConns, conn) } else { invalidStates[state]++ log.Printf("[K8s Discovery] [%s] Connection #%d is invalid, state: %v, closing", serviceName, i, state) // 连接失效,关闭它 if err := conn.Close(); err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to close invalid connection #%d: %v", serviceName, i, err) } } } if len(invalidStates) > 0 { log.Printf("[K8s Discovery] [%s] Invalid connection states: %v", serviceName, invalidStates) } return validConns } // GetConn returns a single gRPC client connection for a given Kubernetes service name. // 重要:GetConn 使用 DNS,避免连接被强制关闭 // 原因: // 1. GetConn 返回的连接可能被长期复用 // 2. 如果使用 Endpoints 直连,连接会在 Endpoints 刷新时被关闭 // 3. 这会导致正在进行的请求失败:grpc: the client connection is closing // 4. DNS 方式由 gRPC 客户端管理连接,不会受到 Endpoints 刷新的影响 func (k *KubernetesConnManager) GetConn(ctx context.Context, serviceName string, opts ...grpc.DialOption) (*grpc.ClientConn, error) { log.Printf("[K8s Discovery] [%s] GetConn called (using DNS)", serviceName) var target string // 检查是否有自定义目标 if k.rpcTargets[serviceName] == "" { // 获取 Service 端口 svcPort, err := k.getServicePort(serviceName) if err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to get service port: %v", serviceName, err) return nil, err } // 构建 K8s DNS 名称 // 格式:..svc.cluster.local: // K8s DNS 会自动解析到所有后端 Pod,并实现负载均衡 target = fmt.Sprintf("%s.%s.svc.cluster.local:%d", serviceName, k.namespace, svcPort) log.Printf("[K8s Discovery] [%s] Using DNS target: %s", serviceName, target) } else { // 使用自定义目标(如果有) target = k.rpcTargets[serviceName] log.Printf("[K8s Discovery] [%s] Using custom target: %s", serviceName, target) } // 创建 gRPC 连接 log.Printf("[K8s Discovery] [%s] Dialing DNS target: %s", serviceName, target) conn, err := grpc.DialContext( ctx, target, append([]grpc.DialOption{ grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithDefaultCallOptions( grpc.MaxCallRecvMsgSize(1024*1024*10), // 最大接收消息大小:10MB grpc.MaxCallSendMsgSize(1024*1024*20), // 最大发送消息大小:20MB ), // 配置 KeepAlive grpc.WithKeepaliveParams(keepalive.ClientParameters{ Time: 10 * time.Second, Timeout: 3 * time.Second, PermitWithoutStream: true, }), }, k.dialOptions...)..., ) if err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to dial DNS target %s: %v", serviceName, target, err) return nil, err } // 检查连接状态 state := conn.GetState() log.Printf("[K8s Discovery] [%s] Connection created, state: %v", serviceName, state) // 如果连接不是 Ready 状态,等待一下 if state != connectivity.Ready { log.Printf("[K8s Discovery] [%s] Connection not ready, waiting for state change", serviceName) ctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() if conn.WaitForStateChange(ctx, state) { newState := conn.GetState() log.Printf("[K8s Discovery] [%s] Connection state changed: %v -> %v", serviceName, state, newState) } else { log.Printf("[K8s Discovery] [%s] WARNING: Connection state change timeout", serviceName) } } log.Printf("[K8s Discovery] [%s] GetConn completed successfully", serviceName) return conn, nil } // watchEndpoints 监听 Endpoints 资源变化 func (k *KubernetesConnManager) watchEndpoints() { log.Printf("[K8s Discovery] Starting Endpoints watcher") // 创建 Informer 工厂 // resyncPeriod: 10 分钟,定期重新同步资源 informerFactory := informers.NewSharedInformerFactory(k.clientset, time.Minute*10) // 创建 Endpoints Informer // 注意:这里修复了原来的 bug // 原来监听的是 Pod,现在改为 Endpoints informer := informerFactory.Core().V1().Endpoints().Informer() log.Printf("[K8s Discovery] Endpoints Informer created") // 注册事件处理器 informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ // AddFunc: 当新的 Endpoints 资源被创建时触发 AddFunc: func(obj interface{}) { endpoint := obj.(*v1.Endpoints) log.Printf("[K8s Discovery] [Watcher] Endpoints ADDED: %s", endpoint.Name) k.handleEndpointChange(obj) }, // UpdateFunc: 当 Endpoints 资源被更新时触发 UpdateFunc: func(oldObj, newObj interface{}) { oldEndpoint := oldObj.(*v1.Endpoints) newEndpoint := newObj.(*v1.Endpoints) // 检查是否有实际变化 if k.endpointsChanged(oldEndpoint, newEndpoint) { log.Printf("[K8s Discovery] [Watcher] Endpoints UPDATED: %s", newEndpoint.Name) k.handleEndpointChange(newObj) } else { log.Printf("[K8s Discovery] [Watcher] Endpoints %s updated but no meaningful change", newEndpoint.Name) } }, // DeleteFunc: 当 Endpoints 资源被删除时触发 DeleteFunc: func(obj interface{}) { endpoint := obj.(*v1.Endpoints) log.Printf("[K8s Discovery] [Watcher] Endpoints DELETED: %s", endpoint.Name) k.handleEndpointChange(obj) }, }) // 启动 Informer log.Printf("[K8s Discovery] Starting Informer factory") informerFactory.Start(context.Background().Done()) // 等待 Informer 同步完成 log.Printf("[K8s Discovery] Waiting for Informer cache to sync") if !cache.WaitForCacheSync(context.Background().Done(), informer.HasSynced) { log.Printf("[K8s Discovery] ERROR: Failed to sync Informer cache") return } log.Printf("[K8s Discovery] Informer cache synced successfully") // 阻塞等待,直到程序退出 log.Printf("[K8s Discovery] Endpoints watcher is running") <-context.Background().Done() log.Printf("[K8s Discovery] Endpoints watcher stopped") } // endpointsChanged 检查 Endpoints 是否有实际变化 func (k *KubernetesConnManager) endpointsChanged(old, new *v1.Endpoints) bool { // 比较地址列表 oldAddresses := make(map[string]bool) for _, subset := range old.Subsets { for _, address := range subset.Addresses { oldAddresses[address.IP] = true } } newAddresses := make(map[string]bool) for _, subset := range new.Subsets { for _, address := range subset.Addresses { newAddresses[address.IP] = true } } // 比较数量 if len(oldAddresses) != len(newAddresses) { return true } // 比较内容 for ip := range oldAddresses { if !newAddresses[ip] { return true } } return false } // handleEndpointChange 处理 Endpoints 资源变化 func (k *KubernetesConnManager) handleEndpointChange(obj interface{}) { // 类型断言 endpoint, ok := obj.(*v1.Endpoints) if !ok { // 类型断言失败,记录日志但不中断程序 log.Printf("[K8s Discovery] [Watcher] ERROR: Expected *v1.Endpoints, got %T", obj) return } serviceName := endpoint.Name log.Printf("[K8s Discovery] [Watcher] Handling Endpoints change for service: %s", serviceName) // 统计 Endpoints 信息 var totalAddresses int for _, subset := range endpoint.Subsets { totalAddresses += len(subset.Addresses) } log.Printf("[K8s Discovery] [Watcher] Service %s has %d endpoint addresses", serviceName, totalAddresses) // 重新初始化连接 if err := k.initializeConns(serviceName); err != nil { // 初始化失败,记录错误但不中断程序 log.Printf("[K8s Discovery] [Watcher] ERROR: Failed to initialize connections for %s: %v", serviceName, err) } else { log.Printf("[K8s Discovery] [Watcher] Successfully updated connections for %s", serviceName) } } // getServicePort 获取 Service 的 RPC 端口 func (k *KubernetesConnManager) getServicePort(serviceName string) (int32, error) { log.Printf("[K8s Discovery] [%s] Getting service port", serviceName) svc, err := k.clientset.CoreV1().Services(k.namespace).Get( context.Background(), serviceName, metav1.GetOptions{}, ) if err != nil { log.Printf("[K8s Discovery] [%s] ERROR: Failed to get service: %v", serviceName, err) return 0, fmt.Errorf("failed to get service %s: %w", serviceName, err) } if len(svc.Spec.Ports) == 0 { log.Printf("[K8s Discovery] [%s] ERROR: Service has no ports defined", serviceName) return 0, fmt.Errorf("service %s has no ports defined", serviceName) } // 找到 RPC 端口(非 10001) var svcPort int32 for _, port := range svc.Spec.Ports { if port.Port != 10001 { svcPort = port.Port break } } if svcPort == 0 { log.Printf("[K8s Discovery] [%s] ERROR: Service has no RPC port (all ports are 10001)", serviceName) return 0, fmt.Errorf("service %s has no RPC port (all ports are 10001)", serviceName) } log.Printf("[K8s Discovery] [%s] Service port: %d", serviceName, svcPort) return svcPort, nil } // Close 关闭所有连接 func (k *KubernetesConnManager) Close() { log.Printf("[K8s Discovery] Closing all connections") k.mu.Lock() defer k.mu.Unlock() totalConns := 0 for serviceName, conns := range k.connMap { log.Printf("[K8s Discovery] Closing %d connections for service %s", len(conns), serviceName) for i, conn := range conns { if err := conn.Close(); err != nil { log.Printf("[K8s Discovery] ERROR: Failed to close connection #%d for service %s: %v", i, serviceName, err) } } totalConns += len(conns) } log.Printf("[K8s Discovery] Closed %d total connections", totalConns) k.connMap = make(map[string][]*grpc.ClientConn) } // GetSelfConnTarget returns the connection target for the current service. func (k *KubernetesConnManager) GetSelfConnTarget() string { if k.selfTarget == "" { hostName := os.Getenv("HOSTNAME") log.Printf("[K8s Discovery] Getting self connection target, HOSTNAME: %s", hostName) pod, err := k.clientset.CoreV1().Pods(k.namespace).Get(context.Background(), hostName, metav1.GetOptions{}) if err != nil { log.Printf("[K8s Discovery] ERROR: Failed to get pod %s: %v", hostName, err) } for pod.Status.PodIP == "" { log.Printf("[K8s Discovery] Waiting for pod %s IP to be assigned", hostName) pod, err = k.clientset.CoreV1().Pods(k.namespace).Get(context.TODO(), hostName, metav1.GetOptions{}) if err != nil { log.Printf("[K8s Discovery] ERROR: Failed to get pod: %v", err) } time.Sleep(3 * time.Second) } var selfPort int32 for _, port := range pod.Spec.Containers[0].Ports { if port.ContainerPort != 10001 { selfPort = port.ContainerPort break } } k.selfTarget = fmt.Sprintf("%s:%d", pod.Status.PodIP, selfPort) log.Printf("[K8s Discovery] Self connection target: %s", k.selfTarget) } return k.selfTarget } // AddOption appends gRPC dial options to the existing options. func (k *KubernetesConnManager) AddOption(opts ...grpc.DialOption) { k.mu.Lock() defer k.mu.Unlock() k.dialOptions = append(k.dialOptions, opts...) log.Printf("[K8s Discovery] Added %d dial options", len(opts)) } // CloseConn closes a given gRPC client connection. func (k *KubernetesConnManager) CloseConn(conn *grpc.ClientConn) { log.Printf("[K8s Discovery] Closing single connection") conn.Close() } func (k *KubernetesConnManager) Register(serviceName, host string, port int, opts ...grpc.DialOption) error { // K8s 环境下不需要注册,返回 nil return nil } func (k *KubernetesConnManager) UnRegister() error { // K8s 环境下不需要注销,返回 nil return nil } func (k *KubernetesConnManager) GetUserIdHashGatewayHost(ctx context.Context, userId string) (string, error) { // K8s 环境下不支持,返回空 return "", nil } ``` --- ## 3. 调试日志说明 ### 3.1 日志格式 所有日志都使用统一的前缀:`[K8s Discovery]`,方便过滤和查找。 ### 3.2 日志级别 - **INFO**:正常操作流程 - **WARNING**:需要注意但不影响功能 - **ERROR**:错误信息,需要关注 ### 3.3 关键日志点 #### 3.3.1 初始化日志 ``` [K8s Discovery] Initializing Kubernetes connection manager, namespace: default [K8s Discovery] Successfully created in-cluster config [K8s Discovery] Successfully created clientset [K8s Discovery] Starting Endpoints watcher [K8s Discovery] Kubernetes connection manager initialized successfully ``` #### 3.3.2 连接初始化日志 ``` [K8s Discovery] [user-rpc-service] Starting to initialize connections [K8s Discovery] [user-rpc-service] Service port: 10320 [K8s Discovery] [user-rpc-service] Found 3 endpoint addresses [K8s Discovery] [user-rpc-service] Creating connection to 10.244.1.5:10320 [K8s Discovery] [user-rpc-service] Connection to 10.244.1.5:10320 created, state: Connecting [K8s Discovery] [user-rpc-service] Successfully created 3 connections [K8s Discovery] [user-rpc-service] Found 2 old connections to close [K8s Discovery] [user-rpc-service] Connection map updated: 2 old -> 3 new [K8s Discovery] [user-rpc-service] Scheduling delayed close for 2 old connections (5 seconds delay) [K8s Discovery] [user-rpc-service] Connection initialization completed successfully ``` #### 3.3.3 Endpoints 监听日志 ``` [K8s Discovery] [Watcher] Endpoints UPDATED: user-rpc-service [K8s Discovery] [Watcher] Service user-rpc-service has 3 endpoint addresses [K8s Discovery] [Watcher] Handling Endpoints change for service: user-rpc-service [K8s Discovery] [Watcher] Successfully updated connections for user-rpc-service ``` #### 3.3.4 连接健康检查日志 ``` [K8s Discovery] [user-rpc-service] GetConns called [K8s Discovery] [user-rpc-service] Found 3 connections in cache, checking health [K8s Discovery] [user-rpc-service] Connection #1 is invalid, state: Shutdown, closing [K8s Discovery] [user-rpc-service] Removed 1 invalid connections, 2 valid connections remaining [K8s Discovery] [user-rpc-service] Returning 2 connections ``` #### 3.3.5 GetConn 日志 ``` [K8s Discovery] [msg-rpc-service] GetConn called (using DNS) [K8s Discovery] [msg-rpc-service] Using DNS target: msg-rpc-service.default.svc.cluster.local:10280 [K8s Discovery] [msg-rpc-service] Dialing DNS target: msg-rpc-service.default.svc.cluster.local:10280 [K8s Discovery] [msg-rpc-service] Connection created, state: Ready [K8s Discovery] [msg-rpc-service] GetConn completed successfully ``` --- ## 4. 测试验证 ### 4.1 测试场景 1:Pod 重建 **步骤**: ```bash # 1. 查看日志 kubectl logs -f | grep "K8s Discovery" # 2. 触发 Pod 重建 kubectl delete pod # 3. 观察日志输出 # 应该看到: # - Endpoints UPDATED 事件 # - 连接重新初始化 # - 旧连接延迟关闭 ``` **预期日志**: ``` [K8s Discovery] [Watcher] Endpoints UPDATED: user-rpc-service [K8s Discovery] [user-rpc-service] Starting to initialize connections [K8s Discovery] [user-rpc-service] Found 2 old connections to close [K8s Discovery] [user-rpc-service] Scheduling delayed close for 2 old connections (5 seconds delay) [K8s Discovery] [user-rpc-service] Connection initialization completed successfully [K8s Discovery] [user-rpc-service] Closing 2 old connections [K8s Discovery] [user-rpc-service] Closed 2/2 old connections ``` ### 4.2 测试场景 2:消息同步和推送 **步骤**: ```bash # 1. 发送消息 # 2. 触发 Pod 重建 kubectl delete pod # 3. 验证消息同步和推送不失败 # 4. 查看日志确认连接正常 ``` **预期结果**: - ✅ 消息同步不失败(GetConn 使用 DNS,不受影响) - ✅ 消息推送不失败(GetConns 自动更新连接) - ✅ 日志显示连接自动恢复 ### 4.3 测试场景 3:连接健康检查 **步骤**: ```bash # 1. 查看当前连接状态 kubectl logs | grep "checking health" # 2. 模拟连接失效(停止目标服务) kubectl scale deployment --replicas=0 # 3. 等待一段时间后查看日志 # 应该看到连接被标记为无效并移除 ``` **预期日志**: ``` [K8s Discovery] [user-rpc-service] GetConns called [K8s Discovery] [user-rpc-service] Found 3 connections in cache, checking health [K8s Discovery] [user-rpc-service] Connection #0 is invalid, state: TransientFailure, closing [K8s Discovery] [user-rpc-service] Connection #1 is invalid, state: TransientFailure, closing [K8s Discovery] [user-rpc-service] Connection #2 is invalid, state: TransientFailure, closing [K8s Discovery] [user-rpc-service] All connections are invalid, clearing cache and reinitializing ``` --- ## 5. 问题排查指南 ### 5.1 常见问题 #### 问题 1:连接初始化失败 **症状**: ``` [K8s Discovery] [user-rpc-service] ERROR: Failed to dial endpoint 10.244.1.5:10320: connection refused ``` **可能原因**: - Pod 还未就绪 - 网络问题 - 端口配置错误 **排查步骤**: 1. 检查 Pod 状态:`kubectl get pods` 2. 检查 Service 配置:`kubectl get svc user-rpc-service -o yaml` 3. 检查 Endpoints:`kubectl get endpoints user-rpc-service` #### 问题 2:Endpoints 监听不工作 **症状**: - Pod 重建后连接不更新 - 没有看到 `[Watcher] Endpoints UPDATED` 日志 **可能原因**: - Informer 未启动 - 权限问题 - 网络问题 **排查步骤**: 1. 检查日志中是否有 `[K8s Discovery] Starting Endpoints watcher` 2. 检查 ServiceAccount 权限 3. 手动检查 Endpoints:`kubectl get endpoints -w` #### 问题 3:连接被强制关闭 **症状**: ``` grpc: the client connection is closing ``` **可能原因**: - GetConn 使用了 Endpoints 直连(错误) - 旧连接被立即关闭 **排查步骤**: 1. 检查日志中是否有 `[K8s Discovery] [xxx] GetConn called (using DNS)` 2. 确认 GetConn 使用的是 DNS 而不是 Endpoints 3. 检查延迟关闭是否生效 #### 问题 4:连接泄漏 **症状**: - 连接数量持续增长 - 内存使用持续增长 **可能原因**: - 旧连接未正确关闭 - 延迟关闭的 goroutine 未执行 **排查步骤**: 1. 查看日志中是否有 `[K8s Discovery] [xxx] Closing X old connections` 2. 检查延迟关闭的日志 3. 监控连接数量变化 ### 5.2 日志过滤命令 **查看所有 K8s Discovery 日志**: ```bash kubectl logs | grep "K8s Discovery" ``` **查看特定服务的日志**: ```bash kubectl logs | grep "K8s Discovery.*user-rpc-service" ``` **查看错误日志**: ```bash kubectl logs | grep "K8s Discovery.*ERROR" ``` **查看 Watcher 日志**: ```bash kubectl logs | grep "K8s Discovery.*Watcher" ``` **实时监控日志**: ```bash kubectl logs -f | grep "K8s Discovery" ``` ### 5.3 调试技巧 1. **启用详细日志**:如果默认日志不够,可以增加日志级别 2. **监控连接状态**:定期检查连接数量和状态 3. **对比 Endpoints**:手动检查 Endpoints 是否与连接列表一致 4. **测试 Pod 重建**:主动触发 Pod 重建,观察连接更新过程 --- ## 6. 总结 ### 6.1 修复要点 1. ✅ **修复监听资源类型**:从 Pod 改为 Endpoints 2. ✅ **GetConn 使用 DNS**:避免连接被强制关闭 3. ✅ **GetConns 使用 Endpoints**:支持负载均衡和自动更新 4. ✅ **延迟关闭旧连接**:避免正在进行的请求失败 5. ✅ **添加健康检查**:确保连接有效性 6. ✅ **添加 KeepAlive**:支持自动重连 7. ✅ **添加详细日志**:方便问题排查 ### 6.2 关键改进 - **解决了历史修复尝试中的问题**:GetConn 使用 DNS,避免连接被强制关闭 - **添加了完整的调试日志**:每个关键操作都有日志记录 - **改进了错误处理**:更好的错误信息和恢复机制 ### 6.3 使用建议 1. **部署前**:充分测试 Pod 重建场景 2. **部署后**:监控日志,观察连接更新过程 3. **问题排查**:使用日志过滤命令快速定位问题 4. **持续优化**:根据实际使用情况调整延迟关闭时间 --- ## 7. 相关文件 - 修复文件:`pkg/common/discovery/kubernetes/kubernetes.go` - 测试脚本:可以编写自动化测试脚本验证修复效果 - 监控指标:可以添加 Prometheus 指标监控连接状态