如何进行kubernetes问题的排障
- - Xinkun Blogk8s的成熟度很高,伴随着整个项目的扩增,以及新功能和新流程的不断引入,也伴随这产生了一些问题. 虽然自动化测试可以排除掉大部分,但是一些复杂流程以及极端情况却很难做到bug的完全覆盖. 因此在实际的工作过程中,需要对运行的集群进行故障定位和解决. 当然,进行排障的前提是对于k8s的流程和概念进行掌握,对于源码有一定的掌握能力,才可以更好的进行.
I1105 09:50:27.583544 548093 kubelet.go:1829] skipping pod synchronization - [PLEG is not healthy: pleg was last seen active 58h29m3.043779855s ago; threshold is 3m0s]
func (kl *Kubelet) syncLoop(updates <-chan kubetypes.PodUpdate, handler SyncHandler) {
......
for {
if rs := kl.runtimeState.runtimeErrors(); len(rs) != 0 {
glog.Infof("skipping pod synchronization - %v", rs)
// exponential backoff
time.Sleep(duration)
duration = time.Duration(math.Min(float64(max), factor*float64(duration)))
continue
}
// reset backoff if we have a success
duration = base
kl.syncLoopMonitor.Store(kl.clock.Now())
if !kl.syncLoopIteration(updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) {
break
}
kl.syncLoopMonitor.Store(kl.clock.Now())
}
......
func (g *GenericPLEG) Healthy() (bool, error) {
relistTime := g.getRelistTime()
elapsed := g.clock.Since(relistTime)
if elapsed > relistThreshold {
return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistT
hreshold)
}
return true, nil
}
curl -k --cert admin.pem --key admin-key.pem "https://127.0.0.1:10250/debug/pprof/goroutine?debug=1" > stack.txt
goroutine profile: total 2814
...
1 @ 0x42fcaa 0x42fd5e 0x4066cb 0x406465 0x30d4f3e 0x30d682a 0xad7474 0xad6a0d 0xad693d 0x45d2f1
# 0x30d4f3d k8s.io/kubernetes/pkg/kubelet/pleg.(*GenericPLEG).relist+0x74d /go/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/pkg/kubelet/pleg/generic.go:261
# 0x30d6829 k8s.io/kubernetes/pkg/kubelet/pleg.(*GenericPLEG).(k8s.io/kubernetes/pkg/kubelet/pleg.relist)-fm+0x29 /go/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/pkg/kubelet/pleg/generic.go:130
# 0xad7473 k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/util/wait.JitterUntil.func1+0x53 /go/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/util/wait/wait.go:133
# 0xad6a0c k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/util/wait.JitterUntil+0xbc /go/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/util/wait/wait.go:134
# 0xad693c k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/util/wait.Until+0x4c /go/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/util/wait/wait.go:88
func (g *GenericPLEG) relist() {
......
for i := range events {
// Filter out events that are not reliable and no other components use yet.
if events[i].Type == ContainerChanged {
continue
}
g.eventChannel <- events[i] //261行
}
for i := range events {
// Filter out events that are not reliable and no other components use yet.
if events[i].Type == ContainerChanged {
continue
}
select {
case g.eventChannel <- events[i]:
default:
metrics.PLEGDiscardEvents.WithLabelValues().Inc()
klog.Error("event channel is full, discard this relist() cycle event")
}
}