Skip to main content

Monitoring & Troubleshooting

Debugging techniques, monitoring tools, logging, and observability for Kubernetes applications and clusters.

Resource Monitoring

Resource Usage Commands

# Node resource usage (requires metrics-server)
kubectl top nodes
kubectl top nodes --sort-by=cpu
kubectl top nodes --sort-by=memory

# Pod resource usage
kubectl top pods
kubectl top pods --all-namespaces
kubectl top pods --sort-by=cpu
kubectl top pods --sort-by=memory
kubectl top pods -n namespace-name

# Container resource usage
kubectl top pods --containers
kubectl top pods pod-name --containers

# Resource usage for specific namespace
kubectl top pods -n kube-system

Metrics Server

apiVersion: apps/v1
kind: Deployment
metadata:
name: metrics-server
namespace: kube-system
spec:
selector:
matchLabels:
k8s-app: metrics-server
template:
metadata:
labels:
k8s-app: metrics-server
spec:
containers:
- name: metrics-server
image: k8s.gcr.io/metrics-server/metrics-server:v0.6.1
args:
- --cert-dir=/tmp
- --secure-port=4443
- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
- --kubelet-use-node-status-port
- --kubelet-insecure-tls
ports:
- containerPort: 4443
name: https
protocol: TCP

Pod Troubleshooting

Pod Status Debugging

# Check pod status
kubectl get pods
kubectl get pods -o wide
kubectl get pods --field-selector=status.phase=Failed

# Describe pod for events and details
kubectl describe pod pod-name

# Check pod events
kubectl get events --field-selector involvedObject.name=pod-name
kubectl get events --field-selector involvedObject.name=pod-name --sort-by='.lastTimestamp'

# Check pod conditions
kubectl get pod pod-name -o jsonpath='{.status.conditions[*].type}'
kubectl get pod pod-name -o jsonpath='{.status.conditions[*].message}'

# Check container statuses
kubectl get pod pod-name -o jsonpath='{.status.containerStatuses[*].state}'

Pod Logs

# Basic log viewing
kubectl logs pod-name
kubectl logs pod-name -c container-name

# Follow logs
kubectl logs -f pod-name
kubectl logs -f pod-name -c container-name

# Previous container logs
kubectl logs pod-name --previous
kubectl logs pod-name -c container-name --previous

# Logs with timestamps
kubectl logs pod-name --timestamps

# Tail logs
kubectl logs pod-name --tail=100
kubectl logs pod-name --tail=50 --timestamps

# Logs since specific time
kubectl logs pod-name --since=1h
kubectl logs pod-name --since=2023-01-01T00:00:00Z

# All containers in pod
kubectl logs pod-name --all-containers=true

# Logs from all pods with label
kubectl logs -l app=nginx
kubectl logs -l app=nginx --all-containers=true

Interactive Debugging

# Execute commands in running container
kubectl exec -it pod-name -- /bin/bash
kubectl exec -it pod-name -c container-name -- /bin/sh

# Run commands without interactive session
kubectl exec pod-name -- ps aux
kubectl exec pod-name -- cat /etc/hosts
kubectl exec pod-name -c container-name -- env

# Debug with temporary container (sidecar debugging)
kubectl debug pod-name -it --image=busybox --target=container-name

# Create debugging copy of pod
kubectl debug pod-name -it --copy-to=debug-pod --container=debug-container --image=busybox

Node Troubleshooting

Node Information

# Get node information
kubectl get nodes
kubectl get nodes -o wide
kubectl describe node node-name

# Node conditions
kubectl get nodes -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}'

# Node capacity and allocatable resources
kubectl describe node node-name | grep -A 5 "Capacity:"
kubectl describe node node-name | grep -A 5 "Allocatable:"

# Node taints and labels
kubectl get nodes --show-labels
kubectl describe node node-name | grep Taints

# Pods running on specific node
kubectl get pods --all-namespaces --field-selector spec.nodeName=node-name

# Node resource usage
kubectl top node node-name

Node Debugging

# SSH to node (if accessible)
ssh user@node-ip

# Create privileged pod for node debugging
kubectl apply -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: node-debugger
spec:
hostNetwork: true
hostPID: true
hostIPC: true
containers:
- name: debugger
image: busybox
command: ["sleep", "3600"]
securityContext:
privileged: true
volumeMounts:
- name: host-root
mountPath: /host
volumes:
- name: host-root
hostPath:
path: /
nodeSelector:
kubernetes.io/hostname: node-name
EOF

# Access node filesystem
kubectl exec -it node-debugger -- chroot /host

Event Debugging

Event Analysis

# Get all events
kubectl get events
kubectl get events --all-namespaces

# Sort events by timestamp
kubectl get events --sort-by='.lastTimestamp'
kubectl get events --sort-by='.firstTimestamp'

# Filter events by object
kubectl get events --field-selector involvedObject.name=pod-name
kubectl get events --field-selector involvedObject.kind=Pod

# Filter events by type
kubectl get events --field-selector type=Warning
kubectl get events --field-selector type=Normal

# Filter events by reason
kubectl get events --field-selector reason=Failed
kubectl get events --field-selector reason=Pulled

# Recent events
kubectl get events --field-selector involvedObject.name=pod-name --sort-by='.lastTimestamp' | tail -20

# Watch events in real-time
kubectl get events --watch
kubectl get events --watch --field-selector involvedObject.name=pod-name

Cluster Debugging

Cluster Information

# Cluster info
kubectl cluster-info
kubectl cluster-info dump

# Component status
kubectl get componentstatuses
kubectl get cs

# API server info
kubectl get --raw /api/v1
kubectl get --raw /version

# Cluster nodes and system pods
kubectl get nodes
kubectl get pods -n kube-system

# Check cluster DNS
kubectl get svc -n kube-system
kubectl get pods -n kube-system -l k8s-app=kube-dns

Resource Validation

# Validate YAML without applying
kubectl apply --dry-run=client -f manifest.yaml
kubectl apply --dry-run=server -f manifest.yaml

# Validate and show diff
kubectl diff -f manifest.yaml

# Explain resource fields
kubectl explain pod
kubectl explain pod.spec
kubectl explain pod.spec.containers

# Get API resources
kubectl api-resources
kubectl api-resources --verbs=list --namespaced=true
kubectl api-resources --api-group=apps

Logging and Observability

Centralized Logging Setup

# Fluentd DaemonSet for log collection
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: fluentd
namespace: kube-system
spec:
selector:
matchLabels:
name: fluentd
template:
metadata:
labels:
name: fluentd
spec:
containers:
- name: fluentd
image: fluent/fluentd-kubernetes-daemonset:v1-debian-elasticsearch
env:
- name: FLUENT_ELASTICSEARCH_HOST
value: 'elasticsearch.logging.svc.cluster.local'
- name: FLUENT_ELASTICSEARCH_PORT
value: '9200'
volumeMounts:
- name: varlog
mountPath: /var/log
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
volumes:
- name: varlog
hostPath:
path: /var/log
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers

Prometheus Monitoring

apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)

---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus:latest
args:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus/
- --web.console.libraries=/etc/prometheus/console_libraries
- --web.console.templates=/etc/prometheus/consoles
ports:
- containerPort: 9090
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus/
- name: prometheus-storage
mountPath: /prometheus/
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-storage
emptyDir: {}

Performance Troubleshooting

Resource Constraints

# Check resource limits and requests
kubectl describe pod pod-name | grep -A 5 "Limits:"
kubectl describe pod pod-name | grep -A 5 "Requests:"

# Check if pods are hitting resource limits
kubectl top pods --sort-by=memory
kubectl top pods --sort-by=cpu

# Check node resource availability
kubectl describe nodes | grep -A 5 "Allocated resources"

# Check pod QoS class
kubectl get pod pod-name -o jsonpath='{.status.qosClass}'

Performance Metrics

# Pod with resource monitoring
apiVersion: v1
kind: Pod
metadata:
name: resource-monitor
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '8080'
prometheus.io/path: '/metrics'
spec:
containers:
- name: app
image: nginx
resources:
requests:
memory: '128Mi'
cpu: '100m'
limits:
memory: '256Mi'
cpu: '200m'
livenessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 5
periodSeconds: 5

Health Checks and Probes

Liveness and Readiness Probes

apiVersion: v1
kind: Pod
metadata:
name: probe-example
spec:
containers:
- name: app
image: myapp:latest
ports:
- containerPort: 8080
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
startupProbe:
httpGet:
path: /startup
port: 8080
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 30

Probe Debugging

# Check probe status
kubectl describe pod pod-name | grep -A 10 "Conditions:"

# Check probe failures in events
kubectl get events --field-selector involvedObject.name=pod-name,reason=Unhealthy

# Test probe endpoints manually
kubectl exec -it pod-name -- curl http://localhost:8080/health
kubectl port-forward pod-name 8080:8080
curl http://localhost:8080/health

Common Troubleshooting Scenarios

Pod Stuck in Pending

# Check events for scheduling issues
kubectl describe pod pod-name

# Check node resources
kubectl top nodes
kubectl describe nodes

# Check pod resource requests
kubectl describe pod pod-name | grep -A 5 "Requests:"

# Check taints and tolerations
kubectl get nodes --show-labels
kubectl describe pod pod-name | grep -A 5 "Tolerations:"

Pod CrashLoopBackOff

# Check logs
kubectl logs pod-name
kubectl logs pod-name --previous

# Check resource limits
kubectl describe pod pod-name | grep -A 5 "Limits:"

# Check liveness probe configuration
kubectl describe pod pod-name | grep -A 10 "Liveness:"

# Debug with different command
kubectl run debug-pod --image=same-image --rm -it -- /bin/bash

ImagePullBackOff

# Check events
kubectl describe pod pod-name

# Check image name and tag
kubectl get pod pod-name -o jsonpath='{.spec.containers[*].image}'

# Check image pull secrets
kubectl get pod pod-name -o jsonpath='{.spec.imagePullSecrets[*].name}'
kubectl describe secret secret-name

# Test image pull manually
docker pull image-name

Service Not Accessible

# Check service endpoints
kubectl get endpoints service-name
kubectl describe service service-name

# Check pod labels and selectors
kubectl get pods --show-labels
kubectl describe service service-name | grep Selector

# Test service connectivity
kubectl run test-pod --image=busybox --rm -it -- wget -qO- http://service-name:port

# Check network policies
kubectl get networkpolicies
kubectl describe networkpolicy policy-name

Debugging Tools and Utilities

Essential Debugging Images

# Multi-tool debugging container
kubectl run debug --image=nicolaka/netshoot --rm -it -- bash

# Minimal debugging with busybox
kubectl run debug --image=busybox --rm -it -- sh

# Alpine with package manager
kubectl run debug --image=alpine --rm -it -- sh

# Ubuntu for full toolset
kubectl run debug --image=ubuntu --rm -it -- bash

Debugging Sidecar

apiVersion: v1
kind: Pod
metadata:
name: app-with-debug
spec:
containers:
- name: app
image: myapp:latest
ports:
- containerPort: 8080
- name: debug
image: nicolaka/netshoot
command: ['sleep', '3600']
volumeMounts:
- name: shared-data
mountPath: /shared
volumes:
- name: shared-data
emptyDir: {}

Quick Reference

Essential Debugging Commands

  • kubectl describe pod name - Detailed pod information
  • kubectl logs pod-name - Container logs
  • kubectl exec -it pod-name -- bash - Access pod shell
  • kubectl get events - Cluster events
  • kubectl top pods - Resource usage

Common Issues and Solutions

  • Pending pods - Check node resources, taints, and scheduling constraints
  • CrashLoopBackOff - Check logs, resource limits, and liveness probes
  • ImagePullBackOff - Verify image name, registry access, and pull secrets
  • Service issues - Check endpoints, selectors, and network policies

Monitoring Tools

  • metrics-server - Basic resource metrics
  • Prometheus - Comprehensive monitoring
  • Grafana - Metrics visualization
  • Fluentd/ELK - Centralized logging
  • Jaeger - Distributed tracing

Performance Troubleshooting

# Resource usage
kubectl top nodes
kubectl top pods --sort-by=memory

# Resource constraints
kubectl describe nodes | grep -A 5 "Allocated"

# Pod quality of service
kubectl get pods -o custom-columns=NAME:.metadata.name,QOS:.status.qosClass