Skip to main content

Troubleshooting

Memory Troubleshooting

Memory Usage Analysis

# Check memory usage
free -h # Human-readable memory info
cat /proc/meminfo # Detailed memory information
vmstat 1 5 # Memory statistics (1 sec intervals, 5 times)
vmstat -s # Memory statistics summary

# Memory usage by process
ps aux --sort=-%mem | head -10 # Top 10 memory consumers
ps -eo pid,ppid,cmd,%mem,%cpu --sort=-%mem | head
top -o %MEM # Sort processes by memory usage

Memory Issues

# Check for memory leaks
valgrind --leak-check=full ./program

# Monitor memory usage over time
watch -n 1 'free -h'

# Check swap usage
swapon --show # Show swap partitions
cat /proc/swaps # Swap file information

Out of Memory (OOM) Investigation

# Check OOM killer logs
dmesg | grep -i "killed process"
journalctl -k | grep -i "killed process"
grep -i "killed process" /var/log/messages

# Check memory limits
ulimit -v # Virtual memory limit
cat /proc/sys/vm/overcommit_memory

Disk Troubleshooting

Disk Usage Analysis

# Check disk usage
df -h # Filesystem usage
df -i # Inode usage
du -sh /path/* # Directory sizes
du -ah /path | sort -rh | head -20 # Largest files/directories

# Find large files
find /path -type f -size +100M -exec ls -lh {} \; | awk '{ print $9 ": " $5 }'
find /path -type f -printf '%s %p\n' | sort -rn | head -20

Disk Performance

# Check disk I/O
iostat -x 1 # Extended I/O statistics
iotop # I/O monitoring by process
sar -d 1 5 # Disk activity

# Test disk performance
dd if=/dev/zero of=/tmp/test bs=1M count=1000 # Write test
dd if=/tmp/test of=/dev/null bs=1M # Read test
hdparm -tT /dev/sda # Disk speed test

Disk Errors

# Check filesystem errors
fsck /dev/sda1 # Check filesystem (unmount first!)
fsck -f /dev/sda1 # Force check
e2fsck -f /dev/sda1 # Check ext2/3/4 filesystem

# Check disk health
smartctl -a /dev/sda # SMART disk health
badblocks -v /dev/sda # Check for bad blocks

Full Disk Issues

# Clean up disk space
# Find and remove large log files
find /var/log -name "*.log" -size +100M

# Clean package cache
apt-get clean # Debian/Ubuntu
yum clean all # Red Hat/CentOS

# Find files by age
find /path -type f -mtime +30 # Files older than 30 days
find /path -type f -atime +30 # Files not accessed in 30 days

Network Troubleshooting

Network Connectivity

# Basic connectivity tests
ping -c 4 google.com # Test internet connectivity
ping -c 4 192.168.1.1 # Test gateway connectivity
traceroute google.com # Trace network path
mtr google.com # Real-time network diagnostics

# DNS troubleshooting
nslookup google.com # DNS lookup
dig google.com # Detailed DNS info
host google.com # Simple DNS lookup
cat /etc/resolv.conf # DNS configuration

Network Configuration

# Check network interfaces
ip addr show # Show IP addresses
ip link show # Show network interfaces
ip route show # Show routing table
route -n # Numeric routing table

# Network interface status
ethtool eth0 # Ethernet interface info
iwconfig # Wireless interface info

Network Performance

# Network statistics
netstat -i # Interface statistics
cat /proc/net/dev # Network device statistics
ss -s # Socket statistics
sar -n DEV 1 5 # Network device activity

# Bandwidth testing
iperf3 -s # Server mode
iperf3 -c server_ip # Client mode

Port and Service Issues

# Check listening ports
netstat -tuln # TCP/UDP listening ports
ss -tuln # Modern replacement for netstat
lsof -i :80 # What's using port 80
nmap -p 80 localhost # Port scan

# Check services
systemctl status service_name
systemctl list-units --failed

Firewall Issues

# Check firewall status
ufw status # Ubuntu firewall
iptables -L # List iptables rules
firewall-cmd --list-all # CentOS/RHEL firewall

# Check SELinux (if applicable)
getenforce # SELinux status
sestatus # SELinux detailed status

CPU Troubleshooting

CPU Usage Analysis

# Check CPU usage
top # Real-time process viewer
htop # Enhanced process viewer
uptime # System load average
cat /proc/loadavg # Load average

# CPU usage by process
ps aux --sort=-%cpu | head -10 # Top CPU consumers
ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head

CPU Performance

# CPU information
lscpu # CPU details
cat /proc/cpuinfo # Detailed CPU info
nproc # Number of processors

# CPU statistics
sar -u 1 5 # CPU utilization
vmstat 1 5 # System statistics including CPU
mpstat 1 5 # Multi-processor statistics

High CPU Load Investigation

# Find processes causing high CPU
ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head -20
top -c -o %CPU # Sort by CPU usage

# Check for CPU-intensive processes
nice -n 19 command # Run command with low priority
renice -n 19 -p PID # Change process priority

CPU Throttling

# Check CPU frequency
cat /proc/cpuinfo | grep MHz
cpufreq-info # CPU frequency information
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq

# Check thermal throttling
sensors # Temperature monitoring
cat /proc/acpi/thermal_zone/*/temperature

System Performance Issues

System Load

# Check system load
uptime # Load average
w # Who is logged in and what they're doing
who # Who is logged in

# System activity
sar -A # All system activity
sar -u 1 60 # CPU usage for 1 minute
sar -r 1 60 # Memory usage for 1 minute

Process Investigation

# Process tree
pstree # Show process tree
ps -ejH # Process hierarchy
ps aux --forest # Process tree with details

# Process details
lsof -p PID # Files opened by process
strace -p PID # Trace system calls
cat /proc/PID/status # Process status

System Limits

# Check system limits
ulimit -a # All limits for current user
cat /proc/sys/fs/file-max # Maximum file descriptors
cat /proc/sys/kernel/pid_max # Maximum PID value
sysctl -a | grep -i limit # System limits

Log Analysis

System Logs

# View system logs
journalctl # Systemd journal
journalctl -xe # Recent entries with explanations
journalctl -f # Follow log entries
journalctl -u service # Logs for specific service

# Traditional log files
tail -f /var/log/syslog # Follow system log
tail -f /var/log/messages # System messages
tail -f /var/log/auth.log # Authentication log

Log Analysis Tools

# Search logs
grep -i error /var/log/syslog
grep -i "failed\|error" /var/log/messages

# Log rotation
logrotate -f /etc/logrotate.conf

Common Issues and Solutions

Boot Issues

# Check boot messages
dmesg | less # Kernel ring buffer
journalctl -b # Boot log
cat /var/log/boot.log # Boot messages

Permission Issues

# Check file permissions
ls -la filename # File permissions
namei -l /path/to/file # Permissions along path
getfacl filename # Access control lists

Service Issues

# Service troubleshooting
systemctl status service_name
systemctl --failed # Failed services
journalctl -u service_name -f

Performance Degradation

# System performance overview
dstat # System resource statistics
glances # System monitoring
nmon # System performance monitor

Emergency Procedures

System Recovery

# Single user mode (from GRUB)
# Add 'single' or 'init=/bin/bash' to kernel parameters

# Reset root password
passwd root # Change root password

# Filesystem repair
fsck -y /dev/sda1 # Automatic yes to all questions

Process Management Emergency

# Kill all processes by user
pkill -u username # Kill all processes for user
killall -9 process_name # Force kill all instances

# System restart/shutdown
shutdown -r now # Restart immediately
shutdown -h now # Halt immediately
reboot # Restart system

Resource Exhaustion

# Free memory
sync && echo 3 > /proc/sys/vm/drop_caches # Clear cache
killall -9 process_name # Kill memory-hungry processes

# Free disk space
find /tmp -type f -atime +7 -delete # Clean old temp files
journalctl --vacuum-time=3d # Clean old journal entries