Skip to main content

System Monitoring & Logging

Log Analysis and Management

System Logs with journalctl

# Basic log viewing
journalctl # View all logs
journalctl -f # Follow logs in real-time
journalctl --since "2023-01-01" --until "2023-01-02" # Date range
journalctl --since "1 hour ago" # Recent logs

# Service-specific logs
journalctl -u nginx.service # Specific service logs
journalctl -u ssh.service -f # Follow SSH logs
journalctl -u mysql.service --since today # Today's MySQL logs

# Log filtering
journalctl -p err # Error level and above
journalctl -p warning..err # Warning to error range
journalctl -p debug # Debug level logs
journalctl -k # Kernel messages only
journalctl -b # Current boot logs
journalctl -b -1 # Previous boot logs

# Advanced filtering
journalctl _PID=1234 # Logs from specific PID
journalctl _UID=1000 # Logs from specific user
journalctl _COMM=sshd # Logs from specific command
journalctl PRIORITY=3 # Logs with priority 3 (err)

Traditional Syslog Management

# Syslog file locations
tail -f /var/log/syslog # General system log
tail -f /var/log/auth.log # Authentication log
tail -f /var/log/kern.log # Kernel log
tail -f /var/log/mail.log # Mail system log
tail -f /var/log/daemon.log # Daemon log

# Common log files
/var/log/messages # General system messages
/var/log/secure # Security/authentication log (RHEL/CentOS)
/var/log/cron # Cron job logs
/var/log/boot.log # Boot process log
/var/log/dmesg # Hardware/driver messages

# Log analysis with grep
grep "error" /var/log/syslog # Find error messages
grep -i "failed" /var/log/auth.log # Case-insensitive failed logins
grep "$(date '+%b %d')" /var/log/syslog # Today's logs

rsyslog Configuration

# rsyslog configuration
/etc/rsyslog.conf # Main configuration file
/etc/rsyslog.d/ # Additional configuration directory

# Common rsyslog rules
*.info;mail.none;authpriv.none;cron.none /var/log/messages
authpriv.* /var/log/secure
mail.* /var/log/maillog
cron.* /var/log/cron

# Remote logging setup
*.* @@remote-server:514 # Send all logs to remote server (TCP)
*.* @remote-server:514 # Send all logs to remote server (UDP)

# Restart rsyslog
sudo systemctl restart rsyslog
sudo systemctl reload rsyslog

Log Rotation and Archiving

# logrotate configuration
/etc/logrotate.conf # Main configuration
/etc/logrotate.d/ # Service-specific configurations

# Example logrotate configuration
/var/log/myapp/*.log {
daily # Rotate daily
rotate 30 # Keep 30 days
compress # Compress old logs
delaycompress # Delay compression for one cycle
missingok # Don't error if log is missing
notifempty # Don't rotate empty logs
create 644 root root # Create new log file with permissions
postrotate
/bin/kill -HUP $(cat /var/run/myapp.pid 2>/dev/null) 2>/dev/null || true
endscript
}

# Manual log rotation
sudo logrotate /etc/logrotate.conf # Run logrotate manually
sudo logrotate -d /etc/logrotate.conf # Debug mode
sudo logrotate -f /etc/logrotate.conf # Force rotation

System Monitoring Tools

Nagios Monitoring

# Nagios installation (Ubuntu/Debian)
sudo apt update
sudo apt install nagios3 nagios-plugins

# Configuration files
/etc/nagios3/nagios.cfg # Main configuration
/etc/nagios3/conf.d/ # Service definitions
/etc/nagios3/commands.cfg # Command definitions

# Basic host definition
define host {
use linux-server
host_name webserver1
alias Web Server 1
address 192.168.1.100
contact_groups admins
notification_interval 30
notification_period 24x7
}

# Basic service definition
define service {
use local-service
host_name webserver1
service_description HTTP
check_command check_http
notification_interval 30
notification_period 24x7
}

# Nagios commands
sudo systemctl start nagios3
sudo systemctl enable nagios3
sudo /usr/sbin/nagios3 -v /etc/nagios3/nagios.cfg # Verify configuration

Zabbix Monitoring

# Zabbix agent installation
sudo apt install zabbix-agent

# Zabbix agent configuration
/etc/zabbix/zabbix_agentd.conf

# Key configuration parameters
Server=192.168.1.50 # Zabbix server IP
ServerActive=192.168.1.50 # Zabbix server for active checks
Hostname=webserver1 # Hostname for this agent

# Zabbix agent commands
sudo systemctl start zabbix-agent
sudo systemctl enable zabbix-agent
sudo systemctl status zabbix-agent

# Test Zabbix connectivity
zabbix_get -s 192.168.1.100 -k system.cpu.load[all,avg1]
zabbix_get -s 192.168.1.100 -k vm.memory.size[available]

Prometheus Monitoring

# Prometheus installation
wget https://github.com/prometheus/prometheus/releases/latest
tar xvfz prometheus-*.tar.gz
cd prometheus-*
./prometheus --config.file=prometheus.yml

# Prometheus configuration (prometheus.yml)
global:
scrape_interval: 15s

scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']

- job_name: 'node'
static_configs:
- targets: ['localhost:9100']

# Node Exporter for system metrics
wget https://github.com/prometheus/node_exporter/releases/latest
tar xvfz node_exporter-*.tar.gz
cd node_exporter-*
./node_exporter

# Common Prometheus queries
up # Check if targets are up
rate(cpu_seconds_total[5m]) # CPU usage rate
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes # Memory usage

Alert Configuration and Notification

Email Alerting Setup

# Configure mail system
sudo apt install mailutils postfix

# Test email sending
echo "Test message" | mail -s "Test Subject" admin@domain.com

# Nagios email notifications
define contact {
contact_name admin
alias System Administrator
email admin@domain.com
service_notification_period 24x7
host_notification_period 24x7
service_notification_commands notify-service-by-email
host_notification_commands notify-host-by-email
}

# Custom alert script
#!/bin/bash
# /usr/local/bin/alert.sh
THRESHOLD=80
USAGE=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
if [ $USAGE -gt $THRESHOLD ]; then
echo "Disk usage is $USAGE%" | mail -s "Disk Alert" admin@domain.com
fi

Slack/Teams Integration

# Slack webhook script
#!/bin/bash
WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
MESSAGE="$1"
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"$MESSAGE\"}" \
$WEBHOOK_URL

# Usage
./slack-alert.sh "Server load is high: $(uptime)"

Prometheus Alertmanager

# Alertmanager configuration (alertmanager.yml)
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@domain.com'

route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'

receivers:
- name: 'web.hook'
email_configs:
- to: 'admin@domain.com'
subject: 'Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}

# Prometheus alert rules
groups:
- name: system
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% for 5 minutes"

Performance Metrics Collection

System Performance Metrics

# CPU metrics
sar -u 1 60 # CPU utilization
mpstat -P ALL 1 # Per-CPU statistics
pidstat -u 1 # Per-process CPU usage

# Memory metrics
sar -r 1 60 # Memory utilization
sar -S 1 60 # Swap utilization
pidstat -r 1 # Per-process memory usage

# Disk I/O metrics
sar -d 1 60 # Disk activity
iostat -x 1 # Extended disk statistics
pidstat -d 1 # Per-process disk I/O

# Network metrics
sar -n DEV 1 60 # Network device statistics
sar -n EDEV 1 60 # Network error statistics
ss -s # Socket statistics

Custom Metrics Collection

# Create custom metrics script
#!/bin/bash
# /usr/local/bin/collect-metrics.sh
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
MEM_USAGE=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
DISK_USAGE=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
LOAD_AVG=$(uptime | awk -F'load average:' '{print $2}' | cut -d, -f1)

echo "$TIMESTAMP,CPU:$CPU_USAGE,MEM:$MEM_USAGE,DISK:$DISK_USAGE,LOAD:$LOAD_AVG" >> /var/log/metrics.csv

# Run every minute via cron
# * * * * * /usr/local/bin/collect-metrics.sh

Application Performance Monitoring

# Java application monitoring
jstat -gc PID 1s # Garbage collection statistics
jstack PID # Thread dump
jmap -dump:format=b,file=heap.hprof PID # Heap dump

# Database monitoring
mysqladmin processlist # MySQL process list
mysqladmin status # MySQL status
pg_stat_activity # PostgreSQL activity (in psql)

# Web server monitoring
curl -w "@curl-format.txt" http://localhost/ # Response time
ab -n 100 -c 10 http://localhost/ # Apache bench

Centralized Logging

ELK Stack Setup

# Elasticsearch installation
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-7.x.list
sudo apt update && sudo apt install elasticsearch

# Elasticsearch configuration
/etc/elasticsearch/elasticsearch.yml
cluster.name: my-cluster
node.name: node-1
network.host: localhost
http.port: 9200

# Logstash configuration
/etc/logstash/conf.d/syslog.conf
input {
file {
path => "/var/log/syslog"
start_position => "beginning"
}
}

filter {
grok {
match => { "message" => "%{SYSLOGTIMESTAMP:timestamp} %{IPORHOST:host} %{WORD:program}: %{GREEDYDATA:message}" }
}
date {
match => [ "timestamp", "MMM d HH:mm:ss", "MMM dd HH:mm:ss" ]
}
}

output {
elasticsearch {
hosts => ["localhost:9200"]
index => "syslog-%{+YYYY.MM.dd}"
}
}

# Kibana configuration
/etc/kibana/kibana.yml
server.port: 5601
server.host: "localhost"
elasticsearch.hosts: ["http://localhost:9200"]

Fluentd Configuration

# Fluentd installation
gem install fluentd

# Fluentd configuration
/etc/fluent/fluent.conf
<source>
@type tail
path /var/log/syslog
pos_file /var/log/fluent/syslog.log.pos
tag syslog
format syslog
</source>

<match syslog>
@type elasticsearch
host localhost
port 9200
index_name syslog
type_name syslog
</match>

# Start Fluentd
fluentd -c /etc/fluent/fluent.conf -d /var/log/fluent/fluentd.pid

Rsyslog to Elasticsearch

# Install rsyslog elasticsearch module
sudo apt install rsyslog-elasticsearch

# Configure rsyslog for Elasticsearch
/etc/rsyslog.d/elasticsearch.conf
module(load="omelasticsearch")
*.* action(type="omelasticsearch"
server="localhost"
serverport="9200"
template="StdJSONFmt"
searchIndex="syslog-index"
dynSearchIndex="on")

# Restart rsyslog
sudo systemctl restart rsyslog

Real-time Monitoring Dashboards

Grafana Setup

# Grafana installation
sudo apt-get install -y adduser libfontconfig1
wget https://dl.grafana.com/oss/release/grafana_latest_amd64.deb
sudo dpkg -i grafana_latest_amd64.deb

# Start Grafana
sudo systemctl start grafana-server
sudo systemctl enable grafana-server

# Access Grafana: http://localhost:3000 (admin/admin)

# Add Prometheus data source
URL: http://localhost:9090

Custom Dashboard Creation

# Example Grafana dashboard JSON
{
"dashboard": {
"title": "System Overview",
"panels": [
{
"title": "CPU Usage",
"type": "graph",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU Usage %"
}
]
},
{
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100",
"legendFormat": "Memory Usage %"
}
]
}
]
}
}

Terminal-based Dashboards

# htop with custom configuration
htop -C # Use color
htop -d 1 # Update every second
htop -u username # Show only user processes

# nmon for comprehensive monitoring
nmon # Interactive mode
nmon -f -s 5 -c 720 # Record to file (5 sec intervals, 1 hour)

# tmux monitoring dashboard
tmux new-session -d 'htop'
tmux split-window -h 'watch -n 1 "df -h"'
tmux split-window -v 'tail -f /var/log/syslog'
tmux select-pane -t 0
tmux split-window -v 'watch -n 1 "free -h"'
tmux attach

Log Parsing and Analysis Tools

Advanced Log Analysis

# awk for log analysis
awk '{print $1}' /var/log/access.log | sort | uniq -c | sort -nr # IP frequency
awk '$9 == 404 {print $7}' /var/log/access.log | sort | uniq -c # 404 errors
awk '{sum += $10} END {print "Total bytes:", sum}' /var/log/access.log # Total bytes

# sed for log processing
sed -n '1000,2000p' /var/log/syslog # Lines 1000-2000
sed '/ERROR/!d' /var/log/app.log # Only ERROR lines
sed 's/192.168.1.100/SERVER1/g' /var/log/syslog # Replace IP with name

# grep advanced patterns
grep -E "(error|ERROR|Error)" /var/log/syslog # Case variations
grep -v "INFO" /var/log/app.log | grep -E "(WARN|ERROR)" # Exclude INFO, show WARN/ERROR
grep -B 5 -A 5 "OutOfMemory" /var/log/app.log # 5 lines before/after
grep -r "connection refused" /var/log/ # Recursive search

GoAccess Web Log Analyzer

# Install GoAccess
sudo apt install goaccess

# Real-time web log analysis
goaccess /var/log/nginx/access.log --log-format=COMBINED
goaccess /var/log/apache2/access.log --log-format=COMBINED

# Generate HTML report
goaccess /var/log/nginx/access.log -o /var/www/html/report.html --log-format=COMBINED --real-time-html

# Custom log format
goaccess /var/log/custom.log --log-format='%h %^[%d:%t %^] "%r" %s %b "%R" "%u"'

Logwatch System

# Install Logwatch
sudo apt install logwatch

# Configuration
/etc/logwatch/conf/logwatch.conf
MailTo = admin@domain.com
MailFrom = logwatch@server.com
Range = yesterday
Detail = Med

# Generate report
logwatch --detail Med --mailto admin@domain.com --range yesterday
logwatch --service sshd --print # Specific service report
logwatch --service http --range today --detail High

System Health Checks

Automated Health Monitoring

# System health check script
#!/bin/bash
# /usr/local/bin/health-check.sh

# Check CPU load
LOAD=$(uptime | awk -F'load average:' '{print $2}' | cut -d, -f1 | xargs)
if (( $(echo "$LOAD > 4.0" | bc -l) )); then
echo "WARNING: High CPU load: $LOAD"
fi

# Check memory usage
MEM_USAGE=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
if (( $(echo "$MEM_USAGE > 90" | bc -l) )); then
echo "WARNING: High memory usage: $MEM_USAGE%"
fi

# Check disk usage
df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 90) print "WARNING: High disk usage on", $6, ":", $5"%"}'

# Check running services
for service in nginx mysql ssh; do
if ! systemctl is-active --quiet $service; then
echo "ERROR: Service $service is not running"
fi
done

# Check log errors
ERROR_COUNT=$(grep -c "ERROR" /var/log/syslog)
if [ $ERROR_COUNT -gt 10 ]; then
echo "WARNING: $ERROR_COUNT errors found in syslog"
fi

Service Health Monitoring

# Service monitoring script
#!/bin/bash
# /usr/local/bin/service-monitor.sh

SERVICES=("nginx" "mysql" "redis" "ssh")
for service in "${SERVICES[@]}"; do
if systemctl is-active --quiet $service; then
echo "$service: OK"
else
echo "$service: FAILED"
systemctl restart $service
sleep 5
if systemctl is-active --quiet $service; then
echo "$service: RECOVERED"
else
echo "$service: RESTART FAILED" | mail -s "Service Alert" admin@domain.com
fi
fi
done

Network Health Checks

# Network connectivity check
#!/bin/bash
# /usr/local/bin/network-check.sh

HOSTS=("8.8.8.8" "google.com" "internal-server.local")
for host in "${HOSTS[@]}"; do
if ping -c 1 $host &> /dev/null; then
echo "$host: OK"
else
echo "$host: UNREACHABLE"
fi
done

# Port connectivity check
PORTS=("80" "443" "22" "3306")
for port in "${PORTS[@]}"; do
if nc -z localhost $port; then
echo "Port $port: OPEN"
else
echo "Port $port: CLOSED"
fi
done

Monitoring Automation

Cron-based Monitoring

# Crontab for monitoring tasks
# Edit with: crontab -e

# System metrics every minute
* * * * * /usr/local/bin/collect-metrics.sh

# Health check every 5 minutes
*/5 * * * * /usr/local/bin/health-check.sh

# Log rotation check daily
0 2 * * * /usr/sbin/logrotate /etc/logrotate.conf

# Weekly system report
0 8 * * 1 /usr/local/bin/weekly-report.sh | mail -s "Weekly System Report" admin@domain.com

# Disk usage alert daily
0 9 * * * /usr/local/bin/disk-usage-alert.sh

Systemd Service Monitoring

# Create monitoring service
/etc/systemd/system/system-monitor.service
[Unit]
Description=System Monitor
After=network.target

[Service]
Type=simple
User=root
ExecStart=/usr/local/bin/system-monitor.sh
Restart=always
RestartSec=30

[Install]
WantedBy=multi-user.target

# Enable and start service
sudo systemctl enable system-monitor.service
sudo systemctl start system-monitor.service

Monitoring with Ansible

# monitoring-playbook.yml
---
- hosts: all
tasks:
- name: Check system load
shell: uptime | awk -F'load average:' '{print $2}' | cut -d, -f1
register: load_avg

- name: Alert on high load
mail:
to: admin@domain.com
subject: 'High Load Alert'
body: 'System load is {{ load_avg.stdout }}'
when: load_avg.stdout|float > 4.0

- name: Check disk usage
shell: df / | awk 'NR==2 {print $5}' | sed 's/%//'
register: disk_usage

- name: Alert on disk usage
mail:
to: admin@domain.com
subject: 'Disk Usage Alert'
body: 'Disk usage is {{ disk_usage.stdout }}%'
when: disk_usage.stdout|int > 90

Best Practices

Monitoring Strategy

  1. Establish baselines - Understand normal system behavior
  2. Monitor continuously - Use automated tools for 24/7 monitoring
  3. Set meaningful thresholds - Avoid alert fatigue with proper limits
  4. Implement redundancy - Use multiple monitoring systems
  5. Document everything - Maintain runbooks and procedures

Log Management Best Practices

  1. Centralize logs - Use centralized logging solutions
  2. Implement log rotation - Prevent disk space issues
  3. Use structured logging - JSON or similar formats for easier parsing
  4. Set retention policies - Balance storage costs with compliance needs
  5. Monitor log volume - Detect anomalies in log generation

Security Considerations

  1. Secure log files - Proper permissions and access controls
  2. Encrypt log transmission - Use TLS for remote logging
  3. Audit log access - Monitor who accesses sensitive logs
  4. Regular security reviews - Check monitoring system security
  5. Backup monitoring data - Protect against data loss

Performance Optimization

  1. Efficient log parsing - Use appropriate tools for log analysis
  2. Optimize queries - Efficient database queries for metrics
  3. Use compression - Reduce storage requirements
  4. Implement caching - Cache frequently accessed metrics
  5. Regular maintenance - Clean up old data and optimize systems