Text Processing & Scripting
Comprehensive guide to Linux text processing, pattern matching, and shell scripting with practical examples and real-world use cases.
Quick Navigation
- grep - Advanced Pattern Matching
- sed - Stream Editor
- awk - Pattern Scanning and Processing
- Regular Expressions
- Text Manipulation Tools
- Shell Scripting Patterns
- Data Extraction & Formatting
- Log Parsing Techniques
- CSV & JSON Processing
- Stream Processing & Pipes
grep - Advanced Pattern Matching
Basic Usage
grep "pattern" file.txt # Search for pattern in file
grep -i "pattern" file.txt # Case-insensitive search
grep -v "pattern" file.txt # Invert match (exclude pattern)
grep -n "pattern" file.txt # Show line numbers
grep -c "pattern" file.txt # Count matches
grep -l "pattern" *.txt # List files with matches
grep -L "pattern" *.txt # List files without matches
Extended Regex and Options
grep -E "pattern1|pattern2" file.txt # Extended regex (OR)
grep -E "^(start|begin)" file.txt # Match lines starting with words
grep -E "[0-9]{3}-[0-9]{3}-[0-9]{4}" # Phone number pattern
grep -P "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}" # IP address (Perl regex)
Context and Advanced Features
grep -A 5 "pattern" file.txt # Show 5 lines after match
grep -B 5 "pattern" file.txt # Show 5 lines before match
grep -C 5 "pattern" file.txt # Show 5 lines before and after
grep -r "pattern" /path/to/dir/ # Recursive search
grep -r --include="*.log" "ERROR" /var/log/ # Search specific file types
grep -w "word" file.txt # Match whole words only
grep -x "exact line" file.txt # Match entire line
Real-World Examples
# Find failed login attempts
grep "Failed password" /var/log/auth.log
# Find large files in ls output
ls -la | grep "^-.*[0-9]\{7,\}"
# Find processes consuming high CPU
ps aux | grep -E "^[^ ]+ +[0-9]+ +[5-9][0-9]\.[0-9]"
# Extract email addresses
grep -oE "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b" file.txt
# Find TODO comments in code
grep -rn "TODO\|FIXME\|XXX" --include="*.py" .
sed - Stream Editor
Basic Substitution
sed 's/old/new/' file.txt # Replace first occurrence per line
sed 's/old/new/g' file.txt # Replace all occurrences
sed 's/old/new/gi' file.txt # Case-insensitive replacement
sed -i 's/old/new/g' file.txt # Edit file in place
sed -i.bak 's/old/new/g' file.txt # Edit with backup
Advanced Substitution
sed 's/\([0-9]\{1,3\}\)\.\([0-9]\{1,3\}\)\.\([0-9]\{1,3\}\)\.\([0-9]\{1,3\}\)/IP: \1.\2.\3.\4/' file.txt
# Use different delimiter
sed 's|/old/path|/new/path|g' file.txt
# Replace only on specific lines
sed '2,5s/old/new/g' file.txt # Lines 2-5
sed '/pattern/s/old/new/g' file.txt # Lines matching pattern
Line Operations
sed -n '2,5p' file.txt # Print lines 2-5
sed '2,5d' file.txt # Delete lines 2-5
sed '/pattern/d' file.txt # Delete lines matching pattern
sed '2i\New line' file.txt # Insert line before line 2
sed '2a\New line' file.txt # Append line after line 2
sed '2c\Replacement line' file.txt # Replace line 2
Advanced sed Scripts
# Remove empty lines
sed '/^$/d' file.txt
# Remove leading/trailing whitespace
sed 's/^[[:space:]]*//;s/[[:space:]]*$//' file.txt
# Number lines
sed = file.txt | sed 'N;s/\n/\t/'
# Convert DOS line endings to Unix
sed 's/\r$//' file.txt
# Extract text between patterns
sed -n '/START/,/END/p' file.txt
# Multiple commands
sed -e 's/old1/new1/g' -e 's/old2/new2/g' file.txt
Real-World Examples
# Clean up log files
sed 's/^[0-9-]* [0-9:]* //' /var/log/messages
# Extract configuration values
sed -n 's/^config_key=\(.*\)/\1/p' config.file
# Format CSV data
sed 's/,/ | /g' data.csv
# Remove comments from config files
sed 's/#.*$//' config.file | sed '/^$/d'
awk - Pattern Scanning and Processing
Basic Syntax
awk '{print $1}' file.txt # Print first field
awk '{print $1, $3}' file.txt # Print first and third fields
awk '{print NF}' file.txt # Print number of fields
awk '{print NR, $0}' file.txt # Print line number and entire line
awk 'END {print NR}' file.txt # Print total number of lines
Field Separators
awk -F',' '{print $1}' file.csv # Use comma as delimiter
awk -F':' '{print $1}' /etc/passwd # Use colon as delimiter
awk -F'[,:;]' '{print $1}' file.txt # Multiple delimiters
awk 'BEGIN {FS="|"} {print $1}' file.txt # Set field separator in BEGIN
Pattern Matching
awk '/pattern/ {print}' file.txt # Print lines matching pattern
awk '$1 == "value" {print}' file.txt # Print lines where first field equals value
awk '$3 > 100 {print}' file.txt # Print lines where third field > 100
awk 'NR > 1 {print}' file.txt # Skip header line
awk 'length($0) > 80 {print}' file.txt # Print long lines
Built-in Variables
awk '{print "Line " NR ": " $0}' file.txt # Line number
awk '{print "Fields: " NF}' file.txt # Number of fields
awk '{print "Length: " length($0)}' file.txt # Line length
awk '{print FILENAME ": " $0}' file.txt # Filename
Mathematical Operations
awk '{sum += $1} END {print sum}' file.txt # Sum first column
awk '{sum += $1} END {print sum/NR}' file.txt # Average
awk '{if ($1 > max) max = $1} END {print max}' file.txt # Maximum
awk '{count++} END {print count}' file.txt # Count lines
String Functions
awk '{print toupper($1)}' file.txt # Convert to uppercase
awk '{print tolower($1)}' file.txt # Convert to lowercase
awk '{print substr($1, 2, 3)}' file.txt # Substring (start=2, length=3)
awk '{gsub(/old/, "new"); print}' file.txt # Global substitution
awk '{print index($0, "pattern")}' file.txt # Find position of pattern
Advanced awk Scripts
# Count word frequency
awk '{for(i=1;i<=NF;i++) count[$i]++} END {for(word in count) print word, count[word]}' file.txt
# Print unique lines
awk '!seen[$0]++' file.txt
# Calculate statistics
awk '{sum+=$1; sumsq+=$1*$1} END {print "Mean:", sum/NR, "StdDev:", sqrt(sumsq/NR - (sum/NR)^2)}' file.txt
# Process CSV with headers
awk -F',' 'NR==1{for(i=1;i<=NF;i++) col[$i]=i} NR>1{print $col["name"], $col["age"]}' file.csv
# Group by field
awk '{group[$1] += $2} END {for (key in group) print key, group[key]}' file.txt
Real-World Examples
# Parse Apache log for IP addresses
awk '{print $1}' /var/log/apache2/access.log | sort | uniq -c | sort -nr
# Extract usernames from /etc/passwd
awk -F':' '{print $1}' /etc/passwd
# Calculate disk usage summary
df -h | awk 'NR>1 {gsub(/%/, "", $5); if ($5 > 80) print $6, $5"%"}'
# Process system load
uptime | awk '{print "Load average:", $(NF-2), $(NF-1), $NF}'
# Format process list
ps aux | awk '{if (NR>1) printf "%-10s %5s %5s %s\n", $1, $2, $3, $11}'
Regular Expressions
POSIX Character Classes
[[:alnum:]] # Alphanumeric characters
[[:alpha:]] # Alphabetic characters
[[:digit:]] # Digits 0-9
[[:lower:]] # Lowercase letters
[[:upper:]] # Uppercase letters
[[:space:]] # Whitespace characters
[[:punct:]] # Punctuation characters
[[:xdigit:]] # Hexadecimal digits
Common Patterns
^pattern # Start of line
pattern$ # End of line
. # Any single character
.* # Zero or more characters
.+ # One or more characters
[abc] # Any of: a, b, or c
[^abc] # Not a, b, or c
[a-z] # Any lowercase letter
[0-9] # Any digit
\d # Digit (Perl-compatible)
\w # Word character (Perl-compatible)
\s # Whitespace (Perl-compatible)
Quantifiers
pattern? # Zero or one occurrence
pattern* # Zero or more occurrences
pattern+ # One or more occurrences
pattern{n} # Exactly n occurrences
pattern{n,} # n or more occurrences
pattern{n,m} # Between n and m occurrences
Examples
# Email validation
grep -E "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
# IP address validation
grep -E "^([0-9]{1,3}\.){3}[0-9]{1,3}$"
# Phone number formats
grep -E "^(\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}$"
# URL matching
grep -E "^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$"
# Credit card numbers
grep -E "^[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}$"
Text Manipulation Tools
cut - Extract Columns
cut -c 1-10 file.txt # Extract characters 1-10
cut -c 1,3,5 file.txt # Extract specific characters
cut -d',' -f 1,3 file.csv # Extract fields 1 and 3 (CSV)
cut -d':' -f 1 /etc/passwd # Extract usernames
cut -d' ' -f 2- file.txt # Extract from field 2 to end
sort - Sort Lines
sort file.txt # Sort alphabetically
sort -n file.txt # Numeric sort
sort -r file.txt # Reverse sort
sort -k 2 file.txt # Sort by second field
sort -t',' -k 2,2 file.csv # Sort CSV by second column
sort -u file.txt # Sort and remove duplicates
sort -f file.txt # Case-insensitive sort
uniq - Remove Duplicates
uniq file.txt # Remove consecutive duplicates
uniq -c file.txt # Count occurrences
uniq -d file.txt # Show only duplicates
uniq -u file.txt # Show only unique lines
sort file.txt | uniq # Remove all duplicates
tr - Translate Characters
tr 'a-z' 'A-Z' < file.txt # Convert to uppercase
tr '[:lower:]' '[:upper:]' < file.txt # Convert to uppercase (POSIX)
tr -d '\n' < file.txt # Remove newlines
tr -s ' ' < file.txt # Squeeze multiple spaces
tr ' ' '\n' < file.txt # Convert spaces to newlines
tr -d '[:punct:]' < file.txt # Remove punctuation
head and tail
head -n 20 file.txt # First 20 lines
tail -n 20 file.txt # Last 20 lines
head -c 100 file.txt # First 100 characters
tail -f /var/log/messages # Follow log file
tail -n +10 file.txt # From line 10 to end
head -n -5 file.txt # All but last 5 lines
wc - Word Count
wc file.txt # Lines, words, characters
wc -l file.txt # Count lines
wc -w file.txt # Count words
wc -c file.txt # Count characters
wc -m file.txt # Count characters (multibyte)
Shell Scripting Patterns
Variables and Parameter Expansion
#!/bin/bash
# Variable assignment
name="John"
age=25
# Parameter expansion
echo ${name} # Basic expansion
echo ${name:-"default"} # Use default if empty
echo ${name:="default"} # Set default if empty
echo ${name:+"not empty"} # Use value if not empty
echo ${name:?"error message"} # Error if empty
# String manipulation
echo ${name#J} # Remove shortest match from start
echo ${name##*/} # Remove longest match from start
echo ${name%n} # Remove shortest match from end
echo ${name%%/*} # Remove longest match from end
echo ${name/o/a} # Replace first occurrence
echo ${name//o/a} # Replace all occurrences
# Length and substrings
echo ${#name} # String length
echo ${name:1:2} # Substring (start:length)
Arrays
#!/bin/bash
# Array declaration
fruits=("apple" "banana" "orange")
declare -a numbers=(1 2 3 4 5)
# Array operations
echo ${fruits[0]} # First element
echo ${fruits[@]} # All elements
echo ${#fruits[@]} # Array length
fruits[3]="grape" # Add element
fruits+=("mango") # Append element
# Iterate over array
for fruit in "${fruits[@]}"; do
echo "Fruit: $fruit"
done
# Array slicing
echo ${fruits[@]:1:2} # Elements 1-2
Conditionals
#!/bin/bash
# File tests
if [[ -f "file.txt" ]]; then
echo "File exists"
elif [[ -d "directory" ]]; then
echo "Directory exists"
else
echo "Neither exists"
fi
# String comparisons
if [[ "$string1" == "$string2" ]]; then
echo "Strings are equal"
fi
if [[ "$string" =~ ^[0-9]+$ ]]; then
echo "String is numeric"
fi
# Numeric comparisons
if [[ $num1 -gt $num2 ]]; then
echo "num1 is greater"
fi
# Logical operators
if [[ -f "file.txt" && -r "file.txt" ]]; then
echo "File exists and is readable"
fi
Loops
#!/bin/bash
# For loop with range
for i in {1..10}; do
echo "Number: $i"
done
# For loop with array
for file in *.txt; do
echo "Processing: $file"
done
# While loop
counter=1
while [[ $counter -le 10 ]]; do
echo "Counter: $counter"
((counter++))
done
# Until loop
until [[ $counter -gt 10 ]]; do
echo "Counter: $counter"
((counter++))
done
# Read file line by line
while IFS= read -r line; do
echo "Line: $line"
done < file.txt
Functions
#!/bin/bash
# Function definition
function process_file() {
local filename="$1"
local operation="$2"
if [[ ! -f "$filename" ]]; then
echo "Error: File not found" >&2
return 1
fi
case "$operation" in
"count")
wc -l "$filename"
;;
"backup")
cp "$filename" "${filename}.bak"
;;
*)
echo "Unknown operation: $operation" >&2
return 1
;;
esac
}
# Function call
process_file "data.txt" "count"
Error Handling
#!/bin/bash
# Exit on error
set -e
# Exit on undefined variable
set -u
# Exit on pipe failure
set -o pipefail
# Error handling function
handle_error() {
echo "Error on line $1" >&2
exit 1
}
# Trap errors
trap 'handle_error $LINENO' ERR
# Check command success
if ! command -v git &> /dev/null; then
echo "Git is not installed" >&2
exit 1
fi
# Validate arguments
if [[ $# -ne 2 ]]; then
echo "Usage: $0 <input> <output>" >&2
exit 1
fi
Data Extraction & Formatting
Log File Processing
# Extract unique IP addresses from Apache logs
awk '{print $1}' /var/log/apache2/access.log | sort | uniq
# Count HTTP status codes
awk '{print $9}' /var/log/apache2/access.log | sort | uniq -c | sort -nr
# Extract failed login attempts with timestamps
grep "Failed password" /var/log/auth.log | awk '{print $1, $2, $3, $9, $11}'
# Find top 10 largest files accessed
awk '{print $10, $7}' /var/log/apache2/access.log | sort -nr | head -10
System Information Extraction
# Extract running processes by memory usage
ps aux | awk '{print $4, $11}' | sort -nr | head -10
# Get disk usage by directory
du -h /var/log/* | sort -hr | head -10
# Extract network connections
netstat -tuln | awk '/^tcp/ {print $1, $4}' | sort | uniq -c
# Monitor system load
uptime | awk -F'load average:' '{print $2}' | awk '{print $1, $2, $3}'
Configuration File Processing
# Extract non-comment lines from config files
grep -v '^#' /etc/ssh/sshd_config | grep -v '^$'
# Parse key-value pairs
awk -F'=' '/^[^#]/ {gsub(/^[ \t]+|[ \t]+$/, "", $2); print $1 "=" $2}' config.file
# Extract specific configuration values
sed -n 's/^Port[[:space:]]*\([0-9]*\)/\1/p' /etc/ssh/sshd_config
Log Parsing Techniques
Apache Log Analysis
# Common Log Format parsing
awk '{print $1, $4, $7, $9}' /var/log/apache2/access.log
# Extract URLs with 404 errors
awk '$9 == "404" {print $7}' /var/log/apache2/access.log | sort | uniq -c | sort -nr
# Parse timestamp and convert to readable format
awk '{gsub(/[\[\]]/, "", $4); print $4, $7}' /var/log/apache2/access.log
# Extract user agents
awk -F'"' '{print $6}' /var/log/apache2/access.log | sort | uniq -c | sort -nr
System Log Analysis
# Parse syslog messages
awk '{print $1, $2, $3, $5}' /var/log/syslog
# Extract kernel messages
grep "kernel:" /var/log/syslog | awk '{print $1, $2, $3, substr($0, index($0, "kernel:"))}'
# Monitor authentication events
grep -E "(authentication|login|ssh)" /var/log/auth.log | awk '{print $1, $2, $3, $9, $11}'
# Extract cron job executions
grep "CRON" /var/log/syslog | awk '{print $1, $2, $3, $6, $7}'
Application Log Parsing
# Parse Java application logs
grep -E "ERROR|WARN|INFO" application.log | awk '{print $1, $2, $3, $4}' | sort | uniq -c
# Extract database connection errors
grep -i "connection" application.log | grep -i "error" | awk '{print $1, $2}'
# Monitor response times
grep "response time" application.log | sed 's/.*response time: \([0-9]*\)ms.*/\1/' | awk '{sum+=$1; count++} END {print "Average:", sum/count "ms"}'
CSV & JSON Processing
CSV Processing
# Extract specific columns
awk -F',' '{print $1, $3}' data.csv
# Calculate sum of numeric column
awk -F',' '{sum += $2} END {print sum}' data.csv
# Filter rows based on criteria
awk -F',' '$3 > 100 {print}' data.csv
# Add header and format output
awk -F',' 'BEGIN {print "Name,Age,Score"} {print $1, $2, $3}' OFS=',' data.csv
# Convert CSV to different format
awk -F',' '{printf "Name: %s, Age: %s\n", $1, $2}' data.csv
JSON Processing with jq
# Pretty print JSON
jq '.' file.json
# Extract specific field
jq '.fieldname' file.json
# Extract array elements
jq '.array[]' file.json
# Filter objects
jq '.[] | select(.age > 25)' file.json
# Group by field
jq 'group_by(.category)' file.json
# Calculate statistics
jq 'map(.price) | add / length' file.json
JSON Processing without jq
# Extract simple values with grep/sed
grep -o '"name":"[^"]*"' file.json | sed 's/"name":"//g;s/"//g'
# Parse JSON with awk
awk -F'"' '/"name":/ {print $4}' file.json
# Extract nested values
sed -n 's/.*"user":{"name":"\([^"]*\)".*/\1/p' file.json
Stream Processing & Pipes
Advanced Pipe Combinations
# Process large files efficiently
cat large_file.txt | head -1000 | tail -500 | sort | uniq -c
# Multi-stage filtering
ps aux | grep -v grep | awk '$3 > 5.0' | sort -k3 -nr | head -10
# Complex text processing pipeline
cat /var/log/apache2/access.log | \
awk '{print $1, $7, $9}' | \
grep -E ' (404|500) ' | \
sort | uniq -c | \
sort -nr | \
head -20
# Data transformation pipeline
cat data.csv | \
sed '1d' | \
awk -F',' '{print $1, $2*1.1}' | \
sort -k2 -nr | \
head -10
Named Pipes (FIFOs)
# Create named pipe
mkfifo mypipe
# Write to pipe (in background)
command1 > mypipe &
# Read from pipe
command2 < mypipe
# Example: Real-time log processing
mkfifo logpipe
tail -f /var/log/messages > logpipe &
grep "ERROR" < logpipe | while read line; do
echo "$(date): $line" >> error.log
done
Process Substitution
# Compare output of two commands
diff <(command1) <(command2)
# Use command output as input file
sort <(cat file1.txt file2.txt)
# Multiple input sources
paste <(cut -d',' -f1 file1.csv) <(cut -d',' -f2 file2.csv)
# Real-time monitoring
watch -n 1 'ps aux | grep apache'
Background Processing
# Run command in background
command &
# Run multiple commands in parallel
command1 & command2 & command3 &
wait # Wait for all to complete
# Process files in parallel
for file in *.txt; do
(process_file "$file" > "${file}.out") &
done
wait
# Parallel processing with xargs
find . -name "*.log" | xargs -P 4 -I {} gzip {}
Performance Optimization
# Use appropriate buffer sizes
cat large_file.txt | buffer -s 1M | sort
# Optimize sort operations
export LC_ALL=C # Use C locale for faster sorting
sort -S 1G large_file.txt # Use more memory
# Parallel processing
sort --parallel=4 large_file.txt
# Efficient pattern matching
grep -F "fixed_string" file.txt # Use fixed strings when possible
Real-World Examples
Log Analysis Script
#!/bin/bash
# Analyze Apache access logs
analyze_apache_logs() {
local logfile="$1"
echo "=== Apache Log Analysis ==="
echo "Total requests: $(wc -l < "$logfile")"
echo
echo "Top 10 IP addresses:"
awk '{print $1}' "$logfile" | sort | uniq -c | sort -nr | head -10
echo
echo "Top 10 requested pages:"
awk '{print $7}' "$logfile" | sort | uniq -c | sort -nr | head -10
echo
echo "HTTP status codes:"
awk '{print $9}' "$logfile" | sort | uniq -c | sort -nr
echo
echo "Hourly request distribution:"
awk '{print $4}' "$logfile" | sed 's/.*:\([0-9][0-9]\):.*/\1/' | sort -n | uniq -c
}
analyze_apache_logs /var/log/apache2/access.log
Data Processing Pipeline
#!/bin/bash
# Process CSV sales data
process_sales_data() {
local input_file="$1"
local output_file="$2"
# Clean and process data
cat "$input_file" | \
sed '1d' | \
awk -F',' '{
gsub(/[^0-9.]/, "", $3); # Clean price field
if ($3 > 0) {
sales[$2] += $3;
count[$2]++;
}
} END {
print "Product,Total Sales,Average Price";
for (product in sales) {
printf "%s,%.2f,%.2f\n", product, sales[product], sales[product]/count[product];
}
}' | \
sort -t',' -k2 -nr > "$output_file"
echo "Sales data processed and saved to $output_file"
}
process_sales_data sales.csv sales_summary.csv
System Monitoring Script
#!/bin/bash
# System monitoring with text processing
monitor_system() {
echo "=== System Monitor $(date) ==="
# CPU usage
echo "CPU Usage:"
top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//'
# Memory usage
echo "Memory Usage:"
free -h | awk '/^Mem:/ {print "Used:", $3, "Free:", $7}'
# Disk usage
echo "Disk Usage (>80%):"
df -h | awk 'NR>1 {gsub(/%/, "", $5); if ($5 > 80) print $6, $5"%"}'
# Top processes by memory
echo "Top 5 processes by memory:"
ps aux | sort -k4 -nr | head -6 | awk 'NR>1 {printf "%-10s %5s%% %s\n", $1, $4, $11}'
# Network connections
echo "Network connections:"
netstat -tuln | grep LISTEN | wc -l
}
monitor_system
This comprehensive cheatsheet covers advanced text processing techniques, shell scripting patterns, and real-world examples for efficient Linux text manipulation and automation.