Skip to main content

Text Processing & Scripting

Comprehensive guide to Linux text processing, pattern matching, and shell scripting with practical examples and real-world use cases.

Quick Navigation

grep - Advanced Pattern Matching

Basic Usage

grep "pattern" file.txt                 # Search for pattern in file
grep -i "pattern" file.txt # Case-insensitive search
grep -v "pattern" file.txt # Invert match (exclude pattern)
grep -n "pattern" file.txt # Show line numbers
grep -c "pattern" file.txt # Count matches
grep -l "pattern" *.txt # List files with matches
grep -L "pattern" *.txt # List files without matches

Extended Regex and Options

grep -E "pattern1|pattern2" file.txt    # Extended regex (OR)
grep -E "^(start|begin)" file.txt # Match lines starting with words
grep -E "[0-9]{3}-[0-9]{3}-[0-9]{4}" # Phone number pattern
grep -P "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}" # IP address (Perl regex)

Context and Advanced Features

grep -A 5 "pattern" file.txt            # Show 5 lines after match
grep -B 5 "pattern" file.txt # Show 5 lines before match
grep -C 5 "pattern" file.txt # Show 5 lines before and after
grep -r "pattern" /path/to/dir/ # Recursive search
grep -r --include="*.log" "ERROR" /var/log/ # Search specific file types
grep -w "word" file.txt # Match whole words only
grep -x "exact line" file.txt # Match entire line

Real-World Examples

# Find failed login attempts
grep "Failed password" /var/log/auth.log

# Find large files in ls output
ls -la | grep "^-.*[0-9]\{7,\}"

# Find processes consuming high CPU
ps aux | grep -E "^[^ ]+ +[0-9]+ +[5-9][0-9]\.[0-9]"

# Extract email addresses
grep -oE "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b" file.txt

# Find TODO comments in code
grep -rn "TODO\|FIXME\|XXX" --include="*.py" .

sed - Stream Editor

Basic Substitution

sed 's/old/new/' file.txt               # Replace first occurrence per line
sed 's/old/new/g' file.txt # Replace all occurrences
sed 's/old/new/gi' file.txt # Case-insensitive replacement
sed -i 's/old/new/g' file.txt # Edit file in place
sed -i.bak 's/old/new/g' file.txt # Edit with backup

Advanced Substitution

sed 's/\([0-9]\{1,3\}\)\.\([0-9]\{1,3\}\)\.\([0-9]\{1,3\}\)\.\([0-9]\{1,3\}\)/IP: \1.\2.\3.\4/' file.txt

# Use different delimiter
sed 's|/old/path|/new/path|g' file.txt

# Replace only on specific lines
sed '2,5s/old/new/g' file.txt # Lines 2-5
sed '/pattern/s/old/new/g' file.txt # Lines matching pattern

Line Operations

sed -n '2,5p' file.txt                  # Print lines 2-5
sed '2,5d' file.txt # Delete lines 2-5
sed '/pattern/d' file.txt # Delete lines matching pattern
sed '2i\New line' file.txt # Insert line before line 2
sed '2a\New line' file.txt # Append line after line 2
sed '2c\Replacement line' file.txt # Replace line 2

Advanced sed Scripts

# Remove empty lines
sed '/^$/d' file.txt

# Remove leading/trailing whitespace
sed 's/^[[:space:]]*//;s/[[:space:]]*$//' file.txt

# Number lines
sed = file.txt | sed 'N;s/\n/\t/'

# Convert DOS line endings to Unix
sed 's/\r$//' file.txt

# Extract text between patterns
sed -n '/START/,/END/p' file.txt

# Multiple commands
sed -e 's/old1/new1/g' -e 's/old2/new2/g' file.txt

Real-World Examples

# Clean up log files
sed 's/^[0-9-]* [0-9:]* //' /var/log/messages

# Extract configuration values
sed -n 's/^config_key=\(.*\)/\1/p' config.file

# Format CSV data
sed 's/,/ | /g' data.csv

# Remove comments from config files
sed 's/#.*$//' config.file | sed '/^$/d'

awk - Pattern Scanning and Processing

Basic Syntax

awk '{print $1}' file.txt               # Print first field
awk '{print $1, $3}' file.txt # Print first and third fields
awk '{print NF}' file.txt # Print number of fields
awk '{print NR, $0}' file.txt # Print line number and entire line
awk 'END {print NR}' file.txt # Print total number of lines

Field Separators

awk -F',' '{print $1}' file.csv         # Use comma as delimiter
awk -F':' '{print $1}' /etc/passwd # Use colon as delimiter
awk -F'[,:;]' '{print $1}' file.txt # Multiple delimiters
awk 'BEGIN {FS="|"} {print $1}' file.txt # Set field separator in BEGIN

Pattern Matching

awk '/pattern/ {print}' file.txt        # Print lines matching pattern
awk '$1 == "value" {print}' file.txt # Print lines where first field equals value
awk '$3 > 100 {print}' file.txt # Print lines where third field > 100
awk 'NR > 1 {print}' file.txt # Skip header line
awk 'length($0) > 80 {print}' file.txt # Print long lines

Built-in Variables

awk '{print "Line " NR ": " $0}' file.txt    # Line number
awk '{print "Fields: " NF}' file.txt # Number of fields
awk '{print "Length: " length($0)}' file.txt # Line length
awk '{print FILENAME ": " $0}' file.txt # Filename

Mathematical Operations

awk '{sum += $1} END {print sum}' file.txt           # Sum first column
awk '{sum += $1} END {print sum/NR}' file.txt # Average
awk '{if ($1 > max) max = $1} END {print max}' file.txt # Maximum
awk '{count++} END {print count}' file.txt # Count lines

String Functions

awk '{print toupper($1)}' file.txt      # Convert to uppercase
awk '{print tolower($1)}' file.txt # Convert to lowercase
awk '{print substr($1, 2, 3)}' file.txt # Substring (start=2, length=3)
awk '{gsub(/old/, "new"); print}' file.txt # Global substitution
awk '{print index($0, "pattern")}' file.txt # Find position of pattern

Advanced awk Scripts

# Count word frequency
awk '{for(i=1;i<=NF;i++) count[$i]++} END {for(word in count) print word, count[word]}' file.txt

# Print unique lines
awk '!seen[$0]++' file.txt

# Calculate statistics
awk '{sum+=$1; sumsq+=$1*$1} END {print "Mean:", sum/NR, "StdDev:", sqrt(sumsq/NR - (sum/NR)^2)}' file.txt

# Process CSV with headers
awk -F',' 'NR==1{for(i=1;i<=NF;i++) col[$i]=i} NR>1{print $col["name"], $col["age"]}' file.csv

# Group by field
awk '{group[$1] += $2} END {for (key in group) print key, group[key]}' file.txt

Real-World Examples

# Parse Apache log for IP addresses
awk '{print $1}' /var/log/apache2/access.log | sort | uniq -c | sort -nr

# Extract usernames from /etc/passwd
awk -F':' '{print $1}' /etc/passwd

# Calculate disk usage summary
df -h | awk 'NR>1 {gsub(/%/, "", $5); if ($5 > 80) print $6, $5"%"}'

# Process system load
uptime | awk '{print "Load average:", $(NF-2), $(NF-1), $NF}'

# Format process list
ps aux | awk '{if (NR>1) printf "%-10s %5s %5s %s\n", $1, $2, $3, $11}'

Regular Expressions

POSIX Character Classes

[[:alnum:]]      # Alphanumeric characters
[[:alpha:]] # Alphabetic characters
[[:digit:]] # Digits 0-9
[[:lower:]] # Lowercase letters
[[:upper:]] # Uppercase letters
[[:space:]] # Whitespace characters
[[:punct:]] # Punctuation characters
[[:xdigit:]] # Hexadecimal digits

Common Patterns

^pattern         # Start of line
pattern$ # End of line
. # Any single character
.* # Zero or more characters
.+ # One or more characters
[abc] # Any of: a, b, or c
[^abc] # Not a, b, or c
[a-z] # Any lowercase letter
[0-9] # Any digit
\d # Digit (Perl-compatible)
\w # Word character (Perl-compatible)
\s # Whitespace (Perl-compatible)

Quantifiers

pattern?         # Zero or one occurrence
pattern* # Zero or more occurrences
pattern+ # One or more occurrences
pattern{n} # Exactly n occurrences
pattern{n,} # n or more occurrences
pattern{n,m} # Between n and m occurrences

Examples

# Email validation
grep -E "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

# IP address validation
grep -E "^([0-9]{1,3}\.){3}[0-9]{1,3}$"

# Phone number formats
grep -E "^(\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}$"

# URL matching
grep -E "^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$"

# Credit card numbers
grep -E "^[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}$"

Text Manipulation Tools

cut - Extract Columns

cut -c 1-10 file.txt                    # Extract characters 1-10
cut -c 1,3,5 file.txt # Extract specific characters
cut -d',' -f 1,3 file.csv # Extract fields 1 and 3 (CSV)
cut -d':' -f 1 /etc/passwd # Extract usernames
cut -d' ' -f 2- file.txt # Extract from field 2 to end

sort - Sort Lines

sort file.txt                           # Sort alphabetically
sort -n file.txt # Numeric sort
sort -r file.txt # Reverse sort
sort -k 2 file.txt # Sort by second field
sort -t',' -k 2,2 file.csv # Sort CSV by second column
sort -u file.txt # Sort and remove duplicates
sort -f file.txt # Case-insensitive sort

uniq - Remove Duplicates

uniq file.txt                           # Remove consecutive duplicates
uniq -c file.txt # Count occurrences
uniq -d file.txt # Show only duplicates
uniq -u file.txt # Show only unique lines
sort file.txt | uniq # Remove all duplicates

tr - Translate Characters

tr 'a-z' 'A-Z' < file.txt               # Convert to uppercase
tr '[:lower:]' '[:upper:]' < file.txt # Convert to uppercase (POSIX)
tr -d '\n' < file.txt # Remove newlines
tr -s ' ' < file.txt # Squeeze multiple spaces
tr ' ' '\n' < file.txt # Convert spaces to newlines
tr -d '[:punct:]' < file.txt # Remove punctuation

head and tail

head -n 20 file.txt                     # First 20 lines
tail -n 20 file.txt # Last 20 lines
head -c 100 file.txt # First 100 characters
tail -f /var/log/messages # Follow log file
tail -n +10 file.txt # From line 10 to end
head -n -5 file.txt # All but last 5 lines

wc - Word Count

wc file.txt                             # Lines, words, characters
wc -l file.txt # Count lines
wc -w file.txt # Count words
wc -c file.txt # Count characters
wc -m file.txt # Count characters (multibyte)

Shell Scripting Patterns

Variables and Parameter Expansion

#!/bin/bash

# Variable assignment
name="John"
age=25

# Parameter expansion
echo ${name} # Basic expansion
echo ${name:-"default"} # Use default if empty
echo ${name:="default"} # Set default if empty
echo ${name:+"not empty"} # Use value if not empty
echo ${name:?"error message"} # Error if empty

# String manipulation
echo ${name#J} # Remove shortest match from start
echo ${name##*/} # Remove longest match from start
echo ${name%n} # Remove shortest match from end
echo ${name%%/*} # Remove longest match from end
echo ${name/o/a} # Replace first occurrence
echo ${name//o/a} # Replace all occurrences

# Length and substrings
echo ${#name} # String length
echo ${name:1:2} # Substring (start:length)

Arrays

#!/bin/bash

# Array declaration
fruits=("apple" "banana" "orange")
declare -a numbers=(1 2 3 4 5)

# Array operations
echo ${fruits[0]} # First element
echo ${fruits[@]} # All elements
echo ${#fruits[@]} # Array length
fruits[3]="grape" # Add element
fruits+=("mango") # Append element

# Iterate over array
for fruit in "${fruits[@]}"; do
echo "Fruit: $fruit"
done

# Array slicing
echo ${fruits[@]:1:2} # Elements 1-2

Conditionals

#!/bin/bash

# File tests
if [[ -f "file.txt" ]]; then
echo "File exists"
elif [[ -d "directory" ]]; then
echo "Directory exists"
else
echo "Neither exists"
fi

# String comparisons
if [[ "$string1" == "$string2" ]]; then
echo "Strings are equal"
fi

if [[ "$string" =~ ^[0-9]+$ ]]; then
echo "String is numeric"
fi

# Numeric comparisons
if [[ $num1 -gt $num2 ]]; then
echo "num1 is greater"
fi

# Logical operators
if [[ -f "file.txt" && -r "file.txt" ]]; then
echo "File exists and is readable"
fi

Loops

#!/bin/bash

# For loop with range
for i in {1..10}; do
echo "Number: $i"
done

# For loop with array
for file in *.txt; do
echo "Processing: $file"
done

# While loop
counter=1
while [[ $counter -le 10 ]]; do
echo "Counter: $counter"
((counter++))
done

# Until loop
until [[ $counter -gt 10 ]]; do
echo "Counter: $counter"
((counter++))
done

# Read file line by line
while IFS= read -r line; do
echo "Line: $line"
done < file.txt

Functions

#!/bin/bash

# Function definition
function process_file() {
local filename="$1"
local operation="$2"

if [[ ! -f "$filename" ]]; then
echo "Error: File not found" >&2
return 1
fi

case "$operation" in
"count")
wc -l "$filename"
;;
"backup")
cp "$filename" "${filename}.bak"
;;
*)
echo "Unknown operation: $operation" >&2
return 1
;;
esac
}

# Function call
process_file "data.txt" "count"

Error Handling

#!/bin/bash

# Exit on error
set -e

# Exit on undefined variable
set -u

# Exit on pipe failure
set -o pipefail

# Error handling function
handle_error() {
echo "Error on line $1" >&2
exit 1
}

# Trap errors
trap 'handle_error $LINENO' ERR

# Check command success
if ! command -v git &> /dev/null; then
echo "Git is not installed" >&2
exit 1
fi

# Validate arguments
if [[ $# -ne 2 ]]; then
echo "Usage: $0 <input> <output>" >&2
exit 1
fi

Data Extraction & Formatting

Log File Processing

# Extract unique IP addresses from Apache logs
awk '{print $1}' /var/log/apache2/access.log | sort | uniq

# Count HTTP status codes
awk '{print $9}' /var/log/apache2/access.log | sort | uniq -c | sort -nr

# Extract failed login attempts with timestamps
grep "Failed password" /var/log/auth.log | awk '{print $1, $2, $3, $9, $11}'

# Find top 10 largest files accessed
awk '{print $10, $7}' /var/log/apache2/access.log | sort -nr | head -10

System Information Extraction

# Extract running processes by memory usage
ps aux | awk '{print $4, $11}' | sort -nr | head -10

# Get disk usage by directory
du -h /var/log/* | sort -hr | head -10

# Extract network connections
netstat -tuln | awk '/^tcp/ {print $1, $4}' | sort | uniq -c

# Monitor system load
uptime | awk -F'load average:' '{print $2}' | awk '{print $1, $2, $3}'

Configuration File Processing

# Extract non-comment lines from config files
grep -v '^#' /etc/ssh/sshd_config | grep -v '^$'

# Parse key-value pairs
awk -F'=' '/^[^#]/ {gsub(/^[ \t]+|[ \t]+$/, "", $2); print $1 "=" $2}' config.file

# Extract specific configuration values
sed -n 's/^Port[[:space:]]*\([0-9]*\)/\1/p' /etc/ssh/sshd_config

Log Parsing Techniques

Apache Log Analysis

# Common Log Format parsing
awk '{print $1, $4, $7, $9}' /var/log/apache2/access.log

# Extract URLs with 404 errors
awk '$9 == "404" {print $7}' /var/log/apache2/access.log | sort | uniq -c | sort -nr

# Parse timestamp and convert to readable format
awk '{gsub(/[\[\]]/, "", $4); print $4, $7}' /var/log/apache2/access.log

# Extract user agents
awk -F'"' '{print $6}' /var/log/apache2/access.log | sort | uniq -c | sort -nr

System Log Analysis

# Parse syslog messages
awk '{print $1, $2, $3, $5}' /var/log/syslog

# Extract kernel messages
grep "kernel:" /var/log/syslog | awk '{print $1, $2, $3, substr($0, index($0, "kernel:"))}'

# Monitor authentication events
grep -E "(authentication|login|ssh)" /var/log/auth.log | awk '{print $1, $2, $3, $9, $11}'

# Extract cron job executions
grep "CRON" /var/log/syslog | awk '{print $1, $2, $3, $6, $7}'

Application Log Parsing

# Parse Java application logs
grep -E "ERROR|WARN|INFO" application.log | awk '{print $1, $2, $3, $4}' | sort | uniq -c

# Extract database connection errors
grep -i "connection" application.log | grep -i "error" | awk '{print $1, $2}'

# Monitor response times
grep "response time" application.log | sed 's/.*response time: \([0-9]*\)ms.*/\1/' | awk '{sum+=$1; count++} END {print "Average:", sum/count "ms"}'

CSV & JSON Processing

CSV Processing

# Extract specific columns
awk -F',' '{print $1, $3}' data.csv

# Calculate sum of numeric column
awk -F',' '{sum += $2} END {print sum}' data.csv

# Filter rows based on criteria
awk -F',' '$3 > 100 {print}' data.csv

# Add header and format output
awk -F',' 'BEGIN {print "Name,Age,Score"} {print $1, $2, $3}' OFS=',' data.csv

# Convert CSV to different format
awk -F',' '{printf "Name: %s, Age: %s\n", $1, $2}' data.csv

JSON Processing with jq

# Pretty print JSON
jq '.' file.json

# Extract specific field
jq '.fieldname' file.json

# Extract array elements
jq '.array[]' file.json

# Filter objects
jq '.[] | select(.age > 25)' file.json

# Group by field
jq 'group_by(.category)' file.json

# Calculate statistics
jq 'map(.price) | add / length' file.json

JSON Processing without jq

# Extract simple values with grep/sed
grep -o '"name":"[^"]*"' file.json | sed 's/"name":"//g;s/"//g'

# Parse JSON with awk
awk -F'"' '/"name":/ {print $4}' file.json

# Extract nested values
sed -n 's/.*"user":{"name":"\([^"]*\)".*/\1/p' file.json

Stream Processing & Pipes

Advanced Pipe Combinations

# Process large files efficiently
cat large_file.txt | head -1000 | tail -500 | sort | uniq -c

# Multi-stage filtering
ps aux | grep -v grep | awk '$3 > 5.0' | sort -k3 -nr | head -10

# Complex text processing pipeline
cat /var/log/apache2/access.log | \
awk '{print $1, $7, $9}' | \
grep -E ' (404|500) ' | \
sort | uniq -c | \
sort -nr | \
head -20

# Data transformation pipeline
cat data.csv | \
sed '1d' | \
awk -F',' '{print $1, $2*1.1}' | \
sort -k2 -nr | \
head -10

Named Pipes (FIFOs)

# Create named pipe
mkfifo mypipe

# Write to pipe (in background)
command1 > mypipe &

# Read from pipe
command2 < mypipe

# Example: Real-time log processing
mkfifo logpipe
tail -f /var/log/messages > logpipe &
grep "ERROR" < logpipe | while read line; do
echo "$(date): $line" >> error.log
done

Process Substitution

# Compare output of two commands
diff <(command1) <(command2)

# Use command output as input file
sort <(cat file1.txt file2.txt)

# Multiple input sources
paste <(cut -d',' -f1 file1.csv) <(cut -d',' -f2 file2.csv)

# Real-time monitoring
watch -n 1 'ps aux | grep apache'

Background Processing

# Run command in background
command &

# Run multiple commands in parallel
command1 & command2 & command3 &
wait # Wait for all to complete

# Process files in parallel
for file in *.txt; do
(process_file "$file" > "${file}.out") &
done
wait

# Parallel processing with xargs
find . -name "*.log" | xargs -P 4 -I {} gzip {}

Performance Optimization

# Use appropriate buffer sizes
cat large_file.txt | buffer -s 1M | sort

# Optimize sort operations
export LC_ALL=C # Use C locale for faster sorting
sort -S 1G large_file.txt # Use more memory

# Parallel processing
sort --parallel=4 large_file.txt

# Efficient pattern matching
grep -F "fixed_string" file.txt # Use fixed strings when possible

Real-World Examples

Log Analysis Script

#!/bin/bash

# Analyze Apache access logs
analyze_apache_logs() {
local logfile="$1"

echo "=== Apache Log Analysis ==="
echo "Total requests: $(wc -l < "$logfile")"
echo

echo "Top 10 IP addresses:"
awk '{print $1}' "$logfile" | sort | uniq -c | sort -nr | head -10
echo

echo "Top 10 requested pages:"
awk '{print $7}' "$logfile" | sort | uniq -c | sort -nr | head -10
echo

echo "HTTP status codes:"
awk '{print $9}' "$logfile" | sort | uniq -c | sort -nr
echo

echo "Hourly request distribution:"
awk '{print $4}' "$logfile" | sed 's/.*:\([0-9][0-9]\):.*/\1/' | sort -n | uniq -c
}

analyze_apache_logs /var/log/apache2/access.log

Data Processing Pipeline

#!/bin/bash

# Process CSV sales data
process_sales_data() {
local input_file="$1"
local output_file="$2"

# Clean and process data
cat "$input_file" | \
sed '1d' | \
awk -F',' '{
gsub(/[^0-9.]/, "", $3); # Clean price field
if ($3 > 0) {
sales[$2] += $3;
count[$2]++;
}
} END {
print "Product,Total Sales,Average Price";
for (product in sales) {
printf "%s,%.2f,%.2f\n", product, sales[product], sales[product]/count[product];
}
}' | \
sort -t',' -k2 -nr > "$output_file"

echo "Sales data processed and saved to $output_file"
}

process_sales_data sales.csv sales_summary.csv

System Monitoring Script

#!/bin/bash

# System monitoring with text processing
monitor_system() {
echo "=== System Monitor $(date) ==="

# CPU usage
echo "CPU Usage:"
top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//'

# Memory usage
echo "Memory Usage:"
free -h | awk '/^Mem:/ {print "Used:", $3, "Free:", $7}'

# Disk usage
echo "Disk Usage (>80%):"
df -h | awk 'NR>1 {gsub(/%/, "", $5); if ($5 > 80) print $6, $5"%"}'

# Top processes by memory
echo "Top 5 processes by memory:"
ps aux | sort -k4 -nr | head -6 | awk 'NR>1 {printf "%-10s %5s%% %s\n", $1, $4, $11}'

# Network connections
echo "Network connections:"
netstat -tuln | grep LISTEN | wc -l
}

monitor_system

This comprehensive cheatsheet covers advanced text processing techniques, shell scripting patterns, and real-world examples for efficient Linux text manipulation and automation.