Text Processing
Grep - Pattern Searching
Basic Grep Usage
# Search for pattern in file
grep "pattern" file.txt
grep "pattern" file1.txt file2.txt
# Search in all files in directory
grep "pattern" *
grep -r "pattern" directory/
# Case-insensitive search
grep -i "pattern" file.txt
# Show line numbers
grep -n "pattern" file.txt
# Show only matching part
grep -o "pattern" file.txt
# Count matches
grep -c "pattern" file.txt
Grep Options
# Context lines
grep -A 3 "pattern" file.txt # 3 lines after match
grep -B 2 "pattern" file.txt # 2 lines before match
grep -C 5 "pattern" file.txt # 5 lines before and after
# Invert match (show non-matching lines)
grep -v "pattern" file.txt
# Whole word match
grep -w "word" file.txt
# Fixed string (no regex)
grep -F "literal.string" file.txt
# Multiple patterns
grep -E "pattern1|pattern2" file.txt
grep -f patterns.txt file.txt # Patterns from file
# Exclude files/directories
grep -r --exclude="*.log" "pattern" .
grep -r --exclude-dir="node_modules" "pattern" .
Grep with Regular Expressions
# Basic regex
grep "^start" file.txt # Lines starting with "start"
grep "end$" file.txt # Lines ending with "end"
grep "^$" file.txt # Empty lines
grep "[0-9]" file.txt # Lines containing digits
# Extended regex (-E)
grep -E "color|colour" file.txt
grep -E "[0-9]{3}-[0-9]{3}-[0-9]{4}" file.txt # Phone numbers
grep -E "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" file.txt # Email
# Perl-compatible regex (-P)
grep -P "\d{4}-\d{2}-\d{2}" file.txt # Date format
grep -P "(?<=\w)ing\b" file.txt # Words ending with "ing"
Sed - Stream Editor
Basic Sed Operations
# Substitute (replace)
sed 's/old/new/' file.txt # Replace first occurrence per line
sed 's/old/new/g' file.txt # Replace all occurrences
sed 's/old/new/2' file.txt # Replace second occurrence per line
# Case-insensitive replacement
sed 's/old/new/gi' file.txt
# In-place editing
sed -i 's/old/new/g' file.txt
sed -i.bak 's/old/new/g' file.txt # Create backup
# Use different delimiter
sed 's|/old/path|/new/path|g' file.txt
sed 's#old#new#g' file.txt
Sed Line Operations
# Delete lines
sed '2d' file.txt # Delete line 2
sed '2,5d' file.txt # Delete lines 2-5
sed '/pattern/d' file.txt # Delete lines containing pattern
sed '/^$/d' file.txt # Delete empty lines
# Print specific lines
sed -n '2p' file.txt # Print line 2
sed -n '2,5p' file.txt # Print lines 2-5
sed -n '/pattern/p' file.txt # Print lines containing pattern
# Insert and append
sed '2i\New line' file.txt # Insert before line 2
sed '2a\New line' file.txt # Append after line 2
sed '/pattern/i\New line' file.txt # Insert before pattern
Advanced Sed
# Multiple commands
sed 's/old/new/g; s/foo/bar/g' file.txt
sed -e 's/old/new/g' -e 's/foo/bar/g' file.txt
# Address ranges
sed '1,10s/old/new/g' file.txt # Replace in lines 1-10
sed '/start/,/end/s/old/new/g' file.txt # Replace between patterns
# Capture groups
sed 's/\([0-9]*\)-\([0-9]*\)/\2-\1/' file.txt # Swap numbers
# Hold space operations
sed 'h;n;G' file.txt # Duplicate every line
sed '1!G;h;$!d' file.txt # Reverse file
Awk - Text Processing Language
Basic Awk Usage
# Print columns
awk '{print $1}' file.txt # First column
awk '{print $1, $3}' file.txt # First and third columns
awk '{print $NF}' file.txt # Last column
awk '{print NF}' file.txt # Number of fields per line
# Print with custom separator
awk '{print $1 ":" $2}' file.txt
awk '{print $1 "\t" $2}' file.txt
# Field separator
awk -F: '{print $1}' /etc/passwd # Use colon as separator
awk -F',' '{print $2}' data.csv # Use comma as separator
Awk Patterns and Conditions
# Pattern matching
awk '/pattern/' file.txt # Lines containing pattern
awk '/^[0-9]/' file.txt # Lines starting with digit
awk '$1 == "value"' file.txt # First field equals "value"
# Conditions
awk '$3 > 100' file.txt # Third field greater than 100
awk 'NF > 5' file.txt # Lines with more than 5 fields
awk 'length($0) > 80' file.txt # Lines longer than 80 characters
# Range patterns
awk '/start/,/end/' file.txt # Lines between start and end patterns
awk 'NR==2,NR==5' file.txt # Lines 2 through 5
Awk Built-in Variables
# Record and field variables
NR # Number of records (line number)
NF # Number of fields in current record
FNR # File record number (resets for each file)
FILENAME # Current filename
FS # Field separator (default: whitespace)
OFS # Output field separator (default: space)
RS # Record separator (default: newline)
ORS # Output record separator (default: newline)
# Examples
awk 'NR==3' file.txt # Third line
awk '{print NR, $0}' file.txt # Line numbers
awk '{OFS=":"} {print $1, $2}' file.txt # Change output separator
Awk Programming Constructs
# BEGIN and END blocks
awk 'BEGIN {print "Start"} {print $1} END {print "End"}' file.txt
awk 'BEGIN {sum=0} {sum+=$1} END {print "Total:", sum}' file.txt
# Variables and arithmetic
awk '{sum += $1} END {print "Average:", sum/NR}' file.txt
awk '{if ($1 > max) max = $1} END {print "Max:", max}' file.txt
# Loops
awk '{for(i=1; i<=NF; i++) print i, $i}' file.txt
awk '{for(i=NF; i>0; i--) printf "%s ", $i; print ""}' file.txt
# Arrays
awk '{count[$1]++} END {for(word in count) print word, count[word]}' file.txt
Awk Functions
# String functions
length(str) # String length
substr(str, start, len) # Substring
index(str, substr) # Find substring position
split(str, array, sep) # Split string into array
gsub(regex, replacement, str) # Global substitution
toupper(str) # Convert to uppercase
tolower(str) # Convert to lowercase
# Math functions
int(x) # Integer part
sqrt(x) # Square root
sin(x), cos(x) # Trigonometric functions
rand() # Random number (0-1)
srand(seed) # Set random seed
# Examples
awk '{print toupper($1)}' file.txt
awk '{print substr($1, 1, 3)}' file.txt
awk '{gsub(/old/, "new"); print}' file.txt
Cut - Column Extraction
Basic Cut Usage
# Extract columns by position
cut -c1-5 file.txt # Characters 1-5
cut -c1,3,5 file.txt # Characters 1, 3, and 5
cut -c5- file.txt # From character 5 to end
# Extract fields
cut -f1 file.txt # First field (tab-separated)
cut -f1,3 file.txt # Fields 1 and 3
cut -f2- file.txt # From field 2 to end
# Custom delimiter
cut -d: -f1 /etc/passwd # First field, colon-separated
cut -d, -f2,4 data.csv # Fields 2 and 4, comma-separated
Cut Options
# Only show lines with delimiter
cut -d: -f1 -s /etc/passwd
# Custom output delimiter
cut -d: -f1,3 --output-delimiter=' ' /etc/passwd
# Complement (everything except specified)
cut -d: -f1 --complement /etc/passwd
Sort - Sorting Text
Basic Sort Usage
# Sort lines alphabetically
sort file.txt
sort file1.txt file2.txt
# Reverse sort
sort -r file.txt
# Numeric sort
sort -n file.txt
sort -nr file.txt # Reverse numeric
# Sort by column
sort -k2 file.txt # Sort by second field
sort -k2,2 file.txt # Sort by second field only
sort -k2n file.txt # Sort by second field numerically
Advanced Sort Options
# Custom field separator
sort -t: -k3n /etc/passwd # Sort by third field, colon-separated
# Multiple sort keys
sort -k1,1 -k2n file.txt # Sort by first field, then by second numerically
# Unique sort
sort -u file.txt # Remove duplicates
sort file.txt | uniq # Alternative way
# Stable sort
sort -s file.txt # Maintain relative order of equal elements
# Human-readable numeric sort
sort -h file.txt # Sort 1K, 2M, 3G properly
# Random sort
sort -R file.txt # Random order
Uniq - Remove Duplicates
Basic Uniq Usage
# Remove consecutive duplicates
uniq file.txt
sort file.txt | uniq # Remove all duplicates
# Count occurrences
uniq -c file.txt
sort file.txt | uniq -c | sort -nr # Most frequent first
# Show only duplicates
uniq -d file.txt
# Show only unique lines
uniq -u file.txt
Uniq Options
# Ignore case
uniq -i file.txt
# Skip fields/characters
uniq -f2 file.txt # Skip first 2 fields
uniq -s5 file.txt # Skip first 5 characters
# Compare only part of line
uniq -w10 file.txt # Compare only first 10 characters
Tr - Character Translation
Basic Tr Usage
# Character substitution
tr 'a' 'b' < file.txt # Replace 'a' with 'b'
tr 'abc' 'xyz' < file.txt # Replace a->x, b->y, c->z
tr 'a-z' 'A-Z' < file.txt # Convert to uppercase
# Delete characters
tr -d 'a' < file.txt # Delete all 'a' characters
tr -d '0-9' < file.txt # Delete all digits
tr -d '\n' < file.txt # Delete newlines (join lines)
# Squeeze repeated characters
tr -s ' ' < file.txt # Squeeze multiple spaces to single space
tr -s '\n' < file.txt # Remove empty lines
Tr Character Sets
# Predefined character sets
tr '[:lower:]' '[:upper:]' < file.txt # Convert to uppercase
tr '[:upper:]' '[:lower:]' < file.txt # Convert to lowercase
tr -d '[:digit:]' < file.txt # Delete digits
tr -d '[:punct:]' < file.txt # Delete punctuation
tr -s '[:space:]' < file.txt # Squeeze whitespace
# Other character sets
[:alnum:] # Alphanumeric characters
[:alpha:] # Alphabetic characters
[:blank:] # Space and tab
[:cntrl:] # Control characters
[:graph:] # Printable characters except space
[:print:] # Printable characters including space
[:xdigit:] # Hexadecimal digits
String Manipulation
Parameter Expansion
# String length
echo ${#string}
# Substring extraction
echo ${string:position} # From position to end
echo ${string:position:length} # Substring of length
# Pattern removal
echo ${string#pattern} # Remove shortest match from beginning
echo ${string##pattern} # Remove longest match from beginning
echo ${string%pattern} # Remove shortest match from end
echo ${string%%pattern} # Remove longest match from end
# Pattern replacement
echo ${string/pattern/replacement} # Replace first match
echo ${string//pattern/replacement} # Replace all matches
echo ${string/#pattern/replacement} # Replace if at beginning
echo ${string/%pattern/replacement} # Replace if at end
String Comparison
# Test string properties
if [[ -z "$string" ]]; then echo "Empty"; fi
if [[ -n "$string" ]]; then echo "Not empty"; fi
# Pattern matching
if [[ "$string" == pattern* ]]; then echo "Starts with pattern"; fi
if [[ "$string" == *pattern ]]; then echo "Ends with pattern"; fi
if [[ "$string" =~ regex ]]; then echo "Matches regex"; fi
# String comparison
if [[ "$str1" < "$str2" ]]; then echo "str1 comes before str2"; fi
if [[ "$str1" > "$str2" ]]; then echo "str1 comes after str2"; fi
Text Processing Combinations
Common Pipelines
# Word frequency
cat file.txt | tr ' ' '\n' | sort | uniq -c | sort -nr
# Extract email addresses
grep -oE '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' file.txt
# CSV processing
cut -d, -f2 data.csv | sort | uniq -c | sort -nr
# Log analysis
grep "ERROR" /var/log/app.log | awk '{print $1, $2}' | sort | uniq -c
# Find duplicate lines
sort file.txt | uniq -d
# Random sample
shuf -n 10 file.txt # Random 10 lines
sort -R file.txt | head -n 10 # Alternative method
Text Statistics
# Line, word, character count
wc -l file.txt # Line count
wc -w file.txt # Word count
wc -c file.txt # Character count
wc -m file.txt # Character count (multibyte aware)
# Advanced statistics
awk '{chars += length($0) + 1; words += NF} END {print "Lines:", NR, "Words:", words, "Characters:", chars}' file.txt
Best Practices
Performance Tips
# Use appropriate tools for the job
# For simple column extraction: cut > awk
# For complex processing: awk > sed
# For pattern matching: grep > sed/awk
# Avoid unnecessary pipes
# Bad: cat file | grep pattern
# Good: grep pattern file
# Use built-in string operations when possible
# Bad: echo "$string" | tr 'a-z' 'A-Z'
# Good: echo "${string^^}" # Bash 4+
Safety Practices
# Always quote variables in text processing
grep "$pattern" "$file"
# Check for empty input
if [[ -s "$file" ]]; then
process_file "$file"
fi
# Use appropriate regex delimiters
sed 's|/old/path|/new/path|g' # Better than s/\/old\/path/\/new\/path/g