Skip to main content

Text Processing

Grep - Pattern Searching

Basic Grep Usage

# Search for pattern in file
grep "pattern" file.txt
grep "pattern" file1.txt file2.txt

# Search in all files in directory
grep "pattern" *
grep -r "pattern" directory/

# Case-insensitive search
grep -i "pattern" file.txt

# Show line numbers
grep -n "pattern" file.txt

# Show only matching part
grep -o "pattern" file.txt

# Count matches
grep -c "pattern" file.txt

Grep Options

# Context lines
grep -A 3 "pattern" file.txt # 3 lines after match
grep -B 2 "pattern" file.txt # 2 lines before match
grep -C 5 "pattern" file.txt # 5 lines before and after

# Invert match (show non-matching lines)
grep -v "pattern" file.txt

# Whole word match
grep -w "word" file.txt

# Fixed string (no regex)
grep -F "literal.string" file.txt

# Multiple patterns
grep -E "pattern1|pattern2" file.txt
grep -f patterns.txt file.txt # Patterns from file

# Exclude files/directories
grep -r --exclude="*.log" "pattern" .
grep -r --exclude-dir="node_modules" "pattern" .

Grep with Regular Expressions

# Basic regex
grep "^start" file.txt # Lines starting with "start"
grep "end$" file.txt # Lines ending with "end"
grep "^$" file.txt # Empty lines
grep "[0-9]" file.txt # Lines containing digits

# Extended regex (-E)
grep -E "color|colour" file.txt
grep -E "[0-9]{3}-[0-9]{3}-[0-9]{4}" file.txt # Phone numbers
grep -E "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" file.txt # Email

# Perl-compatible regex (-P)
grep -P "\d{4}-\d{2}-\d{2}" file.txt # Date format
grep -P "(?<=\w)ing\b" file.txt # Words ending with "ing"

Sed - Stream Editor

Basic Sed Operations

# Substitute (replace)
sed 's/old/new/' file.txt # Replace first occurrence per line
sed 's/old/new/g' file.txt # Replace all occurrences
sed 's/old/new/2' file.txt # Replace second occurrence per line

# Case-insensitive replacement
sed 's/old/new/gi' file.txt

# In-place editing
sed -i 's/old/new/g' file.txt
sed -i.bak 's/old/new/g' file.txt # Create backup

# Use different delimiter
sed 's|/old/path|/new/path|g' file.txt
sed 's#old#new#g' file.txt

Sed Line Operations

# Delete lines
sed '2d' file.txt # Delete line 2
sed '2,5d' file.txt # Delete lines 2-5
sed '/pattern/d' file.txt # Delete lines containing pattern
sed '/^$/d' file.txt # Delete empty lines

# Print specific lines
sed -n '2p' file.txt # Print line 2
sed -n '2,5p' file.txt # Print lines 2-5
sed -n '/pattern/p' file.txt # Print lines containing pattern

# Insert and append
sed '2i\New line' file.txt # Insert before line 2
sed '2a\New line' file.txt # Append after line 2
sed '/pattern/i\New line' file.txt # Insert before pattern

Advanced Sed

# Multiple commands
sed 's/old/new/g; s/foo/bar/g' file.txt
sed -e 's/old/new/g' -e 's/foo/bar/g' file.txt

# Address ranges
sed '1,10s/old/new/g' file.txt # Replace in lines 1-10
sed '/start/,/end/s/old/new/g' file.txt # Replace between patterns

# Capture groups
sed 's/\([0-9]*\)-\([0-9]*\)/\2-\1/' file.txt # Swap numbers

# Hold space operations
sed 'h;n;G' file.txt # Duplicate every line
sed '1!G;h;$!d' file.txt # Reverse file

Awk - Text Processing Language

Basic Awk Usage

# Print columns
awk '{print $1}' file.txt # First column
awk '{print $1, $3}' file.txt # First and third columns
awk '{print $NF}' file.txt # Last column
awk '{print NF}' file.txt # Number of fields per line

# Print with custom separator
awk '{print $1 ":" $2}' file.txt
awk '{print $1 "\t" $2}' file.txt

# Field separator
awk -F: '{print $1}' /etc/passwd # Use colon as separator
awk -F',' '{print $2}' data.csv # Use comma as separator

Awk Patterns and Conditions

# Pattern matching
awk '/pattern/' file.txt # Lines containing pattern
awk '/^[0-9]/' file.txt # Lines starting with digit
awk '$1 == "value"' file.txt # First field equals "value"

# Conditions
awk '$3 > 100' file.txt # Third field greater than 100
awk 'NF > 5' file.txt # Lines with more than 5 fields
awk 'length($0) > 80' file.txt # Lines longer than 80 characters

# Range patterns
awk '/start/,/end/' file.txt # Lines between start and end patterns
awk 'NR==2,NR==5' file.txt # Lines 2 through 5

Awk Built-in Variables

# Record and field variables
NR # Number of records (line number)
NF # Number of fields in current record
FNR # File record number (resets for each file)
FILENAME # Current filename
FS # Field separator (default: whitespace)
OFS # Output field separator (default: space)
RS # Record separator (default: newline)
ORS # Output record separator (default: newline)

# Examples
awk 'NR==3' file.txt # Third line
awk '{print NR, $0}' file.txt # Line numbers
awk '{OFS=":"} {print $1, $2}' file.txt # Change output separator

Awk Programming Constructs

# BEGIN and END blocks
awk 'BEGIN {print "Start"} {print $1} END {print "End"}' file.txt
awk 'BEGIN {sum=0} {sum+=$1} END {print "Total:", sum}' file.txt

# Variables and arithmetic
awk '{sum += $1} END {print "Average:", sum/NR}' file.txt
awk '{if ($1 > max) max = $1} END {print "Max:", max}' file.txt

# Loops
awk '{for(i=1; i<=NF; i++) print i, $i}' file.txt
awk '{for(i=NF; i>0; i--) printf "%s ", $i; print ""}' file.txt

# Arrays
awk '{count[$1]++} END {for(word in count) print word, count[word]}' file.txt

Awk Functions

# String functions
length(str) # String length
substr(str, start, len) # Substring
index(str, substr) # Find substring position
split(str, array, sep) # Split string into array
gsub(regex, replacement, str) # Global substitution
toupper(str) # Convert to uppercase
tolower(str) # Convert to lowercase

# Math functions
int(x) # Integer part
sqrt(x) # Square root
sin(x), cos(x) # Trigonometric functions
rand() # Random number (0-1)
srand(seed) # Set random seed

# Examples
awk '{print toupper($1)}' file.txt
awk '{print substr($1, 1, 3)}' file.txt
awk '{gsub(/old/, "new"); print}' file.txt

Cut - Column Extraction

Basic Cut Usage

# Extract columns by position
cut -c1-5 file.txt # Characters 1-5
cut -c1,3,5 file.txt # Characters 1, 3, and 5
cut -c5- file.txt # From character 5 to end

# Extract fields
cut -f1 file.txt # First field (tab-separated)
cut -f1,3 file.txt # Fields 1 and 3
cut -f2- file.txt # From field 2 to end

# Custom delimiter
cut -d: -f1 /etc/passwd # First field, colon-separated
cut -d, -f2,4 data.csv # Fields 2 and 4, comma-separated

Cut Options

# Only show lines with delimiter
cut -d: -f1 -s /etc/passwd

# Custom output delimiter
cut -d: -f1,3 --output-delimiter=' ' /etc/passwd

# Complement (everything except specified)
cut -d: -f1 --complement /etc/passwd

Sort - Sorting Text

Basic Sort Usage

# Sort lines alphabetically
sort file.txt
sort file1.txt file2.txt

# Reverse sort
sort -r file.txt

# Numeric sort
sort -n file.txt
sort -nr file.txt # Reverse numeric

# Sort by column
sort -k2 file.txt # Sort by second field
sort -k2,2 file.txt # Sort by second field only
sort -k2n file.txt # Sort by second field numerically

Advanced Sort Options

# Custom field separator
sort -t: -k3n /etc/passwd # Sort by third field, colon-separated

# Multiple sort keys
sort -k1,1 -k2n file.txt # Sort by first field, then by second numerically

# Unique sort
sort -u file.txt # Remove duplicates
sort file.txt | uniq # Alternative way

# Stable sort
sort -s file.txt # Maintain relative order of equal elements

# Human-readable numeric sort
sort -h file.txt # Sort 1K, 2M, 3G properly

# Random sort
sort -R file.txt # Random order

Uniq - Remove Duplicates

Basic Uniq Usage

# Remove consecutive duplicates
uniq file.txt
sort file.txt | uniq # Remove all duplicates

# Count occurrences
uniq -c file.txt
sort file.txt | uniq -c | sort -nr # Most frequent first

# Show only duplicates
uniq -d file.txt

# Show only unique lines
uniq -u file.txt

Uniq Options

# Ignore case
uniq -i file.txt

# Skip fields/characters
uniq -f2 file.txt # Skip first 2 fields
uniq -s5 file.txt # Skip first 5 characters

# Compare only part of line
uniq -w10 file.txt # Compare only first 10 characters

Tr - Character Translation

Basic Tr Usage

# Character substitution
tr 'a' 'b' < file.txt # Replace 'a' with 'b'
tr 'abc' 'xyz' < file.txt # Replace a->x, b->y, c->z
tr 'a-z' 'A-Z' < file.txt # Convert to uppercase

# Delete characters
tr -d 'a' < file.txt # Delete all 'a' characters
tr -d '0-9' < file.txt # Delete all digits
tr -d '\n' < file.txt # Delete newlines (join lines)

# Squeeze repeated characters
tr -s ' ' < file.txt # Squeeze multiple spaces to single space
tr -s '\n' < file.txt # Remove empty lines

Tr Character Sets

# Predefined character sets
tr '[:lower:]' '[:upper:]' < file.txt # Convert to uppercase
tr '[:upper:]' '[:lower:]' < file.txt # Convert to lowercase
tr -d '[:digit:]' < file.txt # Delete digits
tr -d '[:punct:]' < file.txt # Delete punctuation
tr -s '[:space:]' < file.txt # Squeeze whitespace

# Other character sets
[:alnum:] # Alphanumeric characters
[:alpha:] # Alphabetic characters
[:blank:] # Space and tab
[:cntrl:] # Control characters
[:graph:] # Printable characters except space
[:print:] # Printable characters including space
[:xdigit:] # Hexadecimal digits

String Manipulation

Parameter Expansion

# String length
echo ${#string}

# Substring extraction
echo ${string:position} # From position to end
echo ${string:position:length} # Substring of length

# Pattern removal
echo ${string#pattern} # Remove shortest match from beginning
echo ${string##pattern} # Remove longest match from beginning
echo ${string%pattern} # Remove shortest match from end
echo ${string%%pattern} # Remove longest match from end

# Pattern replacement
echo ${string/pattern/replacement} # Replace first match
echo ${string//pattern/replacement} # Replace all matches
echo ${string/#pattern/replacement} # Replace if at beginning
echo ${string/%pattern/replacement} # Replace if at end

String Comparison

# Test string properties
if [[ -z "$string" ]]; then echo "Empty"; fi
if [[ -n "$string" ]]; then echo "Not empty"; fi

# Pattern matching
if [[ "$string" == pattern* ]]; then echo "Starts with pattern"; fi
if [[ "$string" == *pattern ]]; then echo "Ends with pattern"; fi
if [[ "$string" =~ regex ]]; then echo "Matches regex"; fi

# String comparison
if [[ "$str1" < "$str2" ]]; then echo "str1 comes before str2"; fi
if [[ "$str1" > "$str2" ]]; then echo "str1 comes after str2"; fi

Text Processing Combinations

Common Pipelines

# Word frequency
cat file.txt | tr ' ' '\n' | sort | uniq -c | sort -nr

# Extract email addresses
grep -oE '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' file.txt

# CSV processing
cut -d, -f2 data.csv | sort | uniq -c | sort -nr

# Log analysis
grep "ERROR" /var/log/app.log | awk '{print $1, $2}' | sort | uniq -c

# Find duplicate lines
sort file.txt | uniq -d

# Random sample
shuf -n 10 file.txt # Random 10 lines
sort -R file.txt | head -n 10 # Alternative method

Text Statistics

# Line, word, character count
wc -l file.txt # Line count
wc -w file.txt # Word count
wc -c file.txt # Character count
wc -m file.txt # Character count (multibyte aware)

# Advanced statistics
awk '{chars += length($0) + 1; words += NF} END {print "Lines:", NR, "Words:", words, "Characters:", chars}' file.txt

Best Practices

Performance Tips

# Use appropriate tools for the job
# For simple column extraction: cut > awk
# For complex processing: awk > sed
# For pattern matching: grep > sed/awk

# Avoid unnecessary pipes
# Bad: cat file | grep pattern
# Good: grep pattern file

# Use built-in string operations when possible
# Bad: echo "$string" | tr 'a-z' 'A-Z'
# Good: echo "${string^^}" # Bash 4+

Safety Practices

# Always quote variables in text processing
grep "$pattern" "$file"

# Check for empty input
if [[ -s "$file" ]]; then
process_file "$file"
fi

# Use appropriate regex delimiters
sed 's|/old/path|/new/path|g' # Better than s/\/old\/path/\/new\/path/g