Text Processing

Grep - Pattern Searching

Basic Grep Usage

# Search for pattern in file
grep "pattern" file.txt
grep "pattern" file1.txt file2.txt

# Search in all files in directory
grep "pattern" *
grep -r "pattern" directory/

# Case-insensitive search
grep -i "pattern" file.txt

# Show line numbers
grep -n "pattern" file.txt

# Show only matching part
grep -o "pattern" file.txt

# Count matches
grep -c "pattern" file.txt

Grep Options

# Context lines
grep -A 3 "pattern" file.txt    # 3 lines after match
grep -B 2 "pattern" file.txt    # 2 lines before match
grep -C 5 "pattern" file.txt    # 5 lines before and after

# Invert match (show non-matching lines)
grep -v "pattern" file.txt

# Whole word match
grep -w "word" file.txt

# Fixed string (no regex)
grep -F "literal.string" file.txt

# Multiple patterns
grep -E "pattern1|pattern2" file.txt
grep -f patterns.txt file.txt   # Patterns from file

# Exclude files/directories
grep -r --exclude="*.log" "pattern" .
grep -r --exclude-dir="node_modules" "pattern" .

Grep with Regular Expressions

# Basic regex
grep "^start" file.txt          # Lines starting with "start"
grep "end$" file.txt            # Lines ending with "end"
grep "^$" file.txt              # Empty lines
grep "[0-9]" file.txt           # Lines containing digits

# Extended regex (-E)
grep -E "color|colour" file.txt
grep -E "[0-9]{3}-[0-9]{3}-[0-9]{4}" file.txt  # Phone numbers
grep -E "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" file.txt  # Email

# Perl-compatible regex (-P)
grep -P "\d{4}-\d{2}-\d{2}" file.txt    # Date format
grep -P "(?<=\w)ing\b" file.txt         # Words ending with "ing"

Sed - Stream Editor

Basic Sed Operations

# Substitute (replace)
sed 's/old/new/' file.txt           # Replace first occurrence per line
sed 's/old/new/g' file.txt          # Replace all occurrences
sed 's/old/new/2' file.txt          # Replace second occurrence per line

# Case-insensitive replacement
sed 's/old/new/gi' file.txt

# In-place editing
sed -i 's/old/new/g' file.txt
sed -i.bak 's/old/new/g' file.txt   # Create backup

# Use different delimiter
sed 's|/old/path|/new/path|g' file.txt
sed 's#old#new#g' file.txt

Sed Line Operations

# Delete lines
sed '2d' file.txt               # Delete line 2
sed '2,5d' file.txt             # Delete lines 2-5
sed '/pattern/d' file.txt       # Delete lines containing pattern
sed '/^$/d' file.txt            # Delete empty lines

# Print specific lines
sed -n '2p' file.txt            # Print line 2
sed -n '2,5p' file.txt          # Print lines 2-5
sed -n '/pattern/p' file.txt    # Print lines containing pattern

# Insert and append
sed '2i\New line' file.txt      # Insert before line 2
sed '2a\New line' file.txt      # Append after line 2
sed '/pattern/i\New line' file.txt  # Insert before pattern

Advanced Sed

# Multiple commands
sed 's/old/new/g; s/foo/bar/g' file.txt
sed -e 's/old/new/g' -e 's/foo/bar/g' file.txt

# Address ranges
sed '1,10s/old/new/g' file.txt      # Replace in lines 1-10
sed '/start/,/end/s/old/new/g' file.txt  # Replace between patterns

# Capture groups
sed 's/\([0-9]*\)-\([0-9]*\)/\2-\1/' file.txt  # Swap numbers

# Hold space operations
sed 'h;n;G' file.txt            # Duplicate every line
sed '1!G;h;$!d' file.txt        # Reverse file

Awk - Text Processing Language

Basic Awk Usage

# Print columns
awk '{print $1}' file.txt       # First column
awk '{print $1, $3}' file.txt   # First and third columns
awk '{print $NF}' file.txt      # Last column
awk '{print NF}' file.txt       # Number of fields per line

# Print with custom separator
awk '{print $1 ":" $2}' file.txt
awk '{print $1 "\t" $2}' file.txt

# Field separator
awk -F: '{print $1}' /etc/passwd    # Use colon as separator
awk -F',' '{print $2}' data.csv     # Use comma as separator

Awk Patterns and Conditions

# Pattern matching
awk '/pattern/' file.txt        # Lines containing pattern
awk '/^[0-9]/' file.txt         # Lines starting with digit
awk '$1 == "value"' file.txt    # First field equals "value"

# Conditions
awk '$3 > 100' file.txt         # Third field greater than 100
awk 'NF > 5' file.txt           # Lines with more than 5 fields
awk 'length($0) > 80' file.txt  # Lines longer than 80 characters

# Range patterns
awk '/start/,/end/' file.txt    # Lines between start and end patterns
awk 'NR==2,NR==5' file.txt      # Lines 2 through 5

Awk Built-in Variables

# Record and field variables
NR      # Number of records (line number)
NF      # Number of fields in current record
FNR     # File record number (resets for each file)
FILENAME # Current filename
FS      # Field separator (default: whitespace)
OFS     # Output field separator (default: space)
RS      # Record separator (default: newline)
ORS     # Output record separator (default: newline)

# Examples
awk 'NR==3' file.txt            # Third line
awk '{print NR, $0}' file.txt   # Line numbers
awk '{OFS=":"} {print $1, $2}' file.txt  # Change output separator

Awk Programming Constructs

# BEGIN and END blocks
awk 'BEGIN {print "Start"} {print $1} END {print "End"}' file.txt
awk 'BEGIN {sum=0} {sum+=$1} END {print "Total:", sum}' file.txt

# Variables and arithmetic
awk '{sum += $1} END {print "Average:", sum/NR}' file.txt
awk '{if ($1 > max) max = $1} END {print "Max:", max}' file.txt

# Loops
awk '{for(i=1; i<=NF; i++) print i, $i}' file.txt
awk '{for(i=NF; i>0; i--) printf "%s ", $i; print ""}' file.txt

# Arrays
awk '{count[$1]++} END {for(word in count) print word, count[word]}' file.txt

Awk Functions

# String functions
length(str)         # String length
substr(str, start, len)  # Substring
index(str, substr)  # Find substring position
split(str, array, sep)  # Split string into array
gsub(regex, replacement, str)  # Global substitution
toupper(str)        # Convert to uppercase
tolower(str)        # Convert to lowercase

# Math functions
int(x)             # Integer part
sqrt(x)            # Square root
sin(x), cos(x)     # Trigonometric functions
rand()             # Random number (0-1)
srand(seed)        # Set random seed

# Examples
awk '{print toupper($1)}' file.txt
awk '{print substr($1, 1, 3)}' file.txt
awk '{gsub(/old/, "new"); print}' file.txt

Cut - Column Extraction

Basic Cut Usage

# Extract columns by position
cut -c1-5 file.txt              # Characters 1-5
cut -c1,3,5 file.txt            # Characters 1, 3, and 5
cut -c5- file.txt               # From character 5 to end

# Extract fields
cut -f1 file.txt                # First field (tab-separated)
cut -f1,3 file.txt              # Fields 1 and 3
cut -f2- file.txt               # From field 2 to end

# Custom delimiter
cut -d: -f1 /etc/passwd         # First field, colon-separated
cut -d, -f2,4 data.csv          # Fields 2 and 4, comma-separated

Cut Options

# Only show lines with delimiter
cut -d: -f1 -s /etc/passwd

# Custom output delimiter
cut -d: -f1,3 --output-delimiter=' ' /etc/passwd

# Complement (everything except specified)
cut -d: -f1 --complement /etc/passwd

Sort - Sorting Text

Basic Sort Usage

# Sort lines alphabetically
sort file.txt
sort file1.txt file2.txt

# Reverse sort
sort -r file.txt

# Numeric sort
sort -n file.txt
sort -nr file.txt               # Reverse numeric

# Sort by column
sort -k2 file.txt               # Sort by second field
sort -k2,2 file.txt             # Sort by second field only
sort -k2n file.txt              # Sort by second field numerically

Advanced Sort Options

# Custom field separator
sort -t: -k3n /etc/passwd       # Sort by third field, colon-separated

# Multiple sort keys
sort -k1,1 -k2n file.txt        # Sort by first field, then by second numerically

# Unique sort
sort -u file.txt                # Remove duplicates
sort file.txt | uniq            # Alternative way

# Stable sort
sort -s file.txt                # Maintain relative order of equal elements

# Human-readable numeric sort
sort -h file.txt                # Sort 1K, 2M, 3G properly

# Random sort
sort -R file.txt                # Random order

Uniq - Remove Duplicates

Basic Uniq Usage

# Remove consecutive duplicates
uniq file.txt
sort file.txt | uniq            # Remove all duplicates

# Count occurrences
uniq -c file.txt
sort file.txt | uniq -c | sort -nr  # Most frequent first

# Show only duplicates
uniq -d file.txt

# Show only unique lines
uniq -u file.txt

Uniq Options

# Ignore case
uniq -i file.txt

# Skip fields/characters
uniq -f2 file.txt               # Skip first 2 fields
uniq -s5 file.txt               # Skip first 5 characters

# Compare only part of line
uniq -w10 file.txt              # Compare only first 10 characters

Tr - Character Translation

Basic Tr Usage

# Character substitution
tr 'a' 'b' < file.txt           # Replace 'a' with 'b'
tr 'abc' 'xyz' < file.txt       # Replace a->x, b->y, c->z
tr 'a-z' 'A-Z' < file.txt       # Convert to uppercase

# Delete characters
tr -d 'a' < file.txt            # Delete all 'a' characters
tr -d '0-9' < file.txt          # Delete all digits
tr -d '\n' < file.txt           # Delete newlines (join lines)

# Squeeze repeated characters
tr -s ' ' < file.txt            # Squeeze multiple spaces to single space
tr -s '\n' < file.txt           # Remove empty lines

Tr Character Sets

# Predefined character sets
tr '[:lower:]' '[:upper:]' < file.txt   # Convert to uppercase
tr '[:upper:]' '[:lower:]' < file.txt   # Convert to lowercase
tr -d '[:digit:]' < file.txt            # Delete digits
tr -d '[:punct:]' < file.txt            # Delete punctuation
tr -s '[:space:]' < file.txt            # Squeeze whitespace

# Other character sets
[:alnum:]   # Alphanumeric characters
[:alpha:]   # Alphabetic characters
[:blank:]   # Space and tab
[:cntrl:]   # Control characters
[:graph:]   # Printable characters except space
[:print:]   # Printable characters including space
[:xdigit:]  # Hexadecimal digits

String Manipulation

Parameter Expansion

# String length
echo ${#string}

# Substring extraction
echo ${string:position}         # From position to end
echo ${string:position:length}  # Substring of length

# Pattern removal
echo ${string#pattern}          # Remove shortest match from beginning
echo ${string##pattern}         # Remove longest match from beginning
echo ${string%pattern}          # Remove shortest match from end
echo ${string%%pattern}         # Remove longest match from end

# Pattern replacement
echo ${string/pattern/replacement}     # Replace first match
echo ${string//pattern/replacement}    # Replace all matches
echo ${string/#pattern/replacement}    # Replace if at beginning
echo ${string/%pattern/replacement}    # Replace if at end

String Comparison

# Test string properties
if [[ -z "$string" ]]; then echo "Empty"; fi
if [[ -n "$string" ]]; then echo "Not empty"; fi

# Pattern matching
if [[ "$string" == pattern* ]]; then echo "Starts with pattern"; fi
if [[ "$string" == *pattern ]]; then echo "Ends with pattern"; fi
if [[ "$string" =~ regex ]]; then echo "Matches regex"; fi

# String comparison
if [[ "$str1" < "$str2" ]]; then echo "str1 comes before str2"; fi
if [[ "$str1" > "$str2" ]]; then echo "str1 comes after str2"; fi

Text Processing Combinations

Common Pipelines

# Word frequency
cat file.txt | tr ' ' '\n' | sort | uniq -c | sort -nr

# Extract email addresses
grep -oE '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' file.txt

# CSV processing
cut -d, -f2 data.csv | sort | uniq -c | sort -nr

# Log analysis
grep "ERROR" /var/log/app.log | awk '{print $1, $2}' | sort | uniq -c

# Find duplicate lines
sort file.txt | uniq -d

# Random sample
shuf -n 10 file.txt             # Random 10 lines
sort -R file.txt | head -n 10   # Alternative method

Text Statistics

# Line, word, character count
wc -l file.txt                  # Line count
wc -w file.txt                  # Word count
wc -c file.txt                  # Character count
wc -m file.txt                  # Character count (multibyte aware)

# Advanced statistics
awk '{chars += length($0) + 1; words += NF} END {print "Lines:", NR, "Words:", words, "Characters:", chars}' file.txt

Best Practices

Performance Tips

# Use appropriate tools for the job
# For simple column extraction: cut > awk
# For complex processing: awk > sed
# For pattern matching: grep > sed/awk

# Avoid unnecessary pipes
# Bad: cat file | grep pattern
# Good: grep pattern file

# Use built-in string operations when possible
# Bad: echo "$string" | tr 'a-z' 'A-Z'
# Good: echo "${string^^}"  # Bash 4+

Safety Practices

# Always quote variables in text processing
grep "$pattern" "$file"

# Check for empty input
if [[ -s "$file" ]]; then
    process_file "$file"
fi

# Use appropriate regex delimiters
sed 's|/old/path|/new/path|g'  # Better than s/\/old\/path/\/new\/path/g

Grep - Pattern Searching​

Basic Grep Usage​

Grep Options​

Grep with Regular Expressions​

Sed - Stream Editor​

Basic Sed Operations​

Sed Line Operations​

Advanced Sed​

Awk - Text Processing Language​

Basic Awk Usage​

Awk Patterns and Conditions​

Awk Built-in Variables​

Awk Programming Constructs​

Awk Functions​

Cut - Column Extraction​

Basic Cut Usage​

Cut Options​

Sort - Sorting Text​

Basic Sort Usage​

Advanced Sort Options​

Uniq - Remove Duplicates​

Basic Uniq Usage​

Uniq Options​

Tr - Character Translation​

Basic Tr Usage​

Tr Character Sets​

String Manipulation​

Parameter Expansion​

String Comparison​

Text Processing Combinations​

Common Pipelines​

Text Statistics​

Best Practices​

Performance Tips​

Safety Practices​

Grep - Pattern Searching

Basic Grep Usage

Grep Options

Grep with Regular Expressions

Sed - Stream Editor

Basic Sed Operations

Sed Line Operations

Advanced Sed

Awk - Text Processing Language

Basic Awk Usage

Awk Patterns and Conditions

Awk Built-in Variables

Awk Programming Constructs

Awk Functions

Cut - Column Extraction

Basic Cut Usage

Cut Options

Sort - Sorting Text

Basic Sort Usage

Advanced Sort Options

Uniq - Remove Duplicates

Basic Uniq Usage

Uniq Options

Tr - Character Translation

Basic Tr Usage

Tr Character Sets

String Manipulation

Parameter Expansion

String Comparison

Text Processing Combinations

Common Pipelines

Text Statistics

Best Practices

Performance Tips

Safety Practices