sudo apt-get update
sudo apt-get install bulk-extractor
brew install bulk-extractor
git clone https://github.com/simsong/bulk_extractor.git
cd bulk_extractor
./bootstrap.sh
./configure
make
sudo make install
bulk_extractor -V # Version info
which bulk_extractor # Location
bulk_extractor -o output_dir /path/to/image.dd
bulk_extractor -o output_dir /path/to/directory
bulk_extractor -o output_dir /path/to/file.bin
bulk_extractor -o output_dir -r 100 /path/to/image.dd
bulk_extractor -m -o output_dir /path/to/large_image.dd
# Only email and URL extraction
bulk_extractor -o output_dir -s email -s url /path/to/image.dd
# Exclude specific scanners
bulk_extractor -o output_dir -x gzip /path/to/image.dd
| Scanner | Description |
|---|
| email | Email addresses and message metadata |
| url | URLs (HTTP, HTTPS, FTP) |
| phone | Phone numbers (various formats) |
| ccn | Credit card numbers (Luhn validation) |
| gps | GPS coordinates and location data |
| net | Network artifacts (IP addresses, netbios) |
| exif | EXIF metadata from images |
| rar | RAR archive data |
| zip | ZIP archive data |
| gzip | GZIP compressed data |
| sqlite | SQLite database fragments |
| json | JSON structures |
| xor | XOR-obfuscated strings |
| windirs | Windows directory structures |
| winpe | Windows PE executables |
| hiberfil | Hibernation file parsing |
| ntfs | NTFS-specific artifacts |
bulk_extractor -h | grep "^ -s"
output_dir/
├── report.xml # Summary report with statistics
├── email.txt # Extracted email addresses
├── url.txt # Extracted URLs
├── telephone.txt # Phone numbers
├── ccn.txt # Credit card numbers
├── gps.txt # GPS coordinates
├── domain.txt # Domain names
├── url_facebook.txt # Facebook URLs
├── url_instagram.txt # Instagram URLs
├── ccn_histogram.txt # CCN frequency analysis
├── alerts.txt # Scanner alerts
└── feature_files/ # Detailed feature files
├── email_carved.txt
├── url_base64.txt
└── [feature]_base64.txt
# Each line contains:
# offset \t feature \t context
# Example:
12345 test@example.com ...surrounding data...
bulk_extractor -o output_dir -H /path/to/image.dd
cat output_dir/ccn_histogram.txt
cat output_dir/email_histogram.txt
# Sort by frequency
sort output_dir/*_histogram.txt | head -20
cat output_dir/email.txt
cat output_dir/ccn.txt
cat output_dir/gps.txt
# Find specific domain emails
grep "@company.com" output_dir/email.txt
# Find specific country codes (phone)
grep "^1-" output_dir/telephone.txt
# Find base64-encoded URLs
cat output_dir/url_base64.txt
wc -l output_dir/*.txt
grep -c "." output_dir/email.txt
# Use 8 threads
bulk_extractor -o output_dir -j 8 /path/to/image.dd
# Use number of CPU cores (auto-detect)
bulk_extractor -o output_dir -j $(nproc) /path/to/image.dd
# Large image with memory constraints
bulk_extractor -m -o output_dir -j 4 /path/to/huge_image.dd
# Fast SSD with many cores
bulk_extractor -o output_dir -j 16 /path/to/image.dd
# Search for pattern (e.g., employee IDs)
bulk_extractor -o output_dir -e 'regex' "EMP[0-9]{6}" /path/to/image.dd
| Pattern | Description |
|---|
[A-Z]{2}[0-9]{5} | Postal codes |
[0-9]{9} | Social Security Numbers |
user[0-9]+ | Username patterns |
\$[0-9]{1,3},[0-9]{3} | Currency amounts |
(secret|password|key): | Sensitive labels |
# Multiple custom patterns
bulk_extractor -o output_dir \
-e 'pattern1' "regex1" \
-e 'pattern2' "regex2" \
/path/to/image.dd
bulk_extractor -o case_output /path/to/evidence.dd
cat case_output/report.xml
# High-value indicators
ls -lh case_output/*.txt | sort -k5 -rn | head -10
# Extract URLs with context for timeline
grep -E "https?://" case_output/url.txt > urls_timeline.txt
# Find email addresses associated with suspicious domains
grep "@suspicious-domain.com" case_output/email.txt
# Create summary
cp case_output/report.xml /path/to/investigation_report/
tar -czf case_artifacts.tar.gz case_output/
bulk_diff output_dir1 output_dir2 > differences.txt
# Scan image twice to find changes
bulk_extractor -o scan1 /path/to/image.dd
bulk_extractor -o scan2 /path/to/updated_image.dd
bulk_diff scan1 scan2
bulk_diff -r scan1 scan2 > change_report.txt
# Extract all emails with frequency
sort output_dir/email.txt | uniq -c | sort -rn
# Find emails from specific domain
grep -i "@company.com" output_dir/email.txt | wc -l
# Export for contact analysis
awk '{print $2}' output_dir/email.txt | sort -u > contacts.txt
# All credit cards found
cat output_dir/ccn.txt
# Credit card frequency (potential fraud patterns)
cat output_dir/ccn_histogram.txt
# Banks associated with cards (Luhn algorithm applied)
grep "^4" output_dir/ccn.txt # Visa
grep "^5" output_dir/ccn.txt # Mastercard
grep "^3" output_dir/ccn.txt # Amex
# Extract GPS coordinates
cat output_dir/gps.txt
# Filter by location range (manual parsing)
grep "40\.[0-9]" output_dir/gps.txt # Latitude 40°
# Visualize on map (export coordinates)
awk '{print $2}' output_dir/gps.txt > coordinates.txt
# All domains found
cat output_dir/domain.txt | sort -u
# IP addresses and context
cat output_dir/net.txt
# Social media presence
cat output_dir/url_facebook.txt
cat output_dir/url_instagram.txt
# Recover Windows paths and registry keys
grep -i "windows" output_dir/domain.txt
# PE executable discovery
bulk_extractor -o output_dir -s winpe /path/to/image.dd
# Recover MFT fragments
bulk_extractor -o output_dir -s ntfs /path/to/image.dd
# Verbose output with statistics
bulk_extractor -o output_dir -V /path/to/image.dd
# Continue from last checkpoint
bulk_extractor -o output_dir -R /path/to/image.dd
# Scan only first 10GB
bulk_extractor -o output_dir -S 10GB /path/to/image.dd
# Scan specific byte range
bulk_extractor -o output_dir -s 1000000 -e 2000000 /path/to/image.dd
# Custom sector size (default: auto-detect)
bulk_extractor -o output_dir -B 4096 /path/to/image.dd
# Suppress progress output
bulk_extractor -o output_dir -q /path/to/image.dd
# Maximum parallelism with memory mapping
bulk_extractor -m -j $(nproc) -o output_dir /path/to/image.dd
# Single-threaded, memory-efficient
bulk_extractor -o output_dir -j 1 /path/to/image.dd
# Watch output directory growth
watch -n 5 'wc -l output_dir/*.txt'
# Check resource usage
top -p $(pgrep bulk_extractor)
# Reduce thread count
bulk_extractor -o output_dir -j 2 /path/to/image.dd
# Enable memory mapping
bulk_extractor -m -o output_dir /path/to/image.dd
# Disable specific slow scanners
bulk_extractor -o output_dir -x gzip -x zip /path/to/image.dd
# Use fewer threads if disk I/O bottlenecked
bulk_extractor -o output_dir -j 1 /path/to/image.dd
# Run with appropriate privileges
sudo bulk_extractor -o output_dir /path/to/image.dd
# Check output directory permissions
chmod 755 output_dir
# Convert emails to timeline format
awk '{print "email\t" $2}' output_dir/email.txt > timeline.txt
# bulk_extractor output compatible with:
# - EnCase (import feature files)
# - FTK (import report.xml)
# - Timeline tools (parse offsets)
# Compare extracted emails against known lists
comm -12 <(sort contacts.txt) <(sort knownbad.txt)