Aller au contenu

bulk_extractor

sudo apt-get update
sudo apt-get install bulk-extractor
brew install bulk-extractor
git clone https://github.com/simsong/bulk_extractor.git
cd bulk_extractor
./bootstrap.sh
./configure
make
sudo make install
bulk_extractor -V    # Version info
which bulk_extractor # Location
bulk_extractor -o output_dir /path/to/image.dd
bulk_extractor -o output_dir /path/to/directory
bulk_extractor -o output_dir /path/to/file.bin
bulk_extractor -o output_dir -r 100 /path/to/image.dd
bulk_extractor -m -o output_dir /path/to/large_image.dd
# Only email and URL extraction
bulk_extractor -o output_dir -s email -s url /path/to/image.dd

# Exclude specific scanners
bulk_extractor -o output_dir -x gzip /path/to/image.dd
ScannerDescription
emailEmail addresses and message metadata
urlURLs (HTTP, HTTPS, FTP)
phonePhone numbers (various formats)
ccnCredit card numbers (Luhn validation)
gpsGPS coordinates and location data
netNetwork artifacts (IP addresses, netbios)
exifEXIF metadata from images
rarRAR archive data
zipZIP archive data
gzipGZIP compressed data
sqliteSQLite database fragments
jsonJSON structures
xorXOR-obfuscated strings
windirsWindows directory structures
winpeWindows PE executables
hiberfilHibernation file parsing
ntfsNTFS-specific artifacts
bulk_extractor -h | grep "^  -s"
output_dir/
├── report.xml          # Summary report with statistics
├── email.txt           # Extracted email addresses
├── url.txt             # Extracted URLs
├── telephone.txt       # Phone numbers
├── ccn.txt             # Credit card numbers
├── gps.txt             # GPS coordinates
├── domain.txt          # Domain names
├── url_facebook.txt    # Facebook URLs
├── url_instagram.txt   # Instagram URLs
├── ccn_histogram.txt   # CCN frequency analysis
├── alerts.txt          # Scanner alerts
└── feature_files/      # Detailed feature files
    ├── email_carved.txt
    ├── url_base64.txt
    └── [feature]_base64.txt
# Each line contains:
# offset \t feature \t context
# Example:
12345   test@example.com   ...surrounding data...
bulk_extractor -o output_dir -H /path/to/image.dd
cat output_dir/ccn_histogram.txt
cat output_dir/email_histogram.txt
# Sort by frequency
sort output_dir/*_histogram.txt | head -20
cat output_dir/email.txt
cat output_dir/ccn.txt
cat output_dir/gps.txt
# Find specific domain emails
grep "@company.com" output_dir/email.txt

# Find specific country codes (phone)
grep "^1-" output_dir/telephone.txt

# Find base64-encoded URLs
cat output_dir/url_base64.txt
wc -l output_dir/*.txt
grep -c "." output_dir/email.txt
# Use 8 threads
bulk_extractor -o output_dir -j 8 /path/to/image.dd
# Use number of CPU cores (auto-detect)
bulk_extractor -o output_dir -j $(nproc) /path/to/image.dd
# Large image with memory constraints
bulk_extractor -m -o output_dir -j 4 /path/to/huge_image.dd

# Fast SSD with many cores
bulk_extractor -o output_dir -j 16 /path/to/image.dd
# Search for pattern (e.g., employee IDs)
bulk_extractor -o output_dir -e 'regex' "EMP[0-9]{6}" /path/to/image.dd
PatternDescription
[A-Z]{2}[0-9]{5}Postal codes
[0-9]{9}Social Security Numbers
user[0-9]+Username patterns
\$[0-9]{1,3},[0-9]{3}Currency amounts
(secret|password|key):Sensitive labels
# Multiple custom patterns
bulk_extractor -o output_dir \
  -e 'pattern1' "regex1" \
  -e 'pattern2' "regex2" \
  /path/to/image.dd
bulk_extractor -o case_output /path/to/evidence.dd
cat case_output/report.xml
# High-value indicators
ls -lh case_output/*.txt | sort -k5 -rn | head -10
# Extract URLs with context for timeline
grep -E "https?://" case_output/url.txt > urls_timeline.txt
# Find email addresses associated with suspicious domains
grep "@suspicious-domain.com" case_output/email.txt
# Create summary
cp case_output/report.xml /path/to/investigation_report/
tar -czf case_artifacts.tar.gz case_output/
bulk_diff output_dir1 output_dir2 > differences.txt
# Scan image twice to find changes
bulk_extractor -o scan1 /path/to/image.dd
bulk_extractor -o scan2 /path/to/updated_image.dd
bulk_diff scan1 scan2
bulk_diff -r scan1 scan2 > change_report.txt
# Extract all emails with frequency
sort output_dir/email.txt | uniq -c | sort -rn

# Find emails from specific domain
grep -i "@company.com" output_dir/email.txt | wc -l

# Export for contact analysis
awk '{print $2}' output_dir/email.txt | sort -u > contacts.txt
# All credit cards found
cat output_dir/ccn.txt

# Credit card frequency (potential fraud patterns)
cat output_dir/ccn_histogram.txt

# Banks associated with cards (Luhn algorithm applied)
grep "^4" output_dir/ccn.txt  # Visa
grep "^5" output_dir/ccn.txt  # Mastercard
grep "^3" output_dir/ccn.txt  # Amex
# Extract GPS coordinates
cat output_dir/gps.txt

# Filter by location range (manual parsing)
grep "40\.[0-9]" output_dir/gps.txt  # Latitude 40°

# Visualize on map (export coordinates)
awk '{print $2}' output_dir/gps.txt > coordinates.txt
# All domains found
cat output_dir/domain.txt | sort -u

# IP addresses and context
cat output_dir/net.txt

# Social media presence
cat output_dir/url_facebook.txt
cat output_dir/url_instagram.txt
# Recover Windows paths and registry keys
grep -i "windows" output_dir/domain.txt

# PE executable discovery
bulk_extractor -o output_dir -s winpe /path/to/image.dd

# Recover MFT fragments
bulk_extractor -o output_dir -s ntfs /path/to/image.dd
# Verbose output with statistics
bulk_extractor -o output_dir -V /path/to/image.dd
# Continue from last checkpoint
bulk_extractor -o output_dir -R /path/to/image.dd
# Scan only first 10GB
bulk_extractor -o output_dir -S 10GB /path/to/image.dd

# Scan specific byte range
bulk_extractor -o output_dir -s 1000000 -e 2000000 /path/to/image.dd
# Custom sector size (default: auto-detect)
bulk_extractor -o output_dir -B 4096 /path/to/image.dd
# Suppress progress output
bulk_extractor -o output_dir -q /path/to/image.dd
# Maximum parallelism with memory mapping
bulk_extractor -m -j $(nproc) -o output_dir /path/to/image.dd
# Single-threaded, memory-efficient
bulk_extractor -o output_dir -j 1 /path/to/image.dd
# Watch output directory growth
watch -n 5 'wc -l output_dir/*.txt'

# Check resource usage
top -p $(pgrep bulk_extractor)
# Reduce thread count
bulk_extractor -o output_dir -j 2 /path/to/image.dd

# Enable memory mapping
bulk_extractor -m -o output_dir /path/to/image.dd
# Disable specific slow scanners
bulk_extractor -o output_dir -x gzip -x zip /path/to/image.dd

# Use fewer threads if disk I/O bottlenecked
bulk_extractor -o output_dir -j 1 /path/to/image.dd
# Run with appropriate privileges
sudo bulk_extractor -o output_dir /path/to/image.dd

# Check output directory permissions
chmod 755 output_dir
# Convert emails to timeline format
awk '{print "email\t" $2}' output_dir/email.txt > timeline.txt
# bulk_extractor output compatible with:
# - EnCase (import feature files)
# - FTK (import report.xml)
# - Timeline tools (parse offsets)
# Compare extracted emails against known lists
comm -12 <(sort contacts.txt) <(sort knownbad.txt)