تخطَّ إلى المحتوى

Parsero

Parsero is a specialized tool for parsing and analyzing robots.txt files from web applications. It extracts information about hidden paths, disallowed directories, and restricted endpoints that website administrators intended to hide from search engines, revealing potential attack surface during security assessments.

# Clone the repository
git clone https://github.com/behindthefirewalls/Parsero.git
cd Parsero

# Install dependencies
pip3 install -r requirements.txt

# Verify installation
python3 parsero.py --help
# Using Homebrew
brew tap behindthefirewalls/parsero
brew install parsero

# Or via pip3
pip3 install parsero

# Verify installation
parsero --help
# Install via pip3
pip3 install parsero

# Update to latest version
pip3 install --upgrade parsero

# Verify installation
python3 -m parsero --help
# Install Python 3.6+
python3 --version

# Install required packages
pip3 install requests
pip3 install urllib3

# Download and setup
git clone https://github.com/behindthefirewalls/Parsero.git
cd Parsero
chmod +x parsero.py
User-agent: Googlebot
Disallow: /admin
Disallow: /private
Allow: /public

User-agent: *
Disallow: /tmp
Crawl-delay: 5

User-agent: BadBot
Disallow: /
  • Disallowed paths: Directories forbidden to search engines
  • Allowed paths: Explicitly allowed despite parent restrictions
  • User-agent rules: Specific bot directives
  • Crawl delays: Rate-limiting hints
  • Sitemaps: Reference to site structure files
  • Reveals structure of sensitive areas
  • Indicates hidden admin panels
  • Shows private directories
  • May expose test/staging environments
  • Hints at API endpoints
  • Reveals backup locations
# Parse robots.txt from website
python3 parsero.py -u http://example.com

# Verbose output
python3 parsero.py -u http://example.com -v

# Save results to file
python3 parsero.py -u http://example.com -o results.txt
# Non-standard HTTP port
python3 parsero.py -u http://example.com:8080

# HTTPS with custom port
python3 parsero.py -u https://example.com:8443

# Localhost testing
python3 parsero.py -u http://localhost:5000
# Parse multiple URLs from file
python3 parsero.py -u http://example1.com http://example2.com http://example3.com

# Read URLs from file
python3 parsero.py -f urls.txt

# Output to directory
python3 parsero.py -f urls.txt -o results_dir/
# Display results in terminal
python3 parsero.py -u http://example.com

# Verbose mode (detailed output)
python3 parsero.py -u http://example.com -v

# Very verbose
python3 parsero.py -u http://example.com -vv
# Save to text file
python3 parsero.py -u http://example.com -o robots_output.txt

# Append to existing file
python3 parsero.py -u http://example.com -o results.txt -a

# Output to specific directory
python3 parsero.py -u http://example.com -o /tmp/parsero_results/
# Export as CSV
python3 parsero.py -u http://example.com -f csv -o results.csv

# Multiple URLs to CSV
python3 parsero.py -f urls.txt -f csv -o all_results.csv
# Export as JSON
python3 parsero.py -u http://example.com -f json -o results.json

# Pretty JSON formatting
python3 parsero.py -u http://example.com -f json -p
# Download actual restricted files (for authorized testing)
python3 parsero.py -u http://example.com -b

# Aggressive scanning
python3 parsero.py -u http://example.com -a

# Deep crawl with discovered paths
python3 parsero.py -u http://example.com -d
# Set connection timeout
python3 parsero.py -u http://example.com -t 30

# Retry failed connections
python3 parsero.py -u http://example.com -r 3

# Adjust crawl delay
python3 parsero.py -u http://example.com --delay 2
# Use HTTP proxy
python3 parsero.py -u http://example.com --proxy http://proxy.example.com:8080

# Use HTTPS proxy
python3 parsero.py -u http://example.com --proxy https://proxy.example.com:8443

# Proxy with authentication
python3 parsero.py -u http://example.com --proxy http://user:pass@proxy.com:8080
# Specify custom user-agent
python3 parsero.py -u http://example.com --user-agent "CustomBot/1.0"

# Impersonate specific bot
python3 parsero.py -u http://example.com --user-agent "Googlebot/2.1"

# Use custom headers file
python3 parsero.py -u http://example.com -H headers.txt
# Extract all disallowed paths
python3 parsero.py -u http://example.com -o paths.txt

# List unique paths only
python3 parsero.py -u http://example.com | sort | uniq

# Count total paths found
python3 parsero.py -u http://example.com | wc -l
# Find paths containing keyword
python3 parsero.py -u http://example.com | grep admin

# Find all API endpoints
python3 parsero.py -u http://example.com | grep /api

# Identify sensitive paths
python3 parsero.py -u http://example.com | grep -E "(admin|private|backup|tmp)"
# Show only directories
python3 parsero.py -u http://example.com | grep '/$'

# Show only files
python3 parsero.py -u http://example.com | grep '\.'

# Exclude certain paths
python3 parsero.py -u http://example.com | grep -v '/search'
# Parse robots.txt from multiple sites
cat targets.txt | while read target; do
  python3 parsero.py -u "$target" -o "results_${target##*/}.txt"
done

# Combine all results
cat results_*.txt > all_discovered_paths.txt
# Analyze competitor sites
python3 parsero.py -u http://competitor1.com -o competitor1.txt
python3 parsero.py -u http://competitor2.com -o competitor2.txt

# Compare discovered structures
diff competitor1.txt competitor2.txt
# Parse robots.txt looking for APIs
python3 parsero.py -u http://api.example.com

# Filter for API paths
python3 parsero.py -u http://example.com | grep -E "/api|/v1|/v2|/rest"

# Extract endpoint patterns
python3 parsero.py -u http://example.com | grep -oP '/api/[^?]*' | sort | uniq
# Check robots.txt on multiple subdomains
for sub in www api staging dev admin; do
  echo "=== $sub.example.com ==="
  python3 parsero.py -u http://$sub.example.com 2>/dev/null
done

# Save results
for sub in www api staging dev; do
  python3 parsero.py -u http://$sub.example.com -o "$sub.txt"
done
# Find sensitive directories
python3 parsero.py -u http://example.com | grep -E "(backup|logs|config|private)"

# Identify admin panels
python3 parsero.py -u http://example.com | grep -i admin

# Find test/debug endpoints
python3 parsero.py -u http://example.com | grep -E "(test|debug|dev|staging)"
# Discover API structure
python3 parsero.py -u http://api.example.com -v

# Map API versions
python3 parsero.py -u http://example.com | grep -oP '/api/v[0-9]+'

# Identify deprecated APIs
python3 parsero.py -u http://example.com | grep -E "(legacy|deprecated|v1)"
# Find login/auth paths
python3 parsero.py -u http://example.com | grep -E "(login|auth|signin|register)"

# Identify account management
python3 parsero.py -u http://example.com | grep -E "(profile|account|user)"

# Find admin interfaces
python3 parsero.py -u http://example.com | grep -E "(admin|panel|dashboard)"
# Use Parsero output for targeted bruting
python3 parsero.py -u http://example.com -o discovered.txt

# Verify discoveries with dirbuster
dirbuster -u http://example.com -l discovered.txt -r report.html
# Parse robots.txt and feed to crawler
python3 parsero.py -u http://example.com -o urls.txt

# Crawl discovered URLs
wget -i urls.txt --no-parent

# Or with curl
cat urls.txt | xargs -I {} curl -I {}
# Compare robots.txt with actual structure
python3 parsero.py -u http://example.com -o from_robots.txt

# Use nmap/nikto to verify access
nikto -h http://example.com -o nikto_results.txt

# Cross-reference findings
comm -12 <(sort from_robots.txt) <(sort nikto_results.txt)
#!/bin/bash
# Batch processing script
for url in $(cat sites.txt); do
  echo "Processing: $url"
  python3 parsero.py -u "$url" \
    -o "results/$(echo $url | cut -d'/' -f3).txt" \
    -v
done
# Combine results from multiple sites
python3 parsero.py -f urls.txt -o combined.txt

# Create summary statistics
echo "Total unique paths found:"
cat combined.txt | sort | uniq | wc -l

# Find most common path patterns
cat combined.txt | grep -oP '^/[^/]+' | sort | uniq -c | sort -rn
# Create comprehensive report
python3 parsero.py -u http://example.com \
  -o report.txt \
  -f json \
  -v

# Generate formatted output
echo "=== robots.txt Analysis ===" > full_report.txt
echo "Target: example.com" >> full_report.txt
echo "Date: $(date)" >> full_report.txt
cat report.txt >> full_report.txt
# Reduce detection likelihood
python3 parsero.py -u http://example.com --delay 5

# Multiple requests with delays
for url in $(cat urls.txt); do
  python3 parsero.py -u "$url" --delay 10
  sleep 30
done
# Use different user-agents
python3 parsero.py -u http://example.com --user-agent "Googlebot/2.1"
python3 parsero.py -u http://example.com --user-agent "Mozilla/5.0"
python3 parsero.py -u http://example.com --user-agent "bingbot/2.0"
# Use proxy to mask origin
python3 parsero.py -u http://example.com \
  --proxy http://proxy.example.com:8080 \
  --delay 5 \
  --user-agent "Googlebot"
# Find most restricted paths
python3 parsero.py -f urls.txt -o results.txt

# Analyze frequency
cat results.txt | sort | uniq -c | sort -rn

# Export statistics
python3 << 'EOF'
import re
from collections import Counter

with open('results.txt', 'r') as f:
    paths = f.readlines()

# Extract path components
components = []
for path in paths:
    parts = path.strip().split('/')
    components.extend([p for p in parts if p])

counter = Counter(components)
for comp, count in counter.most_common(20):
    print(f"{comp}: {count}")
EOF
# Build hierarchy of paths
python3 parsero.py -u http://example.com -v | \
  sort | \
  sed 's|/[^/]*$||' | \
  sort | uniq -c | sort -rn
# Test basic connectivity
python3 parsero.py -u http://example.com -t 60

# Verify robots.txt exists
curl -I http://example.com/robots.txt

# Check with specific user-agent
python3 parsero.py -u http://example.com --user-agent "Mozilla/5.0" -v
# Verify site has robots.txt
wget -q -O - http://example.com/robots.txt

# Check if site blocks parsing
python3 parsero.py -u http://example.com -vv

# Verify URL format
python3 parsero.py -u http://example.com:80  # Explicit port
# Test proxy connectivity
python3 parsero.py -u http://example.com --proxy http://127.0.0.1:8080 -vv

# Verify credentials
python3 parsero.py -u http://example.com \
  --proxy http://user:password@proxy:8080
  • Obtain written authorization before analysis
  • Respect robots.txt directives in production
  • Document all discovered information
  • Follow responsible disclosure practices
  • Maintain ethical standards
# Comprehensive reconnaissance workflow
python3 parsero.py -u http://target.example.com \
  -o robots_analysis.txt \
  -f json \
  -f csv \
  -v

# Create summary
echo "=== Robots.txt Analysis Summary ===" > summary.txt
echo "Total entries: $(wc -l < robots_analysis.txt)" >> summary.txt
echo "Unique paths: $(sort robots_analysis.txt | uniq | wc -l)" >> summary.txt

Parsero should be used:

  • For authorized security assessments
  • With written permission from site owners
  • To improve understanding of web security
  • In compliance with applicable laws
  • Respecting privacy and confidentiality

Never:

  • Access restricted paths without authorization
  • Download sensitive files from robots.txt
  • Share discovered information publicly
  • Attempt to access restricted areas
  • Violate applicable laws or regulations