Waybackpy
Overview
Section intitulée « Overview »Waybackpy is a Python library that provides a simple interface to query the Wayback Machine (Internet Archive). It allows security researchers, penetration testers, and forensic analysts to programmatically access historical snapshots of websites for reconnaissance, vulnerability research, and content analysis.
Installation
Section intitulée « Installation »Using pip
Section intitulée « Using pip »pip install waybackpy
From source
Section intitulée « From source »git clone https://github.com/akamhy/waybackpy.git
cd waybackpy
python setup.py install
Requirements
Section intitulée « Requirements »pip install requests
Basic Usage
Section intitulée « Basic Usage »Retrieve Snapshots
Section intitulée « Retrieve Snapshots »from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get available snapshots
snapshots = wayback.snapshots()
for snapshot in snapshots:
print(snapshot)
Get Specific Snapshot
Section intitulée « Get Specific Snapshot »from waybackpy import Wayback
from datetime import datetime
url = "https://example.com"
wayback = Wayback(url)
# Get closest snapshot to specific date
target_date = datetime(2020, 1, 1)
snapshot = wayback.near(target_date)
print(snapshot.archive_url)
Fetch Snapshot Content
Section intitulée « Fetch Snapshot Content »from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get oldest snapshot
oldest = wayback.oldest()
print(oldest.archive_url)
# Get newest snapshot
newest = wayback.newest()
print(newest.archive_url)
Common Commands
Section intitulée « Common Commands »| Command | Description |
|---|---|
Wayback(url).snapshots() | List all available snapshots |
Wayback(url).oldest() | Retrieve oldest snapshot |
Wayback(url).newest() | Retrieve newest snapshot |
Wayback(url).near(date) | Get snapshot closest to date |
wayback.timestamp | Get timestamp of snapshot |
wayback.archive_url | Get full archive.org URL |
wayback.status_code() | Check HTTP status of snapshot |
Snapshot Analysis
Section intitulée « Snapshot Analysis »Iterate Through Historical Versions
Section intitulée « Iterate Through Historical Versions »from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get all snapshots for a URL
all_snapshots = wayback.snapshots()
print(f"Total snapshots: {len(all_snapshots)}")
for snapshot in all_snapshots[-10:]: # Last 10
print(f"{snapshot.timestamp}: {snapshot.status_code()}")
Filter Snapshots by Date Range
Section intitulée « Filter Snapshots by Date Range »from waybackpy import Wayback
from datetime import datetime, timedelta
url = "https://example.com"
wayback = Wayback(url)
start_date = datetime(2018, 1, 1)
end_date = datetime(2020, 12, 31)
snapshots = wayback.snapshots()
filtered = [s for s in snapshots
if start_date <= s.datetime_timestamp() <= end_date]
for snapshot in filtered:
print(snapshot.timestamp)
Check Snapshot Status
Section intitulée « Check Snapshot Status »from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get newest and check status
newest = wayback.newest()
status = newest.status_code()
print(f"Status code: {status}")
# Verify snapshot is accessible
if status == 200:
print("Snapshot is accessible")
Web Forensics
Section intitulée « Web Forensics »Domain History Tracking
Section intitulée « Domain History Tracking »from waybackpy import Wayback
domains = [
"example.com",
"example.org",
"example.net"
]
for domain in domains:
try:
wayback = Wayback(f"https://{domain}")
oldest = wayback.oldest()
newest = wayback.newest()
print(f"{domain}: {oldest.timestamp} -> {newest.timestamp}")
except Exception as e:
print(f"Error for {domain}: {e}")
Analyze Website Evolution
Section intitulée « Analyze Website Evolution »from waybackpy import Wayback
from datetime import datetime
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
dates = [s.datetime_timestamp() for s in snapshots]
# Find largest gaps between snapshots
gaps = []
for i in range(len(dates) - 1):
gap = dates[i + 1] - dates[i]
gaps.append((dates[i], gap.days))
# Sort by gap size
gaps.sort(key=lambda x: x[1], reverse=True)
for date, days in gaps[:5]:
print(f"Gap of {days} days after {date}")
Content Comparison
Section intitulée « Content Comparison »from waybackpy import Wayback
import requests
url = "https://example.com"
wayback = Wayback(url)
# Get two snapshots
oldest = wayback.oldest()
newest = wayback.newest()
# Fetch content
old_content = requests.get(oldest.archive_url).text
new_content = requests.get(newest.archive_url).text
# Compare sizes
print(f"Old version: {len(old_content)} bytes")
print(f"New version: {len(new_content)} bytes")
print(f"Change: {len(new_content) - len(old_content)} bytes")
Security Research
Section intitulée « Security Research »Reconnaissance Script
Section intitulée « Reconnaissance Script »from waybackpy import Wayback
import re
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
# Analyze patterns in snapshots
print(f"Total snapshots: {len(snapshots)}")
print(f"First snapshot: {snapshots[0].timestamp}")
print(f"Latest snapshot: {snapshots[-1].timestamp}")
# Count snapshots per year
from collections import Counter
years = [s.timestamp[:4] for s in snapshots]
year_counts = Counter(years)
for year, count in sorted(year_counts.items()):
print(f"{year}: {count} snapshots")
Find Exposed Credentials
Section intitulée « Find Exposed Credentials »from waybackpy import Wayback
import requests
import re
url = "https://example.com"
wayback = Wayback(url)
# Regex patterns for sensitive data
patterns = {
'api_keys': r'["\']?api[_-]?key["\']?\s*[:=]\s*["\']?[a-zA-Z0-9]{20,}',
'tokens': r'["\']?token["\']?\s*[:=]\s*["\']?[a-zA-Z0-9]{20,}',
'passwords': r'["\']?password["\']?\s*[:=]\s*["\'][^"\']{8,}',
}
snapshots = wayback.snapshots()[-20:] # Check recent
for snapshot in snapshots:
try:
content = requests.get(snapshot.archive_url, timeout=5).text
for pattern_name, pattern in patterns.items():
if re.search(pattern, content, re.IGNORECASE):
print(f"[!] Possible {pattern_name} in {snapshot.timestamp}")
except:
pass
Subdomain Discovery
Section intitulée « Subdomain Discovery »from waybackpy import Wayback
import re
domains = []
# Check main domain
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
for snapshot in snapshots:
# Extract from timestamp
# Look for subdomains in archive URLs
pass
print(f"Discovered {len(set(domains))} unique domains")
Advanced Features
Section intitulée « Advanced Features »Batch Processing
Section intitulée « Batch Processing »from waybackpy import Wayback
import concurrent.futures
urls = [
"https://example1.com",
"https://example2.com",
"https://example3.com",
]
def get_snapshot_count(url):
try:
wayback = Wayback(url)
return url, len(wayback.snapshots())
except Exception as e:
return url, 0
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = executor.map(get_snapshot_count, urls)
for url, count in results:
print(f"{url}: {count} snapshots")
Error Handling
Section intitulée « Error Handling »from waybackpy import Wayback
url = "https://example.com"
try:
wayback = Wayback(url)
snapshots = wayback.snapshots()
print(f"Found {len(snapshots)} snapshots")
except Exception as e:
print(f"Error: {e}")
API Rate Limiting
Section intitulée « API Rate Limiting »from waybackpy import Wayback
import time
urls = ["https://example1.com", "https://example2.com"]
for url in urls:
wayback = Wayback(url)
snapshots = wayback.snapshots()
print(f"{url}: {len(snapshots)}")
time.sleep(2) # Respect API rate limits
Practical Examples
Section intitulée « Practical Examples »Website Backup Recovery
Section intitulée « Website Backup Recovery »from waybackpy import Wayback
import requests
url = "https://lost-website.com"
wayback = Wayback(url)
newest = wayback.newest()
print(f"Recovering from: {newest.timestamp}")
# Download content
content = requests.get(newest.archive_url).text
# Save locally
with open('recovered_site.html', 'w') as f:
f.write(content)
Monitor Domain Changes
Section intitulée « Monitor Domain Changes »from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
newest = wayback.newest()
print(f"Latest snapshot: {newest.timestamp}")
print(f"Archive URL: {newest.archive_url}")
# Track changes over time
snapshots = wayback.snapshots()
if len(snapshots) > 1:
prev = snapshots[-2]
curr = snapshots[-1]
print(f"Previous: {prev.timestamp}")
print(f"Current: {curr.timestamp}")
Export Historical Data
Section intitulée « Export Historical Data »from waybackpy import Wayback
import json
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
export = {
'url': url,
'total_snapshots': len(snapshots),
'snapshots': [
{
'timestamp': s.timestamp,
'archive_url': s.archive_url,
'status': s.status_code()
}
for s in snapshots
]
}
with open('snapshots.json', 'w') as f:
json.dump(export, f, indent=2)
Tips and Tricks
Section intitulée « Tips and Tricks »- Rate Limiting: Add delays between requests to avoid overloading the API
- Timeout Handling: Internet Archive may be slow; use reasonable timeouts
- Snapshot Availability: Not all URLs have snapshots; implement error handling
- Archive URLs: Use
wayback.archive_urlfor direct access to cached content - Date Filtering: Use
datetimeobjects for precise snapshot targeting - Batch Processing: Use threading for multiple URL analysis
Troubleshooting
Section intitulée « Troubleshooting »No Snapshots Found
Section intitulée « No Snapshots Found »from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
try:
snapshots = wayback.snapshots()
if not snapshots:
print("No snapshots found - domain too new or blocked")
except Exception as e:
print(f"Error: {e}")
Connection Issues
Section intitulée « Connection Issues »from waybackpy import Wayback
import requests
url = "https://example.com"
try:
wayback = Wayback(url)
snapshots = wayback.snapshots()
except requests.ConnectionError:
print("Connection error - check internet connection")
except requests.Timeout:
print("Request timeout - try again later")