Waybackpy
Overview
Waybackpy is a Python library that provides a simple interface to query the Wayback Machine (Internet Archive). It allows security researchers, penetration testers, and forensic analysts to programmatically access historical snapshots of websites for reconnaissance, vulnerability research, and content analysis.
Installation
Using pip
pip install waybackpy
From source
git clone https://github.com/akamhy/waybackpy.git
cd waybackpy
python setup.py install
Requirements
pip install requests
Basic Usage
Retrieve Snapshots
from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get available snapshots
snapshots = wayback.snapshots()
for snapshot in snapshots:
print(snapshot)
Get Specific Snapshot
from waybackpy import Wayback
from datetime import datetime
url = "https://example.com"
wayback = Wayback(url)
# Get closest snapshot to specific date
target_date = datetime(2020, 1, 1)
snapshot = wayback.near(target_date)
print(snapshot.archive_url)
Fetch Snapshot Content
from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get oldest snapshot
oldest = wayback.oldest()
print(oldest.archive_url)
# Get newest snapshot
newest = wayback.newest()
print(newest.archive_url)
Common Commands
| Command | Description |
|---|---|
Wayback(url).snapshots() | List all available snapshots |
Wayback(url).oldest() | Retrieve oldest snapshot |
Wayback(url).newest() | Retrieve newest snapshot |
Wayback(url).near(date) | Get snapshot closest to date |
wayback.timestamp | Get timestamp of snapshot |
wayback.archive_url | Get full archive.org URL |
wayback.status_code() | Check HTTP status of snapshot |
Snapshot Analysis
Iterate Through Historical Versions
from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get all snapshots for a URL
all_snapshots = wayback.snapshots()
print(f"Total snapshots: {len(all_snapshots)}")
for snapshot in all_snapshots[-10:]: # Last 10
print(f"{snapshot.timestamp}: {snapshot.status_code()}")
Filter Snapshots by Date Range
from waybackpy import Wayback
from datetime import datetime, timedelta
url = "https://example.com"
wayback = Wayback(url)
start_date = datetime(2018, 1, 1)
end_date = datetime(2020, 12, 31)
snapshots = wayback.snapshots()
filtered = [s for s in snapshots
if start_date <= s.datetime_timestamp() <= end_date]
for snapshot in filtered:
print(snapshot.timestamp)
Check Snapshot Status
from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
# Get newest and check status
newest = wayback.newest()
status = newest.status_code()
print(f"Status code: {status}")
# Verify snapshot is accessible
if status == 200:
print("Snapshot is accessible")
Web Forensics
Domain History Tracking
from waybackpy import Wayback
domains = [
"example.com",
"example.org",
"example.net"
]
for domain in domains:
try:
wayback = Wayback(f"https://{domain}")
oldest = wayback.oldest()
newest = wayback.newest()
print(f"{domain}: {oldest.timestamp} -> {newest.timestamp}")
except Exception as e:
print(f"Error for {domain}: {e}")
Analyze Website Evolution
from waybackpy import Wayback
from datetime import datetime
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
dates = [s.datetime_timestamp() for s in snapshots]
# Find largest gaps between snapshots
gaps = []
for i in range(len(dates) - 1):
gap = dates[i + 1] - dates[i]
gaps.append((dates[i], gap.days))
# Sort by gap size
gaps.sort(key=lambda x: x[1], reverse=True)
for date, days in gaps[:5]:
print(f"Gap of {days} days after {date}")
Content Comparison
from waybackpy import Wayback
import requests
url = "https://example.com"
wayback = Wayback(url)
# Get two snapshots
oldest = wayback.oldest()
newest = wayback.newest()
# Fetch content
old_content = requests.get(oldest.archive_url).text
new_content = requests.get(newest.archive_url).text
# Compare sizes
print(f"Old version: {len(old_content)} bytes")
print(f"New version: {len(new_content)} bytes")
print(f"Change: {len(new_content) - len(old_content)} bytes")
Security Research
Reconnaissance Script
from waybackpy import Wayback
import re
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
# Analyze patterns in snapshots
print(f"Total snapshots: {len(snapshots)}")
print(f"First snapshot: {snapshots[0].timestamp}")
print(f"Latest snapshot: {snapshots[-1].timestamp}")
# Count snapshots per year
from collections import Counter
years = [s.timestamp[:4] for s in snapshots]
year_counts = Counter(years)
for year, count in sorted(year_counts.items()):
print(f"{year}: {count} snapshots")
Find Exposed Credentials
from waybackpy import Wayback
import requests
import re
url = "https://example.com"
wayback = Wayback(url)
# Regex patterns for sensitive data
patterns = {
'api_keys': r'["\']?api[_-]?key["\']?\s*[:=]\s*["\']?[a-zA-Z0-9]{20,}',
'tokens': r'["\']?token["\']?\s*[:=]\s*["\']?[a-zA-Z0-9]{20,}',
'passwords': r'["\']?password["\']?\s*[:=]\s*["\'][^"\']{8,}',
}
snapshots = wayback.snapshots()[-20:] # Check recent
for snapshot in snapshots:
try:
content = requests.get(snapshot.archive_url, timeout=5).text
for pattern_name, pattern in patterns.items():
if re.search(pattern, content, re.IGNORECASE):
print(f"[!] Possible {pattern_name} in {snapshot.timestamp}")
except:
pass
Subdomain Discovery
from waybackpy import Wayback
import re
domains = []
# Check main domain
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
for snapshot in snapshots:
# Extract from timestamp
# Look for subdomains in archive URLs
pass
print(f"Discovered {len(set(domains))} unique domains")
Advanced Features
Batch Processing
from waybackpy import Wayback
import concurrent.futures
urls = [
"https://example1.com",
"https://example2.com",
"https://example3.com",
]
def get_snapshot_count(url):
try:
wayback = Wayback(url)
return url, len(wayback.snapshots())
except Exception as e:
return url, 0
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = executor.map(get_snapshot_count, urls)
for url, count in results:
print(f"{url}: {count} snapshots")
Error Handling
from waybackpy import Wayback
url = "https://example.com"
try:
wayback = Wayback(url)
snapshots = wayback.snapshots()
print(f"Found {len(snapshots)} snapshots")
except Exception as e:
print(f"Error: {e}")
API Rate Limiting
from waybackpy import Wayback
import time
urls = ["https://example1.com", "https://example2.com"]
for url in urls:
wayback = Wayback(url)
snapshots = wayback.snapshots()
print(f"{url}: {len(snapshots)}")
time.sleep(2) # Respect API rate limits
Practical Examples
Website Backup Recovery
from waybackpy import Wayback
import requests
url = "https://lost-website.com"
wayback = Wayback(url)
newest = wayback.newest()
print(f"Recovering from: {newest.timestamp}")
# Download content
content = requests.get(newest.archive_url).text
# Save locally
with open('recovered_site.html', 'w') as f:
f.write(content)
Monitor Domain Changes
from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
newest = wayback.newest()
print(f"Latest snapshot: {newest.timestamp}")
print(f"Archive URL: {newest.archive_url}")
# Track changes over time
snapshots = wayback.snapshots()
if len(snapshots) > 1:
prev = snapshots[-2]
curr = snapshots[-1]
print(f"Previous: {prev.timestamp}")
print(f"Current: {curr.timestamp}")
Export Historical Data
from waybackpy import Wayback
import json
url = "https://example.com"
wayback = Wayback(url)
snapshots = wayback.snapshots()
export = {
'url': url,
'total_snapshots': len(snapshots),
'snapshots': [
{
'timestamp': s.timestamp,
'archive_url': s.archive_url,
'status': s.status_code()
}
for s in snapshots
]
}
with open('snapshots.json', 'w') as f:
json.dump(export, f, indent=2)
Tips and Tricks
- Rate Limiting: Add delays between requests to avoid overloading the API
- Timeout Handling: Internet Archive may be slow; use reasonable timeouts
- Snapshot Availability: Not all URLs have snapshots; implement error handling
- Archive URLs: Use
wayback.archive_urlfor direct access to cached content - Date Filtering: Use
datetimeobjects for precise snapshot targeting - Batch Processing: Use threading for multiple URL analysis
Troubleshooting
No Snapshots Found
from waybackpy import Wayback
url = "https://example.com"
wayback = Wayback(url)
try:
snapshots = wayback.snapshots()
if not snapshots:
print("No snapshots found - domain too new or blocked")
except Exception as e:
print(f"Error: {e}")
Connection Issues
from waybackpy import Wayback
import requests
url = "https://example.com"
try:
wayback = Wayback(url)
snapshots = wayback.snapshots()
except requests.ConnectionError:
print("Connection error - check internet connection")
except requests.Timeout:
print("Request timeout - try again later")