コンテンツにスキップ

Waybackpy

Waybackpy is a Python library that provides a simple interface to query the Wayback Machine (Internet Archive). It allows security researchers, penetration testers, and forensic analysts to programmatically access historical snapshots of websites for reconnaissance, vulnerability research, and content analysis.

pip install waybackpy
git clone https://github.com/akamhy/waybackpy.git
cd waybackpy
python setup.py install
pip install requests
from waybackpy import Wayback

url = "https://example.com"
wayback = Wayback(url)

# Get available snapshots
snapshots = wayback.snapshots()
for snapshot in snapshots:
    print(snapshot)
from waybackpy import Wayback
from datetime import datetime

url = "https://example.com"
wayback = Wayback(url)

# Get closest snapshot to specific date
target_date = datetime(2020, 1, 1)
snapshot = wayback.near(target_date)
print(snapshot.archive_url)
from waybackpy import Wayback

url = "https://example.com"
wayback = Wayback(url)

# Get oldest snapshot
oldest = wayback.oldest()
print(oldest.archive_url)

# Get newest snapshot
newest = wayback.newest()
print(newest.archive_url)
CommandDescription
Wayback(url).snapshots()List all available snapshots
Wayback(url).oldest()Retrieve oldest snapshot
Wayback(url).newest()Retrieve newest snapshot
Wayback(url).near(date)Get snapshot closest to date
wayback.timestampGet timestamp of snapshot
wayback.archive_urlGet full archive.org URL
wayback.status_code()Check HTTP status of snapshot
from waybackpy import Wayback

url = "https://example.com"
wayback = Wayback(url)

# Get all snapshots for a URL
all_snapshots = wayback.snapshots()

print(f"Total snapshots: {len(all_snapshots)}")
for snapshot in all_snapshots[-10:]:  # Last 10
    print(f"{snapshot.timestamp}: {snapshot.status_code()}")
from waybackpy import Wayback
from datetime import datetime, timedelta

url = "https://example.com"
wayback = Wayback(url)

start_date = datetime(2018, 1, 1)
end_date = datetime(2020, 12, 31)

snapshots = wayback.snapshots()
filtered = [s for s in snapshots 
            if start_date <= s.datetime_timestamp() <= end_date]

for snapshot in filtered:
    print(snapshot.timestamp)
from waybackpy import Wayback

url = "https://example.com"
wayback = Wayback(url)

# Get newest and check status
newest = wayback.newest()
status = newest.status_code()
print(f"Status code: {status}")

# Verify snapshot is accessible
if status == 200:
    print("Snapshot is accessible")
from waybackpy import Wayback

domains = [
    "example.com",
    "example.org",
    "example.net"
]

for domain in domains:
    try:
        wayback = Wayback(f"https://{domain}")
        oldest = wayback.oldest()
        newest = wayback.newest()
        print(f"{domain}: {oldest.timestamp} -> {newest.timestamp}")
    except Exception as e:
        print(f"Error for {domain}: {e}")
from waybackpy import Wayback
from datetime import datetime

url = "https://example.com"
wayback = Wayback(url)

snapshots = wayback.snapshots()
dates = [s.datetime_timestamp() for s in snapshots]

# Find largest gaps between snapshots
gaps = []
for i in range(len(dates) - 1):
    gap = dates[i + 1] - dates[i]
    gaps.append((dates[i], gap.days))

# Sort by gap size
gaps.sort(key=lambda x: x[1], reverse=True)
for date, days in gaps[:5]:
    print(f"Gap of {days} days after {date}")
from waybackpy import Wayback
import requests

url = "https://example.com"
wayback = Wayback(url)

# Get two snapshots
oldest = wayback.oldest()
newest = wayback.newest()

# Fetch content
old_content = requests.get(oldest.archive_url).text
new_content = requests.get(newest.archive_url).text

# Compare sizes
print(f"Old version: {len(old_content)} bytes")
print(f"New version: {len(new_content)} bytes")
print(f"Change: {len(new_content) - len(old_content)} bytes")
from waybackpy import Wayback
import re

url = "https://example.com"
wayback = Wayback(url)

snapshots = wayback.snapshots()

# Analyze patterns in snapshots
print(f"Total snapshots: {len(snapshots)}")
print(f"First snapshot: {snapshots[0].timestamp}")
print(f"Latest snapshot: {snapshots[-1].timestamp}")

# Count snapshots per year
from collections import Counter
years = [s.timestamp[:4] for s in snapshots]
year_counts = Counter(years)
for year, count in sorted(year_counts.items()):
    print(f"{year}: {count} snapshots")
from waybackpy import Wayback
import requests
import re

url = "https://example.com"
wayback = Wayback(url)

# Regex patterns for sensitive data
patterns = {
    'api_keys': r'["\']?api[_-]?key["\']?\s*[:=]\s*["\']?[a-zA-Z0-9]{20,}',
    'tokens': r'["\']?token["\']?\s*[:=]\s*["\']?[a-zA-Z0-9]{20,}',
    'passwords': r'["\']?password["\']?\s*[:=]\s*["\'][^"\']{8,}',
}

snapshots = wayback.snapshots()[-20:]  # Check recent

for snapshot in snapshots:
    try:
        content = requests.get(snapshot.archive_url, timeout=5).text
        for pattern_name, pattern in patterns.items():
            if re.search(pattern, content, re.IGNORECASE):
                print(f"[!] Possible {pattern_name} in {snapshot.timestamp}")
    except:
        pass
from waybackpy import Wayback
import re

domains = []

# Check main domain
url = "https://example.com"
wayback = Wayback(url)

snapshots = wayback.snapshots()

for snapshot in snapshots:
    # Extract from timestamp
    # Look for subdomains in archive URLs
    pass

print(f"Discovered {len(set(domains))} unique domains")
from waybackpy import Wayback
import concurrent.futures

urls = [
    "https://example1.com",
    "https://example2.com",
    "https://example3.com",
]

def get_snapshot_count(url):
    try:
        wayback = Wayback(url)
        return url, len(wayback.snapshots())
    except Exception as e:
        return url, 0

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = executor.map(get_snapshot_count, urls)
    
for url, count in results:
    print(f"{url}: {count} snapshots")
from waybackpy import Wayback

url = "https://example.com"

try:
    wayback = Wayback(url)
    snapshots = wayback.snapshots()
    print(f"Found {len(snapshots)} snapshots")
except Exception as e:
    print(f"Error: {e}")
from waybackpy import Wayback
import time

urls = ["https://example1.com", "https://example2.com"]

for url in urls:
    wayback = Wayback(url)
    snapshots = wayback.snapshots()
    print(f"{url}: {len(snapshots)}")
    time.sleep(2)  # Respect API rate limits
from waybackpy import Wayback
import requests

url = "https://lost-website.com"
wayback = Wayback(url)

newest = wayback.newest()
print(f"Recovering from: {newest.timestamp}")

# Download content
content = requests.get(newest.archive_url).text

# Save locally
with open('recovered_site.html', 'w') as f:
    f.write(content)
from waybackpy import Wayback

url = "https://example.com"
wayback = Wayback(url)

newest = wayback.newest()
print(f"Latest snapshot: {newest.timestamp}")
print(f"Archive URL: {newest.archive_url}")

# Track changes over time
snapshots = wayback.snapshots()
if len(snapshots) > 1:
    prev = snapshots[-2]
    curr = snapshots[-1]
    print(f"Previous: {prev.timestamp}")
    print(f"Current: {curr.timestamp}")
from waybackpy import Wayback
import json

url = "https://example.com"
wayback = Wayback(url)

snapshots = wayback.snapshots()

export = {
    'url': url,
    'total_snapshots': len(snapshots),
    'snapshots': [
        {
            'timestamp': s.timestamp,
            'archive_url': s.archive_url,
            'status': s.status_code()
        }
        for s in snapshots
    ]
}

with open('snapshots.json', 'w') as f:
    json.dump(export, f, indent=2)
  • Rate Limiting: Add delays between requests to avoid overloading the API
  • Timeout Handling: Internet Archive may be slow; use reasonable timeouts
  • Snapshot Availability: Not all URLs have snapshots; implement error handling
  • Archive URLs: Use wayback.archive_url for direct access to cached content
  • Date Filtering: Use datetime objects for precise snapshot targeting
  • Batch Processing: Use threading for multiple URL analysis
from waybackpy import Wayback

url = "https://example.com"
wayback = Wayback(url)

try:
    snapshots = wayback.snapshots()
    if not snapshots:
        print("No snapshots found - domain too new or blocked")
except Exception as e:
    print(f"Error: {e}")
from waybackpy import Wayback
import requests

url = "https://example.com"

try:
    wayback = Wayback(url)
    snapshots = wayback.snapshots()
except requests.ConnectionError:
    print("Connection error - check internet connection")
except requests.Timeout:
    print("Request timeout - try again later")