pdfplumber Cheat Sheet
Overview
pdfplumber is a Python library for extracting detailed text, tables, and metadata from PDF files. Built on top of pdfminer.six, it provides precise access to every character, line, rectangle, and curve on each page, enabling fine-grained control over text extraction. The library excels at table extraction with customizable detection settings and offers visual debugging by rendering pages as images with overlays.
Unlike OCR-based tools, pdfplumber works with the underlying PDF text layer, making it fast and accurate for born-digital PDFs. It is widely used for processing invoices, financial reports, government forms, and scientific papers where precise table and layout extraction is critical.
Installation
pip install pdfplumber
# With image rendering support (for visual debugging)
pip install "pdfplumber[image]"
Core Usage
Open and Read PDF
import pdfplumber
# Open PDF
with pdfplumber.open("document.pdf") as pdf:
# PDF metadata
print(f"Pages: {len(pdf.pages)}")
print(f"Metadata: {pdf.metadata}")
# Extract text from first page
page = pdf.pages[0]
text = page.extract_text()
print(text)
# Extract text from all pages
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
Page Properties
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
print(f"Width: {page.width}")
print(f"Height: {page.height}")
print(f"Page number: {page.page_number}")
# Access raw objects
print(f"Characters: {len(page.chars)}")
print(f"Lines: {len(page.lines)}")
print(f"Rectangles: {len(page.rects)}")
print(f"Curves: {len(page.curves)}")
print(f"Images: {len(page.images)}")
Text Extraction Options
# Default text extraction
text = page.extract_text()
# With layout preservation
text = page.extract_text(layout=True)
# Custom extraction settings
text = page.extract_text(
x_tolerance=3, # Horizontal tolerance for grouping chars
y_tolerance=3, # Vertical tolerance for grouping lines
layout=False,
x_density=7.25, # Characters per point (for layout mode)
y_density=13, # Lines per point (for layout mode)
)
# Extract words with positions
words = page.extract_words(
x_tolerance=3,
y_tolerance=3,
keep_blank_chars=False,
use_text_flow=False,
extra_attrs=["fontname", "size"]
)
for word in words[:10]:
print(f"'{word['text']}' at ({word['x0']:.1f}, {word['top']:.1f}) font={word.get('fontname')}")
Table Extraction
Basic Table Extraction
with pdfplumber.open("report.pdf") as pdf:
page = pdf.pages[0]
# Find all tables
tables = page.extract_tables()
for i, table in enumerate(tables):
print(f"Table {i+1}: {len(table)} rows x {len(table[0])} cols")
for row in table:
print(row)
# Extract single table (first found)
table = page.extract_table()
if table:
headers = table[0]
for row in table[1:]:
print(dict(zip(headers, row)))
Table Settings
# Custom table detection settings
table_settings = {
"vertical_strategy": "lines", # lines, text, explicit
"horizontal_strategy": "lines", # lines, text, explicit
"explicit_vertical_lines": [], # List of x-coordinates
"explicit_horizontal_lines": [], # List of y-coordinates
"snap_tolerance": 3,
"snap_x_tolerance": 3,
"snap_y_tolerance": 3,
"join_tolerance": 3,
"join_x_tolerance": 3,
"join_y_tolerance": 3,
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"text_tolerance": 3,
"text_x_tolerance": 3,
"text_y_tolerance": 3,
"intersection_tolerance": 3,
"intersection_x_tolerance": 3,
"intersection_y_tolerance": 3,
}
tables = page.extract_tables(table_settings=table_settings)
Convert Tables to DataFrames
import pandas as pd
with pdfplumber.open("financials.pdf") as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
df = pd.DataFrame(table[1:], columns=table[0])
print(df)
print("---")
Cropping and Filtering
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Crop page to specific region
# bbox = (x0, top, x1, bottom)
cropped = page.crop((0, 0, page.width / 2, page.height)) # Left half
text = cropped.extract_text()
# Within bounding box
header_area = page.within_bbox((50, 50, 550, 100))
header_text = header_area.extract_text()
# Filter by properties
large_text = page.filter(lambda obj: obj.get("size", 0) > 14 if obj["object_type"] == "char" else True)
titles = large_text.extract_text()
# Filter out headers/footers
body = page.crop((0, 72, page.width, page.height - 72))
body_text = body.extract_text()
Visual Debugging
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Render page as image
img = page.to_image(resolution=150)
img.save("page_debug.png")
# Draw detected table lines
img = page.to_image()
img.debug_tablefinder()
img.save("table_debug.png")
# Draw custom annotations
img = page.to_image()
img.draw_rects(page.extract_words(), stroke="blue", stroke_width=1)
img.draw_lines(page.lines, stroke="red")
img.draw_circles([(100, 200)], radius=5, fill="green")
img.save("annotated.png")
# Highlight specific regions
img = page.to_image()
img.draw_rect((50, 100, 400, 300), fill=(255, 0, 0, 50))
img.save("highlighted.png")
Configuration
Character-Level Access
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Access individual characters
for char in page.chars[:20]:
print(f"Char: '{char['text']}' "
f"Font: {char['fontname']} "
f"Size: {char['size']:.1f} "
f"Position: ({char['x0']:.1f}, {char['top']:.1f})")
# Group characters by font
from collections import defaultdict
fonts = defaultdict(list)
for char in page.chars:
fonts[char['fontname']].append(char['text'])
for font, chars in fonts.items():
print(f"{font}: {len(chars)} characters")
Password-Protected PDFs
with pdfplumber.open("protected.pdf", password="secret") as pdf:
text = pdf.pages[0].extract_text()
Advanced Usage
Multi-Page Processing
import pdfplumber
import pandas as pd
def extract_all_tables(pdf_path):
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
tables = page.extract_tables()
for j, table in enumerate(tables):
df = pd.DataFrame(table[1:], columns=table[0])
df["page"] = i + 1
df["table_num"] = j + 1
all_tables.append(df)
if all_tables:
return pd.concat(all_tables, ignore_index=True)
return pd.DataFrame()
result = extract_all_tables("annual_report.pdf")
result.to_csv("extracted_tables.csv", index=False)
Extract Specific Regions
# Extract data from known form fields
form_fields = {
"name": (100, 150, 400, 170),
"date": (100, 180, 300, 200),
"amount": (100, 210, 300, 230),
}
with pdfplumber.open("form.pdf") as pdf:
page = pdf.pages[0]
for field_name, bbox in form_fields.items():
cropped = page.crop(bbox)
value = cropped.extract_text()
print(f"{field_name}: {value}")
Integration with RAG
def pdf_to_chunks(pdf_path, chunk_size=1000):
chunks = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
# Simple chunking by size
for i in range(0, len(text), chunk_size):
chunk = text[i:i + chunk_size]
chunks.append({
"text": chunk,
"page": page.page_number,
"source": pdf_path
})
return chunks
Troubleshooting
| Issue | Solution |
|---|---|
| Empty text extraction | PDF may be image-based; use OCR tool instead |
| Garbled/overlapping text | Adjust x_tolerance and y_tolerance |
| Table not detected | Try horizontal_strategy="text", adjust tolerances |
| Wrong table columns | Use explicit_vertical_lines with x-coordinates |
| Slow on large PDFs | Process specific pages, use multiprocessing |
| Memory issues | Process pages one at a time, close PDF promptly |
| Image rendering fails | Install Pillow: pip install "pdfplumber[image]" |
| Unicode errors | Check PDF encoding, try different x_tolerance |
# Quick test
python -c "
import pdfplumber
with pdfplumber.open('test.pdf') as pdf:
print(f'Pages: {len(pdf.pages)}')
print(pdf.pages[0].extract_text()[:200])
"