pdfplumber Cheat Sheet

Overview

pdfplumber is a Python library for extracting detailed text, tables, and metadata from PDF files. Built on top of pdfminer.six, it provides precise access to every character, line, rectangle, and curve on each page, enabling fine-grained control over text extraction. The library excels at table extraction with customizable detection settings and offers visual debugging by rendering pages as images with overlays.

Unlike OCR-based tools, pdfplumber works with the underlying PDF text layer, making it fast and accurate for born-digital PDFs. It is widely used for processing invoices, financial reports, government forms, and scientific papers where precise table and layout extraction is critical.

Installation

pip install pdfplumber

# With image rendering support (for visual debugging)
pip install "pdfplumber[image]"

Core Usage

Open and Read PDF

import pdfplumber

# Open PDF
with pdfplumber.open("document.pdf") as pdf:
    # PDF metadata
    print(f"Pages: {len(pdf.pages)}")
    print(f"Metadata: {pdf.metadata}")

    # Extract text from first page
    page = pdf.pages[0]
    text = page.extract_text()
    print(text)

    # Extract text from all pages
    full_text = ""
    for page in pdf.pages:
        full_text += page.extract_text() + "\n"

Page Properties

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]

    print(f"Width: {page.width}")
    print(f"Height: {page.height}")
    print(f"Page number: {page.page_number}")

    # Access raw objects
    print(f"Characters: {len(page.chars)}")
    print(f"Lines: {len(page.lines)}")
    print(f"Rectangles: {len(page.rects)}")
    print(f"Curves: {len(page.curves)}")
    print(f"Images: {len(page.images)}")

Text Extraction Options

# Default text extraction
text = page.extract_text()

# With layout preservation
text = page.extract_text(layout=True)

# Custom extraction settings
text = page.extract_text(
    x_tolerance=3,     # Horizontal tolerance for grouping chars
    y_tolerance=3,     # Vertical tolerance for grouping lines
    layout=False,
    x_density=7.25,    # Characters per point (for layout mode)
    y_density=13,      # Lines per point (for layout mode)
)

# Extract words with positions
words = page.extract_words(
    x_tolerance=3,
    y_tolerance=3,
    keep_blank_chars=False,
    use_text_flow=False,
    extra_attrs=["fontname", "size"]
)
for word in words[:10]:
    print(f"'{word['text']}' at ({word['x0']:.1f}, {word['top']:.1f}) font={word.get('fontname')}")

Table Extraction

Basic Table Extraction

with pdfplumber.open("report.pdf") as pdf:
    page = pdf.pages[0]

    # Find all tables
    tables = page.extract_tables()
    for i, table in enumerate(tables):
        print(f"Table {i+1}: {len(table)} rows x {len(table[0])} cols")
        for row in table:
            print(row)

    # Extract single table (first found)
    table = page.extract_table()
    if table:
        headers = table[0]
        for row in table[1:]:
            print(dict(zip(headers, row)))

Table Settings

# Custom table detection settings
table_settings = {
    "vertical_strategy": "lines",      # lines, text, explicit
    "horizontal_strategy": "lines",    # lines, text, explicit
    "explicit_vertical_lines": [],     # List of x-coordinates
    "explicit_horizontal_lines": [],   # List of y-coordinates
    "snap_tolerance": 3,
    "snap_x_tolerance": 3,
    "snap_y_tolerance": 3,
    "join_tolerance": 3,
    "join_x_tolerance": 3,
    "join_y_tolerance": 3,
    "edge_min_length": 3,
    "min_words_vertical": 3,
    "min_words_horizontal": 1,
    "text_tolerance": 3,
    "text_x_tolerance": 3,
    "text_y_tolerance": 3,
    "intersection_tolerance": 3,
    "intersection_x_tolerance": 3,
    "intersection_y_tolerance": 3,
}

tables = page.extract_tables(table_settings=table_settings)

Convert Tables to DataFrames

import pandas as pd

with pdfplumber.open("financials.pdf") as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            df = pd.DataFrame(table[1:], columns=table[0])
            print(df)
            print("---")

Cropping and Filtering

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]

    # Crop page to specific region
    # bbox = (x0, top, x1, bottom)
    cropped = page.crop((0, 0, page.width / 2, page.height))  # Left half
    text = cropped.extract_text()

    # Within bounding box
    header_area = page.within_bbox((50, 50, 550, 100))
    header_text = header_area.extract_text()

    # Filter by properties
    large_text = page.filter(lambda obj: obj.get("size", 0) > 14 if obj["object_type"] == "char" else True)
    titles = large_text.extract_text()

    # Filter out headers/footers
    body = page.crop((0, 72, page.width, page.height - 72))
    body_text = body.extract_text()

Visual Debugging

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]

    # Render page as image
    img = page.to_image(resolution=150)
    img.save("page_debug.png")

    # Draw detected table lines
    img = page.to_image()
    img.debug_tablefinder()
    img.save("table_debug.png")

    # Draw custom annotations
    img = page.to_image()
    img.draw_rects(page.extract_words(), stroke="blue", stroke_width=1)
    img.draw_lines(page.lines, stroke="red")
    img.draw_circles([(100, 200)], radius=5, fill="green")
    img.save("annotated.png")

    # Highlight specific regions
    img = page.to_image()
    img.draw_rect((50, 100, 400, 300), fill=(255, 0, 0, 50))
    img.save("highlighted.png")

Configuration

Character-Level Access

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]

    # Access individual characters
    for char in page.chars[:20]:
        print(f"Char: '{char['text']}' "
              f"Font: {char['fontname']} "
              f"Size: {char['size']:.1f} "
              f"Position: ({char['x0']:.1f}, {char['top']:.1f})")

    # Group characters by font
    from collections import defaultdict
    fonts = defaultdict(list)
    for char in page.chars:
        fonts[char['fontname']].append(char['text'])

    for font, chars in fonts.items():
        print(f"{font}: {len(chars)} characters")

Password-Protected PDFs

with pdfplumber.open("protected.pdf", password="secret") as pdf:
    text = pdf.pages[0].extract_text()

Advanced Usage

Multi-Page Processing

import pdfplumber
import pandas as pd

def extract_all_tables(pdf_path):
    all_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for j, table in enumerate(tables):
                df = pd.DataFrame(table[1:], columns=table[0])
                df["page"] = i + 1
                df["table_num"] = j + 1
                all_tables.append(df)

    if all_tables:
        return pd.concat(all_tables, ignore_index=True)
    return pd.DataFrame()

result = extract_all_tables("annual_report.pdf")
result.to_csv("extracted_tables.csv", index=False)

Extract Specific Regions

# Extract data from known form fields
form_fields = {
    "name": (100, 150, 400, 170),
    "date": (100, 180, 300, 200),
    "amount": (100, 210, 300, 230),
}

with pdfplumber.open("form.pdf") as pdf:
    page = pdf.pages[0]
    for field_name, bbox in form_fields.items():
        cropped = page.crop(bbox)
        value = cropped.extract_text()
        print(f"{field_name}: {value}")

Integration with RAG

def pdf_to_chunks(pdf_path, chunk_size=1000):
    chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                # Simple chunking by size
                for i in range(0, len(text), chunk_size):
                    chunk = text[i:i + chunk_size]
                    chunks.append({
                        "text": chunk,
                        "page": page.page_number,
                        "source": pdf_path
                    })
    return chunks

Troubleshooting

Issue	Solution
Empty text extraction	PDF may be image-based; use OCR tool instead
Garbled/overlapping text	Adjust `x_tolerance` and `y_tolerance`
Table not detected	Try `horizontal_strategy="text"`, adjust tolerances
Wrong table columns	Use `explicit_vertical_lines` with x-coordinates
Slow on large PDFs	Process specific pages, use multiprocessing
Memory issues	Process pages one at a time, close PDF promptly
Image rendering fails	Install Pillow: `pip install "pdfplumber[image]"`
Unicode errors	Check PDF encoding, try different `x_tolerance`

# Quick test
python -c "
import pdfplumber
with pdfplumber.open('test.pdf') as pdf:
    print(f'Pages: {len(pdf.pages)}')
    print(pdf.pages[0].extract_text()[:200])
"