Awesome-Agent-Skills-for-Empirical-Research pdf-extraction-guide

PDF parsing, text extraction, and document format conversion

install
source · Clone the upstream repo
git clone https://github.com/brycewang-stanford/Awesome-Agent-Skills-for-Empirical-Research
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/brycewang-stanford/Awesome-Agent-Skills-for-Empirical-Research "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/43-wentorai-research-plugins/skills/tools/document/pdf-extraction-guide" ~/.claude/skills/brycewang-stanford-awesome-agent-skills-for-empirical-research-pdf-extraction-gu && rm -rf "$T"
manifest: skills/43-wentorai-research-plugins/skills/tools/document/pdf-extraction-guide/SKILL.md
source content

PDF Extraction Guide

Extract text, tables, figures, and metadata from academic PDFs using Python libraries, with strategies for handling multi-column layouts, mathematical content, and scanned documents.

PDF Extraction Tools Comparison

ToolTextTablesFiguresLayoutOCRSpeed
PyMuPDF (fitz)ExcellentManualYesBlocksNo (add with OCR engine)Fast
pdfplumberGoodExcellentNoTables focusNoMedium
PyPDF2 / pypdfBasicNoNoNoNoFast
Tabula-pyNoExcellentNoNoNoMedium
GROBIDStructuredYesReferencesAcademic layoutNoSlow (ML-based)
Nougat (Meta)ExcellentYesYesAcademic layoutBuilt-inSlow (GPU)
MarkerExcellentYesYesMulti-columnBuilt-inMedium
pdf2image + TesseractVia OCRVia OCRVia OCRNoYesSlow

PyMuPDF (fitz) — Fast Text Extraction

Basic Text Extraction

import fitz  # pip install PyMuPDF

def extract_text(pdf_path):
    """Extract all text from a PDF with page numbers."""
    doc = fitz.open(pdf_path)
    full_text = []

    for page_num, page in enumerate(doc, 1):
        text = page.get_text("text")
        full_text.append(f"--- Page {page_num} ---\n{text}")

    doc.close()
    return "\n".join(full_text)

# Usage
text = extract_text("paper.pdf")
print(text[:2000])

Structured Block-Level Extraction

def extract_structured(pdf_path):
    """Extract text with layout information (blocks, lines, spans)."""
    doc = fitz.open(pdf_path)
    pages = []

    for page_num, page in enumerate(doc):
        blocks = page.get_text("dict")["blocks"]
        page_data = {"page": page_num + 1, "blocks": []}

        for block in blocks:
            if "lines" not in block:
                continue  # Skip image blocks

            block_text = ""
            max_font_size = 0
            is_bold = False

            for line in block["lines"]:
                for span in line["spans"]:
                    block_text += span["text"]
                    max_font_size = max(max_font_size, span["size"])
                    if "Bold" in span.get("font", ""):
                        is_bold = True
                block_text += "\n"

            page_data["blocks"].append({
                "text": block_text.strip(),
                "font_size": max_font_size,
                "is_bold": is_bold,
                "bbox": block["bbox"]  # (x0, y0, x1, y1)
            })

        pages.append(page_data)

    doc.close()
    return pages

# Identify section headings
pages = extract_structured("paper.pdf")
for page in pages:
    for block in page["blocks"]:
        if block["is_bold"] and block["font_size"] > 11:
            print(f"[Heading] {block['text'][:80]}")

Extract Images and Figures

def extract_images(pdf_path, output_dir="./images"):
    """Extract all images from a PDF."""
    import os
    os.makedirs(output_dir, exist_ok=True)

    doc = fitz.open(pdf_path)
    img_count = 0

    for page_num, page in enumerate(doc):
        images = page.get_images(full=True)
        for img_idx, img in enumerate(images):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)

            if pix.n - pix.alpha > 3:  # CMYK
                pix = fitz.Pixmap(fitz.csRGB, pix)

            filename = f"{output_dir}/page{page_num+1}_img{img_idx+1}.png"
            pix.save(filename)
            img_count += 1

    doc.close()
    print(f"Extracted {img_count} images to {output_dir}")

pdfplumber — Table Extraction

import pdfplumber

def extract_tables(pdf_path):
    """Extract all tables from a PDF."""
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_tables = page.extract_tables()
            for table_idx, table in enumerate(page_tables):
                tables.append({
                    "page": page_num + 1,
                    "table_index": table_idx,
                    "data": table
                })
    return tables

# Convert extracted table to pandas DataFrame
import pandas as pd

tables = extract_tables("paper.pdf")
for t in tables:
    if t["data"]:
        df = pd.DataFrame(t["data"][1:], columns=t["data"][0])
        print(f"\nTable on page {t['page']}:")
        print(df.to_string())

GROBID — Structured Academic Paper Parsing

GROBID uses machine learning to parse academic PDFs into structured TEI XML.

import requests

def parse_with_grobid(pdf_path, grobid_url="http://localhost:8070"):
    """Parse a paper PDF using GROBID."""
    with open(pdf_path, "rb") as f:
        response = requests.post(
            f"{grobid_url}/api/processFulltextDocument",
            files={"input": f},
            data={"consolidateHeader": 1, "consolidateCitations": 1}
        )

    if response.status_code == 200:
        return response.text  # TEI XML
    else:
        raise Exception(f"GROBID error: {response.status_code}")

# Parse the TEI XML
from lxml import etree

tei_xml = parse_with_grobid("paper.pdf")
root = etree.fromstring(tei_xml.encode())
ns = {"tei": "http://www.tei-c.org/ns/1.0"}

# Extract title
title = root.find(".//tei:titleStmt/tei:title", ns)
print(f"Title: {title.text if title is not None else 'N/A'}")

# Extract abstract
abstract = root.find(".//tei:profileDesc/tei:abstract", ns)
if abstract is not None:
    print(f"Abstract: {abstract.text}")

# Extract references
refs = root.findall(".//tei:listBibl/tei:biblStruct", ns)
print(f"References found: {len(refs)}")
for ref in refs[:5]:
    title_elem = ref.find(".//tei:title", ns)
    print(f"  - {title_elem.text if title_elem is not None else 'N/A'}")

Document Chunking for RAG

Split documents into semantically meaningful chunks for retrieval-augmented generation:

def chunk_academic_paper(pdf_path, max_chunk_size=1000, overlap=200):
    """Chunk an academic paper by sections with overlap."""
    pages = extract_structured(pdf_path)

    # Identify sections
    sections = []
    current_section = {"heading": "Preamble", "text": ""}

    for page in pages:
        for block in page["blocks"]:
            if block["is_bold"] and block["font_size"] > 11 and len(block["text"]) < 100:
                if current_section["text"].strip():
                    sections.append(current_section)
                current_section = {"heading": block["text"], "text": ""}
            else:
                current_section["text"] += block["text"] + "\n"

    if current_section["text"].strip():
        sections.append(current_section)

    # Split long sections into overlapping chunks
    chunks = []
    for section in sections:
        text = section["text"]
        if len(text) <= max_chunk_size:
            chunks.append({
                "heading": section["heading"],
                "text": text,
                "chunk_index": 0
            })
        else:
            words = text.split()
            start = 0
            chunk_idx = 0
            while start < len(words):
                end = start + max_chunk_size // 5  # Approximate words
                chunk_text = " ".join(words[start:end])
                chunks.append({
                    "heading": section["heading"],
                    "text": chunk_text,
                    "chunk_index": chunk_idx
                })
                start = end - overlap // 5  # Overlap in words
                chunk_idx += 1

    return chunks

Format Conversion

Markdown to PDF

# Using Pandoc (most versatile converter)
pandoc paper.md -o paper.pdf --pdf-engine=xelatex

# With template and bibliography
pandoc paper.md -o paper.pdf \
  --pdf-engine=xelatex \
  --template=ieee.tex \
  --bibliography=references.bib \
  --citeproc \
  --number-sections

# Markdown to Word (for collaborators who prefer Word)
pandoc paper.md -o paper.docx --reference-doc=template.docx

PDF to Markdown (Using Marker)

# Install Marker (ML-based PDF to Markdown converter)
pip install marker-pdf

# Convert a single PDF
marker_single paper.pdf output_dir/ --langs English

# Batch convert
marker output_dir/ input_dir/ --workers 4

OCR for Scanned PDFs

from pdf2image import convert_from_path
import pytesseract

def ocr_pdf(pdf_path, lang="eng"):
    """OCR a scanned PDF using Tesseract."""
    images = convert_from_path(pdf_path, dpi=300)
    full_text = []

    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image, lang=lang)
        full_text.append(f"--- Page {i+1} ---\n{text}")

    return "\n".join(full_text)

# For academic papers with math, use specialized OCR:
# - Mathpix API (commercial, excellent math OCR)
# - Nougat (Meta, open source, GPU required)
# - LaTeX-OCR (open source, formula-specific)

Best Practices

  1. Try PyMuPDF first: It is the fastest and handles most modern PDFs well. Fall back to GROBID for academic papers that need structural parsing.
  2. Check PDF type: Use
    page.get_text()
    to detect if a PDF is text-based or scanned. If empty, use OCR.
  3. Handle multi-column layouts: PyMuPDF's
    sort
    parameter in
    get_text("blocks")
    helps with reading order. GROBID and Marker handle this natively.
  4. Preserve metadata: Extract DOI, authors, and title from PDF metadata (
    doc.metadata
    ) when available.
  5. Validate table extraction: Always visually verify extracted tables; complex layouts with merged cells often fail.
  6. Cache extracted text: Store parsed results alongside PDFs to avoid re-processing.