Skills extract-article-text

Extract clean article content — title, author, date, and body text — from PDFs, Word docs, and web pages.

install
source · Clone the upstream repo
git clone https://github.com/iterationlayer/skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/iterationlayer/skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/extract-article-text" ~/.claude/skills/iterationlayer-skills-extract-article-text && rm -rf "$T"
manifest: skills/extract-article-text/SKILL.md
source content

Extract Article Text

Content aggregators and newsletter platforms use this recipe to extract clean article content from PDFs, Word documents, and saved web pages. Define fields for title, author, date, body, and summary — the parser pulls the content and ignores headers, footers, navigation, and sidebars.

APIs Used

Document Extraction (1 credit per page)

Prerequisites

You need an Iteration Layer API key. Get one at platform.iterationlayer.com — free trial credits included, no credit card required.

For full integration guidance (SDKs, auth, MCP, error handling), see the Iteration Layer Integration Guide.

Implementation

curl -X POST \
  https://api.iterationlayer.com/document-extraction/v1/extract \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "files": [
      {
        "type": "url",
        "name": "article.pdf",
        "url": "https://example.com/article.pdf"
      }
    ],
    "schema": {
      "fields": [
        {
          "name": "title",
          "type": "TEXT",
          "description": "Article title or headline",
          "is_required": true
        },
        {
          "name": "author",
          "type": "TEXT",
          "description": "Author name"
        },
        {
          "name": "publish_date",
          "type": "DATE",
          "description": "Publication date of the article"
        },
        {
          "name": "body",
          "type": "TEXTAREA",
          "description": "Main article text content, excluding headers, footers, sidebars, and navigation",
          "is_required": true
        },
        {
          "name": "summary",
          "type": "TEXT",
          "description": "Brief summary or abstract",
          "max_length": 500
        },
        {
          "name": "category",
          "type": "TEXT",
          "description": "Article category or section"
        }
      ]
    }
  }'
import { IterationLayer } from "iterationlayer";
const client = new IterationLayer({ apiKey: "YOUR_API_KEY" });

const result = await client.extract({
  files: [
    {
      type: "url",
      name: "article.pdf",
      url: "https://example.com/article.pdf",
    },
  ],
  schema: {
    fields: [
      {
        name: "title",
        type: "TEXT",
        description: "Article title or headline",
        is_required: true,
      },
      {
        name: "author",
        type: "TEXT",
        description: "Author name",
      },
      {
        name: "publish_date",
        type: "DATE",
        description: "Publication date of the article",
      },
      {
        name: "body",
        type: "TEXTAREA",
        description: "Main article text content, excluding headers, footers, sidebars, and navigation",
        is_required: true,
      },
      {
        name: "summary",
        type: "TEXT",
        description: "Brief summary or abstract",
        max_length: 500,
      },
      {
        name: "category",
        type: "TEXT",
        description: "Article category or section",
      },
    ],
  },
});
from iterationlayer import IterationLayer
client = IterationLayer(api_key="YOUR_API_KEY")

result = client.extract(
    files=[
        {
            "type": "url",
            "name": "article.pdf",
            "url": "https://example.com/article.pdf",
        }
    ],
    schema={
        "fields": [
            {
                "name": "title",
                "type": "TEXT",
                "description": "Article title or headline",
                "is_required": True,
            },
            {
                "name": "author",
                "type": "TEXT",
                "description": "Author name",
            },
            {
                "name": "publish_date",
                "type": "DATE",
                "description": "Publication date of the article",
            },
            {
                "name": "body",
                "type": "TEXTAREA",
                "description": "Main article text content, excluding headers, footers, sidebars, and navigation",
                "is_required": True,
            },
            {
                "name": "summary",
                "type": "TEXT",
                "description": "Brief summary or abstract",
                "max_length": 500,
            },
            {
                "name": "category",
                "type": "TEXT",
                "description": "Article category or section",
            },
        ]
    },
)
package main

import il "github.com/iterationlayer/sdk-go"

func main() {
	client := il.NewClient("YOUR_API_KEY")

	result, err := client.Extract(il.ExtractRequest{
		Files: []il.FileInput{
			il.NewFileFromURL(
				"article.pdf",
				"https://example.com/article.pdf",
			),
		},
		Schema: il.ExtractionSchema{
			"title": il.NewTextFieldConfig(
				"title",
				"Article title or headline",
			),
			"author": il.NewTextFieldConfig(
				"author",
				"Author name",
			),
			"publish_date": il.NewDateFieldConfig(
				"publish_date",
				"Publication date of the article",
			),
			"body": il.NewTextareaFieldConfig(
				"body",
				"Main article text content, excluding headers, footers, sidebars, and navigation",
			),
			"summary": il.NewTextFieldConfig(
				"summary",
				"Brief summary or abstract",
			),
			"category": il.NewTextFieldConfig(
				"category",
				"Article category or section",
			),
		},
	})
	if err != nil {
		panic(err)
	}
}
{
  "name": "Extract Article Text",
  "nodes": [
    {
      "parameters": {
        "content": "## Extract Article Text\n\nContent aggregators and newsletter platforms use this recipe to extract clean article content from PDFs, Word documents, and saved web pages. Define fields for title, author, date, body, and summary \u2014 the parser pulls the content and ignores headers, footers, navigation, and sidebars.\n\n**Note:** This workflow uses the Iteration Layer community node (`n8n-nodes-iterationlayer`). Install it via Settings > Community Nodes before importing. Self-hosted n8n only.",
        "height": 280,
        "width": 500,
        "color": 2
      },
      "type": "n8n-nodes-base.stickyNote",
      "typeVersion": 1,
      "position": [
        200,
        40
      ],
      "id": "97f247db-41d8-4eae-95c9-d65cc3b2124d",
      "name": "Overview"
    },
    {
      "parameters": {
        "content": "### Step 1: Extract Data\nResource: **Document Extraction**\n\nConfigure the Document Extraction parameters below, then connect your credentials.",
        "height": 160,
        "width": 300,
        "color": 6
      },
      "type": "n8n-nodes-base.stickyNote",
      "typeVersion": 1,
      "position": [
        475,
        100
      ],
      "id": "a0acbdab-4287-4466-8b64-95bd9d4e3e49",
      "name": "Step 1 Note"
    },
    {
      "parameters": {},
      "type": "n8n-nodes-base.manualTrigger",
      "typeVersion": 1,
      "position": [
        250,
        300
      ],
      "id": "c3d4e5f6-a7b8-9012-cdef-123456789012",
      "name": "Manual Trigger"
    },
    {
      "parameters": {
        "resource": "documentExtraction",
        "schemaInputMode": "rawJson",
        "schemaJson": "{\"fields\":[{\"name\":\"title\",\"type\":\"TEXT\",\"description\":\"Article title or headline\",\"is_required\":true},{\"name\":\"author\",\"type\":\"TEXT\",\"description\":\"Author name\"},{\"name\":\"publish_date\",\"type\":\"DATE\",\"description\":\"Publication date of the article\"},{\"name\":\"body\",\"type\":\"TEXTAREA\",\"description\":\"Main article text content, excluding headers, footers, sidebars, and navigation\",\"is_required\":true},{\"name\":\"summary\",\"type\":\"TEXT\",\"description\":\"Brief summary or abstract\",\"max_length\":500},{\"name\":\"category\",\"type\":\"TEXT\",\"description\":\"Article category or section\"}]}",
        "files": {
          "fileValues": [
            {
              "fileInputMode": "url",
              "fileName": "article.pdf",
              "fileUrl": "https://example.com/article.pdf"
            }
          ]
        }
      },
      "type": "n8n-nodes-iterationlayer.iterationLayer",
      "typeVersion": 1,
      "position": [
        500,
        300
      ],
      "id": "d4e5f6a7-b8c9-0123-defa-234567890123",
      "name": "Extract Data",
      "credentials": {
        "iterationLayerApi": {
          "id": "1",
          "name": "Iteration Layer API"
        }
      }
    }
  ],
  "connections": {
    "Manual Trigger": {
      "main": [
        [
          {
            "node": "Extract Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  },
  "settings": {
    "executionOrder": "v1"
  }
}
Extract article content from the file at [file URL]. Use the extract_document tool with these fields:

- title (TEXT, required): Article title or headline
- author (TEXT): Author name
- publish_date (DATE): Publication date of the article
- body (TEXTAREA, required): Main article text content, excluding headers, footers, sidebars, and navigation
- summary (TEXT): Brief summary or abstract
- category (TEXT): Article category or section

Response

{
  "success": true,
  "data": {
    "title": {
      "value": "The Quiet Revolution in Battery Chemistry",
      "confidence": 0.97,
      "citations": ["The Quiet Revolution in Battery Chemistry"],
      "source": "article.pdf"
    },
    "author": {
      "value": "James Park",
      "confidence": 0.94,
      "citations": ["James Park"],
      "source": "article.pdf"
    },
    "publish_date": {
      "value": "2026-01-15",
      "confidence": 0.93,
      "citations": ["January 15, 2026"],
      "source": "article.pdf"
    },
    "body": {
      "value": "Solid-state batteries have been five years away for the last twenty years. But the latest generation of prototypes from three independent labs suggests the timeline might finally be real...",
      "confidence": 0.91,
      "citations": ["Solid-state batteries have been five years away"],
      "source": "article.pdf"
    },
    "summary": {
      "value": "Recent advances in solid-state battery technology suggest commercial viability within three years, driven by breakthroughs in solid electrolyte materials.",
      "confidence": 0.88,
      "citations": ["solid-state battery technology"],
      "source": "article.pdf"
    }
  }
}

Links