Claude-code-plugins-plus firecrawl-reliability-patterns
install
source · Clone the upstream repo
git clone https://github.com/jeremylongshore/claude-code-plugins-plus-skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/jeremylongshore/claude-code-plugins-plus-skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/plugins/saas-packs/firecrawl-pack/skills/firecrawl-reliability-patterns" ~/.claude/skills/jeremylongshore-claude-code-plugins-plus-firecrawl-reliability-patterns && rm -rf "$T"
manifest:
plugins/saas-packs/firecrawl-pack/skills/firecrawl-reliability-patterns/SKILL.mdsource content
Firecrawl Reliability Patterns
Overview
Production reliability patterns for Firecrawl scraping pipelines. Firecrawl's async crawl model, JS rendering, and credit-based pricing create specific reliability challenges: crawl jobs may time out, scraped content may be empty (bot detection, JS failures), and credits can be burned by runaway crawls. This skill covers battle-tested patterns for each.
Instructions
Step 1: Robust Crawl with Timeout and Backoff
import FirecrawlApp from "@mendable/firecrawl-js"; const firecrawl = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY!, }); async function reliableCrawl( url: string, opts: { limit: number; paths?: string[] }, timeoutMs = 600000 ) { const job = await firecrawl.asyncCrawlUrl(url, { limit: opts.limit, includePaths: opts.paths, scrapeOptions: { formats: ["markdown"], onlyMainContent: true }, }); const deadline = Date.now() + timeoutMs; let pollInterval = 2000; while (Date.now() < deadline) { const status = await firecrawl.checkCrawlStatus(job.id); if (status.status === "completed") return status; if (status.status === "failed") { throw new Error(`Crawl failed: ${status.error}`); } await new Promise(r => setTimeout(r, pollInterval)); pollInterval = Math.min(pollInterval * 1.5, 30000); // back off to 30s max } throw new Error(`Crawl timed out after ${timeoutMs}ms (job: ${job.id})`); }
Step 2: Content Quality Validation
interface ScrapedPage { url: string; markdown: string; metadata: { title?: string; statusCode?: number }; } function validateContent(page: ScrapedPage): { valid: boolean; reason?: string; } { if (!page.markdown || page.markdown.length < 100) { return { valid: false, reason: "Content too short" }; } if (page.metadata.statusCode && page.metadata.statusCode >= 400) { return { valid: false, reason: `HTTP ${page.metadata.statusCode}` }; } const errorPatterns = [ "access denied", "403 forbidden", "page not found", "captcha", "please verify", "enable javascript", ]; const lower = page.markdown.toLowerCase(); for (const pattern of errorPatterns) { if (lower.includes(pattern)) { return { valid: false, reason: `Error page detected: "${pattern}"` }; } } return { valid: true }; }
Step 3: Crawl-to-Scrape Fallback
// If a full crawl fails, fall back to scraping critical pages individually async function resilientFetch(urls: string[]): Promise<any[]> { // Try batch scrape first (most efficient) try { const batch = await firecrawl.batchScrapeUrls(urls, { formats: ["markdown"], onlyMainContent: true, }); const results = (batch.data || []).filter(page => { const { valid } = validateContent({ url: page.metadata?.sourceURL || "", markdown: page.markdown || "", metadata: page.metadata || {}, }); return valid; }); if (results.length >= urls.length * 0.5) { return results; // batch succeeded (>50% valid) } } catch (batchError) { console.warn("Batch scrape failed, falling back to individual scrapes"); } // Fallback: scrape individually with retries const results: any[] = []; for (const url of urls) { try { const result = await firecrawl.scrapeUrl(url, { formats: ["markdown"], onlyMainContent: true, waitFor: 5000, }); if (validateContent({ url, markdown: result.markdown || "", metadata: result.metadata || {} }).valid) { results.push(result); } } catch (e) { console.error(`Failed to scrape ${url}: ${(e as Error).message}`); } // Delay between individual scrapes to avoid rate limits await new Promise(r => setTimeout(r, 1000)); } return results; }
Step 4: Circuit Breaker for Firecrawl
class FirecrawlCircuitBreaker { private failures = 0; private lastFailure = 0; private state: "closed" | "open" | "half-open" = "closed"; private threshold: number; private resetTimeMs: number; constructor(threshold = 5, resetTimeMs = 60000) { this.threshold = threshold; this.resetTimeMs = resetTimeMs; } async execute<T>(operation: () => Promise<T>, fallback?: () => T): Promise<T> { // Check if circuit should reset if (this.state === "open" && Date.now() - this.lastFailure > this.resetTimeMs) { this.state = "half-open"; } if (this.state === "open") { console.warn("Circuit breaker OPEN — using fallback"); if (fallback) return fallback(); throw new Error("Firecrawl circuit breaker is open"); } try { const result = await operation(); if (this.state === "half-open") { this.state = "closed"; this.failures = 0; } return result; } catch (error) { this.failures++; this.lastFailure = Date.now(); if (this.failures >= this.threshold) { this.state = "open"; console.error(`Circuit breaker OPENED after ${this.failures} failures`); } throw error; } } } const breaker = new FirecrawlCircuitBreaker(5, 60000); async function protectedScrape(url: string) { return breaker.execute( () => firecrawl.scrapeUrl(url, { formats: ["markdown"] }), () => ({ markdown: getCachedContent(url), metadata: { fromCache: true } }) ); }
Step 5: Credit-Aware Processing
class CreditGuard { private dailyUsage = new Map<string, number>(); private dailyLimit: number; constructor(dailyLimit = 5000) { this.dailyLimit = dailyLimit; } canAfford(credits: number): boolean { const today = new Date().toISOString().split("T")[0]; return (this.dailyUsage.get(today) || 0) + credits <= this.dailyLimit; } record(credits: number) { const today = new Date().toISOString().split("T")[0]; this.dailyUsage.set(today, (this.dailyUsage.get(today) || 0) + credits); } remaining(): number { const today = new Date().toISOString().split("T")[0]; return this.dailyLimit - (this.dailyUsage.get(today) || 0); } } const creditGuard = new CreditGuard(5000); async function budgetedCrawl(url: string, limit: number) { if (!creditGuard.canAfford(limit)) { throw new Error(`Budget exceeded: ${creditGuard.remaining()} credits remaining`); } const result = await reliableCrawl(url, { limit }); creditGuard.record(result.data?.length || 0); return result; }
Error Handling
| Issue | Cause | Solution |
|---|---|---|
| Crawl timeout | Large site, slow rendering | Set timeout, reduce limit |
| Empty markdown | Bot detection or JS failure | Increase , use |
| Credit overrun | No budget tracking | Implement credit guard |
| Cascade failures | Single scrape failure crashes pipeline | Circuit breaker + fallback |
| Partial crawl results | Some pages blocked | Validate content, retry failed URLs |
Examples
Full Resilient Pipeline
async function resilientPipeline(url: string) { const map = await firecrawl.mapUrl(url); const urls = (map.links || []).filter(u => u.includes("/docs/")).slice(0, 50); if (!creditGuard.canAfford(urls.length)) { console.warn("Budget tight — reducing scope"); urls.splice(20); // trim to 20 } const pages = await resilientFetch(urls); const valid = pages.filter(p => validateContent(p).valid); creditGuard.record(urls.length); return { scraped: urls.length, valid: valid.length, remaining: creditGuard.remaining() }; }
Resources
Next Steps
For policy enforcement, see
firecrawl-policy-guardrails.