Skillshub apify-reference-architecture
install
source · Clone the upstream repo
git clone https://github.com/ComeOnOliver/skillshub
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/ComeOnOliver/skillshub "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/jeremylongshore/claude-code-plugins-plus-skills/apify-reference-architecture" ~/.claude/skills/comeonoliver-skillshub-apify-reference-architecture && rm -rf "$T"
manifest:
skills/jeremylongshore/claude-code-plugins-plus-skills/apify-reference-architecture/SKILL.mdsource content
Apify Reference Architecture
Overview
Production-ready architecture patterns for applications built on Apify. Covers standalone Actor projects, multi-Actor pipelines, and full-stack applications that integrate Apify as a data source.
Architecture Pattern 1: Standalone Actor
For a single scraper deployed to Apify platform.
my-scraper/ ├── .actor/ │ ├── actor.json # Actor metadata │ ├── INPUT_SCHEMA.json # Input definition (generates UI) │ └── Dockerfile # Build configuration ├── src/ │ ├── main.ts # Entry point (Actor.main) │ ├── routes/ │ │ ├── listing.ts # Router handler: listing pages │ │ └── detail.ts # Router handler: detail pages │ ├── types.ts # Input/output TypeScript types │ └── utils/ │ ├── extractors.ts # Data extraction functions │ └── validators.ts # Input/output validation ├── tests/ │ ├── extractors.test.ts # Unit tests for extraction logic │ └── integration.test.ts # Integration tests (live API) ├── storage/ # Local storage (git-ignored) ├── package.json ├── tsconfig.json └── .gitignore
Key Files
// src/main.ts — Actor entry point import { Actor } from 'apify'; import { CheerioCrawler } from 'crawlee'; import { router } from './routes/listing'; import { validateInput, ScraperInput } from './types'; await Actor.main(async () => { const rawInput = await Actor.getInput<ScraperInput>(); const input = validateInput(rawInput); const proxyConfiguration = input.proxyConfig?.useApifyProxy ? await Actor.createProxyConfiguration({ groups: input.proxyConfig.groups }) : undefined; const crawler = new CheerioCrawler({ requestHandler: router, proxyConfiguration, maxRequestsPerCrawl: input.maxItems ?? 100, maxConcurrency: input.concurrency ?? 10, }); await crawler.run(input.startUrls.map(s => s.url)); });
// src/types.ts — Shared types and validation import { z } from 'zod'; export const InputSchema = z.object({ startUrls: z.array(z.object({ url: z.string().url() })).min(1), maxItems: z.number().int().positive().optional().default(100), concurrency: z.number().int().min(1).max(50).optional().default(10), proxyConfig: z.object({ useApifyProxy: z.boolean(), groups: z.array(z.string()).optional(), }).optional(), }); export type ScraperInput = z.infer<typeof InputSchema>; export function validateInput(raw: unknown): ScraperInput { return InputSchema.parse(raw); } export interface ProductOutput { url: string; name: string; price: number | null; currency: string; inStock: boolean; scrapedAt: string; }
Architecture Pattern 2: Multi-Actor Pipeline
For complex scraping workflows with multiple stages.
┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ Discover │────▶│ Scrape │────▶│ Transform │ │ Actor │ │ Actor │ │ Actor │ │ │ │ │ │ │ │ Finds URLs │ │ Extracts │ │ Dedup, │ │ to scrape │ │ raw data │ │ clean, │ │ │ │ │ │ enrich │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │ │ ▼ ▼ ▼ Request Queue Dataset A Dataset B (URLs to scrape) (raw data) (clean data)
Pipeline Orchestrator
// pipeline/orchestrator.ts import { ApifyClient } from 'apify-client'; const client = new ApifyClient({ token: process.env.APIFY_TOKEN }); interface PipelineConfig { discoverActorId: string; scrapeActorId: string; transformActorId: string; seedUrls: string[]; maxItems: number; } async function runPipeline(config: PipelineConfig) { const results = { discover: { runId: '', items: 0, cost: 0 }, scrape: { runId: '', items: 0, cost: 0 }, transform: { runId: '', items: 0, cost: 0 }, }; // Stage 1: Discover URLs console.log('Stage 1: Discovering URLs...'); const discoverRun = await client.actor(config.discoverActorId).call({ seedUrls: config.seedUrls, maxPages: 50, }); const { items: urls } = await client .dataset(discoverRun.defaultDatasetId) .listItems(); results.discover = { runId: discoverRun.id, items: urls.length, cost: discoverRun.usageTotalUsd ?? 0, }; // Stage 2: Scrape each discovered URL console.log(`Stage 2: Scraping ${urls.length} URLs...`); const scrapeRun = await client.actor(config.scrapeActorId).call({ startUrls: urls.map((u: any) => ({ url: u.url })), maxItems: config.maxItems, }); results.scrape = { runId: scrapeRun.id, items: scrapeRun.stats?.datasetItemCount ?? 0, cost: scrapeRun.usageTotalUsd ?? 0, }; // Stage 3: Transform and deduplicate console.log('Stage 3: Transforming...'); const transformRun = await client.actor(config.transformActorId).call({ sourceDatasetId: scrapeRun.defaultDatasetId, dedupField: 'url', filterEmpty: true, }); results.transform = { runId: transformRun.id, items: transformRun.stats?.datasetItemCount ?? 0, cost: transformRun.usageTotalUsd ?? 0, }; // Store final results in named dataset const finalDs = await client.datasets().getOrCreate('pipeline-output'); const { items: cleanData } = await client .dataset(transformRun.defaultDatasetId) .listItems(); await client.dataset(finalDs.id).pushItems(cleanData); // Summary const totalCost = Object.values(results).reduce((s, r) => s + r.cost, 0); console.log('\n=== Pipeline Summary ==='); console.log(`Discovered: ${results.discover.items} URLs`); console.log(`Scraped: ${results.scrape.items} items`); console.log(`Clean: ${results.transform.items} items`); console.log(`Total cost: $${totalCost.toFixed(4)}`); return results; }
Architecture Pattern 3: Full-Stack Integration
Application that uses Apify as a data source.
┌─────────────────────────────────────────────────────────┐ │ Your Application │ │ │ │ ┌─────────┐ ┌──────────────┐ ┌─────────────────┐ │ │ │ Frontend │──▶│ API Server │──▶│ Apify Service │ │ │ │ (React) │ │ (Express/ │ │ (apify-client) │ │ │ │ │◀──│ Next.js) │◀──│ │ │ │ └─────────┘ └──────┬───────┘ └────────┬─────────┘ │ │ │ │ │ │ ▼ ▼ │ │ ┌──────────┐ ┌────────────┐ │ │ │ Your DB │ │ Apify │ │ │ │ (results)│ │ Platform │ │ │ └──────────┘ └────────────┘ │ │ │ │ ┌──────────────────────────────────────────────────┐ │ │ │ Webhook Handler │ │ │ │ Receives run completion → saves results to DB │ │ │ └──────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────┘
Service Layer
// src/services/apify-service.ts import { ApifyClient } from 'apify-client'; export class ApifyService { private client: ApifyClient; constructor(token: string) { this.client = new ApifyClient({ token }); } async startScrape(urls: string[]): Promise<{ runId: string }> { const run = await this.client.actor('username/scraper').start({ startUrls: urls.map(url => ({ url })), }); return { runId: run.id }; } async getRunStatus(runId: string): Promise<{ status: string; progress?: { finished: number; failed: number }; }> { const run = await this.client.run(runId).get(); return { status: run.status, progress: { finished: run.stats?.requestsFinished ?? 0, failed: run.stats?.requestsFailed ?? 0, }, }; } async getResults<T>(runId: string): Promise<T[]> { const run = await this.client.run(runId).get(); if (run.status !== 'SUCCEEDED') { throw new Error(`Run not ready: ${run.status}`); } const { items } = await this.client .dataset(run.defaultDatasetId) .listItems(); return items as T[]; } async checkHealth(): Promise<boolean> { try { const user = await this.client.user().get(); return !!user.username; } catch { return false; } } }
Configuration Management
// src/config/apify.ts interface ApifyConfig { token: string; actorId: string; defaultMemory: number; defaultTimeout: number; webhookUrl?: string; } export function loadConfig(): ApifyConfig { const env = process.env.NODE_ENV || 'development'; const base: ApifyConfig = { token: process.env.APIFY_TOKEN!, actorId: process.env.APIFY_ACTOR_ID!, defaultMemory: 1024, defaultTimeout: 3600, }; const overrides: Record<string, Partial<ApifyConfig>> = { development: { defaultMemory: 256, defaultTimeout: 300 }, staging: { defaultMemory: 512 }, production: { webhookUrl: process.env.APIFY_WEBHOOK_URL }, }; return { ...base, ...overrides[env] }; }
Health Check
// src/health.ts export async function healthCheck(apifyService: ApifyService) { const start = Date.now(); const healthy = await apifyService.checkHealth(); return { service: 'apify', status: healthy ? 'healthy' : 'unhealthy', latencyMs: Date.now() - start, timestamp: new Date().toISOString(), }; }
Error Handling
| Issue | Cause | Solution |
|---|---|---|
| Circular dependencies | Service imports service | Use dependency injection |
| Missing config | Env var not set | Validate at startup with |
| Pipeline stage failure | Actor crash mid-pipeline | Add retry logic per stage |
| State management | Tracking run status | Use webhook handler + database |
Resources
Flagship Skills
For multi-environment setup, see
apify-deploy-integration.