Claude-code-plugins langchain-performance-tuning

install
source · Clone the upstream repo
git clone https://github.com/jeremylongshore/claude-code-plugins-plus-skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/jeremylongshore/claude-code-plugins-plus-skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/plugins/saas-packs/langchain-pack/skills/langchain-performance-tuning" ~/.claude/skills/jeremylongshore-claude-code-plugins-langchain-performance-tuning && rm -rf "$T"
manifest: plugins/saas-packs/langchain-pack/skills/langchain-performance-tuning/SKILL.md
source content

LangChain Performance Tuning

Overview

Optimize LangChain apps for production: measure baseline latency, implement caching, batch with concurrency control, stream for perceived speed, optimize prompts for fewer tokens, and select the right model for each task.

Step 1: Benchmark Baseline

async function benchmark(
  chain: { invoke: (input: any) => Promise<any> },
  input: any,
  iterations = 5,
) {
  const times: number[] = [];

  for (let i = 0; i < iterations; i++) {
    const start = performance.now();
    await chain.invoke(input);
    times.push(performance.now() - start);
  }

  times.sort((a, b) => a - b);
  return {
    mean: (times.reduce((a, b) => a + b, 0) / times.length).toFixed(0) + "ms",
    median: times[Math.floor(times.length / 2)].toFixed(0) + "ms",
    p95: times[Math.floor(times.length * 0.95)].toFixed(0) + "ms",
    min: times[0].toFixed(0) + "ms",
    max: times[times.length - 1].toFixed(0) + "ms",
  };
}

// Usage
const results = await benchmark(chain, { input: "test" }, 10);
console.table(results);

Step 2: Streaming (Perceived Performance)

import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { StringOutputParser } from "@langchain/core/output_parsers";

const chain = ChatPromptTemplate.fromTemplate("{input}")
  .pipe(new ChatOpenAI({ model: "gpt-4o-mini", streaming: true }))
  .pipe(new StringOutputParser());

// Non-streaming: user waits 2-3s for full response
// Streaming: first token in ~200ms, user sees progress immediately

const stream = await chain.stream({ input: "Explain LCEL" });
for await (const chunk of stream) {
  process.stdout.write(chunk);
}

// Express SSE endpoint for web apps
app.post("/api/chat/stream", async (req, res) => {
  res.setHeader("Content-Type", "text/event-stream");
  res.setHeader("Cache-Control", "no-cache");
  res.setHeader("Connection", "keep-alive");

  const stream = await chain.stream({ input: req.body.input });
  for await (const chunk of stream) {
    res.write(`data: ${JSON.stringify({ text: chunk })}\n\n`);
  }
  res.write("data: [DONE]\n\n");
  res.end();
});

Step 3: Batch Processing with Concurrency

import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { StringOutputParser } from "@langchain/core/output_parsers";

const chain = ChatPromptTemplate.fromTemplate("Summarize: {text}")
  .pipe(new ChatOpenAI({ model: "gpt-4o-mini" }))
  .pipe(new StringOutputParser());

const inputs = articles.map((text) => ({ text }));

// Sequential: ~10s for 10 items (1s each)
// const results = [];
// for (const input of inputs) results.push(await chain.invoke(input));

// Batch: ~2s for 10 items (parallel API calls)
const results = await chain.batch(inputs, {
  maxConcurrency: 10,
});

// Benchmark comparison
console.time("sequential");
for (const i of inputs.slice(0, 5)) await chain.invoke(i);
console.timeEnd("sequential");

console.time("batch");
await chain.batch(inputs.slice(0, 5), { maxConcurrency: 5 });
console.timeEnd("batch");

Step 4: Caching

// In-memory cache (single process, resets on restart)
const cache = new Map<string, string>();

async function cachedInvoke(
  chain: any,
  input: Record<string, any>,
): Promise<string> {
  const key = JSON.stringify(input);
  const cached = cache.get(key);
  if (cached) return cached;

  const result = await chain.invoke(input);
  cache.set(key, result);
  return result;
}

// Cache hit: ~0ms (vs ~500-2000ms for API call)
# Python — built-in caching
from langchain_core.globals import set_llm_cache
from langchain_community.cache import SQLiteCache, InMemoryCache

# Option 1: In-memory (single process)
set_llm_cache(InMemoryCache())

# Option 2: SQLite (persistent, survives restarts)
set_llm_cache(SQLiteCache(database_path=".langchain_cache.db"))

# Option 3: Redis (distributed, production)
from langchain_community.cache import RedisCache
import redis
set_llm_cache(RedisCache(redis.Redis.from_url("redis://localhost:6379")))

Step 5: Model Selection by Task

import { ChatOpenAI } from "@langchain/openai";

// Fast + cheap: simple tasks, classification, extraction
const fast = new ChatOpenAI({
  model: "gpt-4o-mini",    // ~200ms TTFT, $0.15/1M input
  temperature: 0,
});

// Powerful + slower: complex reasoning, code generation
const powerful = new ChatOpenAI({
  model: "gpt-4o",          // ~400ms TTFT, $2.50/1M input
  temperature: 0,
});

// Route based on task
import { RunnableBranch } from "@langchain/core/runnables";

const router = RunnableBranch.from([
  [(input: any) => input.task === "classify", classifyChain],
  [(input: any) => input.task === "reason", reasoningChain],
  defaultChain,
]);

Step 6: Prompt Optimization

// Shorter prompts = fewer input tokens = lower latency + cost

// BEFORE (150+ tokens):
const verbose = `You are an expert AI assistant specialized in software
engineering. Your task is to carefully analyze the following code and
provide a comprehensive review covering all aspects including...`;

// AFTER (20 tokens, same quality):
const concise = "Review this code. List issues and fixes:\n\n{code}";

// Token counting (Python)
// import tiktoken
// enc = tiktoken.encoding_for_model("gpt-4o-mini")
// print(len(enc.encode(prompt)))  # check before deploying

Performance Impact Summary

OptimizationLatency ImprovementCost Impact
StreamingFirst token 80% fasterNeutral
Caching99% on cache hitMajor savings
Batch processing50-80% for bulk opsNeutral
gpt-4o-mini vs gpt-4o~2x faster TTFT~17x cheaper
Shorter prompts10-30%10-50% cheaper
maxConcurrency tuningLinear scalingNeutral

Error Handling

ErrorCauseFix
Batch partially failsRate limit on some itemsLower
maxConcurrency
, add
maxRetries
Stream hangsNetwork timeoutSet
timeout
on model, handle disconnect
Cache stale dataContent changed upstreamAdd TTL or version key to cache
High memory usageLarge cacheUse LRU eviction or Redis

Resources

Next Steps

Use

langchain-cost-tuning
for cost optimization alongside performance.