Skillsbench audiobook

Create audiobooks from web content or text files. Handles content fetching, text processing, and TTS conversion with automatic fallback between ElevenLabs, OpenAI TTS, and gTTS.

install
source · Clone the upstream repo
git clone https://github.com/benchflow-ai/skillsbench
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/benchflow-ai/skillsbench "$T" && mkdir -p ~/.claude/skills && cp -r "$T/tasks/pg-essay-to-audiobook/environment/skills/audiobook" ~/.claude/skills/benchflow-ai-skillsbench-audiobook && rm -rf "$T"
manifest: tasks/pg-essay-to-audiobook/environment/skills/audiobook/SKILL.md
source content

Audiobook Creation Guide

Create audiobooks from web articles, essays, or text files. This skill covers the full pipeline: content fetching, text processing, and audio generation.

Quick Start

import os

# 1. Check which TTS API is available
def get_tts_provider():
    if os.environ.get("ELEVENLABS_API_KEY"):
        return "elevenlabs"
    elif os.environ.get("OPENAI_API_KEY"):
        return "openai"
    else:
        return "gtts"  # Free, no API key needed

provider = get_tts_provider()
print(f"Using TTS provider: {provider}")

Step 1: Fetching Web Content

IMPORTANT: Verify fetched content is complete

WebFetch and similar tools may return summaries instead of full text. Always verify:

import subprocess

def fetch_article_content(url):
    """Fetch article content using curl for reliability."""
    # Use curl to get raw HTML - more reliable than web fetch tools
    result = subprocess.run(
        ["curl", "-s", url],
        capture_output=True,
        text=True
    )
    html = result.stdout

    # Strip HTML tags (basic approach)
    import re
    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
    text = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

Content verification checklist

Before converting to audio, verify:

  • Text length is reasonable for the source (articles typically 1,000-10,000+ words)
  • Content includes actual article text, not just navigation/headers
  • No "summary" or "key points" headers that indicate truncation
def verify_content(text, expected_min_chars=1000):
    """Basic verification that content is complete."""
    if len(text) < expected_min_chars:
        print(f"WARNING: Content may be truncated ({len(text)} chars)")
        return False
    if "summary" in text.lower()[:500] or "key points" in text.lower()[:500]:
        print("WARNING: Content appears to be a summary, not full text")
        return False
    return True

Step 2: Text Processing

Clean and prepare text for TTS

import re

def clean_text_for_tts(text):
    """Clean text for better TTS output."""
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove footnote markers like [1], [2]
    text = re.sub(r'\[\d+\]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters that confuse TTS
    text = re.sub(r'[^\w\s.,!?;:\'"()-]', '', text)

    return text.strip()

def chunk_text(text, max_chars=4000):
    """Split text into chunks at sentence boundaries."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chars:
            current_chunk += sentence + " "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

Step 3: TTS Conversion with Fallback

Automatic provider selection

import os
import subprocess

def create_audiobook(text, output_path):
    """Convert text to audiobook with automatic TTS provider selection."""

    # Check available providers
    has_elevenlabs = bool(os.environ.get("ELEVENLABS_API_KEY"))
    has_openai = bool(os.environ.get("OPENAI_API_KEY"))

    if has_elevenlabs:
        print("Using ElevenLabs TTS (highest quality)")
        return create_with_elevenlabs(text, output_path)
    elif has_openai:
        print("Using OpenAI TTS (high quality)")
        return create_with_openai(text, output_path)
    else:
        print("Using gTTS (free, no API key required)")
        return create_with_gtts(text, output_path)

ElevenLabs implementation

import requests

def create_with_elevenlabs(text, output_path):
    """Generate audiobook using ElevenLabs API."""
    api_key = os.environ.get("ELEVENLABS_API_KEY")
    voice_id = "21m00Tcm4TlvDq8ikWAM"  # Rachel - calm female voice

    chunks = chunk_text(text, max_chars=4500)
    audio_files = []

    for i, chunk in enumerate(chunks):
        chunk_file = f"/tmp/chunk_{i:03d}.mp3"

        response = requests.post(
            f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
            headers={
                "xi-api-key": api_key,
                "Content-Type": "application/json"
            },
            json={
                "text": chunk,
                "model_id": "eleven_turbo_v2_5",
                "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}
            }
        )

        if response.status_code == 200:
            with open(chunk_file, "wb") as f:
                f.write(response.content)
            audio_files.append(chunk_file)
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return False

    return concatenate_audio(audio_files, output_path)

OpenAI TTS implementation

def create_with_openai(text, output_path):
    """Generate audiobook using OpenAI TTS API."""
    api_key = os.environ.get("OPENAI_API_KEY")

    chunks = chunk_text(text, max_chars=4000)
    audio_files = []

    for i, chunk in enumerate(chunks):
        chunk_file = f"/tmp/chunk_{i:03d}.mp3"

        response = requests.post(
            "https://api.openai.com/v1/audio/speech",
            headers={
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "tts-1",
                "input": chunk,
                "voice": "onyx",  # Deep male voice, good for essays
                "response_format": "mp3"
            }
        )

        if response.status_code == 200:
            with open(chunk_file, "wb") as f:
                f.write(response.content)
            audio_files.append(chunk_file)
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return False

    return concatenate_audio(audio_files, output_path)

gTTS implementation (free fallback)

def create_with_gtts(text, output_path):
    """Generate audiobook using gTTS (free, no API key)."""
    from gtts import gTTS
    from pydub import AudioSegment

    chunks = chunk_text(text, max_chars=4500)
    audio_files = []

    for i, chunk in enumerate(chunks):
        chunk_file = f"/tmp/chunk_{i:03d}.mp3"

        tts = gTTS(text=chunk, lang='en', slow=False)
        tts.save(chunk_file)
        audio_files.append(chunk_file)

    return concatenate_audio(audio_files, output_path)

Audio concatenation

def concatenate_audio(audio_files, output_path):
    """Concatenate multiple audio files using ffmpeg."""
    if not audio_files:
        return False

    # Create file list for ffmpeg
    list_file = "/tmp/audio_list.txt"
    with open(list_file, "w") as f:
        for audio_file in audio_files:
            f.write(f"file '{audio_file}'\n")

    # Concatenate with ffmpeg
    result = subprocess.run([
        "ffmpeg", "-y", "-f", "concat", "-safe", "0",
        "-i", list_file, "-c", "copy", output_path
    ], capture_output=True)

    # Cleanup temp files
    import os
    for f in audio_files:
        os.unlink(f)
    os.unlink(list_file)

    return result.returncode == 0

Complete Example

#!/usr/bin/env python3
"""Create audiobook from web articles."""

import os
import re
import subprocess
import requests

# ... include all helper functions above ...

def main():
    # Fetch articles
    urls = [
        "https://example.com/article1",
        "https://example.com/article2"
    ]

    all_text = ""
    for url in urls:
        print(f"Fetching: {url}")
        text = fetch_article_content(url)

        if not verify_content(text):
            print(f"WARNING: Content from {url} may be incomplete")

        all_text += f"\n\n{text}"

    # Clean and convert
    clean_text = clean_text_for_tts(all_text)
    print(f"Total text: {len(clean_text)} characters")

    # Create audiobook
    success = create_audiobook(clean_text, "/root/audiobook.mp3")

    if success:
        print("Audiobook created successfully!")
    else:
        print("Failed to create audiobook")

if __name__ == "__main__":
    main()

TTS Provider Comparison

ProviderQualityCostAPI Key RequiredBest For
ElevenLabsExcellentPaidYesProfessional audiobooks
OpenAI TTSVery GoodPaidYesGeneral purpose
gTTSGoodFreeNoTesting, budget projects

Troubleshooting

"Content appears to be a summary"

  • Use
    curl
    directly instead of web fetch tools
  • Verify the URL is correct and accessible
  • Check if the site requires JavaScript rendering

"API key not found"

  • Check environment variables:
    echo $OPENAI_API_KEY
  • Ensure keys are exported in the shell
  • Fall back to gTTS if no paid API keys available

"Audio chunks don't sound continuous"

  • Ensure chunking happens at sentence boundaries
  • Consider adding small pauses between sections
  • Use consistent voice settings across all chunks