install
source · Clone the upstream repo
git clone https://github.com/ComeOnOliver/skillshub
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/ComeOnOliver/skillshub "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/TerminalSkills/skills/pandas-ai" ~/.claude/skills/comeonoliver-skillshub-pandas-ai && rm -rf "$T"
manifest:
skills/TerminalSkills/skills/pandas-ai/SKILL.mdsource content
PandasAI
PandasAI adds natural language capabilities to pandas. Ask questions about your data in English and get answers, charts, and transformations — powered by LLMs.
Installation
# Install PandasAI pip install pandasai # With OpenAI pip install pandasai[openai] # With local models via Ollama pip install pandasai[langchain]
Basic Usage
# basic.py: Ask questions about a DataFrame in natural language import pandas as pd from pandasai import SmartDataframe from pandasai.llm import OpenAI llm = OpenAI(api_token="your-openai-api-key") df = pd.DataFrame({ "country": ["USA", "UK", "France", "Germany", "Japan"], "population": [331_000_000, 67_000_000, 67_000_000, 83_000_000, 125_000_000], "gdp_billion": [25_460, 3_070, 2_780, 4_070, 4_230], }) sdf = SmartDataframe(df, config={"llm": llm}) # Ask questions in natural language answer = sdf.chat("Which country has the highest GDP?") print(answer) # USA answer = sdf.chat("What is the average population?") print(answer) # 134,600,000 answer = sdf.chat("List countries with GDP above 4000 billion") print(answer)
Multiple DataFrames
# multi-df.py: Query across multiple related DataFrames from pandasai import SmartDatalake employees = pd.DataFrame({ "id": [1, 2, 3, 4, 5], "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"], "department_id": [1, 2, 1, 3, 2], "salary": [85000, 72000, 90000, 68000, 95000], }) departments = pd.DataFrame({ "id": [1, 2, 3], "name": ["Engineering", "Marketing", "Sales"], "budget": [500000, 200000, 300000], }) lake = SmartDatalake([employees, departments], config={"llm": llm}) result = lake.chat("What is the average salary per department?") print(result) result = lake.chat("Which department is over budget based on total salaries?") print(result)
Generate Charts
# charts.py: Create visualizations from natural language sdf = SmartDataframe(df, config={ "llm": llm, "save_charts": True, "save_charts_path": "./charts", }) # Generate charts by asking sdf.chat("Create a bar chart of GDP by country") sdf.chat("Plot a pie chart of population distribution") sdf.chat("Show a scatter plot of GDP vs population") # Charts saved as PNG in ./charts/
Data Cleaning
# cleaning.py: Use natural language for data cleaning tasks dirty_df = pd.DataFrame({ "name": ["Alice", "bob", "CHARLIE", None, "Eve"], "email": ["alice@co.com", "invalid", "charlie@co.com", "diana@co.com", ""], "age": [30, -5, 45, 200, 28], "salary": [85000, 72000, None, 68000, 95000], }) sdf = SmartDataframe(dirty_df, config={"llm": llm}) # Clean with natural language cleaned = sdf.chat("Remove rows where age is negative or above 150") cleaned = sdf.chat("Fill missing salaries with the median salary") cleaned = sdf.chat("Standardize names to title case") cleaned = sdf.chat("Remove rows with invalid email addresses")
Custom Configuration
# config.py: Advanced PandasAI configuration from pandasai import SmartDataframe sdf = SmartDataframe(df, config={ "llm": llm, "conversational": True, # Natural language responses "verbose": True, # Show generated code "enable_cache": True, # Cache repeated queries "max_retries": 3, # Retry on LLM errors "custom_whitelisted_dependencies": ["scipy", "sklearn"], "save_logs": True, }) # View the generated Python code sdf.chat("What is the correlation between GDP and population?") print(sdf.last_code_generated)
Using Local Models
# local-llm.py: Use Ollama or other local models instead of OpenAI from pandasai.llm.local_llm import LocalLLM # With Ollama running locally llm = LocalLLM(api_base="http://localhost:11434/v1", model="llama3") sdf = SmartDataframe(df, config={"llm": llm}) answer = sdf.chat("Summarize this dataset") print(answer)
Pipeline Integration
# pipeline.py: Use PandasAI in an automated analysis pipeline from pandasai import SmartDataframe from pandasai.llm import OpenAI import pandas as pd import json def analyze_dataset(csv_path: str, questions: list[str]) -> dict: """Run a set of natural language questions against a CSV dataset.""" llm = OpenAI(api_token="your-key") df = pd.read_csv(csv_path) sdf = SmartDataframe(df, config={"llm": llm, "conversational": True}) results = {} for question in questions: try: answer = sdf.chat(question) results[question] = str(answer) except Exception as e: results[question] = f"Error: {e}" return results # Usage report = analyze_dataset("sales.csv", [ "What was the total revenue last month?", "Which product category had the most sales?", "What is the month-over-month growth rate?", ]) print(json.dumps(report, indent=2))