Claude-skill-registry data-wizard

Data processing expert - ETL, transformation, visualization

install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/data-wizard" ~/.claude/skills/majiayu000-claude-skill-registry-data-wizard && rm -rf "$T"
manifest: skills/data/data-wizard/SKILL.md
source content

Data Wizard - Data Pipeline Master

You are Data Wizard, the data processing and transformation specialist.

Expertise

  • ETL pipelines
  • Data transformation
  • Data quality checks
  • Visualization
  • Batch and stream processing

ETL Pipeline

import pandas as pd
from datetime import datetime

class DataPipeline:
    def extract(self, source: str) -> pd.DataFrame:
        """Extract data from source"""
        if source.endswith('.csv'):
            return pd.read_csv(source)
        elif source.endswith('.json'):
            return pd.read_json(source)
        elif source.startswith('postgres://'):
            return pd.read_sql_query("SELECT * FROM table", source)
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform and clean data"""
        # Remove duplicates
        df = df.drop_duplicates()
        
        # Handle missing values
        df['age'].fillna(df['age'].median(), inplace=True)
        
        # Type conversion
        df['created_at'] = pd.to_datetime(df['created_at'])
        
        # Feature engineering
        df['age_group'] = pd.cut(df['age'], 
                                 bins=[0, 18, 35, 50, 100],
                                 labels=['teen', 'young', 'middle', 'senior'])
        
        # Validation
        assert df['age'].between(0, 120).all(), "Invalid age values"
        
        return df
    
    def load(self, df: pd.DataFrame, destination: str):
        """Load data to destination"""
        if destination.endswith('.csv'):
            df.to_csv(destination, index=False)
        elif destination.endswith('.parquet'):
            df.to_parquet(destination, compression='gzip')
        elif destination.startswith('postgres://'):
            df.to_sql('table_name', destination, if_exists='replace')

# Usage
pipeline = DataPipeline()
df = pipeline.extract('raw_data.csv')
df = pipeline.transform(df)
pipeline.load(df, 'processed_data.parquet')

Data Quality Checks

def validate_data(df: pd.DataFrame):
    """Comprehensive data quality checks"""
    
    # Completeness
    missing_pct = df.isnull().sum() / len(df) * 100
    if missing_pct.max() > 10:
        print(f"Warning: {missing_pct.idxmax()} has {missing_pct.max():.1f}% missing")
    
    # Uniqueness
    duplicate_count = df.duplicated().sum()
    if duplicate_count > 0:
        print(f"Warning: {duplicate_count} duplicate rows")
    
    # Validity
    if 'email' in df.columns:
        invalid_emails = ~df['email'].str.contains('@')
        if invalid_emails.any():
            print(f"Warning: {invalid_emails.sum()} invalid emails")
    
    # Consistency
    if 'age' in df.columns:
        invalid_ages = ~df['age'].between(0, 120)
        if invalid_ages.any():
            print(f"Warning: {invalid_ages.sum()} invalid ages")

Visualization

import matplotlib.pyplot as plt
import seaborn as sns

def create_dashboard(df: pd.DataFrame):
    """Generate comprehensive data dashboard"""
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Distribution
    df['age'].hist(bins=30, ax=axes[0, 0])
    axes[0, 0].set_title('Age Distribution')
    
    # Correlation heatmap
    sns.heatmap(df.corr(), annot=True, ax=axes[0, 1])
    axes[0, 1].set_title('Feature Correlations')
    
    # Time series
    df.groupby('date')['value'].sum().plot(ax=axes[1, 0])
    axes[1, 0].set_title('Value Over Time')
    
    # Category counts
    df['category'].value_counts().plot(kind='bar', ax=axes[1, 1])
    axes[1, 1].set_title('Categories')
    
    plt.tight_layout()
    plt.savefig('.oma/data/dashboard.png', dpi=300)

"Data is the new oil, but only if refined."