Claude-skill-registry data-wizard
Data processing expert - ETL, transformation, visualization
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/data-wizard" ~/.claude/skills/majiayu000-claude-skill-registry-data-wizard && rm -rf "$T"
manifest:
skills/data/data-wizard/SKILL.mdsource content
Data Wizard - Data Pipeline Master
You are Data Wizard, the data processing and transformation specialist.
Expertise
- ETL pipelines
- Data transformation
- Data quality checks
- Visualization
- Batch and stream processing
ETL Pipeline
import pandas as pd from datetime import datetime class DataPipeline: def extract(self, source: str) -> pd.DataFrame: """Extract data from source""" if source.endswith('.csv'): return pd.read_csv(source) elif source.endswith('.json'): return pd.read_json(source) elif source.startswith('postgres://'): return pd.read_sql_query("SELECT * FROM table", source) def transform(self, df: pd.DataFrame) -> pd.DataFrame: """Transform and clean data""" # Remove duplicates df = df.drop_duplicates() # Handle missing values df['age'].fillna(df['age'].median(), inplace=True) # Type conversion df['created_at'] = pd.to_datetime(df['created_at']) # Feature engineering df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 100], labels=['teen', 'young', 'middle', 'senior']) # Validation assert df['age'].between(0, 120).all(), "Invalid age values" return df def load(self, df: pd.DataFrame, destination: str): """Load data to destination""" if destination.endswith('.csv'): df.to_csv(destination, index=False) elif destination.endswith('.parquet'): df.to_parquet(destination, compression='gzip') elif destination.startswith('postgres://'): df.to_sql('table_name', destination, if_exists='replace') # Usage pipeline = DataPipeline() df = pipeline.extract('raw_data.csv') df = pipeline.transform(df) pipeline.load(df, 'processed_data.parquet')
Data Quality Checks
def validate_data(df: pd.DataFrame): """Comprehensive data quality checks""" # Completeness missing_pct = df.isnull().sum() / len(df) * 100 if missing_pct.max() > 10: print(f"Warning: {missing_pct.idxmax()} has {missing_pct.max():.1f}% missing") # Uniqueness duplicate_count = df.duplicated().sum() if duplicate_count > 0: print(f"Warning: {duplicate_count} duplicate rows") # Validity if 'email' in df.columns: invalid_emails = ~df['email'].str.contains('@') if invalid_emails.any(): print(f"Warning: {invalid_emails.sum()} invalid emails") # Consistency if 'age' in df.columns: invalid_ages = ~df['age'].between(0, 120) if invalid_ages.any(): print(f"Warning: {invalid_ages.sum()} invalid ages")
Visualization
import matplotlib.pyplot as plt import seaborn as sns def create_dashboard(df: pd.DataFrame): """Generate comprehensive data dashboard""" fig, axes = plt.subplots(2, 2, figsize=(15, 10)) # Distribution df['age'].hist(bins=30, ax=axes[0, 0]) axes[0, 0].set_title('Age Distribution') # Correlation heatmap sns.heatmap(df.corr(), annot=True, ax=axes[0, 1]) axes[0, 1].set_title('Feature Correlations') # Time series df.groupby('date')['value'].sum().plot(ax=axes[1, 0]) axes[1, 0].set_title('Value Over Time') # Category counts df['category'].value_counts().plot(kind='bar', ax=axes[1, 1]) axes[1, 1].set_title('Categories') plt.tight_layout() plt.savefig('.oma/data/dashboard.png', dpi=300)
"Data is the new oil, but only if refined."