Claude-skill-registry data-ai-skills
Master machine learning, data engineering, AI engineering, LLMs, prompt engineering, and MLOps. Build intelligent systems with Python.
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/data-ai-skills" ~/.claude/skills/majiayu000-claude-skill-registry-data-ai-skills && rm -rf "$T"
manifest:
skills/data/data-ai-skills/SKILL.mdsource content
Data & AI Engineering Skills
Python Fundamentals
import numpy as np import pandas as pd from typing import Optional, List # NumPy arrays with type hints def process_array(arr: np.ndarray) -> np.ndarray: """Process numpy array with validation.""" if arr.size == 0: raise ValueError("Empty array not allowed") return np.clip(arr, 0, 1) # Pandas DataFrames with error handling def load_data(path: str) -> pd.DataFrame: """Load data with validation.""" try: df = pd.read_csv(path) if df.empty: raise ValueError(f"Empty dataset: {path}") return df except FileNotFoundError: raise FileNotFoundError(f"Data file not found: {path}")
Machine Learning with Scikit-Learn
from sklearn.model_selection import train_test_split, cross_val_score from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report import logging logger = logging.getLogger(__name__) def train_model(X, y, n_estimators=100, random_state=42): """Train model with logging and validation.""" logger.info(f"Training with {len(X)} samples") # Validate input if len(X) != len(y): raise ValueError("X and y must have same length") # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_state, stratify=y ) # Train model = RandomForestClassifier(n_estimators=n_estimators) model.fit(X_train, y_train) # Evaluate y_pred = model.predict(X_test) report = classification_report(y_test, y_pred, output_dict=True) logger.info(f"Accuracy: {report['accuracy']:.4f}") return model, report
Deep Learning with PyTorch
import torch import torch.nn as nn from torch.utils.data import DataLoader class SimpleNet(nn.Module): def __init__(self, input_dim: int, hidden_dim: int = 64): super().__init__() self.fc1 = nn.Linear(input_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim, 1) self.relu = nn.ReLU() self.dropout = nn.Dropout(0.2) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.relu(self.fc1(x)) x = self.dropout(x) return torch.sigmoid(self.fc2(x)) def train_epoch(model, dataloader, optimizer, loss_fn, device): """Train one epoch with gradient clipping.""" model.train() total_loss = 0 for batch_x, batch_y in dataloader: batch_x, batch_y = batch_x.to(device), batch_y.to(device) optimizer.zero_grad() output = model(batch_x) loss = loss_fn(output, batch_y) loss.backward() # Gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() total_loss += loss.item() return total_loss / len(dataloader)
LLM Applications (2025 Best Practices)
from anthropic import Anthropic import backoff client = Anthropic() @backoff.on_exception(backoff.expo, Exception, max_tries=3) def query_llm(prompt: str, max_tokens: int = 1024) -> str: """Query LLM with retry logic.""" message = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=max_tokens, messages=[ {"role": "user", "content": prompt} ] ) return message.content[0].text # Structured output def extract_entities(text: str) -> dict: """Extract entities with validation.""" prompt = f"""Extract entities from the text. Return JSON with keys: persons, organizations, locations. Text: {text} """ response = query_llm(prompt) return json.loads(response)
Data Pipeline with Checkpointing
import os from pathlib import Path def process_with_checkpoint( df: pd.DataFrame, checkpoint_dir: str, step_name: str ) -> pd.DataFrame: """Process with checkpoint recovery.""" checkpoint_path = Path(checkpoint_dir) / f"{step_name}.parquet" if checkpoint_path.exists(): print(f"Loading checkpoint: {step_name}") return pd.read_parquet(checkpoint_path) # Process result = expensive_transform(df) # Save checkpoint checkpoint_path.parent.mkdir(parents=True, exist_ok=True) result.to_parquet(checkpoint_path) return result
MLOps with MLflow
import mlflow import mlflow.sklearn def train_with_tracking(X_train, y_train, X_test, y_test, params): """Train with experiment tracking.""" mlflow.set_experiment("model-training") with mlflow.start_run(): # Log parameters mlflow.log_params(params) # Train model = RandomForestClassifier(**params) model.fit(X_train, y_train) # Evaluate accuracy = model.score(X_test, y_test) mlflow.log_metric("accuracy", accuracy) # Log model mlflow.sklearn.log_model(model, "model") return model, accuracy
Model Evaluation Metrics
| Task | Primary Metric | Secondary Metrics |
|---|---|---|
| Classification | Accuracy, F1 | Precision, Recall, AUC |
| Regression | RMSE, MAE | R², MAPE |
| Ranking | NDCG | MRR, MAP |
| Clustering | Silhouette | Davies-Bouldin |
Unit Test Template
import pytest import numpy as np from your_module import train_model, process_array class TestMLFunctions: @pytest.fixture def sample_data(self): X = np.random.randn(100, 10) y = np.random.randint(0, 2, 100) return X, y def test_train_model(self, sample_data): X, y = sample_data model, report = train_model(X, y) assert model is not None assert 'accuracy' in report assert 0 <= report['accuracy'] <= 1 def test_process_array_empty(self): with pytest.raises(ValueError, match="Empty array"): process_array(np.array([])) def test_process_array_clips(self): arr = np.array([−1, 0.5, 2]) result = process_array(arr) assert result.min() >= 0 assert result.max() <= 1
Troubleshooting Guide
| Symptom | Cause | Solution |
|---|---|---|
| CUDA OOM | Batch too large | Reduce batch, use gradient accumulation |
| Loss NaN | Learning rate too high | Reduce LR, add gradient clipping |
| Overfitting | Model too complex | Add regularization, more data |
| Slow training | I/O bottleneck | Use DataLoader workers |
Key Concepts Checklist
- Python basics and NumPy
- Pandas data manipulation
- Data visualization (Matplotlib, Seaborn)
- Supervised learning (classification, regression)
- Unsupervised learning (clustering)
- Feature engineering
- Model evaluation and metrics
- Hyperparameter tuning
- Cross-validation
- Handling imbalanced data
- Deep learning basics
- LLM integration
- Prompt engineering
- MLOps pipeline setup
Source: https://roadmap.sh Version: 2.0.0 Last Updated: 2025-01-01