Vibe-Skills statistics-math
Statistics, probability, linear algebra, and mathematical foundations for data science
install
source · Clone the upstream repo
git clone https://github.com/foryourhealth111-pixel/Vibe-Skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/foryourhealth111-pixel/Vibe-Skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/bundled/skills/statistics-math" ~/.claude/skills/foryourhealth111-pixel-vibe-skills-statistics-math && rm -rf "$T"
manifest:
bundled/skills/statistics-math/SKILL.mdsource content
Statistics & Mathematics
Mathematical foundations for data science, machine learning, and statistical analysis.
Quick Start
import numpy as np import scipy.stats as stats from sklearn.linear_model import LinearRegression # Descriptive Statistics data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56]) print(f"Mean: {np.mean(data):.2f}") print(f"Median: {np.median(data):.2f}") print(f"Std Dev: {np.std(data, ddof=1):.2f}") print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}") # Hypothesis Testing sample_a = [23, 45, 67, 32, 45] sample_b = [56, 78, 45, 67, 89] t_stat, p_value = stats.ttest_ind(sample_a, sample_b) print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}") if p_value < 0.05: print("Reject null hypothesis: significant difference") else: print("Fail to reject null hypothesis")
Core Concepts
1. Probability Distributions
import numpy as np import scipy.stats as stats import matplotlib.pyplot as plt # Normal Distribution mu, sigma = 100, 15 normal_dist = stats.norm(loc=mu, scale=sigma) x = np.linspace(50, 150, 100) # PDF, CDF calculations print(f"P(X < 85): {normal_dist.cdf(85):.4f}") print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}") print(f"95th percentile: {normal_dist.ppf(0.95):.2f}") # Binomial Distribution (discrete) n, p = 100, 0.3 binom_dist = stats.binom(n=n, p=p) print(f"P(X = 30): {binom_dist.pmf(30):.4f}") print(f"P(X <= 30): {binom_dist.cdf(30):.4f}") # Poisson Distribution (events per time) lambda_param = 5 poisson_dist = stats.poisson(mu=lambda_param) print(f"P(X = 3): {poisson_dist.pmf(3):.4f}") # Central Limit Theorem demonstration population = np.random.exponential(scale=10, size=100000) sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)] print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")
2. Hypothesis Testing Framework
from scipy import stats import numpy as np class HypothesisTest: """Framework for statistical hypothesis testing.""" @staticmethod def two_sample_ttest(group_a, group_b, alpha=0.05): """Independent samples t-test.""" t_stat, p_value = stats.ttest_ind(group_a, group_b) effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt( (np.var(group_a) + np.var(group_b)) / 2 ) return { "t_statistic": t_stat, "p_value": p_value, "significant": p_value < alpha, "effect_size_cohens_d": effect_size } @staticmethod def chi_square_test(observed, expected=None, alpha=0.05): """Chi-square test for categorical data.""" if expected is None: chi2, p_value, dof, expected = stats.chi2_contingency(observed) else: chi2, p_value = stats.chisquare(observed, expected) dof = len(observed) - 1 return { "chi2_statistic": chi2, "p_value": p_value, "degrees_of_freedom": dof, "significant": p_value < alpha } @staticmethod def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05): """Two-proportion z-test for A/B testing.""" p_a = conversions_a / total_a p_b = conversions_b / total_b p_pooled = (conversions_a + conversions_b) / (total_a + total_b) se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b)) z_stat = (p_a - p_b) / se p_value = 2 * (1 - stats.norm.cdf(abs(z_stat))) return { "conversion_a": p_a, "conversion_b": p_b, "lift": (p_b - p_a) / p_a * 100, "z_statistic": z_stat, "p_value": p_value, "significant": p_value < alpha } # Usage result = HypothesisTest.ab_test_proportion( conversions_a=120, total_a=1000, conversions_b=150, total_b=1000 ) print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")
3. Linear Algebra Essentials
import numpy as np # Matrix operations A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]]) # Basic operations print("Matrix multiplication:", A @ B) print("Element-wise:", A * B) print("Transpose:", A.T) print("Inverse:", np.linalg.inv(A)) print("Determinant:", np.linalg.det(A)) # Eigenvalues and eigenvectors (PCA foundation) eigenvalues, eigenvectors = np.linalg.eig(A) print(f"Eigenvalues: {eigenvalues}") # Singular Value Decomposition (dimensionality reduction) U, S, Vt = np.linalg.svd(A) print(f"Singular values: {S}") # Solving linear systems: Ax = b b = np.array([5, 11]) x = np.linalg.solve(A, b) print(f"Solution: {x}") # Cosine similarity (NLP, recommendations) def cosine_similarity(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) vec1 = np.array([1, 2, 3]) vec2 = np.array([4, 5, 6]) print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")
4. Regression Analysis
import numpy as np from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.metrics import r2_score, mean_squared_error import statsmodels.api as sm # Multiple Linear Regression with statsmodels X = np.random.randn(100, 3) y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100)*0.5 X_with_const = sm.add_constant(X) model = sm.OLS(y, X_with_const).fit() print(model.summary()) print(f"R-squared: {model.rsquared:.4f}") print(f"Coefficients: {model.params}") print(f"P-values: {model.pvalues}") # Regularization comparison X_train, y_train = X[:80], y[:80] X_test, y_test = X[80:], y[80:] models = { "OLS": LinearRegression(), "Ridge": Ridge(alpha=1.0), "Lasso": Lasso(alpha=0.1) } for name, model in models.items(): model.fit(X_train, y_train) y_pred = model.predict(X_test) print(f"{name}: R²={r2_score(y_test, y_pred):.4f}, RMSE={np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
Tools & Technologies
| Tool | Purpose | Version (2025) |
|---|---|---|
| NumPy | Numerical computing | 1.26+ |
| SciPy | Scientific computing | 1.12+ |
| pandas | Data manipulation | 2.2+ |
| statsmodels | Statistical models | 0.14+ |
| scikit-learn | ML algorithms | 1.4+ |
Troubleshooting Guide
| Issue | Symptoms | Root Cause | Fix |
|---|---|---|---|
| Low p-value, small effect | Significant but meaningless | Large sample size | Check effect size |
| High variance | Unstable estimates | Small sample, outliers | More data, robust methods |
| Multicollinearity | Inflated coefficients | Correlated features | VIF check, remove features |
| Heteroscedasticity | Invalid inference | Non-constant variance | Weighted least squares |
Best Practices
# ✅ DO: Check assumptions before testing from scipy.stats import shapiro stat, p = shapiro(data) if p > 0.05: print("Data is approximately normal") # ✅ DO: Use effect sizes, not just p-values # ✅ DO: Correct for multiple comparisons (Bonferroni) # ✅ DO: Report confidence intervals # ❌ DON'T: p-hack by trying many tests # ❌ DON'T: Confuse correlation with causation # ❌ DON'T: Ignore sample size requirements
Resources
- Khan Academy Statistics
- StatQuest with Josh Starmer
- "Introduction to Statistical Learning" (ISLR)
Skill Certification Checklist:
- Can calculate descriptive statistics
- Can perform hypothesis tests (t-test, chi-square)
- Can implement A/B testing
- Can perform regression analysis
- Can use matrix operations for ML