Babysitter distribution-fitter
Statistical distribution fitting skill for input modeling in simulation and analysis.
install
source · Clone the upstream repo
git clone https://github.com/a5c-ai/babysitter
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/a5c-ai/babysitter "$T" && mkdir -p ~/.claude/skills && cp -r "$T/library/specializations/domains/science/industrial-engineering/skills/distribution-fitter" ~/.claude/skills/a5c-ai-babysitter-distribution-fitter && rm -rf "$T"
manifest:
library/specializations/domains/science/industrial-engineering/skills/distribution-fitter/SKILL.mdsource content
distribution-fitter
You are distribution-fitter - a specialized skill for fitting statistical distributions to data for input modeling in simulation and analysis.
Overview
This skill enables AI-powered distribution fitting including:
- Goodness-of-fit testing (Chi-square, K-S, Anderson-Darling)
- Maximum likelihood estimation
- Distribution parameter estimation
- Inter-arrival time analysis
- Service time distribution fitting
- Empirical distribution construction
- Distribution comparison and selection
Prerequisites
- Python 3.8+ with scipy, fitter installed
- Statistical analysis libraries
- Understanding of probability distributions
Capabilities
1. Automated Distribution Fitting
from fitter import Fitter import numpy as np def fit_distribution(data, distributions=None): """ Fit multiple distributions and select best fit """ if distributions is None: distributions = ['norm', 'expon', 'gamma', 'lognorm', 'weibull_min', 'beta', 'uniform', 'triang'] f = Fitter(data, distributions=distributions) f.fit() # Get summary summary = f.summary() # Best distribution best = f.get_best(method='sumsquare_error') return { "best_distribution": list(best.keys())[0], "parameters": best, "summary": summary.to_dict(), "all_fits": f.fitted_param }
2. Goodness-of-Fit Testing
from scipy import stats import numpy as np def goodness_of_fit_tests(data, distribution, params): """ Perform multiple goodness-of-fit tests """ results = {} # Kolmogorov-Smirnov test ks_stat, ks_pvalue = stats.kstest(data, distribution, args=params) results['kolmogorov_smirnov'] = { 'statistic': ks_stat, 'p_value': ks_pvalue, 'conclusion': 'accept' if ks_pvalue > 0.05 else 'reject' } # Chi-square test observed, bins = np.histogram(data, bins='auto') dist = getattr(stats, distribution) expected = len(data) * np.diff(dist.cdf(bins, *params)) # Combine bins with low expected counts mask = expected >= 5 chi2_stat, chi2_pvalue = stats.chisquare( observed[mask], expected[mask] ) results['chi_square'] = { 'statistic': chi2_stat, 'p_value': chi2_pvalue, 'degrees_of_freedom': sum(mask) - len(params) - 1 } # Anderson-Darling test (for specific distributions) if distribution in ['norm', 'expon', 'gumbel', 'logistic']: ad_result = stats.anderson(data, dist=distribution) results['anderson_darling'] = { 'statistic': ad_result.statistic, 'critical_values': dict(zip( ['15%', '10%', '5%', '2.5%', '1%'], ad_result.critical_values )) } return results
3. Maximum Likelihood Estimation
from scipy.optimize import minimize from scipy import stats def mle_fit(data, distribution): """ Fit distribution using maximum likelihood """ dist = getattr(stats, distribution) # Get parameter bounds bounds = get_parameter_bounds(distribution) # Negative log-likelihood function def neg_log_likelihood(params): return -np.sum(dist.logpdf(data, *params)) # Initial guess x0 = get_initial_params(data, distribution) # Optimize result = minimize(neg_log_likelihood, x0, bounds=bounds, method='L-BFGS-B') # Standard errors via Hessian from scipy.optimize import approx_fprime hessian = np.zeros((len(result.x), len(result.x))) epsilon = 1e-5 for i in range(len(result.x)): hessian[i] = approx_fprime(result.x, lambda p: approx_fprime(p, neg_log_likelihood, epsilon)[i], epsilon) se = np.sqrt(np.diag(np.linalg.inv(hessian))) return { "distribution": distribution, "parameters": result.x.tolist(), "standard_errors": se.tolist(), "log_likelihood": -result.fun, "aic": 2 * len(result.x) + 2 * result.fun, "bic": len(result.x) * np.log(len(data)) + 2 * result.fun }
4. Inter-arrival Time Analysis
def analyze_interarrival_times(timestamps): """ Analyze inter-arrival times from timestamp data """ # Calculate inter-arrival times timestamps = np.array(timestamps) interarrivals = np.diff(timestamps) # Basic statistics stats_summary = { "count": len(interarrivals), "mean": np.mean(interarrivals), "std": np.std(interarrivals), "cv": np.std(interarrivals) / np.mean(interarrivals), "min": np.min(interarrivals), "max": np.max(interarrivals), "median": np.median(interarrivals) } # Fit exponential (Poisson process test) exp_params = stats.expon.fit(interarrivals, floc=0) ks_stat, ks_pvalue = stats.kstest(interarrivals, 'expon', args=exp_params) is_poisson = ks_pvalue > 0.05 and 0.8 < stats_summary['cv'] < 1.2 # Fit other distributions fit_result = fit_distribution(interarrivals) return { "statistics": stats_summary, "poisson_process_test": { "ks_statistic": ks_stat, "p_value": ks_pvalue, "cv_test": stats_summary['cv'], "is_poisson": is_poisson }, "best_fit": fit_result, "arrival_rate": 1 / stats_summary['mean'] }
5. Empirical Distribution
class EmpiricalDistribution: """ Create empirical distribution from data """ def __init__(self, data): self.data = np.sort(data) self.n = len(data) self.ecdf = np.arange(1, self.n + 1) / self.n def cdf(self, x): """Cumulative distribution function""" return np.searchsorted(self.data, x, side='right') / self.n def ppf(self, q): """Percent point function (inverse CDF)""" idx = int(q * self.n) return self.data[min(idx, self.n - 1)] def sample(self, size=1): """Generate random samples""" u = np.random.uniform(0, 1, size) return np.array([self.ppf(ui) for ui in u]) def to_dict(self): """Export for storage""" return { "type": "empirical", "values": self.data.tolist(), "probabilities": self.ecdf.tolist() }
6. Distribution Comparison
def compare_distributions(data, candidates): """ Compare multiple distribution fits """ results = [] for dist_name in candidates: try: dist = getattr(stats, dist_name) params = dist.fit(data) # Log-likelihood ll = np.sum(dist.logpdf(data, *params)) # Information criteria k = len(params) n = len(data) aic = 2 * k - 2 * ll bic = k * np.log(n) - 2 * ll # KS test ks_stat, ks_pvalue = stats.kstest(data, dist_name, args=params) results.append({ "distribution": dist_name, "parameters": params, "log_likelihood": ll, "aic": aic, "bic": bic, "ks_statistic": ks_stat, "ks_pvalue": ks_pvalue }) except Exception as e: continue # Sort by AIC results.sort(key=lambda x: x['aic']) return { "rankings": results, "best_by_aic": results[0]['distribution'], "best_by_bic": min(results, key=lambda x: x['bic'])['distribution'] }
Process Integration
This skill integrates with the following processes:
discrete-event-simulation-modeling.jsqueuing-system-analysis.jsdemand-forecasting-model-development.js
Output Format
{ "data_summary": { "n": 500, "mean": 5.2, "std": 2.1, "cv": 0.40 }, "best_fit": { "distribution": "gamma", "parameters": {"shape": 6.1, "scale": 0.85}, "goodness_of_fit": { "ks_statistic": 0.032, "ks_pvalue": 0.67, "aic": 1523.4 } }, "alternative_fits": [ {"distribution": "lognorm", "aic": 1528.1}, {"distribution": "weibull", "aic": 1531.2} ], "recommendation": "Use gamma(6.1, 0.85) for simulation input" }
Tools/Libraries
| Library | Description | Use Case |
|---|---|---|
| scipy.stats | Statistical functions | Core fitting |
| fitter | Auto fitting | Quick analysis |
| statsmodels | Advanced stats | Detailed tests |
| R fitdistrplus | R package | Complex fitting |
Best Practices
- Visualize first - Always plot histograms and Q-Q plots
- Consider theory - Choose distributions based on process
- Test multiple - Compare several candidate distributions
- Check tails - Extreme values matter for simulation
- Document choice - Record rationale for selected distribution
- Update periodically - Re-fit as new data becomes available
Constraints
- Report goodness-of-fit statistics, not just parameters
- Document data collection methodology
- Consider censored or truncated data
- Test for time-varying parameters