Vibeship-spawner-skills reinforcement-learning

id: reinforcement-learning

install

source · Clone the upstream repo

git clone https://github.com/vibeforge1111/vibeship-spawner-skills

manifest: ai/reinforcement-learning/skill.yaml

tags

#reinforcement-learning #policy-gradients #rlhf #ppo #q-learning #reward-shaping

source content

id: reinforcement-learning name: Reinforcement Learning category: ai description: Use when implementing RL algorithms, training agents with rewards, or aligning LLMs with human feedback - covers policy gradients, PPO, Q-learning, RLHF, and GRPO

patterns: golden_rules: - rule: "Reward shaping is critical" reason: "Sparse rewards make learning nearly impossible" - rule: "Start simple, scale up" reason: "Debug on toy environments before complex ones" - rule: "Monitor training metrics obsessively" reason: "RL training is notoriously unstable" - rule: "Use appropriate baselines" reason: "Reduces variance in policy gradients" - rule: "Clip/constrain policy updates" reason: "Prevents catastrophic policy collapse" - rule: "Separate exploration from exploitation" reason: "Ensures sufficient state-space coverage"

algorithm_taxonomy: value_based: algorithms: ["Q-Learning", "DQN", "Double DQN", "Dueling DQN"] learns: "Q(s,a) - Value of state-action pairs" best_for: ["Discrete actions", "Atari games"] policy_based: algorithms: ["REINFORCE", "Policy Gradient"] learns: "pi(a|s) - Policy directly" best_for: ["Continuous actions", "Robotics"] actor_critic: algorithms: ["A2C/A3C", "PPO", "SAC", "TRPO"] learns: "Both V and pi" best_for: ["Most tasks", "LLM alignment"]

on_vs_off_policy: on_policy: algorithms: ["PPO", "A2C"] property: "Learn from current policy samples" pros: "More stable" cons: "Fresh data required" off_policy: algorithms: ["DQN", "SAC"] property: "Learn from any policy samples" pros: "More sample efficient" cons: "Requires replay buffer"

discount_factor: short_horizon: 0.9 medium_horizon: 0.95 long_horizon: 0.99 infinite_horizon: 0.999

ppo_config: clip_epsilon: "0.1-0.3 (typically 0.2)" entropy_coef: "0.01 (encourages exploration)" value_coef: "0.5" max_grad_norm: "0.5" n_epochs: "3-10 per batch"

rlhf_pipeline: step1_sft: description: "Supervised Fine-Tuning" purpose: "Establish baseline helpful behavior" step2_reward_model: description: "Train on human preference comparisons" output: "Reward(prompt, response) = scalar" loss: "Bradley-Terry: -log(sigmoid(r_chosen - r_rejected))" step3_ppo: description: "Optimize policy with KL penalty" formula: "reward = r(x,y) - beta * KL(pi || pi_ref)"

anti_patterns:

pattern: "Sparse rewards" problem: "Agent learns nothing" solution: "Reward shaping, dense rewards"
pattern: "No baseline/advantage" problem: "High variance gradients" solution: "Use GAE, value baseline"
pattern: "Large policy updates" problem: "Training collapse" solution: "PPO clipping, KL penalty"
pattern: "No replay buffer (off-policy)" problem: "Sample inefficiency" solution: "Experience replay"
pattern: "Same network for Q and target" problem: "Unstable learning" solution: "Separate target network"
pattern: "Ignoring KL in RLHF" problem: "Model drift, reward hacking" solution: "KL penalty to reference model"

implementation_checklist: general_rl: - "Environment returns normalized observations" - "Reward function is dense (or shaped appropriately)" - "Using appropriate discount factor gamma" - "Logging training metrics (reward, loss, entropy)" ppo_specific: - "Advantage normalization" - "Clip epsilon typically 0.1-0.3" - "Multiple epochs per batch (3-10)" - "Entropy bonus for exploration" - "Gradient clipping" rlhf_specific: - "Good SFT model as starting point" - "High-quality human preference data" - "KL penalty to prevent drift" - "Monitor reward hacking" - "Evaluate on held-out prompts"

handoffs:

skill: transformer-architecture trigger: "building the policy network"
skill: llm-fine-tuning trigger: "SFT phase of RLHF"
skill: distributed-training trigger: "scaling RL training"

ecosystem: frameworks: - "OpenRLHF - RLHF framework" - "TRL - Transformer Reinforcement Learning" - "Stable Baselines3 - Classic RL algorithms" - "CleanRL - Single-file implementations" environments: - "Gymnasium (formerly OpenAI Gym)" - "PettingZoo - Multi-agent" - "Brax - GPU-accelerated physics"

sources: papers: - "Schulman et al. (2017). Proximal Policy Optimization Algorithms" - "DeepSeek (2024). GRPO: Group Relative Policy Optimization" tutorials: - "CMU RLHF 101 Tutorial" - "Hugging Face RLHF Blog" - "PPO for LLM Alignment - Cameron Wolfe"