Claude-skill-registry fine-tuning
LLM fine-tuning with LoRA, QLoRA, and instruction tuning for domain adaptation.
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/fine-tuning" ~/.claude/skills/majiayu000-claude-skill-registry-fine-tuning && rm -rf "$T"
manifest:
skills/data/fine-tuning/SKILL.mdsource content
Fine-Tuning
Adapt LLMs to specific tasks and domains efficiently.
Quick Start
LoRA Fine-Tuning with PEFT
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments from peft import LoraConfig, get_peft_model, TaskType from datasets import load_dataset from trl import SFTTrainer # Load base model model_name = "meta-llama/Llama-2-7b-hf" model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token # Configure LoRA lora_config = LoraConfig( r=16, # Rank lora_alpha=32, # Alpha scaling target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM ) # Apply LoRA model = get_peft_model(model, lora_config) model.print_trainable_parameters() # trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06% # Training arguments training_args = TrainingArguments( output_dir="./output", num_train_epochs=3, per_device_train_batch_size=4, gradient_accumulation_steps=4, learning_rate=2e-4, warmup_ratio=0.03, logging_steps=10, save_strategy="epoch" ) # Train trainer = SFTTrainer( model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer, max_seq_length=512 ) trainer.train()
QLoRA (4-bit Quantized LoRA)
from transformers import BitsAndBytesConfig import torch # Quantization config bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True ) # Load quantized model model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map="auto" ) # Apply LoRA on top of quantized model model = get_peft_model(model, lora_config)
Dataset Preparation
Instruction Dataset Format
# Alpaca format instruction_format = { "instruction": "Summarize the following text.", "input": "The quick brown fox jumps over the lazy dog...", "output": "A fox jumps over a dog." } # ChatML format chat_format = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Summarize this text: ..."}, {"role": "assistant", "content": "Summary: ..."} ] # Formatting function def format_instruction(sample): return f"""### Instruction: {sample['instruction']} ### Input: {sample['input']} ### Response: {sample['output']}"""
Data Preparation Pipeline
from datasets import Dataset import json class DatasetPreparer: def __init__(self, tokenizer, max_length=512): self.tokenizer = tokenizer self.max_length = max_length def prepare(self, data_path: str) -> Dataset: # Load raw data with open(data_path) as f: raw_data = json.load(f) # Format samples formatted = [self._format_sample(s) for s in raw_data] # Create dataset dataset = Dataset.from_dict({"text": formatted}) # Tokenize return dataset.map( self._tokenize, batched=True, remove_columns=["text"] ) def _format_sample(self, sample): return f"""<s>[INST] {sample['instruction']} {sample['input']} [/INST] {sample['output']}</s>""" def _tokenize(self, examples): return self.tokenizer( examples["text"], truncation=True, max_length=self.max_length, padding="max_length" )
Fine-Tuning Methods Comparison
| Method | VRAM | Speed | Quality | Use Case |
|---|---|---|---|---|
| Full Fine-Tune | 60GB+ | Slow | Best | Unlimited resources |
| LoRA | 16GB | Fast | Very Good | Most applications |
| QLoRA | 8GB | Medium | Good | Consumer GPUs |
| Prefix Tuning | 8GB | Fast | Good | Fixed tasks |
| Prompt Tuning | 4GB | Very Fast | Moderate | Simple adaptation |
LoRA Hyperparameters
rank (r): small (4-8): Simple tasks, less capacity medium (16-32): General fine-tuning large (64-128): Complex domain adaptation alpha: rule: Usually 2x rank effect: Higher = more influence from LoRA weights target_modules: attention: [q_proj, k_proj, v_proj, o_proj] mlp: [gate_proj, up_proj, down_proj] all: Maximum adaptation, more VRAM dropout: typical: 0.05-0.1 effect: Regularization, prevents overfitting
Training Best Practices
Learning Rate Schedule
from transformers import get_cosine_schedule_with_warmup optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=100, num_training_steps=total_steps )
Gradient Checkpointing
# Save memory by recomputing activations model.gradient_checkpointing_enable() # Also enable for LoRA model.enable_input_require_grads()
Evaluation During Training
def compute_metrics(eval_pred): predictions, labels = eval_pred # Shift for causal LM predictions = predictions[:, :-1] labels = labels[:, 1:] # Calculate perplexity loss_fct = torch.nn.CrossEntropyLoss(reduction='mean') loss = loss_fct(predictions.view(-1, vocab_size), labels.view(-1)) perplexity = torch.exp(loss) return {"perplexity": perplexity.item()}
Merging and Deploying
Merge LoRA Weights
# After training, merge LoRA into base model merged_model = model.merge_and_unload() # Save merged model merged_model.save_pretrained("./merged_model") tokenizer.save_pretrained("./merged_model")
Multiple LoRA Adapters
from peft import PeftModel # Load base model base_model = AutoModelForCausalLM.from_pretrained("base_model") # Load and switch between adapters model = PeftModel.from_pretrained(base_model, "adapter_1") model.load_adapter("adapter_2", adapter_name="code") # Switch adapters at runtime model.set_adapter("code") # Use code adapter model.set_adapter("default") # Use default adapter
Common Issues
| Issue | Cause | Solution |
|---|---|---|
| Loss not decreasing | LR too low/high | Adjust learning rate |
| OOM errors | Batch too large | Reduce batch, use gradient accumulation |
| Overfitting | Too many epochs | Early stopping, more data |
| Catastrophic forgetting | Too aggressive LR | Lower LR, shorter training |
| Poor quality | Data issues | Clean and validate dataset |
Error Handling & Recovery
from transformers import TrainerCallback class CheckpointCallback(TrainerCallback): def on_save(self, args, state, control, **kwargs): # Always keep last 3 checkpoints pass def on_epoch_end(self, args, state, control, **kwargs): if state.best_metric is None: # Save checkpoint on each epoch control.should_save = True
Troubleshooting
| Symptom | Cause | Solution |
|---|---|---|
| NaN loss | LR too high | Lower to 1e-5 |
| No improvement | LR too low | Increase 10x |
| OOM mid-training | Batch too large | Enable gradient checkpointing |
Unit Test Template
def test_lora_config(): config = LoraConfig(r=16, lora_alpha=32) model = get_peft_model(base_model, config) assert model.print_trainable_parameters() < 1%