Skills onnx
install
source · Clone the upstream repo
git clone https://github.com/TerminalSkills/skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/TerminalSkills/skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/onnx" ~/.claude/skills/terminalskills-skills-onnx && rm -rf "$T"
manifest:
skills/onnx/SKILL.mdsafety · automated scan (medium risk)
This is a pattern-based risk scan, not a security review. Our crawler flagged:
- pip install
- eval/exec/Function constructor
Always read a skill's source content before installing. Patterns alone don't mean the skill is malicious — but they warrant attention.
source content
ONNX
Installation
# Install ONNX and ONNX Runtime pip install onnx onnxruntime # For GPU inference pip install onnxruntime-gpu # For model optimization pip install onnxoptimizer onnxsim
Export PyTorch Model to ONNX
# export_pytorch.py — Convert a PyTorch model to ONNX format import torch import torch.nn as nn class SimpleModel(nn.Module): def __init__(self): super().__init__() self.fc1 = nn.Linear(10, 64) self.relu = nn.ReLU() self.fc2 = nn.Linear(64, 3) def forward(self, x): return self.fc2(self.relu(self.fc1(x))) model = SimpleModel() model.eval() dummy_input = torch.randn(1, 10) torch.onnx.export( model, dummy_input, "model.onnx", export_params=True, opset_version=17, input_names=["input"], output_names=["output"], dynamic_axes={ "input": {0: "batch_size"}, "output": {0: "batch_size"}, }, ) print("Exported model.onnx")
Export Hugging Face Transformers
# export_transformers.py — Export a Hugging Face model to ONNX using optimum # pip install optimum[onnxruntime] from optimum.onnxruntime import ORTModelForSequenceClassification from transformers import AutoTokenizer model_name = "distilbert-base-uncased-finetuned-sst-2-english" # Export and load in one step model = ORTModelForSequenceClassification.from_pretrained(model_name, export=True) tokenizer = AutoTokenizer.from_pretrained(model_name) # Save the ONNX model model.save_pretrained("./onnx_model") tokenizer.save_pretrained("./onnx_model") # Run inference inputs = tokenizer("This movie was fantastic!", return_tensors="pt") outputs = model(**inputs) print(f"Logits: {outputs.logits}")
ONNX Runtime Inference
# inference.py — Run inference with ONNX Runtime for optimized performance import onnxruntime as ort import numpy as np # Create session with optimization session_options = ort.SessionOptions() session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL session_options.intra_op_num_threads = 4 # Use CPU or GPU provider providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] session = ort.InferenceSession("model.onnx", session_options, providers=providers) # Get input/output details print(f"Inputs: {[i.name for i in session.get_inputs()]}") print(f"Outputs: {[o.name for o in session.get_outputs()]}") # Run inference input_data = np.random.randn(1, 10).astype(np.float32) results = session.run(None, {"input": input_data}) print(f"Output shape: {results[0].shape}") print(f"Predictions: {results[0]}")
Batch Inference
# batch_inference.py — Efficient batch processing with ONNX Runtime import onnxruntime as ort import numpy as np import time session = ort.InferenceSession("model.onnx", providers=["CPUExecutionProvider"]) # Batch of 1000 samples batch_data = np.random.randn(1000, 10).astype(np.float32) start = time.time() results = session.run(None, {"input": batch_data}) elapsed = time.time() - start print(f"Processed 1000 samples in {elapsed:.3f}s ({1000/elapsed:.0f} samples/sec)")
Model Optimization
# optimize.py — Optimize an ONNX model for faster inference import onnx from onnxruntime.transformers import optimizer # Basic optimization with ONNX simplifier # pip install onnxsim import onnxsim model = onnx.load("model.onnx") optimized, check = onnxsim.simplify(model) onnx.save(optimized, "model_simplified.onnx") print(f"Simplified: {check}")
Quantization
# quantize.py — Reduce model size and speed up inference with quantization from onnxruntime.quantization import quantize_dynamic, QuantType quantize_dynamic( model_input="model.onnx", model_output="model_quantized.onnx", weight_type=QuantType.QInt8, ) import os original = os.path.getsize("model.onnx") quantized = os.path.getsize("model_quantized.onnx") print(f"Original: {original/1024:.1f} KB") print(f"Quantized: {quantized/1024:.1f} KB ({quantized/original*100:.1f}%)")
Validate ONNX Model
# validate.py — Check model validity and inspect structure import onnx model = onnx.load("model.onnx") onnx.checker.check_model(model) print("Model is valid!") # Print model info print(f"IR version: {model.ir_version}") print(f"Opset: {model.opset_import[0].version}") print(f"Graph inputs: {[i.name for i in model.graph.input]}") print(f"Graph outputs: {[o.name for o in model.graph.output]}") print(f"Nodes: {len(model.graph.node)}")
Edge Deployment (ONNX Runtime Mobile)
# mobile_export.py — Prepare a model for mobile/edge deployment from onnxruntime.tools import ort_format_model # Convert to ORT format for mobile ort_format_model.convert_onnx_models_to_ort( "model.onnx", output_dir="./mobile_model", optimization_level="all", ) # Use the .ort file with ONNX Runtime Mobile SDK on iOS/Android
Key Concepts
- ONNX format: Framework-agnostic model representation — export from PyTorch/TF, run anywhere
- ONNX Runtime: High-performance inference engine with CPU, GPU, TensorRT, and DirectML support
- Dynamic axes: Allow variable batch sizes and sequence lengths in exported models
- Quantization: INT8 quantization reduces model size 2-4x with minimal accuracy loss
- Execution providers: Plug in hardware-specific backends (CUDA, TensorRT, OpenVINO, CoreML)
- Opset versions: Higher opset = more supported operations; use opset 17+ for modern models