Skillshub victoriametrics

VictoriaMetrics

install
source · Clone the upstream repo
git clone https://github.com/ComeOnOliver/skillshub
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/ComeOnOliver/skillshub "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/TerminalSkills/skills/victoriametrics" ~/.claude/skills/comeonoliver-skillshub-victoriametrics && rm -rf "$T"
manifest: skills/TerminalSkills/skills/victoriametrics/SKILL.md
source content

VictoriaMetrics

Overview

Deploy VictoriaMetrics as a fast, cost-effective Prometheus-compatible time-series database. Covers single-node and cluster deployment, vmagent configuration, MetricsQL querying, and integration with Grafana and Prometheus.

Instructions

Task A: Deploy Single-Node VictoriaMetrics

# docker-compose.yml — Single-node VictoriaMetrics with vmagent and vmalert
services:
  victoriametrics:
    image: victoriametrics/victoria-metrics:v1.96.0
    command:
      - '-storageDataPath=/victoria-metrics-data'
      - '-retentionPeriod=90d'
      - '-httpListenAddr=:8428'
      - '-search.maxUniqueTimeseries=1000000'
      - '-dedup.minScrapeInterval=15s'
    ports:
      - "8428:8428"
    volumes:
      - vm_data:/victoria-metrics-data

  vmagent:
    image: victoriametrics/vmagent:v1.96.0
    command:
      - '-promscrape.config=/etc/vmagent/scrape.yml'
      - '-remoteWrite.url=http://victoriametrics:8428/api/v1/write'
      - '-remoteWrite.tmpDataPath=/vmagent-remotewrite-data'
    volumes:
      - ./vmagent-scrape.yml:/etc/vmagent/scrape.yml:ro
      - vmagent_data:/vmagent-remotewrite-data

  vmalert:
    image: victoriametrics/vmalert:v1.96.0
    command:
      - '-datasource.url=http://victoriametrics:8428'
      - '-remoteRead.url=http://victoriametrics:8428'
      - '-remoteWrite.url=http://victoriametrics:8428'
      - '-notifier.url=http://alertmanager:9093'
      - '-rule=/etc/vmalert/rules/*.yml'
    volumes:
      - ./alert-rules:/etc/vmalert/rules:ro

volumes:
  vm_data:
  vmagent_data:

Task B: Configure vmagent Scraping

# vmagent-scrape.yml — Scrape configuration (Prometheus-compatible)
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']
        labels:
          environment: production

  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: namespace
      - source_labels: [__meta_kubernetes_pod_name]
        target_label: pod

  - job_name: 'app-services'
    static_configs:
      - targets:
          - 'api-gateway:8080'
          - 'payment-service:8080'
          - 'order-service:8080'
    metrics_path: /metrics
    relabel_configs:
      - source_labels: [__address__]
        regex: '([^:]+):.*'
        target_label: service

Task C: MetricsQL Queries

# Request rate per service (MetricsQL extends PromQL)
sum(rate(http_requests_total[5m])) by (service)
# P99 latency with MetricsQL's rollup functions
quantile_over_time(0.99, http_request_duration_seconds[5m]) by (service)
# Top 5 services by error rate using MetricsQL's topk_avg
topk_avg(5, sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
  / sum(rate(http_requests_total[5m])) by (service))
# MetricsQL: running total over 24h (not available in standard PromQL)
running_sum(increase(http_requests_total{service="api-gateway"}[1h]))
# MetricsQL: range_median for smoother visualization
range_median(cpu_usage_percent{instance=~"web-.*"})
# Query via API
curl -s "http://localhost:8428/api/v1/query_range" \
  --data-urlencode 'query=sum(rate(http_requests_total[5m])) by (service)' \
  --data-urlencode 'start=-1h' \
  --data-urlencode 'step=60s' | jq '.data.result[] | {service: .metric.service, values: (.values | length)}'

Task D: Alert Rules for vmalert

# alert-rules/service-alerts.yml — Alert rules for vmalert
groups:
  - name: service-health
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
          / sum(rate(http_requests_total[5m])) by (service) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Error rate above 5% on {{ $labels.service }}"

      - alert: HighMemoryUsage
        expr: |
          (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
          / node_memory_MemTotal_bytes > 0.9
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Memory usage above 90% on {{ $labels.instance }}"

      # Recording rule for dashboard performance
      - record: service:request_rate:5m
        expr: sum(rate(http_requests_total[5m])) by (service)

Task E: Cluster Mode Deployment

# docker-compose-cluster.yml — VictoriaMetrics cluster for horizontal scaling
services:
  vmstorage-1:
    image: victoriametrics/vmstorage:v1.96.0-cluster
    command:
      - '-storageDataPath=/storage'
      - '-retentionPeriod=90d'
    volumes:
      - vmstorage1:/storage

  vmstorage-2:
    image: victoriametrics/vmstorage:v1.96.0-cluster
    command:
      - '-storageDataPath=/storage'
      - '-retentionPeriod=90d'
    volumes:
      - vmstorage2:/storage

  vminsert:
    image: victoriametrics/vminsert:v1.96.0-cluster
    command:
      - '-storageNode=vmstorage-1:8400'
      - '-storageNode=vmstorage-2:8400'
      - '-replicationFactor=2'
    ports:
      - "8480:8480"

  vmselect:
    image: victoriametrics/vmselect:v1.96.0-cluster
    command:
      - '-storageNode=vmstorage-1:8401'
      - '-storageNode=vmstorage-2:8401'
      - '-dedup.minScrapeInterval=15s'
    ports:
      - "8481:8481"

volumes:
  vmstorage1:
  vmstorage2:

Best Practices

  • Use vmagent instead of Prometheus for scraping — it handles remote write more efficiently
  • Set
    -dedup.minScrapeInterval
    equal to your scrape interval to handle HA duplicates
  • Use MetricsQL extensions (topk_avg, range_median, running_sum) for cleaner queries
  • Set
    -retentionPeriod
    based on actual needs — VM is storage-efficient but plan capacity
  • Use recording rules for dashboard queries to reduce query load
  • In cluster mode, set
    -replicationFactor=2
    for data durability across storage nodes