Awesome-omni-skill grafana-prometheus
Observability and monitoring with Prometheus metrics and Grafana dashboards
install
source · Clone the upstream repo
git clone https://github.com/diegosouzapw/awesome-omni-skill
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/diegosouzapw/awesome-omni-skill "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/devops/grafana-prometheus" ~/.claude/skills/diegosouzapw-awesome-omni-skill-grafana-prometheus && rm -rf "$T"
manifest:
skills/devops/grafana-prometheus/SKILL.mdsource content
Grafana + Prometheus Monitoring
Complete observability stack with Prometheus metrics collection, PromQL queries, and Grafana visualization dashboards.
Direct Control (CLI / API / Scripting)
Prometheus Configuration
# prometheus.yml global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: 'production' environment: 'prod' # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 # Load rules rule_files: - "alerts/*.yml" - "recording_rules/*.yml" # Scrape configurations scrape_configs: # Prometheus self-monitoring - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] # Node exporter (system metrics) - job_name: 'node' static_configs: - targets: - 'node1:9100' - 'node2:9100' - 'node3:9100' labels: env: 'production' # Kubernetes service discovery - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ # Application metrics - job_name: 'app' static_configs: - targets: - 'app1:8080' - 'app2:8080' metrics_path: '/metrics' scrape_interval: 10s # Blackbox exporter (endpoint monitoring) - job_name: 'blackbox' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://example.com - https://api.example.com/health relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115
PromQL Queries
# CPU Usage 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) # Memory Usage Percentage (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 # Disk Space Available node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 # HTTP Request Rate rate(http_requests_total[5m]) # HTTP Request Rate by Status Code sum by (status) (rate(http_requests_total[5m])) # 95th Percentile Request Duration histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m]))) # Error Rate sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 # Pod Restarts (Kubernetes) sum by (namespace, pod) (kube_pod_container_status_restarts_total) # Network Traffic rate(node_network_receive_bytes_total[5m]) rate(node_network_transmit_bytes_total[5m]) # Active Alerts sum by (alertname, severity) (ALERTS{alertstate="firing"}) # Query per second by database sum by (database) (rate(mysql_global_status_queries[5m])) # Container CPU Usage sum by (pod_name) (rate(container_cpu_usage_seconds_total[5m])) # Top 10 endpoints by request count topk(10, sum by (endpoint) (rate(http_requests_total[1h]))) # SLA calculation (uptime percentage) avg_over_time((up{job="api"}[30d])) * 100
Prometheus HTTP API (cURL)
# Query instant value curl -G http://localhost:9090/api/v1/query \ --data-urlencode 'query=up' \ --data-urlencode 'time=2024-01-01T20:10:30.781Z' # Query range curl -G http://localhost:9090/api/v1/query_range \ --data-urlencode 'query=rate(http_requests_total[5m])' \ --data-urlencode 'start=2024-01-01T00:00:00Z' \ --data-urlencode 'end=2024-01-01T23:59:59Z' \ --data-urlencode 'step=15s' # Get series labels curl -G http://localhost:9090/api/v1/series \ --data-urlencode 'match[]=up' \ --data-urlencode 'start=2024-01-01T00:00:00Z' \ --data-urlencode 'end=2024-01-01T23:59:59Z' # Get label values curl http://localhost:9090/api/v1/label/job/values # Get targets curl http://localhost:9090/api/v1/targets # Get alerts curl http://localhost:9090/api/v1/alerts # Get rules curl http://localhost:9090/api/v1/rules # Health check curl http://localhost:9090/-/healthy # Reload configuration curl -X POST http://localhost:9090/-/reload
Grafana HTTP API
# Authentication export GRAFANA_TOKEN="your-api-token" export GRAFANA_URL="http://localhost:3000" # Create dashboard curl -X POST "$GRAFANA_URL/api/dashboards/db" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d @dashboard.json # Get dashboard by UID curl "$GRAFANA_URL/api/dashboards/uid/my-dashboard" \ -H "Authorization: Bearer $GRAFANA_TOKEN" # Search dashboards curl "$GRAFANA_URL/api/search?query=production&type=dash-db" \ -H "Authorization: Bearer $GRAFANA_TOKEN" # Delete dashboard curl -X DELETE "$GRAFANA_URL/api/dashboards/uid/my-dashboard" \ -H "Authorization: Bearer $GRAFANA_TOKEN" # Create data source curl -X POST "$GRAFANA_URL/api/datasources" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d '{ "name": "Prometheus", "type": "prometheus", "url": "http://prometheus:9090", "access": "proxy", "isDefault": true }' # List data sources curl "$GRAFANA_URL/api/datasources" \ -H "Authorization: Bearer $GRAFANA_TOKEN" # Create organization curl -X POST "$GRAFANA_URL/api/orgs" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d '{"name": "Production Org"}' # Add user to organization curl -X POST "$GRAFANA_URL/api/orgs/1/users" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d '{"loginOrEmail": "user@example.com", "role": "Viewer"}' # Create API key curl -X POST "$GRAFANA_URL/api/auth/keys" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d '{"name": "automation-key", "role": "Admin", "secondsToLive": 86400}' # Create alert notification channel curl -X POST "$GRAFANA_URL/api/alert-notifications" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d '{ "name": "Slack Alerts", "type": "slack", "isDefault": true, "settings": { "url": "https://hooks.slack.com/services/xxx/yyy/zzz", "recipient": "#alerts" } }' # Create folder curl -X POST "$GRAFANA_URL/api/folders" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d '{"title": "Production Dashboards"}' # Snapshot dashboard curl -X POST "$GRAFANA_URL/api/snapshots" \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d @snapshot.json # Health check curl "$GRAFANA_URL/api/health"
Alert Rules Configuration
# alerts/app_alerts.yml groups: - name: application interval: 30s rules: # High error rate - alert: HighErrorRate expr: | sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05 for: 5m labels: severity: critical team: backend annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}" # API latency - alert: HighLatency expr: | histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])) ) > 1 for: 10m labels: severity: warning team: backend annotations: summary: "API latency is high" description: "95th percentile latency is {{ $value }}s" # Service down - alert: ServiceDown expr: up{job="app"} == 0 for: 2m labels: severity: critical team: sre annotations: summary: "Service {{ $labels.instance }} is down" description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes" - name: infrastructure interval: 1m rules: # High CPU - alert: HighCPU expr: | 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning team: sre annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is {{ $value }}%" # Low disk space - alert: LowDiskSpace expr: | (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15 for: 5m labels: severity: warning team: sre annotations: summary: "Low disk space on {{ $labels.instance }}" description: "Only {{ $value }}% disk space remaining" # High memory - alert: HighMemory expr: | (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90 for: 5m labels: severity: critical team: sre annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is {{ $value }}%"
Python Client (Prometheus & Grafana)
import requests from datetime import datetime, timedelta class PrometheusClient: def __init__(self, url="http://localhost:9090"): self.url = url def query(self, promql): """Execute instant query""" response = requests.get( f"{self.url}/api/v1/query", params={"query": promql} ) return response.json() def query_range(self, promql, start, end, step="15s"): """Execute range query""" response = requests.get( f"{self.url}/api/v1/query_range", params={ "query": promql, "start": start.isoformat(), "end": end.isoformat(), "step": step } ) return response.json() def get_targets(self): """Get scrape targets""" response = requests.get(f"{self.url}/api/v1/targets") return response.json() class GrafanaClient: def __init__(self, url, token): self.url = url self.headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" } def create_dashboard(self, dashboard_json): """Create or update dashboard""" response = requests.post( f"{self.url}/api/dashboards/db", headers=self.headers, json={"dashboard": dashboard_json, "overwrite": True} ) return response.json() def get_dashboard(self, uid): """Get dashboard by UID""" response = requests.get( f"{self.url}/api/dashboards/uid/{uid}", headers=self.headers ) return response.json() def search_dashboards(self, query="", tags=None): """Search dashboards""" params = {"query": query} if tags: params["tag"] = tags response = requests.get( f"{self.url}/api/search", headers=self.headers, params=params ) return response.json() def create_annotation(self, dashboard_uid, time, text, tags=None): """Create annotation""" data = { "dashboardUID": dashboard_uid, "time": int(time.timestamp() * 1000), "text": text, "tags": tags or [] } response = requests.post( f"{self.url}/api/annotations", headers=self.headers, json=data ) return response.json() # Usage example prom = PrometheusClient("http://prometheus:9090") grafana = GrafanaClient("http://grafana:3000", "your-api-token") # Query CPU usage result = prom.query('100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)') print(f"CPU Usage: {result['data']['result'][0]['value'][1]}%") # Get dashboard dashboard = grafana.get_dashboard("my-dashboard-uid")
MCP Server Integration
Configuration (.codebuddy/mcp.json)
{ "mcpServers": { "grafana": { "command": "npx", "args": ["-y", "@grafana/mcp-server"], "env": { "GRAFANA_URL": "http://localhost:3000", "GRAFANA_API_TOKEN": "your-api-token-here" } } } }
Available MCP Tools
grafana_create_dashboard
- Creates or updates a Grafana dashboard
- Parameters:
(JSON object),dashboard
(optional),folder_id
(boolean)overwrite - Returns: Dashboard UID, URL, version
grafana_get_dashboard
- Retrieves dashboard by UID
- Parameters:
(string)uid - Returns: Full dashboard JSON with metadata
grafana_search_dashboards
- Searches dashboards by query and tags
- Parameters:
(string),query
(array),tags
(array)folder_ids - Returns: List of matching dashboards
grafana_delete_dashboard
- Deletes a dashboard
- Parameters:
(string)uid - Returns: Deletion confirmation
grafana_create_datasource
- Creates a new data source
- Parameters:
(string),name
(string),type
(string),url
(object)settings - Returns: Data source ID and details
grafana_query_datasource
- Executes query against data source
- Parameters:
(string),datasource_uid
(string),query
(object)time_range - Returns: Query results
grafana_create_alert
- Creates alert rule
- Parameters:
(string),rule_name
(string),folder_uid
(object),condition
(array)notifications - Returns: Alert rule ID
grafana_list_alerts
- Lists all alert rules
- Parameters:
(optional),folder_uid
(optional)state - Returns: Array of alert rules
grafana_create_annotation
- Creates dashboard annotation
- Parameters:
(string),dashboard_uid
(timestamp),time
(string),text
(array)tags - Returns: Annotation ID
prometheus_query
- Executes PromQL instant query
- Parameters:
(string),query
(optional timestamp)time - Returns: Query result values
prometheus_query_range
- Executes PromQL range query
- Parameters:
(string),query
(timestamp),start
(timestamp),end
(string)step - Returns: Time series data
prometheus_get_metrics
- Lists available metrics
- Parameters:
(optional string)filter - Returns: Array of metric names
prometheus_get_targets
- Lists scrape targets and their status
- Parameters: None
- Returns: Target health and labels
Common Workflows
1. Complete Monitoring Stack Setup
# Docker Compose setup cat > docker-compose.yml <<'EOF' version: '3.8' services: prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml - ./alerts:/etc/prometheus/alerts - prometheus-data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.enable-lifecycle' grafana: image: grafana/grafana:latest ports: - "3000:3000" environment: - GF_SECURITY_ADMIN_PASSWORD=admin - GF_USERS_ALLOW_SIGN_UP=false volumes: - grafana-data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning node-exporter: image: prom/node-exporter:latest ports: - "9100:9100" command: - '--path.rootfs=/host' volumes: - '/:/host:ro,rslave' alertmanager: image: prom/alertmanager:latest ports: - "9093:9093" volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml volumes: prometheus-data: grafana-data: EOF # Start stack docker-compose up -d # Wait for services sleep 10 # Create Grafana API token GRAFANA_TOKEN=$(curl -X POST http://admin:admin@localhost:3000/api/auth/keys \ -H "Content-Type: application/json" \ -d '{"name": "automation", "role": "Admin"}' | jq -r '.key') # Add Prometheus data source curl -X POST http://localhost:3000/api/datasources \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d '{ "name": "Prometheus", "type": "prometheus", "url": "http://prometheus:9090", "access": "proxy", "isDefault": true }' echo "Monitoring stack ready!" echo "Grafana: http://localhost:3000 (admin/admin)" echo "Prometheus: http://localhost:9090"
2. Create Custom Application Dashboard
# Generate dashboard JSON cat > app-dashboard.json <<'EOF' { "dashboard": { "title": "Application Metrics", "tags": ["application", "production"], "timezone": "browser", "panels": [ { "id": 1, "title": "Request Rate", "type": "graph", "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, "targets": [ { "expr": "sum(rate(http_requests_total[5m])) by (status)", "legendFormat": "{{status}}", "refId": "A" } ], "yaxes": [ {"format": "reqps", "label": "Requests/sec"}, {"format": "short"} ] }, { "id": 2, "title": "Error Rate", "type": "singlestat", "gridPos": {"x": 12, "y": 0, "w": 6, "h": 8}, "targets": [ { "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100", "refId": "A" } ], "format": "percent", "thresholds": "1,5", "colors": ["#299c46", "#e5ac0e", "#bf1b00"] }, { "id": 3, "title": "Response Time (p95)", "type": "graph", "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m])))", "legendFormat": "{{endpoint}}", "refId": "A" } ], "yaxes": [ {"format": "s", "label": "Duration"}, {"format": "short"} ] }, { "id": 4, "title": "Active Connections", "type": "graph", "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "sum(app_active_connections) by (instance)", "legendFormat": "{{instance}}", "refId": "A" } ] } ], "refresh": "30s", "time": {"from": "now-6h", "to": "now"} }, "overwrite": true } EOF # Upload dashboard curl -X POST http://localhost:3000/api/dashboards/db \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d @app-dashboard.json
3. Set Up Alerting Pipeline
# Configure Alertmanager cat > alertmanager.yml <<'EOF' global: resolve_timeout: 5m slack_api_url: 'https://hooks.slack.com/services/xxx/yyy/zzz' route: group_by: ['alertname', 'cluster'] group_wait: 10s group_interval: 10s repeat_interval: 12h receiver: 'default' routes: - match: severity: critical receiver: 'pagerduty' continue: true - match: severity: warning receiver: 'slack' receivers: - name: 'default' slack_configs: - channel: '#alerts' title: 'Alert: {{ .GroupLabels.alertname }}' text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' - name: 'slack' slack_configs: - channel: '#alerts-warning' title: 'Warning: {{ .GroupLabels.alertname }}' text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}' send_resolved: true - name: 'pagerduty' pagerduty_configs: - service_key: 'your-pagerduty-key' description: '{{ .GroupLabels.alertname }}' inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'instance'] EOF # Reload Alertmanager curl -X POST http://localhost:9093/-/reload # Test alert curl -X POST http://localhost:9093/api/v1/alerts \ -H "Content-Type: application/json" \ -d '[ { "labels": { "alertname": "TestAlert", "severity": "warning" }, "annotations": { "summary": "This is a test alert" } } ]' # Verify alert rules curl http://localhost:9090/api/v1/rules | jq '.data.groups[].rules[] | select(.type=="alerting")' # Check active alerts curl http://localhost:9090/api/v1/alerts | jq '.data.alerts[] | select(.state=="firing")'
4. Instrument Application with Metrics
# Python Flask application with Prometheus metrics from flask import Flask, request from prometheus_client import Counter, Histogram, Gauge, generate_latest, REGISTRY import time app = Flask(__name__) # Metrics REQUEST_COUNT = Counter( 'http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status'] ) REQUEST_DURATION = Histogram( 'http_request_duration_seconds', 'HTTP request duration', ['method', 'endpoint'] ) ACTIVE_REQUESTS = Gauge( 'http_requests_active', 'Active HTTP requests' ) # Middleware @app.before_request def before_request(): request.start_time = time.time() ACTIVE_REQUESTS.inc() @app.after_request def after_request(response): duration = time.time() - request.start_time REQUEST_COUNT.labels( method=request.method, endpoint=request.endpoint or 'unknown', status=response.status_code ).inc() REQUEST_DURATION.labels( method=request.method, endpoint=request.endpoint or 'unknown' ).observe(duration) ACTIVE_REQUESTS.dec() return response # Metrics endpoint @app.route('/metrics') def metrics(): return generate_latest(REGISTRY) # Application endpoints @app.route('/api/users') def users(): return {'users': []} @app.route('/health') def health(): return {'status': 'healthy'} if __name__ == '__main__': app.run(host='0.0.0.0', port=8080)
# Add to Prometheus scrape config cat >> prometheus.yml <<'EOF' - job_name: 'my-app' static_configs: - targets: ['app:8080'] metrics_path: '/metrics' scrape_interval: 10s EOF # Reload Prometheus curl -X POST http://localhost:9090/-/reload
5. Performance Analysis and Troubleshooting
# Check if metrics are being scraped curl 'http://localhost:9090/api/v1/query?query=up{job="app"}' # Analyze request patterns curl -G 'http://localhost:9090/api/v1/query' \ --data-urlencode 'query=topk(10, sum by (endpoint) (rate(http_requests_total[1h])))' # Find slow endpoints curl -G 'http://localhost:9090/api/v1/query' \ --data-urlencode 'query=histogram_quantile(0.99, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m]))) > 1' # Memory leak detection curl -G 'http://localhost:9090/api/v1/query_range' \ --data-urlencode 'query=process_resident_memory_bytes{job="app"}' \ --data-urlencode 'start=2024-01-01T00:00:00Z' \ --data-urlencode 'end=2024-01-01T23:59:59Z' \ --data-urlencode 'step=1h' | jq '.data.result[0].values' # CPU spike investigation curl -G 'http://localhost:9090/api/v1/query_range' \ --data-urlencode 'query=rate(process_cpu_seconds_total{job="app"}[5m])' \ --data-urlencode 'start=2024-01-01T10:00:00Z' \ --data-urlencode 'end=2024-01-01T11:00:00Z' \ --data-urlencode 'step=30s' # Correlate errors with deployments (using annotations) curl -X POST http://localhost:3000/api/annotations \ -H "Authorization: Bearer $GRAFANA_TOKEN" \ -H "Content-Type: application/json" \ -d "{ \"time\": $(date +%s)000, \"text\": \"Deployed v2.1.0\", \"tags\": [\"deployment\", \"v2.1.0\"] }" # Export metrics for analysis curl -G 'http://localhost:9090/api/v1/query_range' \ --data-urlencode 'query=http_requests_total' \ --data-urlencode 'start=2024-01-01T00:00:00Z' \ --data-urlencode 'end=2024-01-01T23:59:59Z' \ --data-urlencode 'step=5m' | jq -r '.data.result[] | [.metric.instance, .values[][1]] | @csv' > metrics.csv