Awesome-omni-skill grafana-prometheus

Observability and monitoring with Prometheus metrics and Grafana dashboards

install
source · Clone the upstream repo
git clone https://github.com/diegosouzapw/awesome-omni-skill
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/diegosouzapw/awesome-omni-skill "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/devops/grafana-prometheus" ~/.claude/skills/diegosouzapw-awesome-omni-skill-grafana-prometheus && rm -rf "$T"
manifest: skills/devops/grafana-prometheus/SKILL.md
source content

Grafana + Prometheus Monitoring

Complete observability stack with Prometheus metrics collection, PromQL queries, and Grafana visualization dashboards.

Direct Control (CLI / API / Scripting)

Prometheus Configuration

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'production'
    environment: 'prod'

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

# Load rules
rule_files:
  - "alerts/*.yml"
  - "recording_rules/*.yml"

# Scrape configurations
scrape_configs:
  # Prometheus self-monitoring
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # Node exporter (system metrics)
  - job_name: 'node'
    static_configs:
      - targets:
          - 'node1:9100'
          - 'node2:9100'
          - 'node3:9100'
        labels:
          env: 'production'

  # Kubernetes service discovery
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__

  # Application metrics
  - job_name: 'app'
    static_configs:
      - targets:
          - 'app1:8080'
          - 'app2:8080'
    metrics_path: '/metrics'
    scrape_interval: 10s

  # Blackbox exporter (endpoint monitoring)
  - job_name: 'blackbox'
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets:
          - https://example.com
          - https://api.example.com/health
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:9115

PromQL Queries

# CPU Usage
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# Memory Usage Percentage
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

# Disk Space Available
node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100

# HTTP Request Rate
rate(http_requests_total[5m])

# HTTP Request Rate by Status Code
sum by (status) (rate(http_requests_total[5m]))

# 95th Percentile Request Duration
histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))

# Error Rate
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100

# Pod Restarts (Kubernetes)
sum by (namespace, pod) (kube_pod_container_status_restarts_total)

# Network Traffic
rate(node_network_receive_bytes_total[5m])
rate(node_network_transmit_bytes_total[5m])

# Active Alerts
sum by (alertname, severity) (ALERTS{alertstate="firing"})

# Query per second by database
sum by (database) (rate(mysql_global_status_queries[5m]))

# Container CPU Usage
sum by (pod_name) (rate(container_cpu_usage_seconds_total[5m]))

# Top 10 endpoints by request count
topk(10, sum by (endpoint) (rate(http_requests_total[1h])))

# SLA calculation (uptime percentage)
avg_over_time((up{job="api"}[30d])) * 100

Prometheus HTTP API (cURL)

# Query instant value
curl -G http://localhost:9090/api/v1/query \
  --data-urlencode 'query=up' \
  --data-urlencode 'time=2024-01-01T20:10:30.781Z'

# Query range
curl -G http://localhost:9090/api/v1/query_range \
  --data-urlencode 'query=rate(http_requests_total[5m])' \
  --data-urlencode 'start=2024-01-01T00:00:00Z' \
  --data-urlencode 'end=2024-01-01T23:59:59Z' \
  --data-urlencode 'step=15s'

# Get series labels
curl -G http://localhost:9090/api/v1/series \
  --data-urlencode 'match[]=up' \
  --data-urlencode 'start=2024-01-01T00:00:00Z' \
  --data-urlencode 'end=2024-01-01T23:59:59Z'

# Get label values
curl http://localhost:9090/api/v1/label/job/values

# Get targets
curl http://localhost:9090/api/v1/targets

# Get alerts
curl http://localhost:9090/api/v1/alerts

# Get rules
curl http://localhost:9090/api/v1/rules

# Health check
curl http://localhost:9090/-/healthy

# Reload configuration
curl -X POST http://localhost:9090/-/reload

Grafana HTTP API

# Authentication
export GRAFANA_TOKEN="your-api-token"
export GRAFANA_URL="http://localhost:3000"

# Create dashboard
curl -X POST "$GRAFANA_URL/api/dashboards/db" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d @dashboard.json

# Get dashboard by UID
curl "$GRAFANA_URL/api/dashboards/uid/my-dashboard" \
  -H "Authorization: Bearer $GRAFANA_TOKEN"

# Search dashboards
curl "$GRAFANA_URL/api/search?query=production&type=dash-db" \
  -H "Authorization: Bearer $GRAFANA_TOKEN"

# Delete dashboard
curl -X DELETE "$GRAFANA_URL/api/dashboards/uid/my-dashboard" \
  -H "Authorization: Bearer $GRAFANA_TOKEN"

# Create data source
curl -X POST "$GRAFANA_URL/api/datasources" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{
    "name": "Prometheus",
    "type": "prometheus",
    "url": "http://prometheus:9090",
    "access": "proxy",
    "isDefault": true
  }'

# List data sources
curl "$GRAFANA_URL/api/datasources" \
  -H "Authorization: Bearer $GRAFANA_TOKEN"

# Create organization
curl -X POST "$GRAFANA_URL/api/orgs" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{"name": "Production Org"}'

# Add user to organization
curl -X POST "$GRAFANA_URL/api/orgs/1/users" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{"loginOrEmail": "user@example.com", "role": "Viewer"}'

# Create API key
curl -X POST "$GRAFANA_URL/api/auth/keys" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{"name": "automation-key", "role": "Admin", "secondsToLive": 86400}'

# Create alert notification channel
curl -X POST "$GRAFANA_URL/api/alert-notifications" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{
    "name": "Slack Alerts",
    "type": "slack",
    "isDefault": true,
    "settings": {
      "url": "https://hooks.slack.com/services/xxx/yyy/zzz",
      "recipient": "#alerts"
    }
  }'

# Create folder
curl -X POST "$GRAFANA_URL/api/folders" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{"title": "Production Dashboards"}'

# Snapshot dashboard
curl -X POST "$GRAFANA_URL/api/snapshots" \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d @snapshot.json

# Health check
curl "$GRAFANA_URL/api/health"

Alert Rules Configuration

# alerts/app_alerts.yml
groups:
  - name: application
    interval: 30s
    rules:
      # High error rate
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}"

      # API latency
      - alert: HighLatency
        expr: |
          histogram_quantile(0.95,
            sum by (le) (rate(http_request_duration_seconds_bucket[5m]))
          ) > 1
        for: 10m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "API latency is high"
          description: "95th percentile latency is {{ $value }}s"

      # Service down
      - alert: ServiceDown
        expr: up{job="app"} == 0
        for: 2m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Service {{ $labels.instance }} is down"
          description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes"

  - name: infrastructure
    interval: 1m
    rules:
      # High CPU
      - alert: HighCPU
        expr: |
          100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
          team: sre
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value }}%"

      # Low disk space
      - alert: LowDiskSpace
        expr: |
          (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15
        for: 5m
        labels:
          severity: warning
          team: sre
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Only {{ $value }}% disk space remaining"

      # High memory
      - alert: HighMemory
        expr: |
          (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
        for: 5m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value }}%"

Python Client (Prometheus & Grafana)

import requests
from datetime import datetime, timedelta

class PrometheusClient:
    def __init__(self, url="http://localhost:9090"):
        self.url = url

    def query(self, promql):
        """Execute instant query"""
        response = requests.get(
            f"{self.url}/api/v1/query",
            params={"query": promql}
        )
        return response.json()

    def query_range(self, promql, start, end, step="15s"):
        """Execute range query"""
        response = requests.get(
            f"{self.url}/api/v1/query_range",
            params={
                "query": promql,
                "start": start.isoformat(),
                "end": end.isoformat(),
                "step": step
            }
        )
        return response.json()

    def get_targets(self):
        """Get scrape targets"""
        response = requests.get(f"{self.url}/api/v1/targets")
        return response.json()

class GrafanaClient:
    def __init__(self, url, token):
        self.url = url
        self.headers = {
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json"
        }

    def create_dashboard(self, dashboard_json):
        """Create or update dashboard"""
        response = requests.post(
            f"{self.url}/api/dashboards/db",
            headers=self.headers,
            json={"dashboard": dashboard_json, "overwrite": True}
        )
        return response.json()

    def get_dashboard(self, uid):
        """Get dashboard by UID"""
        response = requests.get(
            f"{self.url}/api/dashboards/uid/{uid}",
            headers=self.headers
        )
        return response.json()

    def search_dashboards(self, query="", tags=None):
        """Search dashboards"""
        params = {"query": query}
        if tags:
            params["tag"] = tags
        response = requests.get(
            f"{self.url}/api/search",
            headers=self.headers,
            params=params
        )
        return response.json()

    def create_annotation(self, dashboard_uid, time, text, tags=None):
        """Create annotation"""
        data = {
            "dashboardUID": dashboard_uid,
            "time": int(time.timestamp() * 1000),
            "text": text,
            "tags": tags or []
        }
        response = requests.post(
            f"{self.url}/api/annotations",
            headers=self.headers,
            json=data
        )
        return response.json()

# Usage example
prom = PrometheusClient("http://prometheus:9090")
grafana = GrafanaClient("http://grafana:3000", "your-api-token")

# Query CPU usage
result = prom.query('100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)')
print(f"CPU Usage: {result['data']['result'][0]['value'][1]}%")

# Get dashboard
dashboard = grafana.get_dashboard("my-dashboard-uid")

MCP Server Integration

Configuration (.codebuddy/mcp.json)

{
  "mcpServers": {
    "grafana": {
      "command": "npx",
      "args": ["-y", "@grafana/mcp-server"],
      "env": {
        "GRAFANA_URL": "http://localhost:3000",
        "GRAFANA_API_TOKEN": "your-api-token-here"
      }
    }
  }
}

Available MCP Tools

grafana_create_dashboard

  • Creates or updates a Grafana dashboard
  • Parameters:
    dashboard
    (JSON object),
    folder_id
    (optional),
    overwrite
    (boolean)
  • Returns: Dashboard UID, URL, version

grafana_get_dashboard

  • Retrieves dashboard by UID
  • Parameters:
    uid
    (string)
  • Returns: Full dashboard JSON with metadata

grafana_search_dashboards

  • Searches dashboards by query and tags
  • Parameters:
    query
    (string),
    tags
    (array),
    folder_ids
    (array)
  • Returns: List of matching dashboards

grafana_delete_dashboard

  • Deletes a dashboard
  • Parameters:
    uid
    (string)
  • Returns: Deletion confirmation

grafana_create_datasource

  • Creates a new data source
  • Parameters:
    name
    (string),
    type
    (string),
    url
    (string),
    settings
    (object)
  • Returns: Data source ID and details

grafana_query_datasource

  • Executes query against data source
  • Parameters:
    datasource_uid
    (string),
    query
    (string),
    time_range
    (object)
  • Returns: Query results

grafana_create_alert

  • Creates alert rule
  • Parameters:
    rule_name
    (string),
    folder_uid
    (string),
    condition
    (object),
    notifications
    (array)
  • Returns: Alert rule ID

grafana_list_alerts

  • Lists all alert rules
  • Parameters:
    folder_uid
    (optional),
    state
    (optional)
  • Returns: Array of alert rules

grafana_create_annotation

  • Creates dashboard annotation
  • Parameters:
    dashboard_uid
    (string),
    time
    (timestamp),
    text
    (string),
    tags
    (array)
  • Returns: Annotation ID

prometheus_query

  • Executes PromQL instant query
  • Parameters:
    query
    (string),
    time
    (optional timestamp)
  • Returns: Query result values

prometheus_query_range

  • Executes PromQL range query
  • Parameters:
    query
    (string),
    start
    (timestamp),
    end
    (timestamp),
    step
    (string)
  • Returns: Time series data

prometheus_get_metrics

  • Lists available metrics
  • Parameters:
    filter
    (optional string)
  • Returns: Array of metric names

prometheus_get_targets

  • Lists scrape targets and their status
  • Parameters: None
  • Returns: Target health and labels

Common Workflows

1. Complete Monitoring Stack Setup

# Docker Compose setup
cat > docker-compose.yml <<'EOF'
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alerts:/etc/prometheus/alerts
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.enable-lifecycle'

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_USERS_ALLOW_SIGN_UP=false
    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning

  node-exporter:
    image: prom/node-exporter:latest
    ports:
      - "9100:9100"
    command:
      - '--path.rootfs=/host'
    volumes:
      - '/:/host:ro,rslave'

  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml

volumes:
  prometheus-data:
  grafana-data:
EOF

# Start stack
docker-compose up -d

# Wait for services
sleep 10

# Create Grafana API token
GRAFANA_TOKEN=$(curl -X POST http://admin:admin@localhost:3000/api/auth/keys \
  -H "Content-Type: application/json" \
  -d '{"name": "automation", "role": "Admin"}' | jq -r '.key')

# Add Prometheus data source
curl -X POST http://localhost:3000/api/datasources \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{
    "name": "Prometheus",
    "type": "prometheus",
    "url": "http://prometheus:9090",
    "access": "proxy",
    "isDefault": true
  }'

echo "Monitoring stack ready!"
echo "Grafana: http://localhost:3000 (admin/admin)"
echo "Prometheus: http://localhost:9090"

2. Create Custom Application Dashboard

# Generate dashboard JSON
cat > app-dashboard.json <<'EOF'
{
  "dashboard": {
    "title": "Application Metrics",
    "tags": ["application", "production"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "Request Rate",
        "type": "graph",
        "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
        "targets": [
          {
            "expr": "sum(rate(http_requests_total[5m])) by (status)",
            "legendFormat": "{{status}}",
            "refId": "A"
          }
        ],
        "yaxes": [
          {"format": "reqps", "label": "Requests/sec"},
          {"format": "short"}
        ]
      },
      {
        "id": 2,
        "title": "Error Rate",
        "type": "singlestat",
        "gridPos": {"x": 12, "y": 0, "w": 6, "h": 8},
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
            "refId": "A"
          }
        ],
        "format": "percent",
        "thresholds": "1,5",
        "colors": ["#299c46", "#e5ac0e", "#bf1b00"]
      },
      {
        "id": 3,
        "title": "Response Time (p95)",
        "type": "graph",
        "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
        "targets": [
          {
            "expr": "histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m])))",
            "legendFormat": "{{endpoint}}",
            "refId": "A"
          }
        ],
        "yaxes": [
          {"format": "s", "label": "Duration"},
          {"format": "short"}
        ]
      },
      {
        "id": 4,
        "title": "Active Connections",
        "type": "graph",
        "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
        "targets": [
          {
            "expr": "sum(app_active_connections) by (instance)",
            "legendFormat": "{{instance}}",
            "refId": "A"
          }
        ]
      }
    ],
    "refresh": "30s",
    "time": {"from": "now-6h", "to": "now"}
  },
  "overwrite": true
}
EOF

# Upload dashboard
curl -X POST http://localhost:3000/api/dashboards/db \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d @app-dashboard.json

3. Set Up Alerting Pipeline

# Configure Alertmanager
cat > alertmanager.yml <<'EOF'
global:
  resolve_timeout: 5m
  slack_api_url: 'https://hooks.slack.com/services/xxx/yyy/zzz'

route:
  group_by: ['alertname', 'cluster']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'default'
  routes:
    - match:
        severity: critical
      receiver: 'pagerduty'
      continue: true
    - match:
        severity: warning
      receiver: 'slack'

receivers:
  - name: 'default'
    slack_configs:
      - channel: '#alerts'
        title: 'Alert: {{ .GroupLabels.alertname }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

  - name: 'slack'
    slack_configs:
      - channel: '#alerts-warning'
        title: 'Warning: {{ .GroupLabels.alertname }}'
        text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
        send_resolved: true

  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: 'your-pagerduty-key'
        description: '{{ .GroupLabels.alertname }}'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']
EOF

# Reload Alertmanager
curl -X POST http://localhost:9093/-/reload

# Test alert
curl -X POST http://localhost:9093/api/v1/alerts \
  -H "Content-Type: application/json" \
  -d '[
    {
      "labels": {
        "alertname": "TestAlert",
        "severity": "warning"
      },
      "annotations": {
        "summary": "This is a test alert"
      }
    }
  ]'

# Verify alert rules
curl http://localhost:9090/api/v1/rules | jq '.data.groups[].rules[] | select(.type=="alerting")'

# Check active alerts
curl http://localhost:9090/api/v1/alerts | jq '.data.alerts[] | select(.state=="firing")'

4. Instrument Application with Metrics

# Python Flask application with Prometheus metrics
from flask import Flask, request
from prometheus_client import Counter, Histogram, Gauge, generate_latest, REGISTRY
import time

app = Flask(__name__)

# Metrics
REQUEST_COUNT = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

REQUEST_DURATION = Histogram(
    'http_request_duration_seconds',
    'HTTP request duration',
    ['method', 'endpoint']
)

ACTIVE_REQUESTS = Gauge(
    'http_requests_active',
    'Active HTTP requests'
)

# Middleware
@app.before_request
def before_request():
    request.start_time = time.time()
    ACTIVE_REQUESTS.inc()

@app.after_request
def after_request(response):
    duration = time.time() - request.start_time

    REQUEST_COUNT.labels(
        method=request.method,
        endpoint=request.endpoint or 'unknown',
        status=response.status_code
    ).inc()

    REQUEST_DURATION.labels(
        method=request.method,
        endpoint=request.endpoint or 'unknown'
    ).observe(duration)

    ACTIVE_REQUESTS.dec()
    return response

# Metrics endpoint
@app.route('/metrics')
def metrics():
    return generate_latest(REGISTRY)

# Application endpoints
@app.route('/api/users')
def users():
    return {'users': []}

@app.route('/health')
def health():
    return {'status': 'healthy'}

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)
# Add to Prometheus scrape config
cat >> prometheus.yml <<'EOF'
  - job_name: 'my-app'
    static_configs:
      - targets: ['app:8080']
    metrics_path: '/metrics'
    scrape_interval: 10s
EOF

# Reload Prometheus
curl -X POST http://localhost:9090/-/reload

5. Performance Analysis and Troubleshooting

# Check if metrics are being scraped
curl 'http://localhost:9090/api/v1/query?query=up{job="app"}'

# Analyze request patterns
curl -G 'http://localhost:9090/api/v1/query' \
  --data-urlencode 'query=topk(10, sum by (endpoint) (rate(http_requests_total[1h])))'

# Find slow endpoints
curl -G 'http://localhost:9090/api/v1/query' \
  --data-urlencode 'query=histogram_quantile(0.99, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m]))) > 1'

# Memory leak detection
curl -G 'http://localhost:9090/api/v1/query_range' \
  --data-urlencode 'query=process_resident_memory_bytes{job="app"}' \
  --data-urlencode 'start=2024-01-01T00:00:00Z' \
  --data-urlencode 'end=2024-01-01T23:59:59Z' \
  --data-urlencode 'step=1h' | jq '.data.result[0].values'

# CPU spike investigation
curl -G 'http://localhost:9090/api/v1/query_range' \
  --data-urlencode 'query=rate(process_cpu_seconds_total{job="app"}[5m])' \
  --data-urlencode 'start=2024-01-01T10:00:00Z' \
  --data-urlencode 'end=2024-01-01T11:00:00Z' \
  --data-urlencode 'step=30s'

# Correlate errors with deployments (using annotations)
curl -X POST http://localhost:3000/api/annotations \
  -H "Authorization: Bearer $GRAFANA_TOKEN" \
  -H "Content-Type: application/json" \
  -d "{
    \"time\": $(date +%s)000,
    \"text\": \"Deployed v2.1.0\",
    \"tags\": [\"deployment\", \"v2.1.0\"]
  }"

# Export metrics for analysis
curl -G 'http://localhost:9090/api/v1/query_range' \
  --data-urlencode 'query=http_requests_total' \
  --data-urlencode 'start=2024-01-01T00:00:00Z' \
  --data-urlencode 'end=2024-01-01T23:59:59Z' \
  --data-urlencode 'step=5m' | jq -r '.data.result[] | [.metric.instance, .values[][1]] | @csv' > metrics.csv