littleshop/docker/alert_rules.yml

groups:
  - name: littleshop_alerts
    rules:
      # Application health alerts
      - alert: LittleShopDown
        expr: up{job="littleshop"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "LittleShop application is down"
          description: "LittleShop application has been down for more than 1 minute."

      - alert: LittleShopHealthCheckFailing
        expr: up{job="littleshop-health"} == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "LittleShop health check is failing"
          description: "LittleShop health check has been failing for more than 2 minutes."

      # Performance alerts
      - alert: HighCpuUsage
        expr: rate(container_cpu_usage_seconds_total{name="littleshop_prod"}[5m]) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected"
          description: "LittleShop container CPU usage is above 80% for more than 5 minutes."

      - alert: HighMemoryUsage
        expr: (container_memory_usage_bytes{name="littleshop_prod"} / container_spec_memory_limit_bytes{name="littleshop_prod"}) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected"
          description: "LittleShop container memory usage is above 85% for more than 5 minutes."

      # System alerts
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Disk space is running low"
          description: "Available disk space is less than 10%."

      - alert: HighErrorRate
        expr: rate(littleshop_http_requests_total{status=~"5.."}[5m]) / rate(littleshop_http_requests_total[5m]) * 100 > 5
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "HTTP 5xx error rate is above 5% for more than 3 minutes."

  - name: infrastructure_alerts
    rules:
      # Container monitoring
      - alert: ContainerRestarted
        expr: increase(container_last_seen{name="littleshop_prod"}[1h]) > 0
        labels:
          severity: warning
        annotations:
          summary: "Container has been restarted"
          description: "LittleShop container has been restarted within the last hour."

      # Database alerts
      - alert: DatabaseConnectionFailed
        expr: littleshop_database_connections_failed_total > 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Database connection failures detected"
          description: "LittleShop is experiencing database connection failures."