groups: - name: littleshop_alerts rules: # Application health alerts - alert: LittleShopDown expr: up{job="littleshop"} == 0 for: 1m labels: severity: critical annotations: summary: "LittleShop application is down" description: "LittleShop application has been down for more than 1 minute." - alert: LittleShopHealthCheckFailing expr: up{job="littleshop-health"} == 0 for: 2m labels: severity: warning annotations: summary: "LittleShop health check is failing" description: "LittleShop health check has been failing for more than 2 minutes." # Performance alerts - alert: HighCpuUsage expr: rate(container_cpu_usage_seconds_total{name="littleshop_prod"}[5m]) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage detected" description: "LittleShop container CPU usage is above 80% for more than 5 minutes." - alert: HighMemoryUsage expr: (container_memory_usage_bytes{name="littleshop_prod"} / container_spec_memory_limit_bytes{name="littleshop_prod"}) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "High memory usage detected" description: "LittleShop container memory usage is above 85% for more than 5 minutes." # System alerts - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10 for: 5m labels: severity: critical annotations: summary: "Disk space is running low" description: "Available disk space is less than 10%." - alert: HighErrorRate expr: rate(littleshop_http_requests_total{status=~"5.."}[5m]) / rate(littleshop_http_requests_total[5m]) * 100 > 5 for: 3m labels: severity: warning annotations: summary: "High error rate detected" description: "HTTP 5xx error rate is above 5% for more than 3 minutes." - name: infrastructure_alerts rules: # Container monitoring - alert: ContainerRestarted expr: increase(container_last_seen{name="littleshop_prod"}[1h]) > 0 labels: severity: warning annotations: summary: "Container has been restarted" description: "LittleShop container has been restarted within the last hour." # Database alerts - alert: DatabaseConnectionFailed expr: littleshop_database_connections_failed_total > 0 for: 1m labels: severity: critical annotations: summary: "Database connection failures detected" description: "LittleShop is experiencing database connection failures."