🚀 Docker Production Optimizations: - Chiseled Ubuntu base image for minimal attack surface - Non-root user execution with security hardening - Read-only filesystem with targeted writable volumes - Resource limits (1GB RAM, 1 CPU) with health checks - Multi-stage builds optimized for caching - Zero-downtime deployment automation 🔍 Comprehensive Monitoring Stack: - Prometheus metrics collection with custom rules - Grafana dashboards for application visualization - AlertManager with email notifications for critical events - Fluentd centralized logging with retention policies - Node Exporter + cAdvisor for system/container metrics - Health check endpoint (/health) for container orchestration 📋 Production Deployment Ready: - Complete deployment scripts with backup strategy - Environment templates for secure configuration - Performance monitoring and alerting rules - Enterprise-grade security and observability 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
80 lines
2.8 KiB
YAML
80 lines
2.8 KiB
YAML
groups:
|
|
- name: littleshop_alerts
|
|
rules:
|
|
# Application health alerts
|
|
- alert: LittleShopDown
|
|
expr: up{job="littleshop"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "LittleShop application is down"
|
|
description: "LittleShop application has been down for more than 1 minute."
|
|
|
|
- alert: LittleShopHealthCheckFailing
|
|
expr: up{job="littleshop-health"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "LittleShop health check is failing"
|
|
description: "LittleShop health check has been failing for more than 2 minutes."
|
|
|
|
# Performance alerts
|
|
- alert: HighCpuUsage
|
|
expr: rate(container_cpu_usage_seconds_total{name="littleshop_prod"}[5m]) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "LittleShop container CPU usage is above 80% for more than 5 minutes."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (container_memory_usage_bytes{name="littleshop_prod"} / container_spec_memory_limit_bytes{name="littleshop_prod"}) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage detected"
|
|
description: "LittleShop container memory usage is above 85% for more than 5 minutes."
|
|
|
|
# System alerts
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Disk space is running low"
|
|
description: "Available disk space is less than 10%."
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(littleshop_http_requests_total{status=~"5.."}[5m]) / rate(littleshop_http_requests_total[5m]) * 100 > 5
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "HTTP 5xx error rate is above 5% for more than 3 minutes."
|
|
|
|
- name: infrastructure_alerts
|
|
rules:
|
|
# Container monitoring
|
|
- alert: ContainerRestarted
|
|
expr: increase(container_last_seen{name="littleshop_prod"}[1h]) > 0
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container has been restarted"
|
|
description: "LittleShop container has been restarted within the last hour."
|
|
|
|
# Database alerts
|
|
- alert: DatabaseConnectionFailed
|
|
expr: littleshop_database_connections_failed_total > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Database connection failures detected"
|
|
description: "LittleShop is experiencing database connection failures." |