fetch_ml/monitoring/security_rules.yml

groups:
  - name: security.rules
    rules:
      # High rate of failed authentication attempts
      - alert: HighFailedAuthRate
        expr: rate(failed_auth_total[5m]) > 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High rate of failed authentication attempts"
          description: "More than 10 failed auth attempts per minute for the last 2 minutes"

      # Potential brute force attack
      - alert: BruteForceAttack
        expr: rate(failed_auth_total[1m]) > 30
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Potential brute force attack detected"
          description: "More than 30 failed auth attempts per minute"

      # Unusual WebSocket connection patterns
      - alert: UnusualWebSocketActivity
        expr: rate(websocket_connections_total[5m]) > 100
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "Unusual WebSocket connection activity"
          description: "WebSocket connection rate is unusually high"

      # Rate limit breaches
      - alert: RateLimitBreached
        expr: rate(rate_limit_exceeded_total[5m]) > 5
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Rate limits being exceeded"
          description: "Rate limit exceeded more than 5 times per minute"

      # SSL certificate expiration warning
      - alert: SSLCertificateExpiring
        expr: ssl_certificate_expiry_days < 30
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "SSL certificate expiring soon"
          description: "SSL certificate will expire in less than 30 days"

      # High memory usage
      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected"
          description: "Memory usage is above 90%"

      # High CPU usage
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is above 80%"

      # Disk space running low
      - alert: LowDiskSpace
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space"
          description: "Disk space is below 10%"

      # Service down
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"
          description: "{{ $labels.instance }} service has been down for more than 1 minute"

      # Unexpected error rates
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "Error rate is above 10%"

      # Suspicious IP activity
      - alert: SuspiciousIPActivity
        expr: rate(requests_by_ip[5m]) > 1000
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Suspicious IP activity"
          description: "IP address making unusually many requests"