fetch_ml/monitoring/security_rules.yml
Jeremie Fraeys 4aecd469a1 feat: implement comprehensive monitoring and container orchestration
- Add Prometheus, Grafana, and Loki monitoring stack
- Include pre-configured dashboards for ML metrics and logs
- Add Podman container support with security policies
- Implement ML runtime environments for multiple frameworks
- Add containerized ML project templates (PyTorch, TensorFlow, etc.)
- Include secure runner with isolation and resource limits
- Add comprehensive log aggregation and alerting
2025-12-04 16:54:49 -05:00

112 lines
3.6 KiB
YAML

groups:
- name: security.rules
rules:
# High rate of failed authentication attempts
- alert: HighFailedAuthRate
expr: rate(failed_auth_total[5m]) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "High rate of failed authentication attempts"
description: "More than 10 failed auth attempts per minute for the last 2 minutes"
# Potential brute force attack
- alert: BruteForceAttack
expr: rate(failed_auth_total[1m]) > 30
for: 1m
labels:
severity: critical
annotations:
summary: "Potential brute force attack detected"
description: "More than 30 failed auth attempts per minute"
# Unusual WebSocket connection patterns
- alert: UnusualWebSocketActivity
expr: rate(websocket_connections_total[5m]) > 100
for: 3m
labels:
severity: warning
annotations:
summary: "Unusual WebSocket connection activity"
description: "WebSocket connection rate is unusually high"
# Rate limit breaches
- alert: RateLimitBreached
expr: rate(rate_limit_exceeded_total[5m]) > 5
for: 1m
labels:
severity: warning
annotations:
summary: "Rate limits being exceeded"
description: "Rate limit exceeded more than 5 times per minute"
# SSL certificate expiration warning
- alert: SSLCertificateExpiring
expr: ssl_certificate_expiry_days < 30
for: 1h
labels:
severity: warning
annotations:
summary: "SSL certificate expiring soon"
description: "SSL certificate will expire in less than 30 days"
# High memory usage
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 90%"
# High CPU usage
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80%"
# Disk space running low
- alert: LowDiskSpace
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space"
description: "Disk space is below 10%"
# Service down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "{{ $labels.instance }} service has been down for more than 1 minute"
# Unexpected error rates
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is above 10%"
# Suspicious IP activity
- alert: SuspiciousIPActivity
expr: rate(requests_by_ip[5m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "Suspicious IP activity"
description: "IP address making unusually many requests"