- Add Prometheus, Grafana, and Loki monitoring stack - Include pre-configured dashboards for ML metrics and logs - Add Podman container support with security policies - Implement ML runtime environments for multiple frameworks - Add containerized ML project templates (PyTorch, TensorFlow, etc.) - Include secure runner with isolation and resource limits - Add comprehensive log aggregation and alerting
112 lines
3.6 KiB
YAML
112 lines
3.6 KiB
YAML
groups:
|
|
- name: security.rules
|
|
rules:
|
|
# High rate of failed authentication attempts
|
|
- alert: HighFailedAuthRate
|
|
expr: rate(failed_auth_total[5m]) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High rate of failed authentication attempts"
|
|
description: "More than 10 failed auth attempts per minute for the last 2 minutes"
|
|
|
|
# Potential brute force attack
|
|
- alert: BruteForceAttack
|
|
expr: rate(failed_auth_total[1m]) > 30
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Potential brute force attack detected"
|
|
description: "More than 30 failed auth attempts per minute"
|
|
|
|
# Unusual WebSocket connection patterns
|
|
- alert: UnusualWebSocketActivity
|
|
expr: rate(websocket_connections_total[5m]) > 100
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Unusual WebSocket connection activity"
|
|
description: "WebSocket connection rate is unusually high"
|
|
|
|
# Rate limit breaches
|
|
- alert: RateLimitBreached
|
|
expr: rate(rate_limit_exceeded_total[5m]) > 5
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Rate limits being exceeded"
|
|
description: "Rate limit exceeded more than 5 times per minute"
|
|
|
|
# SSL certificate expiration warning
|
|
- alert: SSLCertificateExpiring
|
|
expr: ssl_certificate_expiry_days < 30
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "SSL certificate expiring soon"
|
|
description: "SSL certificate will expire in less than 30 days"
|
|
|
|
# High memory usage
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage detected"
|
|
description: "Memory usage is above 90%"
|
|
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "CPU usage is above 80%"
|
|
|
|
# Disk space running low
|
|
- alert: LowDiskSpace
|
|
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Low disk space"
|
|
description: "Disk space is below 10%"
|
|
|
|
# Service down
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service is down"
|
|
description: "{{ $labels.instance }} service has been down for more than 1 minute"
|
|
|
|
# Unexpected error rates
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is above 10%"
|
|
|
|
# Suspicious IP activity
|
|
- alert: SuspiciousIPActivity
|
|
expr: rate(requests_by_ip[5m]) > 1000
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Suspicious IP activity"
|
|
description: "IP address making unusually many requests"
|