fetch_ml/monitoring/prometheus.yml
Jeremie Fraeys 4aecd469a1 feat: implement comprehensive monitoring and container orchestration
- Add Prometheus, Grafana, and Loki monitoring stack
- Include pre-configured dashboards for ML metrics and logs
- Add Podman container support with security policies
- Implement ML runtime environments for multiple frameworks
- Add containerized ML project templates (PyTorch, TensorFlow, etc.)
- Include secure runner with isolation and resource limits
- Add comprehensive log aggregation and alerting
2025-12-04 16:54:49 -05:00

31 lines
775 B
YAML

# Prometheus configuration for ML experiments monitoring
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# API Server metrics
- job_name: 'api-server'
static_configs:
- targets: ['api-server:9100']
labels:
service: 'api-server'
# Worker metrics (if running in docker)
- job_name: 'worker'
static_configs:
- targets: ['worker:9100']
labels:
service: 'worker'
# Allow failures if worker not running
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']