fetch_ml/monitoring/prometheus/prometheus.yml

44 lines
1.1 KiB
YAML

# Prometheus configuration for ML experiments monitoring
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# API Server metrics and health
- job_name: 'api-server'
scheme: http
static_configs:
- targets: ['api-server:9101']
labels:
service: 'api-server'
metrics_path: /metrics # Future: Prometheus metrics endpoint
# Benchmark metrics from Pushgateway
- job_name: 'benchmark'
static_configs: []
# Worker metrics (ResourceManager + task execution)
# For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
# via host.docker.internal.
- job_name: 'worker'
scrape_interval: 15s
static_configs:
- targets: ['worker:9100']
labels:
service: 'worker'
target_type: 'container'
metrics_path: /metrics
# Loki metrics
- job_name: 'loki'
static_configs:
- targets: ['loki:3100']
labels:
service: 'loki'
metrics_path: /metrics
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']