fetch_ml/monitoring/prometheus/prometheus.yml

# Prometheus configuration for ML experiments monitoring

global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  # API Server metrics and health
  - job_name: 'api-server'
    scheme: http
    static_configs:
      - targets: ['api-server:9101']
        labels:
          service: 'api-server'
    metrics_path: /metrics  # Future: Prometheus metrics endpoint

  # Benchmark metrics from Pushgateway
  - job_name: 'benchmark'
    static_configs: []

  # Worker metrics (ResourceManager + task execution)
  # For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
  # via host.docker.internal.
  - job_name: 'worker'
    scrape_interval: 15s
    static_configs:
      - targets: ['worker:9100']
        labels:
          service: 'worker'
          target_type: 'container'
    metrics_path: /metrics

  # Loki metrics
  - job_name: 'loki'
    static_configs:
      - targets: ['loki:3100']
        labels:
          service: 'loki'
    metrics_path: /metrics

  # Prometheus self-monitoring
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']