fetch_ml/monitoring/prometheus.yml

# Prometheus configuration for ML experiments monitoring

global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  # API Server metrics
  - job_name: 'api-server'
    static_configs:
      - targets: ['api-server:9100']
        labels:
          service: 'api-server'

  # Worker metrics (if running in docker)
  - job_name: 'worker'
    static_configs:
      - targets: ['worker:9100']
        labels:
          service: 'worker'
    # Allow failures if worker not running
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance

  # Prometheus self-monitoring
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']