# Prometheus configuration for ML experiments monitoring global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: # API Server metrics - job_name: 'api-server' static_configs: - targets: ['api-server:9100'] labels: service: 'api-server' # Worker metrics (if running in docker) - job_name: 'worker' static_configs: - targets: ['worker:9100'] labels: service: 'worker' # Allow failures if worker not running relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance # Benchmark metrics from Pushgateway - job_name: 'benchmark' static_configs: - targets: ['localhost:9091'] labels: service: 'benchmark' metrics_path: /metrics honor_labels: true # Loki metrics - job_name: 'loki' static_configs: - targets: ['ml-experiments-loki:3100'] labels: service: 'loki' metrics_path: /metrics # Prometheus self-monitoring - job_name: 'prometheus' static_configs: - targets: ['localhost:9090']