44 lines
1.1 KiB
YAML
44 lines
1.1 KiB
YAML
# Prometheus configuration for ML experiments monitoring
|
|
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
# API Server metrics and health
|
|
- job_name: 'api-server'
|
|
scheme: http
|
|
static_configs:
|
|
- targets: ['api-server:9101']
|
|
labels:
|
|
service: 'api-server'
|
|
metrics_path: /metrics # Future: Prometheus metrics endpoint
|
|
|
|
# Benchmark metrics from Pushgateway
|
|
- job_name: 'benchmark'
|
|
static_configs: []
|
|
|
|
# Worker metrics (ResourceManager + task execution)
|
|
# For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
|
|
# via host.docker.internal.
|
|
- job_name: 'worker'
|
|
scrape_interval: 15s
|
|
static_configs:
|
|
- targets: ['worker:9100']
|
|
labels:
|
|
service: 'worker'
|
|
target_type: 'container'
|
|
metrics_path: /metrics
|
|
|
|
# Loki metrics
|
|
- job_name: 'loki'
|
|
static_configs:
|
|
- targets: ['loki:3100']
|
|
labels:
|
|
service: 'loki'
|
|
metrics_path: /metrics
|
|
|
|
# Prometheus self-monitoring
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|