- Add Prometheus, Grafana, and Loki monitoring stack - Include pre-configured dashboards for ML metrics and logs - Add Podman container support with security policies - Implement ML runtime environments for multiple frameworks - Add containerized ML project templates (PyTorch, TensorFlow, etc.) - Include secure runner with isolation and resource limits - Add comprehensive log aggregation and alerting
31 lines
775 B
YAML
31 lines
775 B
YAML
# Prometheus configuration for ML experiments monitoring
|
|
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
# API Server metrics
|
|
- job_name: 'api-server'
|
|
static_configs:
|
|
- targets: ['api-server:9100']
|
|
labels:
|
|
service: 'api-server'
|
|
|
|
# Worker metrics (if running in docker)
|
|
- job_name: 'worker'
|
|
static_configs:
|
|
- targets: ['worker:9100']
|
|
labels:
|
|
service: 'worker'
|
|
# Allow failures if worker not running
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
|
|
# Prometheus self-monitoring
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|