- Fix YAML tags in auth config struct (json -> yaml) - Update CLI configs to use pre-hashed API keys - Remove double hashing in WebSocket client - Fix port mapping (9102 -> 9103) in CLI commands - Update permission keys to use jobs:read, jobs:create, etc. - Clean up all debug logging from CLI and server - All user roles now authenticate correctly: * Admin: Can queue jobs and see all jobs * Researcher: Can queue jobs and see own jobs * Analyst: Can see status (read-only access) Multi-user authentication is now fully functional.
48 lines
1.1 KiB
YAML
48 lines
1.1 KiB
YAML
# Prometheus configuration for ML experiments monitoring
|
|
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
# API Server metrics
|
|
- job_name: 'api-server'
|
|
static_configs:
|
|
- targets: ['api-server:9100']
|
|
labels:
|
|
service: 'api-server'
|
|
|
|
# Worker metrics (if running in docker)
|
|
- job_name: 'worker'
|
|
static_configs:
|
|
- targets: ['worker:9100']
|
|
labels:
|
|
service: 'worker'
|
|
# Allow failures if worker not running
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
|
|
# Benchmark metrics from Pushgateway
|
|
- job_name: 'benchmark'
|
|
static_configs:
|
|
- targets: ['localhost:9091']
|
|
labels:
|
|
service: 'benchmark'
|
|
metrics_path: /metrics
|
|
honor_labels: true
|
|
|
|
# Loki metrics
|
|
- job_name: 'loki'
|
|
static_configs:
|
|
- targets: ['ml-experiments-loki:3100']
|
|
labels:
|
|
service: 'loki'
|
|
metrics_path: /metrics
|
|
|
|
# Prometheus self-monitoring
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|