fetch_ml/monitoring/prometheus.yml
Jeremie Fraeys ea15af1833 Fix multi-user authentication and clean up debug code
- Fix YAML tags in auth config struct (json -> yaml)
- Update CLI configs to use pre-hashed API keys
- Remove double hashing in WebSocket client
- Fix port mapping (9102 -> 9103) in CLI commands
- Update permission keys to use jobs:read, jobs:create, etc.
- Clean up all debug logging from CLI and server
- All user roles now authenticate correctly:
  * Admin: Can queue jobs and see all jobs
  * Researcher: Can queue jobs and see own jobs
  * Analyst: Can see status (read-only access)

Multi-user authentication is now fully functional.
2025-12-06 12:35:32 -05:00

48 lines
1.1 KiB
YAML

# Prometheus configuration for ML experiments monitoring
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# API Server metrics
- job_name: 'api-server'
static_configs:
- targets: ['api-server:9100']
labels:
service: 'api-server'
# Worker metrics (if running in docker)
- job_name: 'worker'
static_configs:
- targets: ['worker:9100']
labels:
service: 'worker'
# Allow failures if worker not running
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
# Benchmark metrics from Pushgateway
- job_name: 'benchmark'
static_configs:
- targets: ['localhost:9091']
labels:
service: 'benchmark'
metrics_path: /metrics
honor_labels: true
# Loki metrics
- job_name: 'loki'
static_configs:
- targets: ['ml-experiments-loki:3100']
labels:
service: 'loki'
metrics_path: /metrics
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']