fetch_ml/internal/jupyter/health_monitor.go
Jeremie Fraeys cd5640ebd2 Slim and secure: move scripts, clean configs, remove secrets
- Move ci-test.sh and setup.sh to scripts/
- Trim docs/src/zig-cli.md to current structure
- Replace hardcoded secrets with placeholders in configs
- Update .gitignore to block .env*, secrets/, keys, build artifacts
- Slim README.md to reflect current CLI/TUI split
- Add cleanup trap to ci-test.sh
- Ensure no secrets are committed
2025-12-07 13:57:51 -05:00

415 lines
12 KiB
Go

package jupyter
import (
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"sync"
"time"
"github.com/jfraeys/fetch_ml/internal/logging"
)
const (
statusUnhealthy = "unhealthy"
)
// HealthMonitor monitors the health of Jupyter services
type HealthMonitor struct {
logger *logging.Logger
services map[string]*JupyterService
servicesMutex sync.RWMutex
interval time.Duration
client *http.Client
}
// HealthStatus represents the health status of a service
type HealthStatus struct {
ServiceID string `json:"service_id"`
ServiceName string `json:"service_name"`
Status string `json:"status"`
LastCheck time.Time `json:"last_check"`
ResponseTime time.Duration `json:"response_time"`
URL string `json:"url"`
ContainerID string `json:"container_id"`
Errors []string `json:"errors"`
Metrics map[string]interface{} `json:"metrics"`
}
// HealthReport contains a comprehensive health report
type HealthReport struct {
Timestamp time.Time `json:"timestamp"`
TotalServices int `json:"total_services"`
Healthy int `json:"healthy"`
Unhealthy int `json:"unhealthy"`
Unknown int `json:"unknown"`
Services map[string]*HealthStatus `json:"services"`
Summary string `json:"summary"`
}
// NewHealthMonitor creates a new health monitor
func NewHealthMonitor(logger *logging.Logger, interval time.Duration) *HealthMonitor {
return &HealthMonitor{
logger: logger,
services: make(map[string]*JupyterService),
interval: interval,
client: &http.Client{
Timeout: 10 * time.Second,
},
}
}
// AddService adds a service to monitor
func (hm *HealthMonitor) AddService(service *JupyterService) {
hm.services[service.ID] = service
hm.logger.Info("service added to health monitor", "service_id", service.ID, "name", service.Name)
}
// RemoveService removes a service from monitoring
func (hm *HealthMonitor) RemoveService(serviceID string) {
delete(hm.services, serviceID)
hm.logger.Info("service removed from health monitor", "service_id", serviceID)
}
// CheckServiceHealth checks the health of a specific service
func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID string) (*HealthStatus, error) {
service, exists := hm.services[serviceID]
if !exists {
return nil, fmt.Errorf("service %s not found", serviceID)
}
healthStatus := &HealthStatus{
ServiceID: serviceID,
ServiceName: service.Name,
LastCheck: time.Now(),
URL: service.URL,
ContainerID: service.ContainerID,
Metrics: make(map[string]interface{}),
Errors: []string{},
}
// Check HTTP connectivity
start := time.Now()
req, err := http.NewRequestWithContext(ctx, "GET", service.URL, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
resp, err := hm.client.Do(req)
responseTime := time.Since(start)
if err != nil {
healthStatus.Status = statusUnhealthy
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP request failed: %v", err))
healthStatus.ResponseTime = responseTime
return healthStatus, nil
}
defer func() {
if err := resp.Body.Close(); err != nil {
hm.logger.Warn("failed to close response body", "error", err)
}
}()
healthStatus.ResponseTime = responseTime
healthStatus.Metrics["response_time_ms"] = responseTime.Milliseconds()
healthStatus.Metrics["status_code"] = resp.StatusCode
// Check response status
if resp.StatusCode == 200 {
healthStatus.Status = "healthy"
} else {
healthStatus.Status = statusUnhealthy
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP status %d", resp.StatusCode))
}
// Check response headers for Jupyter-specific indicators
if server := resp.Header.Get("Server"); server != "" {
healthStatus.Metrics["server"] = server
}
return healthStatus, nil
}
// CheckAllServices checks the health of all monitored services
func (hm *HealthMonitor) CheckAllServices(ctx context.Context) (*HealthReport, error) {
report := &HealthReport{
Timestamp: time.Now(),
Services: make(map[string]*HealthStatus),
}
for serviceID := range hm.services {
healthStatus, err := hm.CheckServiceHealth(ctx, serviceID)
if err != nil {
hm.logger.Warn("failed to check service health", "service_id", serviceID, "error", err)
continue
}
report.Services[serviceID] = healthStatus
// Update counters
switch healthStatus.Status {
case "healthy":
report.Healthy++
case statusUnhealthy:
report.Unhealthy++
default:
report.Unknown++
}
report.TotalServices++
}
// Generate summary
report.Summary = hm.generateSummary(report)
return report, nil
}
// generateSummary generates a human-readable summary
func (hm *HealthMonitor) generateSummary(report *HealthReport) string {
if report.TotalServices == 0 {
return "No services to monitor"
}
if report.Unhealthy == 0 {
return fmt.Sprintf("All %d services are healthy", report.Healthy)
}
return fmt.Sprintf("%d healthy, %d unhealthy, %d unknown out of %d total services",
report.Healthy, report.Unhealthy, report.Unknown, report.TotalServices)
}
// StartMonitoring starts continuous health monitoring
func (hm *HealthMonitor) StartMonitoring(ctx context.Context) {
ticker := time.NewTicker(hm.interval)
defer ticker.Stop()
hm.logger.Info("health monitoring started", "interval", hm.interval)
for {
select {
case <-ctx.Done():
hm.logger.Info("health monitoring stopped")
return
case <-ticker.C:
report, err := hm.CheckAllServices(ctx)
if err != nil {
hm.logger.Warn("health check failed", "error", err)
continue
}
// Log summary
hm.logger.Info("health check completed", "summary", report.Summary)
// Alert on unhealthy services
for serviceID, health := range report.Services {
if health.Status == statusUnhealthy {
hm.logger.Warn("service unhealthy",
"service_id", serviceID,
"name", health.ServiceName,
"errors", health.Errors)
}
}
}
}
}
// GetServiceMetrics returns detailed metrics for a service
func (hm *HealthMonitor) GetServiceMetrics(ctx context.Context, serviceID string) (map[string]interface{}, error) {
service, exists := hm.services[serviceID]
if !exists {
return nil, fmt.Errorf("service %s not found", serviceID)
}
metrics := make(map[string]interface{})
// Basic service info
metrics["service_id"] = service.ID
metrics["service_name"] = service.Name
metrics["container_id"] = service.ContainerID
metrics["url"] = service.URL
metrics["created_at"] = service.CreatedAt
metrics["last_access"] = service.LastAccess
// Health check
healthStatus, err := hm.CheckServiceHealth(ctx, serviceID)
if err != nil {
metrics["health_status"] = "error"
metrics["health_error"] = err.Error()
} else {
metrics["health_status"] = healthStatus.Status
metrics["response_time_ms"] = healthStatus.ResponseTime.Milliseconds()
metrics["last_health_check"] = healthStatus.LastCheck
if len(healthStatus.Errors) > 0 {
metrics["health_errors"] = healthStatus.Errors
}
}
// Container metrics (if available)
containerMetrics := hm.getContainerMetrics(ctx, service.ContainerID)
for k, v := range containerMetrics {
metrics["container_"+k] = v
}
return metrics, nil
}
// getContainerMetrics gets container-specific metrics
func (hm *HealthMonitor) getContainerMetrics(_ context.Context, _ string) map[string]interface{} {
// Lightweight container metrics - avoid heavy system calls
metrics := make(map[string]interface{})
// Basic status check only - keep it minimal
metrics["status"] = "running"
metrics["last_check"] = time.Now().Unix()
return metrics
}
// ValidateService checks if a service is properly configured
func (hm *HealthMonitor) ValidateService(service *JupyterService) []string {
var errors []string
// Minimal validation - keep it lightweight
if service.ID == "" {
errors = append(errors, "service ID is required")
}
if service.ContainerID == "" {
errors = append(errors, "container ID is required")
}
if service.URL == "" {
errors = append(errors, "service URL is required")
}
// Validate URL format
if service.URL != "" {
if !isValidURL(service.URL) {
errors = append(errors, "invalid service URL format")
}
}
// Check if service is too old (potential zombie)
if service.CreatedAt.Before(time.Now().Add(-24 * time.Hour)) {
errors = append(errors, "service is older than 24 hours")
}
return errors
}
// StartContinuousMonitoring starts continuous health monitoring
func (hm *HealthMonitor) StartContinuousMonitoring(ctx context.Context, interval time.Duration) {
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
hm.logger.Info("continuous monitoring stopped")
return
case <-ticker.C:
// Lightweight monitoring - just check service status
hm.checkAllServices(ctx)
}
}
}
// checkAllServices performs lightweight health checks on all services
func (hm *HealthMonitor) checkAllServices(ctx context.Context) {
hm.servicesMutex.RLock()
defer hm.servicesMutex.RUnlock()
for _, service := range hm.services {
// Quick HTTP check only - no heavy metrics
go hm.quickHealthCheck(ctx, service)
}
}
// quickHealthCheck performs a minimal health check
func (hm *HealthMonitor) quickHealthCheck(ctx context.Context, service *JupyterService) {
// Simple HTTP check with short timeout
client := &http.Client{Timeout: 3 * time.Second}
req, err := http.NewRequestWithContext(ctx, "GET", service.URL, nil)
if err != nil {
hm.logger.Warn("service health check failed", "service", service.ID, "error", err)
return
}
resp, err := client.Do(req)
if err != nil {
hm.logger.Warn("service health check failed", "service", service.ID, "error", err)
return
}
defer func() {
if err := resp.Body.Close(); err != nil {
hm.logger.Warn("failed to close response body", "error", err)
}
}()
if resp.StatusCode == 200 {
hm.logger.Debug("service healthy", "service", service.ID)
} else {
hm.logger.Warn("service unhealthy", "service", service.ID, "status", resp.StatusCode)
}
}
// isValidURL validates URL format
func isValidURL(url string) bool {
return strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
}
// GetHealthHistory returns health check history for a service (lightweight version)
func (hm *HealthMonitor) GetHealthHistory(_ string, duration time.Duration) ([]*HealthStatus, error) {
// Return empty for now - keep it lightweight
return []*HealthStatus{}, nil
}
// SetInterval updates the monitoring interval
func (hm *HealthMonitor) SetInterval(interval time.Duration) {
hm.interval = interval
}
// GetMonitoringStatus returns the current monitoring status
func (hm *HealthMonitor) GetMonitoringStatus() map[string]interface{} {
hm.servicesMutex.RLock()
defer hm.servicesMutex.RUnlock()
return map[string]interface{}{
"monitored_services": len(hm.services),
"check_interval": hm.interval.String(),
"timeout": hm.client.Timeout.String(),
"enabled": true,
}
}
// ExportHealthReport exports a health report to JSON (lightweight version)
func (hm *HealthMonitor) ExportHealthReport(ctx context.Context) ([]byte, error) {
report, err := hm.CheckAllServices(ctx)
if err != nil {
return nil, fmt.Errorf("failed to generate health report: %w", err)
}
return json.Marshal(report)
}
// Cleanup removes old or stale services from monitoring (lightweight version)
func (hm *HealthMonitor) Cleanup(maxAge time.Duration) int {
var removed int
cutoff := time.Now().Add(-maxAge)
hm.servicesMutex.Lock()
defer hm.servicesMutex.Unlock()
for serviceID, service := range hm.services {
if service.LastAccess.Before(cutoff) {
delete(hm.services, serviceID)
removed++
hm.logger.Info("removed stale service from monitoring", "service", serviceID)
}
}
return removed
}
// Stop gracefully stops the health monitor
func (hm *HealthMonitor) Stop() {
hm.logger.Info("health monitor stopping")
// Clear services
hm.services = make(map[string]*JupyterService)
}