fetch_ml/internal/jupyter/health_monitor.go

package jupyter

import (
	"context"
	"encoding/json"
	"fmt"
	"net/http"
	"strings"
	"sync"
	"time"

	"github.com/jfraeys/fetch_ml/internal/logging"
)

const (
	statusUnhealthy = "unhealthy"
)

// HealthMonitor monitors the health of Jupyter services
type HealthMonitor struct {
	logger        *logging.Logger
	services      map[string]*JupyterService
	servicesMutex sync.RWMutex
	interval      time.Duration
	client        *http.Client
}

// HealthStatus represents the health status of a service
type HealthStatus struct {
	ServiceID    string                 `json:"service_id"`
	ServiceName  string                 `json:"service_name"`
	Status       string                 `json:"status"`
	LastCheck    time.Time              `json:"last_check"`
	ResponseTime time.Duration          `json:"response_time"`
	URL          string                 `json:"url"`
	ContainerID  string                 `json:"container_id"`
	Errors       []string               `json:"errors"`
	Metrics      map[string]interface{} `json:"metrics"`
}

// HealthReport contains a comprehensive health report
type HealthReport struct {
	Timestamp     time.Time                `json:"timestamp"`
	TotalServices int                      `json:"total_services"`
	Healthy       int                      `json:"healthy"`
	Unhealthy     int                      `json:"unhealthy"`
	Unknown       int                      `json:"unknown"`
	Services      map[string]*HealthStatus `json:"services"`
	Summary       string                   `json:"summary"`
}

// NewHealthMonitor creates a new health monitor
func NewHealthMonitor(logger *logging.Logger, interval time.Duration) *HealthMonitor {
	return &HealthMonitor{
		logger:   logger,
		services: make(map[string]*JupyterService),
		interval: interval,
		client: &http.Client{
			Timeout: 10 * time.Second,
		},
	}
}

// AddService adds a service to monitor
func (hm *HealthMonitor) AddService(service *JupyterService) {
	hm.services[service.ID] = service
	hm.logger.Info("service added to health monitor", "service_id", service.ID, "name", service.Name)
}

// RemoveService removes a service from monitoring
func (hm *HealthMonitor) RemoveService(serviceID string) {
	delete(hm.services, serviceID)
	hm.logger.Info("service removed from health monitor", "service_id", serviceID)
}

// CheckServiceHealth checks the health of a specific service
func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID string) (*HealthStatus, error) {
	service, exists := hm.services[serviceID]
	if !exists {
		return nil, fmt.Errorf("service %s not found", serviceID)
	}

	healthStatus := &HealthStatus{
		ServiceID:   serviceID,
		ServiceName: service.Name,
		LastCheck:   time.Now(),
		URL:         service.URL,
		ContainerID: service.ContainerID,
		Metrics:     make(map[string]interface{}),
		Errors:      []string{},
	}

	// Check HTTP connectivity
	start := time.Now()
	req, err := http.NewRequestWithContext(ctx, "GET", service.URL, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}
	resp, err := hm.client.Do(req)
	responseTime := time.Since(start)
	if err != nil {
		healthStatus.Status = statusUnhealthy
		healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP request failed: %v", err))
		healthStatus.ResponseTime = responseTime
		return healthStatus, nil
	}
	defer func() {
		if err := resp.Body.Close(); err != nil {
			hm.logger.Warn("failed to close response body", "error", err)
		}
	}()

	healthStatus.ResponseTime = responseTime
	healthStatus.Metrics["response_time_ms"] = responseTime.Milliseconds()
	healthStatus.Metrics["status_code"] = resp.StatusCode

	// Check response status
	if resp.StatusCode == 200 {
		healthStatus.Status = "healthy"
	} else {
		healthStatus.Status = statusUnhealthy
		healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP status %d", resp.StatusCode))
	}

	// Check response headers for Jupyter-specific indicators
	if server := resp.Header.Get("Server"); server != "" {
		healthStatus.Metrics["server"] = server
	}

	return healthStatus, nil
}

// CheckAllServices checks the health of all monitored services
func (hm *HealthMonitor) CheckAllServices(ctx context.Context) (*HealthReport, error) {
	report := &HealthReport{
		Timestamp: time.Now(),
		Services:  make(map[string]*HealthStatus),
	}

	for serviceID := range hm.services {
		healthStatus, err := hm.CheckServiceHealth(ctx, serviceID)
		if err != nil {
			hm.logger.Warn("failed to check service health", "service_id", serviceID, "error", err)
			continue
		}
		report.Services[serviceID] = healthStatus

		// Update counters
		switch healthStatus.Status {
		case "healthy":
			report.Healthy++
		case statusUnhealthy:
			report.Unhealthy++
		default:
			report.Unknown++
		}
		report.TotalServices++
	}

	// Generate summary
	report.Summary = hm.generateSummary(report)

	return report, nil
}

// generateSummary generates a human-readable summary
func (hm *HealthMonitor) generateSummary(report *HealthReport) string {
	if report.TotalServices == 0 {
		return "No services to monitor"
	}

	if report.Unhealthy == 0 {
		return fmt.Sprintf("All %d services are healthy", report.Healthy)
	}

	return fmt.Sprintf("%d healthy, %d unhealthy, %d unknown out of %d total services",
		report.Healthy, report.Unhealthy, report.Unknown, report.TotalServices)
}

// StartMonitoring starts continuous health monitoring
func (hm *HealthMonitor) StartMonitoring(ctx context.Context) {
	ticker := time.NewTicker(hm.interval)
	defer ticker.Stop()

	hm.logger.Info("health monitoring started", "interval", hm.interval)

	for {
		select {
		case <-ctx.Done():
			hm.logger.Info("health monitoring stopped")
			return
		case <-ticker.C:
			report, err := hm.CheckAllServices(ctx)
			if err != nil {
				hm.logger.Warn("health check failed", "error", err)
				continue
			}

			// Log summary
			hm.logger.Info("health check completed", "summary", report.Summary)

			// Alert on unhealthy services
			for serviceID, health := range report.Services {
				if health.Status == statusUnhealthy {
					hm.logger.Warn("service unhealthy",
						"service_id", serviceID,
						"name", health.ServiceName,
						"errors", health.Errors)
				}
			}
		}
	}
}

// GetServiceMetrics returns detailed metrics for a service
func (hm *HealthMonitor) GetServiceMetrics(ctx context.Context, serviceID string) (map[string]interface{}, error) {
	service, exists := hm.services[serviceID]
	if !exists {
		return nil, fmt.Errorf("service %s not found", serviceID)
	}

	metrics := make(map[string]interface{})

	// Basic service info
	metrics["service_id"] = service.ID
	metrics["service_name"] = service.Name
	metrics["container_id"] = service.ContainerID
	metrics["url"] = service.URL
	metrics["created_at"] = service.CreatedAt
	metrics["last_access"] = service.LastAccess

	// Health check
	healthStatus, err := hm.CheckServiceHealth(ctx, serviceID)
	if err != nil {
		metrics["health_status"] = "error"
		metrics["health_error"] = err.Error()
	} else {
		metrics["health_status"] = healthStatus.Status
		metrics["response_time_ms"] = healthStatus.ResponseTime.Milliseconds()
		metrics["last_health_check"] = healthStatus.LastCheck
		if len(healthStatus.Errors) > 0 {
			metrics["health_errors"] = healthStatus.Errors
		}
	}

	// Container metrics (if available)
	containerMetrics := hm.getContainerMetrics(ctx, service.ContainerID)
	for k, v := range containerMetrics {
		metrics["container_"+k] = v
	}

	return metrics, nil
}

// getContainerMetrics gets container-specific metrics
func (hm *HealthMonitor) getContainerMetrics(_ context.Context, _ string) map[string]interface{} {
	// Lightweight container metrics - avoid heavy system calls
	metrics := make(map[string]interface{})

	// Basic status check only - keep it minimal
	metrics["status"] = "running"
	metrics["last_check"] = time.Now().Unix()

	return metrics
}

// ValidateService checks if a service is properly configured
func (hm *HealthMonitor) ValidateService(service *JupyterService) []string {
	var errors []string

	// Minimal validation - keep it lightweight
	if service.ID == "" {
		errors = append(errors, "service ID is required")
	}
	if service.ContainerID == "" {
		errors = append(errors, "container ID is required")
	}
	if service.URL == "" {
		errors = append(errors, "service URL is required")
	}

	// Validate URL format
	if service.URL != "" {
		if !isValidURL(service.URL) {
			errors = append(errors, "invalid service URL format")
		}
	}

	// Check if service is too old (potential zombie)
	if service.CreatedAt.Before(time.Now().Add(-24 * time.Hour)) {
		errors = append(errors, "service is older than 24 hours")
	}

	return errors
}

// StartContinuousMonitoring starts continuous health monitoring
func (hm *HealthMonitor) StartContinuousMonitoring(ctx context.Context, interval time.Duration) {
	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	for {
		select {
		case <-ctx.Done():
			hm.logger.Info("continuous monitoring stopped")
			return
		case <-ticker.C:
			// Lightweight monitoring - just check service status
			hm.checkAllServices(ctx)
		}
	}
}

// checkAllServices performs lightweight health checks on all services
func (hm *HealthMonitor) checkAllServices(ctx context.Context) {
	hm.servicesMutex.RLock()
	defer hm.servicesMutex.RUnlock()

	for _, service := range hm.services {
		// Quick HTTP check only - no heavy metrics
		go hm.quickHealthCheck(ctx, service)
	}
}

// quickHealthCheck performs a minimal health check
func (hm *HealthMonitor) quickHealthCheck(ctx context.Context, service *JupyterService) {
	// Simple HTTP check with short timeout
	client := &http.Client{Timeout: 3 * time.Second}
	req, err := http.NewRequestWithContext(ctx, "GET", service.URL, nil)
	if err != nil {
		hm.logger.Warn("service health check failed", "service", service.ID, "error", err)
		return
	}
	resp, err := client.Do(req)
	if err != nil {
		hm.logger.Warn("service health check failed", "service", service.ID, "error", err)
		return
	}
	defer func() {
		if err := resp.Body.Close(); err != nil {
			hm.logger.Warn("failed to close response body", "error", err)
		}
	}()

	if resp.StatusCode == 200 {
		hm.logger.Debug("service healthy", "service", service.ID)
	} else {
		hm.logger.Warn("service unhealthy", "service", service.ID, "status", resp.StatusCode)
	}
}

// isValidURL validates URL format
func isValidURL(url string) bool {
	return strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
}

// GetHealthHistory returns health check history for a service (lightweight version)
func (hm *HealthMonitor) GetHealthHistory(_ string, duration time.Duration) ([]*HealthStatus, error) {
	// Return empty for now - keep it lightweight
	return []*HealthStatus{}, nil
}

// SetInterval updates the monitoring interval
func (hm *HealthMonitor) SetInterval(interval time.Duration) {
	hm.interval = interval
}

// GetMonitoringStatus returns the current monitoring status
func (hm *HealthMonitor) GetMonitoringStatus() map[string]interface{} {
	hm.servicesMutex.RLock()
	defer hm.servicesMutex.RUnlock()

	return map[string]interface{}{
		"monitored_services": len(hm.services),
		"check_interval":     hm.interval.String(),
		"timeout":            hm.client.Timeout.String(),
		"enabled":            true,
	}
}

// ExportHealthReport exports a health report to JSON (lightweight version)
func (hm *HealthMonitor) ExportHealthReport(ctx context.Context) ([]byte, error) {
	report, err := hm.CheckAllServices(ctx)
	if err != nil {
		return nil, fmt.Errorf("failed to generate health report: %w", err)
	}

	return json.Marshal(report)
}

// Cleanup removes old or stale services from monitoring (lightweight version)
func (hm *HealthMonitor) Cleanup(maxAge time.Duration) int {
	var removed int
	cutoff := time.Now().Add(-maxAge)

	hm.servicesMutex.Lock()
	defer hm.servicesMutex.Unlock()

	for serviceID, service := range hm.services {
		if service.LastAccess.Before(cutoff) {
			delete(hm.services, serviceID)
			removed++
			hm.logger.Info("removed stale service from monitoring", "service", serviceID)
		}
	}

	return removed
}

// Stop gracefully stops the health monitor
func (hm *HealthMonitor) Stop() {
	hm.logger.Info("health monitor stopping")
	// Clear services
	hm.services = make(map[string]*JupyterService)
}