- Move ci-test.sh and setup.sh to scripts/ - Trim docs/src/zig-cli.md to current structure - Replace hardcoded secrets with placeholders in configs - Update .gitignore to block .env*, secrets/, keys, build artifacts - Slim README.md to reflect current CLI/TUI split - Add cleanup trap to ci-test.sh - Ensure no secrets are committed
415 lines
12 KiB
Go
415 lines
12 KiB
Go
package jupyter
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/logging"
|
|
)
|
|
|
|
const (
|
|
statusUnhealthy = "unhealthy"
|
|
)
|
|
|
|
// HealthMonitor monitors the health of Jupyter services
|
|
type HealthMonitor struct {
|
|
logger *logging.Logger
|
|
services map[string]*JupyterService
|
|
servicesMutex sync.RWMutex
|
|
interval time.Duration
|
|
client *http.Client
|
|
}
|
|
|
|
// HealthStatus represents the health status of a service
|
|
type HealthStatus struct {
|
|
ServiceID string `json:"service_id"`
|
|
ServiceName string `json:"service_name"`
|
|
Status string `json:"status"`
|
|
LastCheck time.Time `json:"last_check"`
|
|
ResponseTime time.Duration `json:"response_time"`
|
|
URL string `json:"url"`
|
|
ContainerID string `json:"container_id"`
|
|
Errors []string `json:"errors"`
|
|
Metrics map[string]interface{} `json:"metrics"`
|
|
}
|
|
|
|
// HealthReport contains a comprehensive health report
|
|
type HealthReport struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
TotalServices int `json:"total_services"`
|
|
Healthy int `json:"healthy"`
|
|
Unhealthy int `json:"unhealthy"`
|
|
Unknown int `json:"unknown"`
|
|
Services map[string]*HealthStatus `json:"services"`
|
|
Summary string `json:"summary"`
|
|
}
|
|
|
|
// NewHealthMonitor creates a new health monitor
|
|
func NewHealthMonitor(logger *logging.Logger, interval time.Duration) *HealthMonitor {
|
|
return &HealthMonitor{
|
|
logger: logger,
|
|
services: make(map[string]*JupyterService),
|
|
interval: interval,
|
|
client: &http.Client{
|
|
Timeout: 10 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
// AddService adds a service to monitor
|
|
func (hm *HealthMonitor) AddService(service *JupyterService) {
|
|
hm.services[service.ID] = service
|
|
hm.logger.Info("service added to health monitor", "service_id", service.ID, "name", service.Name)
|
|
}
|
|
|
|
// RemoveService removes a service from monitoring
|
|
func (hm *HealthMonitor) RemoveService(serviceID string) {
|
|
delete(hm.services, serviceID)
|
|
hm.logger.Info("service removed from health monitor", "service_id", serviceID)
|
|
}
|
|
|
|
// CheckServiceHealth checks the health of a specific service
|
|
func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID string) (*HealthStatus, error) {
|
|
service, exists := hm.services[serviceID]
|
|
if !exists {
|
|
return nil, fmt.Errorf("service %s not found", serviceID)
|
|
}
|
|
|
|
healthStatus := &HealthStatus{
|
|
ServiceID: serviceID,
|
|
ServiceName: service.Name,
|
|
LastCheck: time.Now(),
|
|
URL: service.URL,
|
|
ContainerID: service.ContainerID,
|
|
Metrics: make(map[string]interface{}),
|
|
Errors: []string{},
|
|
}
|
|
|
|
// Check HTTP connectivity
|
|
start := time.Now()
|
|
req, err := http.NewRequestWithContext(ctx, "GET", service.URL, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
resp, err := hm.client.Do(req)
|
|
responseTime := time.Since(start)
|
|
if err != nil {
|
|
healthStatus.Status = statusUnhealthy
|
|
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP request failed: %v", err))
|
|
healthStatus.ResponseTime = responseTime
|
|
return healthStatus, nil
|
|
}
|
|
defer func() {
|
|
if err := resp.Body.Close(); err != nil {
|
|
hm.logger.Warn("failed to close response body", "error", err)
|
|
}
|
|
}()
|
|
|
|
healthStatus.ResponseTime = responseTime
|
|
healthStatus.Metrics["response_time_ms"] = responseTime.Milliseconds()
|
|
healthStatus.Metrics["status_code"] = resp.StatusCode
|
|
|
|
// Check response status
|
|
if resp.StatusCode == 200 {
|
|
healthStatus.Status = "healthy"
|
|
} else {
|
|
healthStatus.Status = statusUnhealthy
|
|
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP status %d", resp.StatusCode))
|
|
}
|
|
|
|
// Check response headers for Jupyter-specific indicators
|
|
if server := resp.Header.Get("Server"); server != "" {
|
|
healthStatus.Metrics["server"] = server
|
|
}
|
|
|
|
return healthStatus, nil
|
|
}
|
|
|
|
// CheckAllServices checks the health of all monitored services
|
|
func (hm *HealthMonitor) CheckAllServices(ctx context.Context) (*HealthReport, error) {
|
|
report := &HealthReport{
|
|
Timestamp: time.Now(),
|
|
Services: make(map[string]*HealthStatus),
|
|
}
|
|
|
|
for serviceID := range hm.services {
|
|
healthStatus, err := hm.CheckServiceHealth(ctx, serviceID)
|
|
if err != nil {
|
|
hm.logger.Warn("failed to check service health", "service_id", serviceID, "error", err)
|
|
continue
|
|
}
|
|
report.Services[serviceID] = healthStatus
|
|
|
|
// Update counters
|
|
switch healthStatus.Status {
|
|
case "healthy":
|
|
report.Healthy++
|
|
case statusUnhealthy:
|
|
report.Unhealthy++
|
|
default:
|
|
report.Unknown++
|
|
}
|
|
report.TotalServices++
|
|
}
|
|
|
|
// Generate summary
|
|
report.Summary = hm.generateSummary(report)
|
|
|
|
return report, nil
|
|
}
|
|
|
|
// generateSummary generates a human-readable summary
|
|
func (hm *HealthMonitor) generateSummary(report *HealthReport) string {
|
|
if report.TotalServices == 0 {
|
|
return "No services to monitor"
|
|
}
|
|
|
|
if report.Unhealthy == 0 {
|
|
return fmt.Sprintf("All %d services are healthy", report.Healthy)
|
|
}
|
|
|
|
return fmt.Sprintf("%d healthy, %d unhealthy, %d unknown out of %d total services",
|
|
report.Healthy, report.Unhealthy, report.Unknown, report.TotalServices)
|
|
}
|
|
|
|
// StartMonitoring starts continuous health monitoring
|
|
func (hm *HealthMonitor) StartMonitoring(ctx context.Context) {
|
|
ticker := time.NewTicker(hm.interval)
|
|
defer ticker.Stop()
|
|
|
|
hm.logger.Info("health monitoring started", "interval", hm.interval)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
hm.logger.Info("health monitoring stopped")
|
|
return
|
|
case <-ticker.C:
|
|
report, err := hm.CheckAllServices(ctx)
|
|
if err != nil {
|
|
hm.logger.Warn("health check failed", "error", err)
|
|
continue
|
|
}
|
|
|
|
// Log summary
|
|
hm.logger.Info("health check completed", "summary", report.Summary)
|
|
|
|
// Alert on unhealthy services
|
|
for serviceID, health := range report.Services {
|
|
if health.Status == statusUnhealthy {
|
|
hm.logger.Warn("service unhealthy",
|
|
"service_id", serviceID,
|
|
"name", health.ServiceName,
|
|
"errors", health.Errors)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetServiceMetrics returns detailed metrics for a service
|
|
func (hm *HealthMonitor) GetServiceMetrics(ctx context.Context, serviceID string) (map[string]interface{}, error) {
|
|
service, exists := hm.services[serviceID]
|
|
if !exists {
|
|
return nil, fmt.Errorf("service %s not found", serviceID)
|
|
}
|
|
|
|
metrics := make(map[string]interface{})
|
|
|
|
// Basic service info
|
|
metrics["service_id"] = service.ID
|
|
metrics["service_name"] = service.Name
|
|
metrics["container_id"] = service.ContainerID
|
|
metrics["url"] = service.URL
|
|
metrics["created_at"] = service.CreatedAt
|
|
metrics["last_access"] = service.LastAccess
|
|
|
|
// Health check
|
|
healthStatus, err := hm.CheckServiceHealth(ctx, serviceID)
|
|
if err != nil {
|
|
metrics["health_status"] = "error"
|
|
metrics["health_error"] = err.Error()
|
|
} else {
|
|
metrics["health_status"] = healthStatus.Status
|
|
metrics["response_time_ms"] = healthStatus.ResponseTime.Milliseconds()
|
|
metrics["last_health_check"] = healthStatus.LastCheck
|
|
if len(healthStatus.Errors) > 0 {
|
|
metrics["health_errors"] = healthStatus.Errors
|
|
}
|
|
}
|
|
|
|
// Container metrics (if available)
|
|
containerMetrics := hm.getContainerMetrics(ctx, service.ContainerID)
|
|
for k, v := range containerMetrics {
|
|
metrics["container_"+k] = v
|
|
}
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
// getContainerMetrics gets container-specific metrics
|
|
func (hm *HealthMonitor) getContainerMetrics(_ context.Context, _ string) map[string]interface{} {
|
|
// Lightweight container metrics - avoid heavy system calls
|
|
metrics := make(map[string]interface{})
|
|
|
|
// Basic status check only - keep it minimal
|
|
metrics["status"] = "running"
|
|
metrics["last_check"] = time.Now().Unix()
|
|
|
|
return metrics
|
|
}
|
|
|
|
// ValidateService checks if a service is properly configured
|
|
func (hm *HealthMonitor) ValidateService(service *JupyterService) []string {
|
|
var errors []string
|
|
|
|
// Minimal validation - keep it lightweight
|
|
if service.ID == "" {
|
|
errors = append(errors, "service ID is required")
|
|
}
|
|
if service.ContainerID == "" {
|
|
errors = append(errors, "container ID is required")
|
|
}
|
|
if service.URL == "" {
|
|
errors = append(errors, "service URL is required")
|
|
}
|
|
|
|
// Validate URL format
|
|
if service.URL != "" {
|
|
if !isValidURL(service.URL) {
|
|
errors = append(errors, "invalid service URL format")
|
|
}
|
|
}
|
|
|
|
// Check if service is too old (potential zombie)
|
|
if service.CreatedAt.Before(time.Now().Add(-24 * time.Hour)) {
|
|
errors = append(errors, "service is older than 24 hours")
|
|
}
|
|
|
|
return errors
|
|
}
|
|
|
|
// StartContinuousMonitoring starts continuous health monitoring
|
|
func (hm *HealthMonitor) StartContinuousMonitoring(ctx context.Context, interval time.Duration) {
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
hm.logger.Info("continuous monitoring stopped")
|
|
return
|
|
case <-ticker.C:
|
|
// Lightweight monitoring - just check service status
|
|
hm.checkAllServices(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkAllServices performs lightweight health checks on all services
|
|
func (hm *HealthMonitor) checkAllServices(ctx context.Context) {
|
|
hm.servicesMutex.RLock()
|
|
defer hm.servicesMutex.RUnlock()
|
|
|
|
for _, service := range hm.services {
|
|
// Quick HTTP check only - no heavy metrics
|
|
go hm.quickHealthCheck(ctx, service)
|
|
}
|
|
}
|
|
|
|
// quickHealthCheck performs a minimal health check
|
|
func (hm *HealthMonitor) quickHealthCheck(ctx context.Context, service *JupyterService) {
|
|
// Simple HTTP check with short timeout
|
|
client := &http.Client{Timeout: 3 * time.Second}
|
|
req, err := http.NewRequestWithContext(ctx, "GET", service.URL, nil)
|
|
if err != nil {
|
|
hm.logger.Warn("service health check failed", "service", service.ID, "error", err)
|
|
return
|
|
}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
hm.logger.Warn("service health check failed", "service", service.ID, "error", err)
|
|
return
|
|
}
|
|
defer func() {
|
|
if err := resp.Body.Close(); err != nil {
|
|
hm.logger.Warn("failed to close response body", "error", err)
|
|
}
|
|
}()
|
|
|
|
if resp.StatusCode == 200 {
|
|
hm.logger.Debug("service healthy", "service", service.ID)
|
|
} else {
|
|
hm.logger.Warn("service unhealthy", "service", service.ID, "status", resp.StatusCode)
|
|
}
|
|
}
|
|
|
|
// isValidURL validates URL format
|
|
func isValidURL(url string) bool {
|
|
return strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")
|
|
}
|
|
|
|
// GetHealthHistory returns health check history for a service (lightweight version)
|
|
func (hm *HealthMonitor) GetHealthHistory(_ string, duration time.Duration) ([]*HealthStatus, error) {
|
|
// Return empty for now - keep it lightweight
|
|
return []*HealthStatus{}, nil
|
|
}
|
|
|
|
// SetInterval updates the monitoring interval
|
|
func (hm *HealthMonitor) SetInterval(interval time.Duration) {
|
|
hm.interval = interval
|
|
}
|
|
|
|
// GetMonitoringStatus returns the current monitoring status
|
|
func (hm *HealthMonitor) GetMonitoringStatus() map[string]interface{} {
|
|
hm.servicesMutex.RLock()
|
|
defer hm.servicesMutex.RUnlock()
|
|
|
|
return map[string]interface{}{
|
|
"monitored_services": len(hm.services),
|
|
"check_interval": hm.interval.String(),
|
|
"timeout": hm.client.Timeout.String(),
|
|
"enabled": true,
|
|
}
|
|
}
|
|
|
|
// ExportHealthReport exports a health report to JSON (lightweight version)
|
|
func (hm *HealthMonitor) ExportHealthReport(ctx context.Context) ([]byte, error) {
|
|
report, err := hm.CheckAllServices(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to generate health report: %w", err)
|
|
}
|
|
|
|
return json.Marshal(report)
|
|
}
|
|
|
|
// Cleanup removes old or stale services from monitoring (lightweight version)
|
|
func (hm *HealthMonitor) Cleanup(maxAge time.Duration) int {
|
|
var removed int
|
|
cutoff := time.Now().Add(-maxAge)
|
|
|
|
hm.servicesMutex.Lock()
|
|
defer hm.servicesMutex.Unlock()
|
|
|
|
for serviceID, service := range hm.services {
|
|
if service.LastAccess.Before(cutoff) {
|
|
delete(hm.services, serviceID)
|
|
removed++
|
|
hm.logger.Info("removed stale service from monitoring", "service", serviceID)
|
|
}
|
|
}
|
|
|
|
return removed
|
|
}
|
|
|
|
// Stop gracefully stops the health monitor
|
|
func (hm *HealthMonitor) Stop() {
|
|
hm.logger.Info("health monitor stopping")
|
|
// Clear services
|
|
hm.services = make(map[string]*JupyterService)
|
|
}
|