fetch_ml/internal/worker/worker.go
Jeremie Fraeys a5c1a9fc0b
refactor: Phase 4 - split worker package into focused files
Split 551-line worker/core.go into single-concern files:

- worker/config.go (+44 lines)
  - Added config parsing: envInt(), parseCPUFromConfig(), parseGPUCountFromConfig()
  - parseGPUSlotsPerGPUFromConfig()
  - Now has all config logic in one place (440 lines total)

- worker/metrics.go (new file, 172 lines)
  - Extracted setupMetricsExporter() with ~30 Prometheus metric registrations
  - Isolated metrics logic for easy modification

- worker/factory.go (new file, 183 lines)
  - Extracted NewWorker() factory function
  - Moved prePullImages(), pullImage() from core.go
  - Centralized worker instantiation

- worker/worker.go (renamed from core.go, ~100 lines)
  - Now just defines Worker struct, MLServer, JupyterManager
  - Clean, focused file without mixed concerns

Lines redistributed: ~350 lines moved from monolithic core.go
Build status: Compiles successfully
2026-02-17 12:57:02 -05:00

95 lines
2.9 KiB
Go

package worker
import (
"context"
"net/http"
"sync"
"time"
"github.com/jfraeys/fetch_ml/internal/container"
"github.com/jfraeys/fetch_ml/internal/envpool"
"github.com/jfraeys/fetch_ml/internal/jupyter"
"github.com/jfraeys/fetch_ml/internal/logging"
"github.com/jfraeys/fetch_ml/internal/metrics"
"github.com/jfraeys/fetch_ml/internal/network"
"github.com/jfraeys/fetch_ml/internal/queue"
"github.com/jfraeys/fetch_ml/internal/resources"
"github.com/jfraeys/fetch_ml/internal/tracking"
)
// MLServer wraps network.SSHClient for backward compatibility.
type MLServer struct {
*network.SSHClient
}
// JupyterManager is the subset of the Jupyter service manager used by the worker.
// It exists to keep task execution testable.
type JupyterManager interface {
StartService(ctx context.Context, req *jupyter.StartRequest) (*jupyter.JupyterService, error)
StopService(ctx context.Context, serviceID string) error
RemoveService(ctx context.Context, serviceID string, purge bool) error
RestoreWorkspace(ctx context.Context, name string) (string, error)
ListServices() []*jupyter.JupyterService
ListInstalledPackages(ctx context.Context, serviceName string) ([]jupyter.InstalledPackage, error)
}
// isValidName validates that input strings contain only safe characters.
// isValidName checks if the input string is a valid name.
func isValidName(input string) bool {
return len(input) > 0 && len(input) < 256
}
// NewMLServer creates a new ML server connection.
// NewMLServer returns a new MLServer instance.
func NewMLServer(cfg *Config) (*MLServer, error) {
if cfg.LocalMode {
return &MLServer{SSHClient: network.NewLocalClient(cfg.BasePath)}, nil
}
client, err := network.NewSSHClient(cfg.Host, cfg.User, cfg.SSHKey, cfg.Port, cfg.KnownHosts)
if err != nil {
return nil, err
}
return &MLServer{SSHClient: client}, nil
}
// Worker represents an ML task worker.
type Worker struct {
id string
config *Config
server *MLServer
queue queue.Backend
resources *resources.Manager
running map[string]context.CancelFunc // Store cancellation functions for graceful shutdown
runningMu sync.RWMutex
ctx context.Context
cancel context.CancelFunc
logger *logging.Logger
metrics *metrics.Metrics
metricsSrv *http.Server
datasetCache map[string]time.Time
datasetCacheMu sync.RWMutex
datasetCacheTTL time.Duration
// Graceful shutdown fields
shutdownCh chan struct{}
activeTasks sync.Map // map[string]*queue.Task - track active tasks
gracefulWait sync.WaitGroup
podman *container.PodmanManager
jupyter JupyterManager
trackingRegistry *tracking.Registry
envPool *envpool.Pool
prewarmMu sync.Mutex
prewarmTargetID string
prewarmCancel context.CancelFunc
prewarmStartedAt time.Time
}
func (w *Worker) getGPUDetector() GPUDetector {
factory := &GPUDetectorFactory{}
return factory.CreateDetector(w.config)
}