fetch_ml/tests/unit/scheduler/service_templates_test.go
Jeremie Fraeys 43e6446587
feat(scheduler): implement multi-tenant job scheduler with gang scheduling
Add new scheduler component for distributed ML workload orchestration:
- Hub-based coordination for multi-worker clusters
- Pacing controller for rate limiting job submissions
- Priority queue with preemption support
- Port allocator for dynamic service discovery
- Protocol handlers for worker-scheduler communication
- Service manager with OS-specific implementations
- Connection management and state persistence
- Template system for service deployment

Includes comprehensive test suite:
- Unit tests for all core components
- Integration tests for distributed scenarios
- Benchmark tests for performance validation
- Mock fixtures for isolated testing

Refs: scheduler-architecture.md
2026-02-26 12:03:23 -05:00

264 lines
7.8 KiB
Go

package scheduler_test
import (
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/scheduler"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestJupyterLabTemplate validates the JupyterLab template configuration
func TestJupyterLabTemplate(t *testing.T) {
template := scheduler.JupyterLabTemplate
assert.Equal(t, "service", template.JobType)
assert.Equal(t, "service", template.SlotPool)
assert.Equal(t, 0, template.GPUCount)
// Verify command includes required flags
require.NotEmpty(t, template.Command)
assert.Contains(t, template.Command, "jupyter")
assert.Contains(t, template.Command, "lab")
assert.Contains(t, template.Command, "--ip=0.0.0.0")
assert.Contains(t, template.Command, "--port={{SERVICE_PORT}}")
assert.Contains(t, template.Command, "--no-browser")
// Verify health checks
assert.Equal(t, "http://localhost:{{SERVICE_PORT}}/api", template.HealthCheck.Liveness)
assert.Equal(t, "http://localhost:{{SERVICE_PORT}}/api/kernels", template.HealthCheck.Readiness)
assert.Equal(t, 15, template.HealthCheck.Interval)
assert.Equal(t, 5, template.HealthCheck.Timeout)
// Verify mounts
require.Len(t, template.Mounts, 1)
assert.Equal(t, "{{WORKSPACE}}", template.Mounts[0].Source)
assert.Equal(t, "/workspace", template.Mounts[0].Destination)
}
// TestJupyterNotebookTemplate validates the classic notebook template
func TestJupyterNotebookTemplate(t *testing.T) {
template := scheduler.JupyterNotebookTemplate
assert.Equal(t, "service", template.JobType)
assert.Equal(t, "service", template.SlotPool)
assert.Equal(t, 0, template.GPUCount)
// Verify uses notebook subcommand
require.NotEmpty(t, template.Command)
assert.Contains(t, template.Command, "notebook")
}
// TestVLLMTemplate validates the vLLM inference template
func TestVLLMTemplate(t *testing.T) {
template := scheduler.VLLMTemplate
assert.Equal(t, "service", template.JobType)
assert.Equal(t, "service", template.SlotPool)
assert.Equal(t, 1, template.GPUCount) // Requires GPU
// Verify command
require.NotEmpty(t, template.Command)
assert.Contains(t, template.Command, "vllm.entrypoints.openai.api_server")
assert.Contains(t, template.Command, "{{MODEL_NAME}}")
assert.Contains(t, template.Command, "{{SERVICE_PORT}}")
}
// TestPortAllocatorForServices validates port allocation for service jobs
func TestPortAllocatorForServices(t *testing.T) {
pa := scheduler.NewPortAllocator(10000, 10010)
// Allocate a port for Jupyter service
port1, err := pa.Allocate("jupyter-task-1")
require.NoError(t, err)
assert.True(t, port1 >= 10000 && port1 <= 10010)
// Verify we can get the task for this port
taskID := pa.GetAllocation(port1)
assert.Equal(t, "jupyter-task-1", taskID)
// Allocate another port
port2, err := pa.Allocate("jupyter-task-2")
require.NoError(t, err)
assert.NotEqual(t, port1, port2)
// Release first port
pa.Release(port1)
// Verify port is now available
taskID = pa.GetAllocation(port1)
assert.Equal(t, "", taskID)
// Can reallocate the same port
port3, err := pa.Allocate("jupyter-task-3")
require.NoError(t, err)
// Should get first available (which might be port1)
assert.True(t, port3 >= 10000 && port3 <= 10010)
}
// TestPortAllocatorExhaustion validates behavior when no ports available
func TestPortAllocatorExhaustion(t *testing.T) {
// Small range for testing
pa := scheduler.NewPortAllocator(20000, 20002)
// Allocate all ports
_, err := pa.Allocate("task-1")
require.NoError(t, err)
_, err = pa.Allocate("task-2")
require.NoError(t, err)
_, err = pa.Allocate("task-3")
require.NoError(t, err)
// Fourth allocation should fail
_, err = pa.Allocate("task-4")
assert.Error(t, err)
assert.Contains(t, err.Error(), "no ports available")
}
// TestPortAllocatorTTL validates port TTL behavior
func TestPortAllocatorTTL(t *testing.T) {
pa := scheduler.NewPortAllocator(30000, 30010)
// Set short TTL for testing
pa.SetTTL(50 * time.Millisecond)
// Allocate a port
port1, err := pa.Allocate("test-task")
require.NoError(t, err)
// Release it (marks with expired timestamp due to short TTL)
pa.Release(port1)
// Immediately try to allocate - should get different port since released one is "expired"
port2, err := pa.Allocate("test-task-2")
require.NoError(t, err)
// Could be same or different depending on cleanup timing
assert.True(t, port2 >= 30000 && port2 <= 30010)
}
// TestServiceSlotPoolSeparation validates that service and batch use different pools
func TestServiceSlotPoolSeparation(t *testing.T) {
// This test validates the conceptual separation
// In practice, the scheduler maintains separate queues
// Use JupyterLabTemplate which has health checks configured
serviceJob := scheduler.JupyterLabTemplate
batchJob := scheduler.JobSpec{
ID: "batch-1",
SlotPool: "batch",
GPUCount: 1,
}
// Verify different slot pools
assert.Equal(t, "service", serviceJob.SlotPool)
assert.Equal(t, "batch", batchJob.SlotPool)
// Service job has health checks
assert.NotZero(t, serviceJob.HealthCheck.Interval)
// Batch job would typically not have health checks
// (it runs to completion)
}
// TestHealthCheckValidation validates health check configuration
func TestHealthCheckValidation(t *testing.T) {
tests := []struct {
name string
template scheduler.ServiceTemplate
valid bool
}{
{
name: "JupyterLab - valid",
template: scheduler.ServiceTemplate{
JobType: "service",
SlotPool: "service",
HealthCheck: scheduler.ServiceHealthCheck{
Liveness: "http://localhost:8888/api",
Readiness: "http://localhost:8888/api/kernels",
Interval: 15,
Timeout: 5,
},
},
valid: true,
},
{
name: "Missing liveness - invalid",
template: scheduler.ServiceTemplate{
JobType: "service",
SlotPool: "service",
HealthCheck: scheduler.ServiceHealthCheck{
Readiness: "http://localhost:8888/api",
Interval: 15,
},
},
valid: false,
},
{
name: "Zero interval - invalid",
template: scheduler.ServiceTemplate{
JobType: "service",
SlotPool: "service",
HealthCheck: scheduler.ServiceHealthCheck{
Liveness: "http://localhost:8888/api",
Readiness: "http://localhost:8888/api",
Interval: 0,
},
},
valid: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
hc := tt.template.HealthCheck
isValid := hc.Liveness != "" && hc.Interval > 0 && hc.Timeout > 0
assert.Equal(t, tt.valid, isValid)
})
}
}
// TestDefaultPortRange validates the default service port range
func TestDefaultPortRange(t *testing.T) {
// Default range should be large enough for typical deployments
rangeSize := scheduler.DefaultServicePortEnd - scheduler.DefaultServicePortStart
assert.True(t, rangeSize >= 1000, "Default port range should be at least 1000 ports")
assert.Equal(t, 8000, scheduler.DefaultServicePortStart)
assert.Equal(t, 9000, scheduler.DefaultServicePortEnd)
}
// TestTemplateVariableExpansion validates template variables are present
func TestTemplateVariableExpansion(t *testing.T) {
template := scheduler.JupyterLabTemplate
// Check command contains template variables
hasServicePort := false
for _, cmd := range template.Command {
if cmd == "--port={{SERVICE_PORT}}" {
hasServicePort = true
break
}
}
assert.True(t, hasServicePort, "Command should contain {{SERVICE_PORT}} template variable")
// Check env contains secret template
val, ok := template.Env["JUPYTER_TOKEN"]
assert.True(t, ok, "Should have JUPYTER_TOKEN env var")
assert.Contains(t, val, "{{SECRET:", "Should use secret template")
}
// BenchmarkPortAllocation benchmarks port allocation performance
func BenchmarkPortAllocation(b *testing.B) {
pa := scheduler.NewPortAllocator(40000, 41000)
b.ResetTimer()
for i := 0; i < b.N; i++ {
port, err := pa.Allocate("bench-task")
if err != nil {
b.Fatal(err)
}
pa.Release(port)
}
}