Add new scheduler component for distributed ML workload orchestration: - Hub-based coordination for multi-worker clusters - Pacing controller for rate limiting job submissions - Priority queue with preemption support - Port allocator for dynamic service discovery - Protocol handlers for worker-scheduler communication - Service manager with OS-specific implementations - Connection management and state persistence - Template system for service deployment Includes comprehensive test suite: - Unit tests for all core components - Integration tests for distributed scenarios - Benchmark tests for performance validation - Mock fixtures for isolated testing Refs: scheduler-architecture.md
264 lines
7.8 KiB
Go
264 lines
7.8 KiB
Go
package scheduler_test
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// TestJupyterLabTemplate validates the JupyterLab template configuration
|
|
func TestJupyterLabTemplate(t *testing.T) {
|
|
template := scheduler.JupyterLabTemplate
|
|
|
|
assert.Equal(t, "service", template.JobType)
|
|
assert.Equal(t, "service", template.SlotPool)
|
|
assert.Equal(t, 0, template.GPUCount)
|
|
|
|
// Verify command includes required flags
|
|
require.NotEmpty(t, template.Command)
|
|
assert.Contains(t, template.Command, "jupyter")
|
|
assert.Contains(t, template.Command, "lab")
|
|
assert.Contains(t, template.Command, "--ip=0.0.0.0")
|
|
assert.Contains(t, template.Command, "--port={{SERVICE_PORT}}")
|
|
assert.Contains(t, template.Command, "--no-browser")
|
|
|
|
// Verify health checks
|
|
assert.Equal(t, "http://localhost:{{SERVICE_PORT}}/api", template.HealthCheck.Liveness)
|
|
assert.Equal(t, "http://localhost:{{SERVICE_PORT}}/api/kernels", template.HealthCheck.Readiness)
|
|
assert.Equal(t, 15, template.HealthCheck.Interval)
|
|
assert.Equal(t, 5, template.HealthCheck.Timeout)
|
|
|
|
// Verify mounts
|
|
require.Len(t, template.Mounts, 1)
|
|
assert.Equal(t, "{{WORKSPACE}}", template.Mounts[0].Source)
|
|
assert.Equal(t, "/workspace", template.Mounts[0].Destination)
|
|
}
|
|
|
|
// TestJupyterNotebookTemplate validates the classic notebook template
|
|
func TestJupyterNotebookTemplate(t *testing.T) {
|
|
template := scheduler.JupyterNotebookTemplate
|
|
|
|
assert.Equal(t, "service", template.JobType)
|
|
assert.Equal(t, "service", template.SlotPool)
|
|
assert.Equal(t, 0, template.GPUCount)
|
|
|
|
// Verify uses notebook subcommand
|
|
require.NotEmpty(t, template.Command)
|
|
assert.Contains(t, template.Command, "notebook")
|
|
}
|
|
|
|
// TestVLLMTemplate validates the vLLM inference template
|
|
func TestVLLMTemplate(t *testing.T) {
|
|
template := scheduler.VLLMTemplate
|
|
|
|
assert.Equal(t, "service", template.JobType)
|
|
assert.Equal(t, "service", template.SlotPool)
|
|
assert.Equal(t, 1, template.GPUCount) // Requires GPU
|
|
|
|
// Verify command
|
|
require.NotEmpty(t, template.Command)
|
|
assert.Contains(t, template.Command, "vllm.entrypoints.openai.api_server")
|
|
assert.Contains(t, template.Command, "{{MODEL_NAME}}")
|
|
assert.Contains(t, template.Command, "{{SERVICE_PORT}}")
|
|
}
|
|
|
|
// TestPortAllocatorForServices validates port allocation for service jobs
|
|
func TestPortAllocatorForServices(t *testing.T) {
|
|
pa := scheduler.NewPortAllocator(10000, 10010)
|
|
|
|
// Allocate a port for Jupyter service
|
|
port1, err := pa.Allocate("jupyter-task-1")
|
|
require.NoError(t, err)
|
|
assert.True(t, port1 >= 10000 && port1 <= 10010)
|
|
|
|
// Verify we can get the task for this port
|
|
taskID := pa.GetAllocation(port1)
|
|
assert.Equal(t, "jupyter-task-1", taskID)
|
|
|
|
// Allocate another port
|
|
port2, err := pa.Allocate("jupyter-task-2")
|
|
require.NoError(t, err)
|
|
assert.NotEqual(t, port1, port2)
|
|
|
|
// Release first port
|
|
pa.Release(port1)
|
|
|
|
// Verify port is now available
|
|
taskID = pa.GetAllocation(port1)
|
|
assert.Equal(t, "", taskID)
|
|
|
|
// Can reallocate the same port
|
|
port3, err := pa.Allocate("jupyter-task-3")
|
|
require.NoError(t, err)
|
|
// Should get first available (which might be port1)
|
|
assert.True(t, port3 >= 10000 && port3 <= 10010)
|
|
}
|
|
|
|
// TestPortAllocatorExhaustion validates behavior when no ports available
|
|
func TestPortAllocatorExhaustion(t *testing.T) {
|
|
// Small range for testing
|
|
pa := scheduler.NewPortAllocator(20000, 20002)
|
|
|
|
// Allocate all ports
|
|
_, err := pa.Allocate("task-1")
|
|
require.NoError(t, err)
|
|
_, err = pa.Allocate("task-2")
|
|
require.NoError(t, err)
|
|
_, err = pa.Allocate("task-3")
|
|
require.NoError(t, err)
|
|
|
|
// Fourth allocation should fail
|
|
_, err = pa.Allocate("task-4")
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "no ports available")
|
|
}
|
|
|
|
// TestPortAllocatorTTL validates port TTL behavior
|
|
func TestPortAllocatorTTL(t *testing.T) {
|
|
pa := scheduler.NewPortAllocator(30000, 30010)
|
|
|
|
// Set short TTL for testing
|
|
pa.SetTTL(50 * time.Millisecond)
|
|
|
|
// Allocate a port
|
|
port1, err := pa.Allocate("test-task")
|
|
require.NoError(t, err)
|
|
|
|
// Release it (marks with expired timestamp due to short TTL)
|
|
pa.Release(port1)
|
|
|
|
// Immediately try to allocate - should get different port since released one is "expired"
|
|
port2, err := pa.Allocate("test-task-2")
|
|
require.NoError(t, err)
|
|
|
|
// Could be same or different depending on cleanup timing
|
|
assert.True(t, port2 >= 30000 && port2 <= 30010)
|
|
}
|
|
|
|
// TestServiceSlotPoolSeparation validates that service and batch use different pools
|
|
func TestServiceSlotPoolSeparation(t *testing.T) {
|
|
// This test validates the conceptual separation
|
|
// In practice, the scheduler maintains separate queues
|
|
|
|
// Use JupyterLabTemplate which has health checks configured
|
|
serviceJob := scheduler.JupyterLabTemplate
|
|
|
|
batchJob := scheduler.JobSpec{
|
|
ID: "batch-1",
|
|
SlotPool: "batch",
|
|
GPUCount: 1,
|
|
}
|
|
|
|
// Verify different slot pools
|
|
assert.Equal(t, "service", serviceJob.SlotPool)
|
|
assert.Equal(t, "batch", batchJob.SlotPool)
|
|
|
|
// Service job has health checks
|
|
assert.NotZero(t, serviceJob.HealthCheck.Interval)
|
|
|
|
// Batch job would typically not have health checks
|
|
// (it runs to completion)
|
|
}
|
|
|
|
// TestHealthCheckValidation validates health check configuration
|
|
func TestHealthCheckValidation(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
template scheduler.ServiceTemplate
|
|
valid bool
|
|
}{
|
|
{
|
|
name: "JupyterLab - valid",
|
|
template: scheduler.ServiceTemplate{
|
|
JobType: "service",
|
|
SlotPool: "service",
|
|
HealthCheck: scheduler.ServiceHealthCheck{
|
|
Liveness: "http://localhost:8888/api",
|
|
Readiness: "http://localhost:8888/api/kernels",
|
|
Interval: 15,
|
|
Timeout: 5,
|
|
},
|
|
},
|
|
valid: true,
|
|
},
|
|
{
|
|
name: "Missing liveness - invalid",
|
|
template: scheduler.ServiceTemplate{
|
|
JobType: "service",
|
|
SlotPool: "service",
|
|
HealthCheck: scheduler.ServiceHealthCheck{
|
|
Readiness: "http://localhost:8888/api",
|
|
Interval: 15,
|
|
},
|
|
},
|
|
valid: false,
|
|
},
|
|
{
|
|
name: "Zero interval - invalid",
|
|
template: scheduler.ServiceTemplate{
|
|
JobType: "service",
|
|
SlotPool: "service",
|
|
HealthCheck: scheduler.ServiceHealthCheck{
|
|
Liveness: "http://localhost:8888/api",
|
|
Readiness: "http://localhost:8888/api",
|
|
Interval: 0,
|
|
},
|
|
},
|
|
valid: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
hc := tt.template.HealthCheck
|
|
isValid := hc.Liveness != "" && hc.Interval > 0 && hc.Timeout > 0
|
|
assert.Equal(t, tt.valid, isValid)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestDefaultPortRange validates the default service port range
|
|
func TestDefaultPortRange(t *testing.T) {
|
|
// Default range should be large enough for typical deployments
|
|
rangeSize := scheduler.DefaultServicePortEnd - scheduler.DefaultServicePortStart
|
|
assert.True(t, rangeSize >= 1000, "Default port range should be at least 1000 ports")
|
|
assert.Equal(t, 8000, scheduler.DefaultServicePortStart)
|
|
assert.Equal(t, 9000, scheduler.DefaultServicePortEnd)
|
|
}
|
|
|
|
// TestTemplateVariableExpansion validates template variables are present
|
|
func TestTemplateVariableExpansion(t *testing.T) {
|
|
template := scheduler.JupyterLabTemplate
|
|
|
|
// Check command contains template variables
|
|
hasServicePort := false
|
|
for _, cmd := range template.Command {
|
|
if cmd == "--port={{SERVICE_PORT}}" {
|
|
hasServicePort = true
|
|
break
|
|
}
|
|
}
|
|
assert.True(t, hasServicePort, "Command should contain {{SERVICE_PORT}} template variable")
|
|
|
|
// Check env contains secret template
|
|
val, ok := template.Env["JUPYTER_TOKEN"]
|
|
assert.True(t, ok, "Should have JUPYTER_TOKEN env var")
|
|
assert.Contains(t, val, "{{SECRET:", "Should use secret template")
|
|
}
|
|
|
|
// BenchmarkPortAllocation benchmarks port allocation performance
|
|
func BenchmarkPortAllocation(b *testing.B) {
|
|
pa := scheduler.NewPortAllocator(40000, 41000)
|
|
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
port, err := pa.Allocate("bench-task")
|
|
if err != nil {
|
|
b.Fatal(err)
|
|
}
|
|
pa.Release(port)
|
|
}
|
|
}
|