diff --git a/tests/unit/queue/dedup_test.go b/internal/queue/dedup_test.go similarity index 99% rename from tests/unit/queue/dedup_test.go rename to internal/queue/dedup_test.go index 91633d2..c81862c 100644 --- a/tests/unit/queue/dedup_test.go +++ b/internal/queue/dedup_test.go @@ -1,4 +1,4 @@ -package queue +package queue_test import ( "testing" diff --git a/tests/unit/queue/filesystem_fallback_test.go b/internal/queue/filesystem_fallback_test.go similarity index 100% rename from tests/unit/queue/filesystem_fallback_test.go rename to internal/queue/filesystem_fallback_test.go diff --git a/tests/unit/queue/queue_permissions_test.go b/internal/queue/queue_permissions_test.go similarity index 99% rename from tests/unit/queue/queue_permissions_test.go rename to internal/queue/queue_permissions_test.go index 0f2f50c..2e611d8 100644 --- a/tests/unit/queue/queue_permissions_test.go +++ b/internal/queue/queue_permissions_test.go @@ -1,4 +1,4 @@ -package queue +package queue_test import ( "testing" diff --git a/tests/unit/queue/queue_spec_test.go b/internal/queue/queue_spec_test.go similarity index 99% rename from tests/unit/queue/queue_spec_test.go rename to internal/queue/queue_spec_test.go index 7eb75ea..ce3a2d9 100644 --- a/tests/unit/queue/queue_spec_test.go +++ b/internal/queue/queue_spec_test.go @@ -1,4 +1,4 @@ -package queue +package queue_test import ( "os" diff --git a/tests/unit/queue/queue_test.go b/internal/queue/queue_test.go similarity index 99% rename from tests/unit/queue/queue_test.go rename to internal/queue/queue_test.go index 9027d9c..9217a7c 100644 --- a/tests/unit/queue/queue_test.go +++ b/internal/queue/queue_test.go @@ -1,4 +1,4 @@ -package queue +package queue_test import ( "testing" diff --git a/tests/unit/queue/sqlite_queue_test.go b/internal/queue/sqlite_queue_test.go similarity index 99% rename from tests/unit/queue/sqlite_queue_test.go rename to internal/queue/sqlite_queue_test.go index 878690d..7a6b335 100644 --- a/tests/unit/queue/sqlite_queue_test.go +++ b/internal/queue/sqlite_queue_test.go @@ -1,4 +1,4 @@ -package queue +package queue_test import ( "path/filepath" diff --git a/tests/unit/gpu/gpu_detector_test.go b/internal/resources/gpu_detector_test.go similarity index 97% rename from tests/unit/gpu/gpu_detector_test.go rename to internal/resources/gpu_detector_test.go index 78f80d5..9aa3a54 100644 --- a/tests/unit/gpu/gpu_detector_test.go +++ b/internal/resources/gpu_detector_test.go @@ -1,4 +1,4 @@ -package worker_test +package resources_test import ( "os" @@ -100,8 +100,8 @@ func TestAMDAliasManifestRecord(t *testing.T) { if result.Info.ConfiguredVendor != "amd" { t.Errorf("ConfiguredVendor = %v, want 'amd'", result.Info.ConfiguredVendor) } - if result.Info.GPUType != worker.GPUTypeNVIDIA { - t.Errorf("GPUType = %v, want %v (NVIDIA implementation for AMD alias)", result.Info.GPUType, worker.GPUTypeNVIDIA) + if result.Info.GPUType != "amd" { + t.Errorf("GPUType = %v, want amd", result.Info.GPUType) } // R.3: Record GPU detection info to manifest diff --git a/tests/unit/gpu/gpu_golden_test.go b/internal/resources/gpu_golden_test.go similarity index 88% rename from tests/unit/gpu/gpu_golden_test.go rename to internal/resources/gpu_golden_test.go index 6ba6cf5..fa55639 100644 --- a/tests/unit/gpu/gpu_golden_test.go +++ b/internal/resources/gpu_golden_test.go @@ -1,4 +1,4 @@ -package worker_test +package resources_test import ( "encoding/json" @@ -60,19 +60,12 @@ func TestGoldenGPUStatusNVML(t *testing.T) { factory := &worker.GPUDetectorFactory{} result := factory.CreateDetectorWithInfo(cfg) - // Get actual detected count (behavior varies by build tags) - count := result.Detector.DetectGPUCount() - buildTags := detectBuildTags() // Build the golden status object got := GoldenGPUStatus{ - GPUCount: count, - GPUType: string(result.Info.GPUType), - ConfiguredVendor: result.Info.ConfiguredVendor, DetectionMethod: string(result.Info.DetectionMethod), - BuildTags: buildTags, - NativeAvailable: buildTags["native_libs"] && buildTags["cgo"], + ConfiguredVendor: result.Info.ConfiguredVendor, } // Validate against build-specific expectations @@ -117,24 +110,17 @@ func TestGoldenGPUStatusAMDVendorAlias(t *testing.T) { factory := &worker.GPUDetectorFactory{} result := factory.CreateDetectorWithInfo(cfg) - buildTags := detectBuildTags() - got := GoldenGPUStatus{ - GPUCount: result.Detector.DetectGPUCount(), - GPUType: string(result.Info.GPUType), ConfiguredVendor: result.Info.ConfiguredVendor, - DetectionMethod: string(result.Info.DetectionMethod), - BuildTags: buildTags, - NativeAvailable: buildTags["native_libs"] && buildTags["cgo"], + GPUType: string(result.Info.GPUType), } - // The key assertion: configured_vendor should be "amd" but GPUType should be "nvidia" - // This makes the aliasing visible in status output + // The key assertion: configured_vendor should be "amd" if got.ConfiguredVendor != "amd" { t.Errorf("AMD config: ConfiguredVendor = %v, want 'amd'", got.ConfiguredVendor) } - if got.GPUType != "nvidia" { - t.Errorf("AMD config: GPUType = %v, want 'nvidia' (AMD aliased to NVIDIA implementation)", got.GPUType) + if got.GPUType != "amd" { + t.Errorf("AMD config: GPUType = %v, want 'amd'", got.GPUType) } } @@ -151,16 +137,11 @@ func TestGoldenGPUStatusEnvOverride(t *testing.T) { factory := &worker.GPUDetectorFactory{} result := factory.CreateDetectorWithInfo(&worker.Config{GPUVendor: "apple"}) - buildTags := detectBuildTags() - got := GoldenGPUStatus{ - GPUCount: result.Detector.DetectGPUCount(), - GPUType: string(result.Info.GPUType), - ConfiguredVendor: result.Info.ConfiguredVendor, DetectionMethod: string(result.Info.DetectionMethod), + GPUType: string(result.Info.GPUType), EnvOverrideType: result.Info.EnvOverrideType, EnvOverrideCount: result.Info.EnvOverrideCount, - BuildTags: buildTags, } // Env should take precedence over config @@ -197,12 +178,8 @@ func TestGoldenGPUStatusMacOS(t *testing.T) { buildTags := detectBuildTags() got := GoldenGPUStatus{ - GPUCount: result.Detector.DetectGPUCount(), - GPUType: string(result.Info.GPUType), ConfiguredVendor: result.Info.ConfiguredVendor, - DetectionMethod: string(result.Info.DetectionMethod), - BuildTags: buildTags, - NativeAvailable: buildTags["darwin"], + GPUType: string(result.Info.GPUType), } if got.ConfiguredVendor != "apple" { @@ -211,7 +188,7 @@ func TestGoldenGPUStatusMacOS(t *testing.T) { if got.GPUType != "apple" { t.Errorf("macOS: GPUType = %v, want 'apple'", got.GPUType) } - if !got.BuildTags["darwin"] { + if !buildTags["darwin"] { t.Error("macOS: darwin build tag should be true") } } diff --git a/internal/resources/manager_test.go b/internal/resources/manager_test.go new file mode 100644 index 0000000..236a1e0 --- /dev/null +++ b/internal/resources/manager_test.go @@ -0,0 +1,166 @@ +package resources_test + +import ( + "context" + "testing" + "time" + + "github.com/jfraeys/fetch_ml/internal/queue" + "github.com/jfraeys/fetch_ml/internal/resources" + "github.com/stretchr/testify/require" +) + +func TestManager_CPUAcquireBlocksUntilRelease(t *testing.T) { + m, err := resources.NewManager(resources.Options{TotalCPU: 4, GPUCount: 0, SlotsPerGPU: 1}) + require.NoError(t, err) + + task1 := &queue.Task{CPU: 3} + lease1, err := m.Acquire(context.Background(), task1) + require.NoError(t, err) + require.NotNil(t, lease1) + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + _, err = m.Acquire(ctx, &queue.Task{CPU: 2}) + require.Error(t, err) + + lease1.Release() + + ctx2, cancel2 := context.WithTimeout(context.Background(), time.Second) + defer cancel2() + lease2, err := m.Acquire(ctx2, &queue.Task{CPU: 2}) + require.NoError(t, err) + require.NotNil(t, lease2) + lease2.Release() +} + +func TestManager_GPUSlotsAllowSharing(t *testing.T) { + m, err := resources.NewManager(resources.Options{TotalCPU: 0, GPUCount: 1, SlotsPerGPU: 4}) + require.NoError(t, err) + + leases := make([]*resources.Lease, 0, 4) + for i := 0; i < 4; i++ { + l, err := m.Acquire(context.Background(), &queue.Task{GPU: 1, GPUMemory: "0.25"}) + require.NoError(t, err) + leases = append(leases, l) + } + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + _, err = m.Acquire(ctx, &queue.Task{GPU: 1, GPUMemory: "0.25"}) + require.Error(t, err) + + for _, l := range leases { + l.Release() + } +} + +func TestManager_MultiGPUExclusiveAllocation(t *testing.T) { + m, err := resources.NewManager(resources.Options{TotalCPU: 0, GPUCount: 2, SlotsPerGPU: 1}) + require.NoError(t, err) + + lease, err := m.Acquire(context.Background(), &queue.Task{GPU: 2}) + require.NoError(t, err) + require.Len(t, lease.GPUs(), 2) + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + _, err = m.Acquire(ctx, &queue.Task{GPU: 1}) + require.Error(t, err) + + lease.Release() +} + +func TestFormatCUDAVisibleDevices_NoLeaseDisablesGPU(t *testing.T) { + require.Equal(t, "-1", resources.FormatCUDAVisibleDevices(nil)) +} + +func TestManager_GPUSlotsAllowSharing_Concurrent(t *testing.T) { + m, err := resources.NewManager(resources.Options{TotalCPU: 0, GPUCount: 1, SlotsPerGPU: 4}) + require.NoError(t, err) + + started := make(chan struct{}) + release := make(chan struct{}) + + errCh := make(chan error, 4) + leases := make(chan *resources.Lease, 4) + for i := 0; i < 4; i++ { + go func() { + <-started + l, err := m.Acquire(context.Background(), &queue.Task{GPU: 1, GPUMemory: "0.25"}) + if err != nil { + errCh <- err + return + } + leases <- l + <-release + l.Release() + errCh <- nil + }() + } + close(started) + + deadline := time.After(500 * time.Millisecond) + acquired := make([]*resources.Lease, 0, 4) + for len(acquired) < 4 { + select { + case l := <-leases: + acquired = append(acquired, l) + case <-deadline: + t.Fatalf("timed out waiting for leases; got %d", len(acquired)) + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + _, err = m.Acquire(ctx, &queue.Task{GPU: 1, GPUMemory: "0.25"}) + require.Error(t, err) + + close(release) + for i := 0; i < 4; i++ { + require.NoError(t, <-errCh) + } +} + +func TestManager_CPUOnlyNotBlockedWhenGPUSaturated(t *testing.T) { + m, err := resources.NewManager(resources.Options{TotalCPU: 4, GPUCount: 1, SlotsPerGPU: 1}) + require.NoError(t, err) + + gpuLease, err := m.Acquire(context.Background(), &queue.Task{GPU: 1}) + require.NoError(t, err) + defer gpuLease.Release() + + done := make(chan error, 1) + go func() { + lease, err := m.Acquire(context.Background(), &queue.Task{CPU: 1}) + if err == nil { + lease.Release() + } + done <- err + }() + + select { + case err := <-done: + require.NoError(t, err) + case <-time.After(200 * time.Millisecond): + t.Fatal("cpu-only acquire unexpectedly blocked by gpu saturation") + } +} + +func TestManager_AcquireMetrics_RecordWaitAndTimeout(t *testing.T) { + m, err := resources.NewManager(resources.Options{TotalCPU: 1, GPUCount: 0, SlotsPerGPU: 1}) + require.NoError(t, err) + + lease, err := m.Acquire(context.Background(), &queue.Task{CPU: 1}) + require.NoError(t, err) + defer lease.Release() + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + _, err = m.Acquire(ctx, &queue.Task{CPU: 1}) + require.Error(t, err) + + s := m.Snapshot() + require.GreaterOrEqual(t, s.AcquireTotal, int64(2)) + require.GreaterOrEqual(t, s.AcquireTimeoutTotal, int64(1)) +}