fetch_ml/tests/unit/scheduler/plugin_quota_test.go
Jeremie Fraeys da104367d6
Some checks failed
Build Pipeline / Build Binaries (push) Failing after 1m59s
Build Pipeline / Build Docker Images (push) Has been skipped
Build Pipeline / Sign HIPAA Config (push) Has been skipped
Build Pipeline / Generate SLSA Provenance (push) Has been skipped
Checkout test / test (push) Successful in 5s
CI Pipeline / Test (ubuntu-latest on self-hosted) (push) Failing after 1s
CI Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI Pipeline / Security Scan (push) Has been skipped
CI Pipeline / Test Scripts (push) Has been skipped
CI Pipeline / Test Native Libraries (push) Has been skipped
CI Pipeline / Native Library Build Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 35s
CI Pipeline / Trigger Build Workflow (push) Failing after 0s
Security Scan / Security Analysis (push) Has been cancelled
Security Scan / Native Library Security (push) Has been cancelled
Verification & Maintenance / V.1 - Schema Drift Detection (push) Has been cancelled
Verification & Maintenance / V.4 - Custom Go Vet Analyzers (push) Has been cancelled
Verification & Maintenance / V.7 - Audit Chain Integrity (push) Has been cancelled
Verification & Maintenance / V.6 - Extended Security Scanning (push) Has been cancelled
Verification & Maintenance / V.10 - OpenSSF Scorecard (push) Has been cancelled
Verification & Maintenance / Verification Summary (push) Has been cancelled
feat: add Plugin GPU Quota implementation and tests
- Add plugin_quota.go with GPU quota management for scheduler

- Update scheduler hub and protocol for plugin support

- Add comprehensive plugin quota unit tests

- Update gang service and WebSocket queue integration tests
2026-02-26 14:35:05 -05:00

385 lines
10 KiB
Go

package scheduler_test
import (
"testing"
"github.com/jfraeys/fetch_ml/internal/scheduler"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestPluginQuotaManager_CheckQuota_Disabled(t *testing.T) {
// When quota is disabled, all jobs should pass
config := scheduler.PluginQuotaConfig{
Enabled: false,
TotalGPUs: 1, // Set a low limit that would fail if enabled
}
m := scheduler.NewPluginQuotaManager(config)
err := m.CheckQuota("user1", "plugin1", 100)
assert.NoError(t, err)
}
func TestPluginQuotaManager_CheckQuota_GlobalLimit(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 4,
}
m := scheduler.NewPluginQuotaManager(config)
// First job should succeed
err := m.CheckQuota("user1", "plugin1", 2)
require.NoError(t, err)
// Record the usage
m.RecordUsage("user1", "plugin1", 2)
// Second job should succeed (2+2=4, within limit)
err = m.CheckQuota("user2", "plugin2", 2)
require.NoError(t, err)
m.RecordUsage("user2", "plugin2", 2)
// Third job should fail (would exceed global limit)
err = m.CheckQuota("user3", "plugin3", 1)
assert.Error(t, err)
assert.Contains(t, err.Error(), "global GPU limit exceeded")
}
func TestPluginQuotaManager_CheckQuota_PerUserGPULimit(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 3,
}
m := scheduler.NewPluginQuotaManager(config)
// User1: first job should succeed
err := m.CheckQuota("user1", "plugin1", 2)
require.NoError(t, err)
m.RecordUsage("user1", "plugin1", 2)
// User1: second job should succeed (2+1=3, at limit)
err = m.CheckQuota("user1", "plugin2", 1)
require.NoError(t, err)
m.RecordUsage("user1", "plugin2", 1)
// User1: third job should fail (would exceed per-user limit)
err = m.CheckQuota("user1", "plugin3", 1)
assert.Error(t, err)
assert.Contains(t, err.Error(), "user user1 GPU limit exceeded")
// User2: job should succeed (different user)
err = m.CheckQuota("user2", "plugin1", 3)
assert.NoError(t, err)
}
func TestPluginQuotaManager_CheckQuota_PerUserServiceLimit(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 10,
PerUserServices: 2,
}
m := scheduler.NewPluginQuotaManager(config)
// User1: first service should succeed
err := m.CheckQuota("user1", "plugin1", 1)
require.NoError(t, err)
m.RecordUsage("user1", "plugin1", 1)
// User1: second service should succeed
err = m.CheckQuota("user1", "plugin2", 1)
require.NoError(t, err)
m.RecordUsage("user1", "plugin2", 1)
// User1: third service should fail (would exceed service count limit)
err = m.CheckQuota("user1", "plugin3", 1)
assert.Error(t, err)
assert.Contains(t, err.Error(), "user user1 service limit exceeded")
}
func TestPluginQuotaManager_CheckQuota_UserOverride(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 2,
PerUserServices: 2,
UserOverrides: map[string]scheduler.UserLimit{
"vip-user": {
MaxGPUs: 5,
MaxServices: 10,
},
},
}
m := scheduler.NewPluginQuotaManager(config)
// Regular user: limited by default
err := m.CheckQuota("regular", "plugin1", 3)
assert.Error(t, err)
assert.Contains(t, err.Error(), "regular GPU limit exceeded")
// VIP user: has higher limit
err = m.CheckQuota("vip-user", "plugin1", 4)
require.NoError(t, err)
m.RecordUsage("vip-user", "plugin1", 4)
// VIP user: still within limit
err = m.CheckQuota("vip-user", "plugin2", 1)
assert.NoError(t, err)
}
func TestPluginQuotaManager_CheckQuota_PluginSpecificLimit(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 10,
PerPluginLimits: map[string]scheduler.PluginLimit{
"jupyter": {
MaxGPUs: 3,
MaxServices: 2,
},
"vllm": {
MaxGPUs: 8,
MaxServices: 4,
},
},
}
m := scheduler.NewPluginQuotaManager(config)
// Jupyter: within plugin GPU limit
err := m.CheckQuota("user1", "jupyter", 2)
require.NoError(t, err)
m.RecordUsage("user1", "jupyter", 2)
// Jupyter: exceed plugin GPU limit (but within global and user limits)
err = m.CheckQuota("user2", "jupyter", 2)
assert.Error(t, err)
assert.Contains(t, err.Error(), "plugin jupyter GPU limit exceeded")
// vLLM: within its higher limit
err = m.CheckQuota("user1", "vllm", 4)
assert.NoError(t, err)
}
func TestPluginQuotaManager_CheckQuota_PluginServiceLimit(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 10,
PerUserServices: 10,
PerPluginLimits: map[string]scheduler.PluginLimit{
"jupyter": {
MaxGPUs: 10,
MaxServices: 2, // Only 2 jupyter services total
},
},
}
m := scheduler.NewPluginQuotaManager(config)
// First jupyter service
err := m.CheckQuota("user1", "jupyter", 1)
require.NoError(t, err)
m.RecordUsage("user1", "jupyter", 1)
// Second jupyter service (different user)
err = m.CheckQuota("user2", "jupyter", 1)
require.NoError(t, err)
m.RecordUsage("user2", "jupyter", 1)
// Third jupyter service should fail (plugin service limit reached)
err = m.CheckQuota("user3", "jupyter", 1)
assert.Error(t, err)
assert.Contains(t, err.Error(), "plugin jupyter service limit exceeded")
}
func TestPluginQuotaManager_CheckQuota_AllowedPlugins(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 10,
UserOverrides: map[string]scheduler.UserLimit{
"restricted-user": {
MaxGPUs: 5,
MaxServices: 5,
AllowedPlugins: []string{"jupyter"},
},
},
}
m := scheduler.NewPluginQuotaManager(config)
// Restricted user can use allowed plugin
err := m.CheckQuota("restricted-user", "jupyter", 2)
assert.NoError(t, err)
// Restricted user cannot use other plugins
err = m.CheckQuota("restricted-user", "vllm", 2)
assert.Error(t, err)
assert.Contains(t, err.Error(), "not allowed to use plugin vllm")
// Regular user can use any plugin
err = m.CheckQuota("regular-user", "vllm", 2)
assert.NoError(t, err)
}
func TestPluginQuotaManager_RecordAndReleaseUsage(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 5,
}
m := scheduler.NewPluginQuotaManager(config)
// Record usage
m.RecordUsage("user1", "jupyter", 2)
m.RecordUsage("user1", "vllm", 1)
m.RecordUsage("user2", "jupyter", 3)
// Check usage tracking
usage, totalGPUs := m.GetUsage("user1")
assert.Equal(t, 2, usage["jupyter"].GPUs)
assert.Equal(t, 1, usage["jupyter"].Services)
assert.Equal(t, 1, usage["vllm"].GPUs)
assert.Equal(t, 1, usage["vllm"].Services)
assert.Equal(t, 3, totalGPUs)
// Check global usage
globalGPUs, pluginTotals := m.GetGlobalUsage()
assert.Equal(t, 6, globalGPUs)
assert.Equal(t, 5, pluginTotals["jupyter"]) // 2+3
assert.Equal(t, 1, pluginTotals["vllm"])
// Release usage
m.ReleaseUsage("user1", "jupyter", 2)
// Verify release
usage, totalGPUs = m.GetUsage("user1")
assert.Equal(t, 0, usage["jupyter"].GPUs)
assert.Equal(t, 0, usage["jupyter"].Services)
assert.Equal(t, 1, usage["vllm"].GPUs) // user1 still has vllm
assert.Equal(t, 1, totalGPUs) // only vllm remains for user1
// Check global usage after release
globalGPUs, pluginTotals = m.GetGlobalUsage()
assert.Equal(t, 4, globalGPUs)
assert.Equal(t, 3, pluginTotals["jupyter"]) // 3 from user2
assert.Equal(t, 1, pluginTotals["vllm"])
}
func TestPluginQuotaManager_RecordUsage_Disabled(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: false,
TotalGPUs: 10,
}
m := scheduler.NewPluginQuotaManager(config)
// Recording usage when disabled should not crash
m.RecordUsage("user1", "plugin1", 5)
// Usage should be empty (not tracked)
usage, totalGPUs := m.GetUsage("user1")
assert.Empty(t, usage)
assert.Equal(t, 0, totalGPUs)
}
func TestPluginQuotaManager_ReleaseUsage_NonExistent(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
}
m := scheduler.NewPluginQuotaManager(config)
// Releasing non-existent usage should not crash or go negative
m.ReleaseUsage("nonexistent", "plugin1", 5)
// Global usage should remain 0
globalGPUs, _ := m.GetGlobalUsage()
assert.Equal(t, 0, globalGPUs)
}
func TestPluginQuotaManager_CheckQuota_AnonymousUser(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 2,
PerUserServices: 2,
}
m := scheduler.NewPluginQuotaManager(config)
// Empty userID should be treated as "anonymous"
err := m.CheckQuota("", "plugin1", 2)
require.NoError(t, err)
m.RecordUsage("", "plugin1", 2)
// Second request from anonymous should fail (at limit)
err = m.CheckQuota("", "plugin1", 1)
assert.Error(t, err)
assert.Contains(t, err.Error(), "user anonymous GPU limit exceeded")
}
func TestPluginQuotaManager_CheckQuota_DefaultPlugin(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 10,
PerUserGPUs: 5,
PerUserServices: 5,
PerPluginLimits: map[string]scheduler.PluginLimit{
"default": {
MaxGPUs: 2,
MaxServices: 2,
},
},
}
m := scheduler.NewPluginQuotaManager(config)
// Empty plugin name should be treated as "default"
err := m.CheckQuota("user1", "", 1)
require.NoError(t, err)
m.RecordUsage("user1", "", 1)
// Exceed default plugin limit
err = m.CheckQuota("user2", "", 2)
assert.Error(t, err)
assert.Contains(t, err.Error(), "plugin default GPU limit exceeded")
}
func TestPluginQuotaManager_ConcurrentAccess(t *testing.T) {
config := scheduler.PluginQuotaConfig{
Enabled: true,
TotalGPUs: 100,
PerUserGPUs: 50,
PerUserServices: 50,
}
m := scheduler.NewPluginQuotaManager(config)
// Concurrently record usage from multiple goroutines
done := make(chan bool, 10)
for i := 0; i < 10; i++ {
go func(idx int) {
user := "user"
if idx%2 == 0 {
user = "user1"
} else {
user = "user2"
}
m.RecordUsage(user, "plugin1", 1)
done <- true
}(i)
}
// Wait for all goroutines
for i := 0; i < 10; i++ {
<-done
}
// Verify totals
globalGPUs, _ := m.GetGlobalUsage()
assert.Equal(t, 10, globalGPUs)
usage1, _ := m.GetUsage("user1")
assert.Equal(t, 5, usage1["plugin1"].GPUs)
assert.Equal(t, 5, usage1["plugin1"].Services)
usage2, _ := m.GetUsage("user2")
assert.Equal(t, 5, usage2["plugin1"].GPUs)
assert.Equal(t, 5, usage2["plugin1"].Services)
}