Some checks failed
Build Pipeline / Build Binaries (push) Failing after 1m59s
Build Pipeline / Build Docker Images (push) Has been skipped
Build Pipeline / Sign HIPAA Config (push) Has been skipped
Build Pipeline / Generate SLSA Provenance (push) Has been skipped
Checkout test / test (push) Successful in 5s
CI Pipeline / Test (ubuntu-latest on self-hosted) (push) Failing after 1s
CI Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI Pipeline / Security Scan (push) Has been skipped
CI Pipeline / Test Scripts (push) Has been skipped
CI Pipeline / Test Native Libraries (push) Has been skipped
CI Pipeline / Native Library Build Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 35s
CI Pipeline / Trigger Build Workflow (push) Failing after 0s
Security Scan / Security Analysis (push) Has been cancelled
Security Scan / Native Library Security (push) Has been cancelled
Verification & Maintenance / V.1 - Schema Drift Detection (push) Has been cancelled
Verification & Maintenance / V.4 - Custom Go Vet Analyzers (push) Has been cancelled
Verification & Maintenance / V.7 - Audit Chain Integrity (push) Has been cancelled
Verification & Maintenance / V.6 - Extended Security Scanning (push) Has been cancelled
Verification & Maintenance / V.10 - OpenSSF Scorecard (push) Has been cancelled
Verification & Maintenance / Verification Summary (push) Has been cancelled
- Add plugin_quota.go with GPU quota management for scheduler - Update scheduler hub and protocol for plugin support - Add comprehensive plugin quota unit tests - Update gang service and WebSocket queue integration tests
385 lines
10 KiB
Go
385 lines
10 KiB
Go
package scheduler_test
|
|
|
|
import (
|
|
"testing"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestPluginQuotaManager_CheckQuota_Disabled(t *testing.T) {
|
|
// When quota is disabled, all jobs should pass
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: false,
|
|
TotalGPUs: 1, // Set a low limit that would fail if enabled
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
err := m.CheckQuota("user1", "plugin1", 100)
|
|
assert.NoError(t, err)
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_GlobalLimit(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 4,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// First job should succeed
|
|
err := m.CheckQuota("user1", "plugin1", 2)
|
|
require.NoError(t, err)
|
|
|
|
// Record the usage
|
|
m.RecordUsage("user1", "plugin1", 2)
|
|
|
|
// Second job should succeed (2+2=4, within limit)
|
|
err = m.CheckQuota("user2", "plugin2", 2)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user2", "plugin2", 2)
|
|
|
|
// Third job should fail (would exceed global limit)
|
|
err = m.CheckQuota("user3", "plugin3", 1)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "global GPU limit exceeded")
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_PerUserGPULimit(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 3,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// User1: first job should succeed
|
|
err := m.CheckQuota("user1", "plugin1", 2)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user1", "plugin1", 2)
|
|
|
|
// User1: second job should succeed (2+1=3, at limit)
|
|
err = m.CheckQuota("user1", "plugin2", 1)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user1", "plugin2", 1)
|
|
|
|
// User1: third job should fail (would exceed per-user limit)
|
|
err = m.CheckQuota("user1", "plugin3", 1)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "user user1 GPU limit exceeded")
|
|
|
|
// User2: job should succeed (different user)
|
|
err = m.CheckQuota("user2", "plugin1", 3)
|
|
assert.NoError(t, err)
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_PerUserServiceLimit(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 10,
|
|
PerUserServices: 2,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// User1: first service should succeed
|
|
err := m.CheckQuota("user1", "plugin1", 1)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user1", "plugin1", 1)
|
|
|
|
// User1: second service should succeed
|
|
err = m.CheckQuota("user1", "plugin2", 1)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user1", "plugin2", 1)
|
|
|
|
// User1: third service should fail (would exceed service count limit)
|
|
err = m.CheckQuota("user1", "plugin3", 1)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "user user1 service limit exceeded")
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_UserOverride(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 2,
|
|
PerUserServices: 2,
|
|
UserOverrides: map[string]scheduler.UserLimit{
|
|
"vip-user": {
|
|
MaxGPUs: 5,
|
|
MaxServices: 10,
|
|
},
|
|
},
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Regular user: limited by default
|
|
err := m.CheckQuota("regular", "plugin1", 3)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "regular GPU limit exceeded")
|
|
|
|
// VIP user: has higher limit
|
|
err = m.CheckQuota("vip-user", "plugin1", 4)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("vip-user", "plugin1", 4)
|
|
|
|
// VIP user: still within limit
|
|
err = m.CheckQuota("vip-user", "plugin2", 1)
|
|
assert.NoError(t, err)
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_PluginSpecificLimit(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 10,
|
|
PerPluginLimits: map[string]scheduler.PluginLimit{
|
|
"jupyter": {
|
|
MaxGPUs: 3,
|
|
MaxServices: 2,
|
|
},
|
|
"vllm": {
|
|
MaxGPUs: 8,
|
|
MaxServices: 4,
|
|
},
|
|
},
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Jupyter: within plugin GPU limit
|
|
err := m.CheckQuota("user1", "jupyter", 2)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user1", "jupyter", 2)
|
|
|
|
// Jupyter: exceed plugin GPU limit (but within global and user limits)
|
|
err = m.CheckQuota("user2", "jupyter", 2)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "plugin jupyter GPU limit exceeded")
|
|
|
|
// vLLM: within its higher limit
|
|
err = m.CheckQuota("user1", "vllm", 4)
|
|
assert.NoError(t, err)
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_PluginServiceLimit(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 10,
|
|
PerUserServices: 10,
|
|
PerPluginLimits: map[string]scheduler.PluginLimit{
|
|
"jupyter": {
|
|
MaxGPUs: 10,
|
|
MaxServices: 2, // Only 2 jupyter services total
|
|
},
|
|
},
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// First jupyter service
|
|
err := m.CheckQuota("user1", "jupyter", 1)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user1", "jupyter", 1)
|
|
|
|
// Second jupyter service (different user)
|
|
err = m.CheckQuota("user2", "jupyter", 1)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user2", "jupyter", 1)
|
|
|
|
// Third jupyter service should fail (plugin service limit reached)
|
|
err = m.CheckQuota("user3", "jupyter", 1)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "plugin jupyter service limit exceeded")
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_AllowedPlugins(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 10,
|
|
UserOverrides: map[string]scheduler.UserLimit{
|
|
"restricted-user": {
|
|
MaxGPUs: 5,
|
|
MaxServices: 5,
|
|
AllowedPlugins: []string{"jupyter"},
|
|
},
|
|
},
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Restricted user can use allowed plugin
|
|
err := m.CheckQuota("restricted-user", "jupyter", 2)
|
|
assert.NoError(t, err)
|
|
|
|
// Restricted user cannot use other plugins
|
|
err = m.CheckQuota("restricted-user", "vllm", 2)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "not allowed to use plugin vllm")
|
|
|
|
// Regular user can use any plugin
|
|
err = m.CheckQuota("regular-user", "vllm", 2)
|
|
assert.NoError(t, err)
|
|
}
|
|
|
|
func TestPluginQuotaManager_RecordAndReleaseUsage(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 5,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Record usage
|
|
m.RecordUsage("user1", "jupyter", 2)
|
|
m.RecordUsage("user1", "vllm", 1)
|
|
m.RecordUsage("user2", "jupyter", 3)
|
|
|
|
// Check usage tracking
|
|
usage, totalGPUs := m.GetUsage("user1")
|
|
assert.Equal(t, 2, usage["jupyter"].GPUs)
|
|
assert.Equal(t, 1, usage["jupyter"].Services)
|
|
assert.Equal(t, 1, usage["vllm"].GPUs)
|
|
assert.Equal(t, 1, usage["vllm"].Services)
|
|
assert.Equal(t, 3, totalGPUs)
|
|
|
|
// Check global usage
|
|
globalGPUs, pluginTotals := m.GetGlobalUsage()
|
|
assert.Equal(t, 6, globalGPUs)
|
|
assert.Equal(t, 5, pluginTotals["jupyter"]) // 2+3
|
|
assert.Equal(t, 1, pluginTotals["vllm"])
|
|
|
|
// Release usage
|
|
m.ReleaseUsage("user1", "jupyter", 2)
|
|
|
|
// Verify release
|
|
usage, totalGPUs = m.GetUsage("user1")
|
|
assert.Equal(t, 0, usage["jupyter"].GPUs)
|
|
assert.Equal(t, 0, usage["jupyter"].Services)
|
|
assert.Equal(t, 1, usage["vllm"].GPUs) // user1 still has vllm
|
|
assert.Equal(t, 1, totalGPUs) // only vllm remains for user1
|
|
|
|
// Check global usage after release
|
|
globalGPUs, pluginTotals = m.GetGlobalUsage()
|
|
assert.Equal(t, 4, globalGPUs)
|
|
assert.Equal(t, 3, pluginTotals["jupyter"]) // 3 from user2
|
|
assert.Equal(t, 1, pluginTotals["vllm"])
|
|
}
|
|
|
|
func TestPluginQuotaManager_RecordUsage_Disabled(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: false,
|
|
TotalGPUs: 10,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Recording usage when disabled should not crash
|
|
m.RecordUsage("user1", "plugin1", 5)
|
|
|
|
// Usage should be empty (not tracked)
|
|
usage, totalGPUs := m.GetUsage("user1")
|
|
assert.Empty(t, usage)
|
|
assert.Equal(t, 0, totalGPUs)
|
|
}
|
|
|
|
func TestPluginQuotaManager_ReleaseUsage_NonExistent(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Releasing non-existent usage should not crash or go negative
|
|
m.ReleaseUsage("nonexistent", "plugin1", 5)
|
|
|
|
// Global usage should remain 0
|
|
globalGPUs, _ := m.GetGlobalUsage()
|
|
assert.Equal(t, 0, globalGPUs)
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_AnonymousUser(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 2,
|
|
PerUserServices: 2,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Empty userID should be treated as "anonymous"
|
|
err := m.CheckQuota("", "plugin1", 2)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("", "plugin1", 2)
|
|
|
|
// Second request from anonymous should fail (at limit)
|
|
err = m.CheckQuota("", "plugin1", 1)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "user anonymous GPU limit exceeded")
|
|
}
|
|
|
|
func TestPluginQuotaManager_CheckQuota_DefaultPlugin(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 10,
|
|
PerUserGPUs: 5,
|
|
PerUserServices: 5,
|
|
PerPluginLimits: map[string]scheduler.PluginLimit{
|
|
"default": {
|
|
MaxGPUs: 2,
|
|
MaxServices: 2,
|
|
},
|
|
},
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Empty plugin name should be treated as "default"
|
|
err := m.CheckQuota("user1", "", 1)
|
|
require.NoError(t, err)
|
|
m.RecordUsage("user1", "", 1)
|
|
|
|
// Exceed default plugin limit
|
|
err = m.CheckQuota("user2", "", 2)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "plugin default GPU limit exceeded")
|
|
}
|
|
|
|
func TestPluginQuotaManager_ConcurrentAccess(t *testing.T) {
|
|
config := scheduler.PluginQuotaConfig{
|
|
Enabled: true,
|
|
TotalGPUs: 100,
|
|
PerUserGPUs: 50,
|
|
PerUserServices: 50,
|
|
}
|
|
m := scheduler.NewPluginQuotaManager(config)
|
|
|
|
// Concurrently record usage from multiple goroutines
|
|
done := make(chan bool, 10)
|
|
for i := 0; i < 10; i++ {
|
|
go func(idx int) {
|
|
user := "user"
|
|
if idx%2 == 0 {
|
|
user = "user1"
|
|
} else {
|
|
user = "user2"
|
|
}
|
|
m.RecordUsage(user, "plugin1", 1)
|
|
done <- true
|
|
}(i)
|
|
}
|
|
|
|
// Wait for all goroutines
|
|
for i := 0; i < 10; i++ {
|
|
<-done
|
|
}
|
|
|
|
// Verify totals
|
|
globalGPUs, _ := m.GetGlobalUsage()
|
|
assert.Equal(t, 10, globalGPUs)
|
|
|
|
usage1, _ := m.GetUsage("user1")
|
|
assert.Equal(t, 5, usage1["plugin1"].GPUs)
|
|
assert.Equal(t, 5, usage1["plugin1"].Services)
|
|
|
|
usage2, _ := m.GetUsage("user2")
|
|
assert.Equal(t, 5, usage2["plugin1"].GPUs)
|
|
assert.Equal(t, 5, usage2["plugin1"].Services)
|
|
}
|