Refactor plugins to use interface for testability: - Add PodmanInterface to container package (StartContainer, StopContainer, RemoveContainer) - Update MLflow plugin to use container.PodmanInterface - Update TensorBoard plugin to use container.PodmanInterface - Add comprehensive mocked tests for all three plugins (wandb, mlflow, tensorboard) - Coverage increased from 18% to 91.4%
170 lines
4.5 KiB
Go
170 lines
4.5 KiB
Go
package domain_test
|
|
|
|
import (
|
|
"syscall"
|
|
"testing"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/domain"
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
// TestClassifyFailureSIGKILL tests SIGKILL classification
|
|
func TestClassifyFailureSIGKILL(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
result := domain.ClassifyFailure(0, syscall.SIGKILL, "")
|
|
assert.Equal(t, domain.FailureInfrastructure, result)
|
|
}
|
|
|
|
// TestClassifyFailureCUDAOOM tests CUDA OOM classification
|
|
func TestClassifyFailureCUDAOOM(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []string{
|
|
"CUDA out of memory",
|
|
"cuda error: out of memory",
|
|
"GPU OOM detected",
|
|
}
|
|
|
|
for _, log := range cases {
|
|
result := domain.ClassifyFailure(1, nil, log)
|
|
assert.Equal(t, domain.FailureResource, result, "Failed for: %s", log)
|
|
}
|
|
}
|
|
|
|
// TestClassifyFailureGeneralOOM tests general OOM classification
|
|
func TestClassifyFailureGeneralOOM(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []string{
|
|
"Out of memory",
|
|
"Process was killed by OOM killer",
|
|
"cannot allocate memory",
|
|
}
|
|
|
|
for _, log := range cases {
|
|
result := domain.ClassifyFailure(1, nil, log)
|
|
assert.Equal(t, domain.FailureInfrastructure, result, "Failed for: %s", log)
|
|
}
|
|
}
|
|
|
|
// TestClassifyFailureDatasetHash tests dataset hash failure classification
|
|
func TestClassifyFailureDatasetHash(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []string{
|
|
"Hash mismatch detected",
|
|
"Checksum failed for dataset",
|
|
"dataset not found",
|
|
"dataset unreachable",
|
|
}
|
|
|
|
for _, log := range cases {
|
|
result := domain.ClassifyFailure(1, nil, log)
|
|
assert.Equal(t, domain.FailureData, result, "Failed for: %s", log)
|
|
}
|
|
}
|
|
|
|
// TestClassifyFailureDiskFull tests disk full classification
|
|
func TestClassifyFailureDiskFull(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []string{
|
|
"No space left on device",
|
|
"Disk full",
|
|
"disk quota exceeded",
|
|
}
|
|
|
|
for _, log := range cases {
|
|
result := domain.ClassifyFailure(1, nil, log)
|
|
assert.Equal(t, domain.FailureResource, result, "Failed for: %s", log)
|
|
}
|
|
}
|
|
|
|
// TestClassifyFailureTimeout tests timeout classification
|
|
func TestClassifyFailureTimeout(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []string{
|
|
"Task timeout after 300s",
|
|
"Connection timeout",
|
|
"deadline exceeded",
|
|
}
|
|
|
|
for _, log := range cases {
|
|
result := domain.ClassifyFailure(1, nil, log)
|
|
assert.Equal(t, domain.FailureResource, result, "Failed for: %s", log)
|
|
}
|
|
}
|
|
|
|
// TestClassifyFailureSegfault tests segmentation fault classification
|
|
func TestClassifyFailureSegfault(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
result := domain.ClassifyFailure(139, nil, "Segmentation fault")
|
|
assert.Equal(t, domain.FailureCode, result)
|
|
}
|
|
|
|
// TestClassifyFailureException tests exception classification
|
|
func TestClassifyFailureException(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []string{
|
|
"Traceback (most recent call last)",
|
|
"Exception: Something went wrong",
|
|
"Error: module not found",
|
|
}
|
|
|
|
for _, log := range cases {
|
|
result := domain.ClassifyFailure(1, nil, log)
|
|
assert.Equal(t, domain.FailureCode, result, "Failed for: %s", log)
|
|
}
|
|
}
|
|
|
|
// TestClassifyFailureNetwork tests network failure classification
|
|
func TestClassifyFailureNetwork(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []string{
|
|
"connection refused",
|
|
"Connection reset by peer",
|
|
"No route to host",
|
|
"Network unreachable",
|
|
}
|
|
|
|
for _, log := range cases {
|
|
result := domain.ClassifyFailure(1, nil, log)
|
|
assert.Equal(t, domain.FailureInfrastructure, result, "Failed for: %s", log)
|
|
}
|
|
}
|
|
|
|
// TestClassifyFailureUnknown tests unknown failure classification
|
|
func TestClassifyFailureUnknown(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Use exitCode=0 and message that doesn't match any pattern
|
|
result := domain.ClassifyFailure(0, nil, "Something unexpected happened")
|
|
assert.Equal(t, domain.FailureUnknown, result)
|
|
}
|
|
|
|
// TestFailureClassString tests failure class string representation
|
|
func TestFailureClassString(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
assert.Equal(t, "infrastructure", string(domain.FailureInfrastructure))
|
|
assert.Equal(t, "code", string(domain.FailureCode))
|
|
assert.Equal(t, "data", string(domain.FailureData))
|
|
assert.Equal(t, "resource", string(domain.FailureResource))
|
|
assert.Equal(t, "unknown", string(domain.FailureUnknown))
|
|
}
|
|
|
|
// TestJobStatusString tests job status string representation
|
|
func TestJobStatusString(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
assert.Equal(t, "pending", domain.StatusPending.String())
|
|
assert.Equal(t, "queued", domain.StatusQueued.String())
|
|
assert.Equal(t, "running", domain.StatusRunning.String())
|
|
assert.Equal(t, "completed", domain.StatusCompleted.String())
|
|
assert.Equal(t, "failed", domain.StatusFailed.String())
|
|
}
|