Refactor plugins to use interface for testability: - Add PodmanInterface to container package (StartContainer, StopContainer, RemoveContainer) - Update MLflow plugin to use container.PodmanInterface - Update TensorBoard plugin to use container.PodmanInterface - Add comprehensive mocked tests for all three plugins (wandb, mlflow, tensorboard) - Coverage increased from 18% to 91.4%
84 lines
3.4 KiB
Go
84 lines
3.4 KiB
Go
package fault
|
|
|
|
import (
|
|
"os"
|
|
"testing"
|
|
)
|
|
|
|
// TestMain controls whether fault injection tests run
|
|
// These tests require toxiproxy and are intended for nightly CI only
|
|
func TestMain(m *testing.M) {
|
|
// Check if fault injection tests should run
|
|
if os.Getenv("FETCH_ML_FAULT_INJECTION") != "1" {
|
|
// Skip all fault tests silently
|
|
os.Exit(0)
|
|
}
|
|
os.Exit(m.Run())
|
|
}
|
|
|
|
// TestNVMLUnavailableProvenanceFail verifies that when NVML is unavailable
|
|
// and ProvenanceBestEffort=false, the job fails loudly (no silent degradation)
|
|
func TestNVMLUnavailableProvenanceFail(t *testing.T) {
|
|
// TODO: Implement fault injection test with toxiproxy
|
|
// This test requires:
|
|
// - toxiproxy setup for GPU/NVML fault simulation
|
|
// - Configuration with ProvenanceBestEffort=false
|
|
// - A job that requires GPU
|
|
// - Verification that job fails with clear error, not silent degradation
|
|
t.Log("TODO: Implement NVML fault injection test")
|
|
}
|
|
|
|
// TestManifestWritePartialFailure verifies that if manifest write fails midway,
|
|
// no partial manifest is left on disk
|
|
func TestManifestWritePartialFailure(t *testing.T) {
|
|
// TODO: Implement fault injection test with disk fault simulation
|
|
// This test requires:
|
|
// - toxiproxy or disk fault injection setup
|
|
// - Write of large manifest that gets interrupted
|
|
// - Verification that no partial/corrupted manifest exists
|
|
t.Log("TODO: Implement manifest partial failure test")
|
|
}
|
|
|
|
// TestRedisUnavailableQueueBehavior verifies that when Redis is unavailable,
|
|
// there is no silent queue item drop
|
|
func TestRedisUnavailableQueueBehavior(t *testing.T) {
|
|
// TODO: Implement fault injection test with Redis fault simulation
|
|
// This test requires:
|
|
// - toxiproxy for Redis fault simulation
|
|
// - Queue operations during Redis outage
|
|
// - Verification that items are not dropped (either processed or error returned)
|
|
t.Log("TODO: Implement Redis queue fault injection test")
|
|
}
|
|
|
|
// TestAuditLogUnavailableHaltsJob verifies that if audit log write fails,
|
|
// the job halts rather than continuing without audit trail
|
|
func TestAuditLogUnavailableHaltsJob(t *testing.T) {
|
|
// TODO: Implement fault injection test for audit log failures
|
|
// This test requires:
|
|
// - toxiproxy for audit log fault simulation
|
|
// - Job submission when audit log is unavailable
|
|
// - Verification that job halts rather than continuing unaudited
|
|
t.Log("TODO: Implement audit log fault injection test")
|
|
}
|
|
|
|
// TestConfigHashFailureProvenanceClosed verifies that if config hash computation
|
|
// fails in strict mode, the operation fails closed (secure default)
|
|
func TestConfigHashFailureProvenanceClosed(t *testing.T) {
|
|
// TODO: Implement fault injection test for hash computation failures
|
|
// This test requires:
|
|
// - Fault injection framework for hash computation failures
|
|
// - Strict provenance mode enabled
|
|
// - Verification that operation fails closed (secure default)
|
|
t.Log("TODO: Implement config hash failure test")
|
|
}
|
|
|
|
// TestDiskFullDuringArtifactScan verifies that when disk is full during
|
|
// artifact scanning, an error is returned rather than a partial manifest
|
|
func TestDiskFullDuringArtifactScan(t *testing.T) {
|
|
// TODO: Implement fault injection test for disk full scenarios
|
|
// This test requires:
|
|
// - Disk space fault injection or container limits
|
|
// - Artifact scan operation that would fill disk
|
|
// - Verification that error is returned, not partial manifest
|
|
t.Log("TODO: Implement disk full artifact scan test")
|
|
}
|