fetch_ml/tests/fault/fault_test.go
Jeremie Fraeys f827ee522a
test(tracking/plugins): add PodmanInterface and comprehensive plugin tests for 91% coverage
Refactor plugins to use interface for testability:
- Add PodmanInterface to container package (StartContainer, StopContainer, RemoveContainer)
- Update MLflow plugin to use container.PodmanInterface
- Update TensorBoard plugin to use container.PodmanInterface
- Add comprehensive mocked tests for all three plugins (wandb, mlflow, tensorboard)
- Coverage increased from 18% to 91.4%
2026-03-14 16:59:16 -04:00

84 lines
3.4 KiB
Go

package fault
import (
"os"
"testing"
)
// TestMain controls whether fault injection tests run
// These tests require toxiproxy and are intended for nightly CI only
func TestMain(m *testing.M) {
// Check if fault injection tests should run
if os.Getenv("FETCH_ML_FAULT_INJECTION") != "1" {
// Skip all fault tests silently
os.Exit(0)
}
os.Exit(m.Run())
}
// TestNVMLUnavailableProvenanceFail verifies that when NVML is unavailable
// and ProvenanceBestEffort=false, the job fails loudly (no silent degradation)
func TestNVMLUnavailableProvenanceFail(t *testing.T) {
// TODO: Implement fault injection test with toxiproxy
// This test requires:
// - toxiproxy setup for GPU/NVML fault simulation
// - Configuration with ProvenanceBestEffort=false
// - A job that requires GPU
// - Verification that job fails with clear error, not silent degradation
t.Log("TODO: Implement NVML fault injection test")
}
// TestManifestWritePartialFailure verifies that if manifest write fails midway,
// no partial manifest is left on disk
func TestManifestWritePartialFailure(t *testing.T) {
// TODO: Implement fault injection test with disk fault simulation
// This test requires:
// - toxiproxy or disk fault injection setup
// - Write of large manifest that gets interrupted
// - Verification that no partial/corrupted manifest exists
t.Log("TODO: Implement manifest partial failure test")
}
// TestRedisUnavailableQueueBehavior verifies that when Redis is unavailable,
// there is no silent queue item drop
func TestRedisUnavailableQueueBehavior(t *testing.T) {
// TODO: Implement fault injection test with Redis fault simulation
// This test requires:
// - toxiproxy for Redis fault simulation
// - Queue operations during Redis outage
// - Verification that items are not dropped (either processed or error returned)
t.Log("TODO: Implement Redis queue fault injection test")
}
// TestAuditLogUnavailableHaltsJob verifies that if audit log write fails,
// the job halts rather than continuing without audit trail
func TestAuditLogUnavailableHaltsJob(t *testing.T) {
// TODO: Implement fault injection test for audit log failures
// This test requires:
// - toxiproxy for audit log fault simulation
// - Job submission when audit log is unavailable
// - Verification that job halts rather than continuing unaudited
t.Log("TODO: Implement audit log fault injection test")
}
// TestConfigHashFailureProvenanceClosed verifies that if config hash computation
// fails in strict mode, the operation fails closed (secure default)
func TestConfigHashFailureProvenanceClosed(t *testing.T) {
// TODO: Implement fault injection test for hash computation failures
// This test requires:
// - Fault injection framework for hash computation failures
// - Strict provenance mode enabled
// - Verification that operation fails closed (secure default)
t.Log("TODO: Implement config hash failure test")
}
// TestDiskFullDuringArtifactScan verifies that when disk is full during
// artifact scanning, an error is returned rather than a partial manifest
func TestDiskFullDuringArtifactScan(t *testing.T) {
// TODO: Implement fault injection test for disk full scenarios
// This test requires:
// - Disk space fault injection or container limits
// - Artifact scan operation that would fill disk
// - Verification that error is returned, not partial manifest
t.Log("TODO: Implement disk full artifact scan test")
}