fetch_ml/tests/e2e/ml_project_variants_test.go
Jeremie Fraeys c980167041 test: implement comprehensive test suite with multiple test types
- Add end-to-end tests for complete workflow validation
- Include integration tests for API and database interactions
- Add unit tests for all major components and utilities
- Include performance tests for payload handling
- Add CLI API integration tests
- Include Podman container integration tests
- Add WebSocket and queue execution tests
- Include shell script tests for setup validation

Provides comprehensive test coverage ensuring platform reliability
and functionality across all components and interactions.
2025-12-04 16:55:13 -05:00

673 lines
20 KiB
Go

package tests
import (
"os"
"path/filepath"
"testing"
)
// TestMLProjectVariants tests different types of ML projects with zero-install workflow
func TestMLProjectVariants(t *testing.T) {
testDir := t.TempDir()
// Test 1: Scikit-learn project
t.Run("ScikitLearnProject", func(t *testing.T) {
experimentDir := filepath.Join(testDir, "sklearn_experiment")
if err := os.MkdirAll(experimentDir, 0755); err != nil {
t.Fatalf("Failed to create experiment directory: %v", err)
}
// Create scikit-learn training script
trainScript := filepath.Join(experimentDir, "train.py")
trainCode := `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int, default=100)
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training Random Forest with {args.n_estimators} estimators...")
# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=42)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
logger.info(f"Training completed. Accuracy: {accuracy:.4f}")
# Save results
results = {
"model_type": "RandomForest",
"n_estimators": args.n_estimators,
"accuracy": accuracy,
"n_samples": len(X),
"n_features": X.shape[1]
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
logger.info("Results saved successfully!")
if __name__ == "__main__":
main()
`
if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil {
t.Fatalf("Failed to create train.py: %v", err)
}
// Create requirements.txt
requirementsFile := filepath.Join(experimentDir, "requirements.txt")
requirements := `scikit-learn>=1.0.0
numpy>=1.21.0
pandas>=1.3.0
`
if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil {
t.Fatalf("Failed to create requirements.txt: %v", err)
}
// Verify scikit-learn project structure
if _, err := os.Stat(trainScript); os.IsNotExist(err) {
t.Error("scikit-learn train.py should exist")
}
if _, err := os.Stat(requirementsFile); os.IsNotExist(err) {
t.Error("scikit-learn requirements.txt should exist")
}
})
// Test 2: XGBoost project
t.Run("XGBoostProject", func(t *testing.T) {
experimentDir := filepath.Join(testDir, "xgboost_experiment")
if err := os.MkdirAll(experimentDir, 0755); err != nil {
t.Fatalf("Failed to create experiment directory: %v", err)
}
// Create XGBoost training script
trainScript := filepath.Join(experimentDir, "train.py")
trainCode := `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int, default=100)
parser.add_argument("--max_depth", type=int, default=6)
parser.add_argument("--learning_rate", type=float, default=0.1)
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training XGBoost with {args.n_estimators} estimators, depth {args.max_depth}...")
# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Convert to DMatrix (XGBoost format)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Train model
params = {
'max_depth': args.max_depth,
'eta': args.learning_rate,
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'seed': 42
}
model = xgb.train(params, dtrain, args.n_estimators)
# Evaluate
y_pred_prob = model.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
logger.info(f"Training completed. Accuracy: {accuracy:.4f}")
# Save results
results = {
"model_type": "XGBoost",
"n_estimators": args.n_estimators,
"max_depth": args.max_depth,
"learning_rate": args.learning_rate,
"accuracy": accuracy,
"n_samples": len(X),
"n_features": X.shape[1]
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
# Save model
model.save_model(str(output_dir / "xgboost_model.json"))
logger.info("Results and model saved successfully!")
if __name__ == "__main__":
main()
`
if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil {
t.Fatalf("Failed to create train.py: %v", err)
}
// Create requirements.txt
requirementsFile := filepath.Join(experimentDir, "requirements.txt")
requirements := `xgboost>=1.5.0
scikit-learn>=1.0.0
numpy>=1.21.0
pandas>=1.3.0
`
if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil {
t.Fatalf("Failed to create requirements.txt: %v", err)
}
// Verify XGBoost project structure
if _, err := os.Stat(trainScript); os.IsNotExist(err) {
t.Error("XGBoost train.py should exist")
}
if _, err := os.Stat(requirementsFile); os.IsNotExist(err) {
t.Error("XGBoost requirements.txt should exist")
}
})
// Test 3: PyTorch project (deep learning)
t.Run("PyTorchProject", func(t *testing.T) {
experimentDir := filepath.Join(testDir, "pytorch_experiment")
if err := os.MkdirAll(experimentDir, 0755); err != nil {
t.Fatalf("Failed to create experiment directory: %v", err)
}
// Create PyTorch training script
trainScript := filepath.Join(experimentDir, "train.py")
trainCode := `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
class SimpleNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--learning_rate", type=float, default=0.001)
parser.add_argument("--hidden_size", type=int, default=64)
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training PyTorch model for {args.epochs} epochs...")
# Generate synthetic data
torch.manual_seed(42)
X = torch.randn(1000, 20)
y = torch.randint(0, 2, (1000,))
# Create dataset and dataloader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
# Initialize model
model = SimpleNet(20, args.hidden_size, 2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
# Training loop
model.train()
for epoch in range(args.epochs):
total_loss = 0
correct = 0
total = 0
for batch_X, batch_y in dataloader:
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
accuracy = correct / total
avg_loss = total_loss / len(dataloader)
logger.info(f"Epoch {epoch + 1}/{args.epochs}: Loss={avg_loss:.4f}, Acc={accuracy:.4f}")
time.sleep(0.1) # Small delay for logging
# Final evaluation
model.eval()
with torch.no_grad():
correct = 0
total = 0
for batch_X, batch_y in dataloader:
outputs = model(batch_X)
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
final_accuracy = correct / total
logger.info(f"Training completed. Final accuracy: {final_accuracy:.4f}")
# Save results
results = {
"model_type": "PyTorch",
"epochs": args.epochs,
"batch_size": args.batch_size,
"learning_rate": args.learning_rate,
"hidden_size": args.hidden_size,
"final_accuracy": final_accuracy,
"n_samples": len(X),
"input_features": X.shape[1]
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
# Save model
torch.save(model.state_dict(), output_dir / "pytorch_model.pth")
logger.info("Results and model saved successfully!")
if __name__ == "__main__":
main()
`
if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil {
t.Fatalf("Failed to create train.py: %v", err)
}
// Create requirements.txt
requirementsFile := filepath.Join(experimentDir, "requirements.txt")
requirements := `torch>=1.9.0
torchvision>=0.10.0
numpy>=1.21.0
`
if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil {
t.Fatalf("Failed to create requirements.txt: %v", err)
}
// Verify PyTorch project structure
if _, err := os.Stat(trainScript); os.IsNotExist(err) {
t.Error("PyTorch train.py should exist")
}
if _, err := os.Stat(requirementsFile); os.IsNotExist(err) {
t.Error("PyTorch requirements.txt should exist")
}
})
// Test 4: TensorFlow/Keras project
t.Run("TensorFlowProject", func(t *testing.T) {
experimentDir := filepath.Join(testDir, "tensorflow_experiment")
if err := os.MkdirAll(experimentDir, 0755); err != nil {
t.Fatalf("Failed to create experiment directory: %v", err)
}
// Create TensorFlow training script
trainScript := filepath.Join(experimentDir, "train.py")
trainCode := `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
import tensorflow as tf
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--learning_rate", type=float, default=0.001)
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training TensorFlow model for {args.epochs} epochs...")
# Generate synthetic data
np.random.seed(42)
tf.random.set_seed(42)
X = np.random.randn(1000, 20)
y = np.random.randint(0, 2, (1000,))
# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(buffer_size=1000).batch(args.batch_size)
# Build model
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(20,)),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(2, activation='softmax')
])
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate),
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Training
history = model.fit(
dataset,
epochs=args.epochs,
verbose=1
)
final_accuracy = history.history['accuracy'][-1]
logger.info(f"Training completed. Final accuracy: {final_accuracy:.4f}")
# Save results
results = {
"model_type": "TensorFlow",
"epochs": args.epochs,
"batch_size": args.batch_size,
"learning_rate": args.learning_rate,
"final_accuracy": float(final_accuracy),
"n_samples": len(X),
"input_features": X.shape[1]
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
# Save model
model.save(output_dir / "tensorflow_model")
logger.info("Results and model saved successfully!")
if __name__ == "__main__":
main()
`
if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil {
t.Fatalf("Failed to create train.py: %v", err)
}
// Create requirements.txt
requirementsFile := filepath.Join(experimentDir, "requirements.txt")
requirements := `tensorflow>=2.8.0
numpy>=1.21.0
`
if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil {
t.Fatalf("Failed to create requirements.txt: %v", err)
}
// Verify TensorFlow project structure
if _, err := os.Stat(trainScript); os.IsNotExist(err) {
t.Error("TensorFlow train.py should exist")
}
if _, err := os.Stat(requirementsFile); os.IsNotExist(err) {
t.Error("TensorFlow requirements.txt should exist")
}
})
// Test 5: Traditional ML (statsmodels)
t.Run("StatsModelsProject", func(t *testing.T) {
experimentDir := filepath.Join(testDir, "statsmodels_experiment")
if err := os.MkdirAll(experimentDir, 0755); err != nil {
t.Fatalf("Failed to create experiment directory: %v", err)
}
// Create statsmodels training script
trainScript := filepath.Join(experimentDir, "train.py")
trainCode := `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
import pandas as pd
import statsmodels.api as sm
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Training statsmodels linear regression...")
# Generate synthetic data
np.random.seed(42)
n_samples = 1000
n_features = 5
X = np.random.randn(n_samples, n_features)
# True coefficients
true_coef = np.array([1.5, -2.0, 0.5, 3.0, -1.0])
noise = np.random.randn(n_samples) * 0.1
y = X @ true_coef + noise
# Create DataFrame
feature_names = [f"feature_{i}" for i in range(n_features)]
X_df = pd.DataFrame(X, columns=feature_names)
y_series = pd.Series(y, name="target")
# Add constant for intercept
X_with_const = sm.add_constant(X_df)
# Fit model
model = sm.OLS(y_series, X_with_const).fit()
logger.info(f"Model fitted successfully. R-squared: {model.rsquared:.4f}")
# Save results
results = {
"model_type": "LinearRegression",
"n_samples": n_samples,
"n_features": n_features,
"r_squared": float(model.rsquared),
"adj_r_squared": float(model.rsquared_adj),
"f_statistic": float(model.fvalue),
"f_pvalue": float(model.f_pvalue),
"coefficients": model.params.to_dict(),
"standard_errors": model.bse.to_dict(),
"p_values": model.pvalues.to_dict()
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
# Save model summary
with open(output_dir / "model_summary.txt", "w") as f:
f.write(str(model.summary()))
logger.info("Results and model summary saved successfully!")
if __name__ == "__main__":
main()
`
if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil {
t.Fatalf("Failed to create train.py: %v", err)
}
// Create requirements.txt
requirementsFile := filepath.Join(experimentDir, "requirements.txt")
requirements := `statsmodels>=0.13.0
pandas>=1.3.0
numpy>=1.21.0
`
if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil {
t.Fatalf("Failed to create requirements.txt: %v", err)
}
// Verify statsmodels project structure
if _, err := os.Stat(trainScript); os.IsNotExist(err) {
t.Error("statsmodels train.py should exist")
}
if _, err := os.Stat(requirementsFile); os.IsNotExist(err) {
t.Error("statsmodels requirements.txt should exist")
}
})
}
// TestMLProjectCompatibility tests that all project types work with zero-install workflow
func TestMLProjectCompatibility(t *testing.T) {
testDir := t.TempDir()
// Test that all project types can be uploaded and processed
projectTypes := []string{
"sklearn_experiment",
"xgboost_experiment",
"pytorch_experiment",
"tensorflow_experiment",
"statsmodels_experiment",
}
for _, projectType := range projectTypes {
t.Run(projectType+"_UploadTest", func(t *testing.T) {
// Create experiment directory
experimentDir := filepath.Join(testDir, projectType)
if err := os.MkdirAll(experimentDir, 0755); err != nil {
t.Fatalf("Failed to create experiment directory: %v", err)
}
// Create minimal files
trainScript := filepath.Join(experimentDir, "train.py")
trainCode := `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training {projectType} model...")
# Simulate training
for epoch in range(3):
logger.info(f"Epoch {epoch + 1}: training...")
time.sleep(0.01)
results = {"model_type": projectType, "status": "completed"}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f)
logger.info("Training complete!")
if __name__ == "__main__":
main()
`
if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil {
t.Fatalf("Failed to create train.py: %v", err)
}
// Create requirements.txt
requirementsFile := filepath.Join(experimentDir, "requirements.txt")
requirements := "# Framework-specific dependencies\n"
if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil {
t.Fatalf("Failed to create requirements.txt: %v", err)
}
// Simulate upload process
serverDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
jobDir := filepath.Join(serverDir, projectType+"_20231201_143022")
if err := os.MkdirAll(jobDir, 0755); err != nil {
t.Fatalf("Failed to create server directories: %v", err)
}
// Copy files
files := []string{"train.py", "requirements.txt"}
for _, file := range files {
src := filepath.Join(experimentDir, file)
dst := filepath.Join(jobDir, file)
data, err := os.ReadFile(src)
if err != nil {
t.Fatalf("Failed to read %s: %v", file, err)
}
if err := os.WriteFile(dst, data, 0755); err != nil {
t.Fatalf("Failed to copy %s: %v", file, err)
}
}
// Verify upload
for _, file := range files {
dst := filepath.Join(jobDir, file)
if _, err := os.Stat(dst); os.IsNotExist(err) {
t.Errorf("Uploaded file %s should exist for %s", file, projectType)
}
}
})
}
}