package tests import ( "os" "path/filepath" "testing" ) // TestMLProjectVariants tests different types of ML projects with zero-install workflow func TestMLProjectVariants(t *testing.T) { testDir := t.TempDir() // Test 1: Scikit-learn project t.Run("ScikitLearnProject", func(t *testing.T) { experimentDir := filepath.Join(testDir, "sklearn_experiment") if err := os.MkdirAll(experimentDir, 0755); err != nil { t.Fatalf("Failed to create experiment directory: %v", err) } // Create scikit-learn training script trainScript := filepath.Join(experimentDir, "train.py") trainCode := `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.datasets import make_classification def main(): parser = argparse.ArgumentParser() parser.add_argument("--n_estimators", type=int, default=100) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training Random Forest with {args.n_estimators} estimators...") # Generate synthetic data X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train model model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=42) model.fit(X_train, y_train) # Evaluate y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) logger.info(f"Training completed. Accuracy: {accuracy:.4f}") # Save results results = { "model_type": "RandomForest", "n_estimators": args.n_estimators, "accuracy": accuracy, "n_samples": len(X), "n_features": X.shape[1] } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) logger.info("Results saved successfully!") if __name__ == "__main__": main() ` if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil { t.Fatalf("Failed to create train.py: %v", err) } // Create requirements.txt requirementsFile := filepath.Join(experimentDir, "requirements.txt") requirements := `scikit-learn>=1.0.0 numpy>=1.21.0 pandas>=1.3.0 ` if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil { t.Fatalf("Failed to create requirements.txt: %v", err) } // Verify scikit-learn project structure if _, err := os.Stat(trainScript); os.IsNotExist(err) { t.Error("scikit-learn train.py should exist") } if _, err := os.Stat(requirementsFile); os.IsNotExist(err) { t.Error("scikit-learn requirements.txt should exist") } }) // Test 2: XGBoost project t.Run("XGBoostProject", func(t *testing.T) { experimentDir := filepath.Join(testDir, "xgboost_experiment") if err := os.MkdirAll(experimentDir, 0755); err != nil { t.Fatalf("Failed to create experiment directory: %v", err) } // Create XGBoost training script trainScript := filepath.Join(experimentDir, "train.py") trainCode := `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.datasets import make_classification def main(): parser = argparse.ArgumentParser() parser.add_argument("--n_estimators", type=int, default=100) parser.add_argument("--max_depth", type=int, default=6) parser.add_argument("--learning_rate", type=float, default=0.1) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training XGBoost with {args.n_estimators} estimators, depth {args.max_depth}...") # Generate synthetic data X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Convert to DMatrix (XGBoost format) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) # Train model params = { 'max_depth': args.max_depth, 'eta': args.learning_rate, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42 } model = xgb.train(params, dtrain, args.n_estimators) # Evaluate y_pred_prob = model.predict(dtest) y_pred = (y_pred_prob > 0.5).astype(int) accuracy = accuracy_score(y_test, y_pred) logger.info(f"Training completed. Accuracy: {accuracy:.4f}") # Save results results = { "model_type": "XGBoost", "n_estimators": args.n_estimators, "max_depth": args.max_depth, "learning_rate": args.learning_rate, "accuracy": accuracy, "n_samples": len(X), "n_features": X.shape[1] } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model model.save_model(str(output_dir / "xgboost_model.json")) logger.info("Results and model saved successfully!") if __name__ == "__main__": main() ` if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil { t.Fatalf("Failed to create train.py: %v", err) } // Create requirements.txt requirementsFile := filepath.Join(experimentDir, "requirements.txt") requirements := `xgboost>=1.5.0 scikit-learn>=1.0.0 numpy>=1.21.0 pandas>=1.3.0 ` if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil { t.Fatalf("Failed to create requirements.txt: %v", err) } // Verify XGBoost project structure if _, err := os.Stat(trainScript); os.IsNotExist(err) { t.Error("XGBoost train.py should exist") } if _, err := os.Stat(requirementsFile); os.IsNotExist(err) { t.Error("XGBoost requirements.txt should exist") } }) // Test 3: PyTorch project (deep learning) t.Run("PyTorchProject", func(t *testing.T) { experimentDir := filepath.Join(testDir, "pytorch_experiment") if err := os.MkdirAll(experimentDir, 0755); err != nil { t.Fatalf("Failed to create experiment directory: %v", err) } // Create PyTorch training script trainScript := filepath.Join(experimentDir, "train.py") trainCode := `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset class SimpleNet(nn.Module): def __init__(self, input_size, hidden_size, output_size): super().__init__() self.fc1 = nn.Linear(input_size, hidden_size) self.fc2 = nn.Linear(hidden_size, output_size) self.relu = nn.ReLU() def forward(self, x): x = self.fc1(x) x = self.relu(x) x = self.fc2(x) return x def main(): parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--learning_rate", type=float, default=0.001) parser.add_argument("--hidden_size", type=int, default=64) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training PyTorch model for {args.epochs} epochs...") # Generate synthetic data torch.manual_seed(42) X = torch.randn(1000, 20) y = torch.randint(0, 2, (1000,)) # Create dataset and dataloader dataset = TensorDataset(X, y) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) # Initialize model model = SimpleNet(20, args.hidden_size, 2) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) # Training loop model.train() for epoch in range(args.epochs): total_loss = 0 correct = 0 total = 0 for batch_X, batch_y in dataloader: optimizer.zero_grad() outputs = model(batch_X) loss = criterion(outputs, batch_y) loss.backward() optimizer.step() total_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += batch_y.size(0) correct += (predicted == batch_y).sum().item() accuracy = correct / total avg_loss = total_loss / len(dataloader) logger.info(f"Epoch {epoch + 1}/{args.epochs}: Loss={avg_loss:.4f}, Acc={accuracy:.4f}") time.sleep(0.1) # Small delay for logging # Final evaluation model.eval() with torch.no_grad(): correct = 0 total = 0 for batch_X, batch_y in dataloader: outputs = model(batch_X) _, predicted = torch.max(outputs.data, 1) total += batch_y.size(0) correct += (predicted == batch_y).sum().item() final_accuracy = correct / total logger.info(f"Training completed. Final accuracy: {final_accuracy:.4f}") # Save results results = { "model_type": "PyTorch", "epochs": args.epochs, "batch_size": args.batch_size, "learning_rate": args.learning_rate, "hidden_size": args.hidden_size, "final_accuracy": final_accuracy, "n_samples": len(X), "input_features": X.shape[1] } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model torch.save(model.state_dict(), output_dir / "pytorch_model.pth") logger.info("Results and model saved successfully!") if __name__ == "__main__": main() ` if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil { t.Fatalf("Failed to create train.py: %v", err) } // Create requirements.txt requirementsFile := filepath.Join(experimentDir, "requirements.txt") requirements := `torch>=1.9.0 torchvision>=0.10.0 numpy>=1.21.0 ` if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil { t.Fatalf("Failed to create requirements.txt: %v", err) } // Verify PyTorch project structure if _, err := os.Stat(trainScript); os.IsNotExist(err) { t.Error("PyTorch train.py should exist") } if _, err := os.Stat(requirementsFile); os.IsNotExist(err) { t.Error("PyTorch requirements.txt should exist") } }) // Test 4: TensorFlow/Keras project t.Run("TensorFlowProject", func(t *testing.T) { experimentDir := filepath.Join(testDir, "tensorflow_experiment") if err := os.MkdirAll(experimentDir, 0755); err != nil { t.Fatalf("Failed to create experiment directory: %v", err) } // Create TensorFlow training script trainScript := filepath.Join(experimentDir, "train.py") trainCode := `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np import tensorflow as tf def main(): parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--learning_rate", type=float, default=0.001) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training TensorFlow model for {args.epochs} epochs...") # Generate synthetic data np.random.seed(42) tf.random.set_seed(42) X = np.random.randn(1000, 20) y = np.random.randint(0, 2, (1000,)) # Create TensorFlow dataset dataset = tf.data.Dataset.from_tensor_slices((X, y)) dataset = dataset.shuffle(buffer_size=1000).batch(args.batch_size) # Build model model = tf.keras.Sequential([ tf.keras.layers.Dense(64, activation='relu', input_shape=(20,)), tf.keras.layers.Dense(32, activation='relu'), tf.keras.layers.Dense(2, activation='softmax') ]) model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'] ) # Training history = model.fit( dataset, epochs=args.epochs, verbose=1 ) final_accuracy = history.history['accuracy'][-1] logger.info(f"Training completed. Final accuracy: {final_accuracy:.4f}") # Save results results = { "model_type": "TensorFlow", "epochs": args.epochs, "batch_size": args.batch_size, "learning_rate": args.learning_rate, "final_accuracy": float(final_accuracy), "n_samples": len(X), "input_features": X.shape[1] } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model model.save(output_dir / "tensorflow_model") logger.info("Results and model saved successfully!") if __name__ == "__main__": main() ` if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil { t.Fatalf("Failed to create train.py: %v", err) } // Create requirements.txt requirementsFile := filepath.Join(experimentDir, "requirements.txt") requirements := `tensorflow>=2.8.0 numpy>=1.21.0 ` if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil { t.Fatalf("Failed to create requirements.txt: %v", err) } // Verify TensorFlow project structure if _, err := os.Stat(trainScript); os.IsNotExist(err) { t.Error("TensorFlow train.py should exist") } if _, err := os.Stat(requirementsFile); os.IsNotExist(err) { t.Error("TensorFlow requirements.txt should exist") } }) // Test 5: Traditional ML (statsmodels) t.Run("StatsModelsProject", func(t *testing.T) { experimentDir := filepath.Join(testDir, "statsmodels_experiment") if err := os.MkdirAll(experimentDir, 0755); err != nil { t.Fatalf("Failed to create experiment directory: %v", err) } // Create statsmodels training script trainScript := filepath.Join(experimentDir, "train.py") trainCode := `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np import pandas as pd import statsmodels.api as sm def main(): parser = argparse.ArgumentParser() parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info("Training statsmodels linear regression...") # Generate synthetic data np.random.seed(42) n_samples = 1000 n_features = 5 X = np.random.randn(n_samples, n_features) # True coefficients true_coef = np.array([1.5, -2.0, 0.5, 3.0, -1.0]) noise = np.random.randn(n_samples) * 0.1 y = X @ true_coef + noise # Create DataFrame feature_names = [f"feature_{i}" for i in range(n_features)] X_df = pd.DataFrame(X, columns=feature_names) y_series = pd.Series(y, name="target") # Add constant for intercept X_with_const = sm.add_constant(X_df) # Fit model model = sm.OLS(y_series, X_with_const).fit() logger.info(f"Model fitted successfully. R-squared: {model.rsquared:.4f}") # Save results results = { "model_type": "LinearRegression", "n_samples": n_samples, "n_features": n_features, "r_squared": float(model.rsquared), "adj_r_squared": float(model.rsquared_adj), "f_statistic": float(model.fvalue), "f_pvalue": float(model.f_pvalue), "coefficients": model.params.to_dict(), "standard_errors": model.bse.to_dict(), "p_values": model.pvalues.to_dict() } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model summary with open(output_dir / "model_summary.txt", "w") as f: f.write(str(model.summary())) logger.info("Results and model summary saved successfully!") if __name__ == "__main__": main() ` if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil { t.Fatalf("Failed to create train.py: %v", err) } // Create requirements.txt requirementsFile := filepath.Join(experimentDir, "requirements.txt") requirements := `statsmodels>=0.13.0 pandas>=1.3.0 numpy>=1.21.0 ` if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil { t.Fatalf("Failed to create requirements.txt: %v", err) } // Verify statsmodels project structure if _, err := os.Stat(trainScript); os.IsNotExist(err) { t.Error("statsmodels train.py should exist") } if _, err := os.Stat(requirementsFile); os.IsNotExist(err) { t.Error("statsmodels requirements.txt should exist") } }) } // TestMLProjectCompatibility tests that all project types work with zero-install workflow func TestMLProjectCompatibility(t *testing.T) { testDir := t.TempDir() // Test that all project types can be uploaded and processed projectTypes := []string{ "sklearn_experiment", "xgboost_experiment", "pytorch_experiment", "tensorflow_experiment", "statsmodels_experiment", } for _, projectType := range projectTypes { t.Run(projectType+"_UploadTest", func(t *testing.T) { // Create experiment directory experimentDir := filepath.Join(testDir, projectType) if err := os.MkdirAll(experimentDir, 0755); err != nil { t.Fatalf("Failed to create experiment directory: %v", err) } // Create minimal files trainScript := filepath.Join(experimentDir, "train.py") trainCode := `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path def main(): parser = argparse.ArgumentParser() parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training {projectType} model...") # Simulate training for epoch in range(3): logger.info(f"Epoch {epoch + 1}: training...") time.sleep(0.01) results = {"model_type": projectType, "status": "completed"} output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f) logger.info("Training complete!") if __name__ == "__main__": main() ` if err := os.WriteFile(trainScript, []byte(trainCode), 0755); err != nil { t.Fatalf("Failed to create train.py: %v", err) } // Create requirements.txt requirementsFile := filepath.Join(experimentDir, "requirements.txt") requirements := "# Framework-specific dependencies\n" if err := os.WriteFile(requirementsFile, []byte(requirements), 0644); err != nil { t.Fatalf("Failed to create requirements.txt: %v", err) } // Simulate upload process serverDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending") jobDir := filepath.Join(serverDir, projectType+"_20231201_143022") if err := os.MkdirAll(jobDir, 0755); err != nil { t.Fatalf("Failed to create server directories: %v", err) } // Copy files files := []string{"train.py", "requirements.txt"} for _, file := range files { src := filepath.Join(experimentDir, file) dst := filepath.Join(jobDir, file) data, err := os.ReadFile(src) if err != nil { t.Fatalf("Failed to read %s: %v", file, err) } if err := os.WriteFile(dst, data, 0755); err != nil { t.Fatalf("Failed to copy %s: %v", file, err) } } // Verify upload for _, file := range files { dst := filepath.Join(jobDir, file) if _, err := os.Stat(dst); os.IsNotExist(err) { t.Errorf("Uploaded file %s should exist for %s", file, projectType) } } }) } }