// Package tests provides ML experiment templates for testing. package tests // MLProjectTemplate represents a template for creating ML projects type MLProjectTemplate struct { Name string TrainScript string Requirements string } // ScikitLearnTemplate returns the Scikit-learn project template func ScikitLearnTemplate() MLProjectTemplate { return MLProjectTemplate{ Name: "Scikit-learn", TrainScript: `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.datasets import make_classification def main(): parser = argparse.ArgumentParser() parser.add_argument("--n_estimators", type=int, default=100) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training Random Forest with {args.n_estimators} estimators...") # Generate synthetic data X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train model model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=42) model.fit(X_train, y_train) # Evaluate y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) logger.info(f"Training completed. Accuracy: {accuracy:.4f}") # Save results results = { "model_type": "RandomForest", "n_estimators": args.n_estimators, "accuracy": accuracy, "n_samples": len(X), "n_features": X.shape[1] } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) logger.info("Results saved successfully!") if __name__ == "__main__": main() `, Requirements: `scikit-learn>=1.0.0 numpy>=1.21.0 pandas>=1.3.0 `, } } // StatsModelsTemplate returns the StatsModels project template func StatsModelsTemplate() MLProjectTemplate { return MLProjectTemplate{ Name: "StatsModels", TrainScript: `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np import pandas as pd import statsmodels.api as sm def main(): parser = argparse.ArgumentParser() parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info("Training statsmodels linear regression...") # Generate synthetic data np.random.seed(42) n_samples = 1000 n_features = 5 X = np.random.randn(n_samples, n_features) # True coefficients true_coef = np.array([1.5, -2.0, 0.5, 3.0, -1.0]) noise = np.random.randn(n_samples) * 0.1 y = X @ true_coef + noise # Create DataFrame feature_names = [f"feature_{i}" for i in range(n_features)] X_df = pd.DataFrame(X, columns=feature_names) y_series = pd.Series(y, name="target") # Add constant for intercept X_with_const = sm.add_constant(X_df) # Fit model model = sm.OLS(y_series, X_with_const).fit() logger.info(f"Model fitted successfully. R-squared: {model.rsquared:.4f}") # Save results results = { "model_type": "LinearRegression", "n_samples": n_samples, "n_features": n_features, "r_squared": float(model.rsquared), "adj_r_squared": float(model.rsquared_adj), "f_statistic": float(model.fvalue), "f_pvalue": float(model.f_pvalue), "coefficients": model.params.to_dict(), "standard_errors": model.bse.to_dict(), "p_values": model.pvalues.to_dict() } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model summary with open(output_dir / "model_summary.txt", "w") as f: f.write(str(model.summary())) logger.info("Results and model summary saved successfully!") if __name__ == "__main__": main() `, Requirements: `statsmodels>=0.13.0 pandas>=1.3.0 numpy>=1.21.0 `, } } // XGBoostTemplate returns the XGBoost project template func XGBoostTemplate() MLProjectTemplate { return MLProjectTemplate{ Name: "XGBoost", TrainScript: `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np import xgboost as xgb from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score def main(): parser = argparse.ArgumentParser() parser.add_argument("--n_estimators", type=int, default=100) parser.add_argument("--max_depth", type=int, default=6) parser.add_argument("--learning_rate", type=float, default=0.1) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training XGBoost with {args.n_estimators} estimators...") # Generate synthetic data X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Convert to DMatrix format dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) # Train model params = { 'max_depth': args.max_depth, 'eta': args.learning_rate, 'objective': 'binary:logistic', 'eval_metric': 'logloss' } model = xgb.train(params, dtrain, args.n_estimators) # Evaluate y_pred = model.predict(dtest) y_pred_binary = (y_pred > 0.5).astype(int) accuracy = accuracy_score(y_test, y_pred_binary) logger.info(f"Training completed. Accuracy: {accuracy:.4f}") # Save results results = { "model_type": "XGBoost", "n_estimators": args.n_estimators, "max_depth": args.max_depth, "learning_rate": args.learning_rate, "accuracy": accuracy, "n_samples": len(X), "n_features": X.shape[1] } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model model.save_model(str(output_dir / "xgboost_model.json")) logger.info("Results and model saved successfully!") if __name__ == "__main__": main() `, Requirements: `xgboost>=1.5.0 scikit-learn>=1.0.0 numpy>=1.21.0 `, } } // PyTorchTemplate returns the PyTorch project template func PyTorchTemplate() MLProjectTemplate { return MLProjectTemplate{ Name: "PyTorch", TrainScript: `#!/usr/bin/env python3 import argparse, json, logging, time from pathlib import Path import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import TensorDataset, DataLoader class SimpleNet(nn.Module): def __init__(self, input_size, hidden_size, num_classes): super(SimpleNet, self).__init__() self.fc1 = nn.Linear(input_size, hidden_size) self.relu = nn.ReLU() self.fc2 = nn.Linear(hidden_size, num_classes) def forward(self, x): out = self.fc1(x) out = self.relu(out) out = self.fc2(out) return out def main(): parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--learning_rate", type=float, default=0.001) parser.add_argument("--hidden_size", type=int, default=64) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info(f"Training PyTorch model for {args.epochs} epochs...") # Generate synthetic data torch.manual_seed(42) X = torch.randn(1000, 20) y = torch.randint(0, 2, (1000,)) # Create dataset and dataloader dataset = TensorDataset(X, y) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) # Initialize model model = SimpleNet(20, args.hidden_size, 2) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) # Training loop model.train() for epoch in range(args.epochs): total_loss = 0 correct = 0 total = 0 for batch_X, batch_y in dataloader: optimizer.zero_grad() outputs = model(batch_X) loss = criterion(outputs, batch_y) loss.backward() optimizer.step() total_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += batch_y.size(0) correct += (predicted == batch_y).sum().item() accuracy = correct / total avg_loss = total_loss / len(dataloader) logger.info(f"Epoch {epoch + 1}/{args.epochs}: Loss={avg_loss:.4f}, Acc={accuracy:.4f}") time.sleep(0.1) # Small delay for logging # Final evaluation model.eval() with torch.no_grad(): correct = 0 total = 0 for batch_X, batch_y in dataloader: outputs = model(batch_X) _, predicted = torch.max(outputs.data, 1) total += batch_y.size(0) correct += (predicted == batch_y).sum().item() final_accuracy = correct / total logger.info(f"Training completed. Final accuracy: {final_accuracy:.4f}") # Save results results = { "model_type": "PyTorch", "epochs": args.epochs, "batch_size": args.batch_size, "learning_rate": args.learning_rate, "hidden_size": args.hidden_size, "final_accuracy": final_accuracy, "n_samples": len(X), "input_features": X.shape[1] } output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "results.json", "w") as f: json.dump(results, f, indent=2) # Save model torch.save(model.state_dict(), output_dir / "pytorch_model.pth") logger.info("Results and model saved successfully!") if __name__ == "__main__": main() `, Requirements: `torch>=1.9.0 torchvision>=0.10.0 numpy>=1.21.0 `, } }