fetch_ml/tests/fixtures/ml_templates.go
Jeremie Fraeys ea15af1833 Fix multi-user authentication and clean up debug code
- Fix YAML tags in auth config struct (json -> yaml)
- Update CLI configs to use pre-hashed API keys
- Remove double hashing in WebSocket client
- Fix port mapping (9102 -> 9103) in CLI commands
- Update permission keys to use jobs:read, jobs:create, etc.
- Clean up all debug logging from CLI and server
- All user roles now authenticate correctly:
  * Admin: Can queue jobs and see all jobs
  * Researcher: Can queue jobs and see own jobs
  * Analyst: Can see status (read-only access)

Multi-user authentication is now fully functional.
2025-12-06 12:35:32 -05:00

364 lines
11 KiB
Go

// Package tests provides ML experiment templates for testing.
package tests
// MLProjectTemplate represents a template for creating ML projects
type MLProjectTemplate struct {
Name string
TrainScript string
Requirements string
}
// ScikitLearnTemplate returns the Scikit-learn project template
func ScikitLearnTemplate() MLProjectTemplate {
return MLProjectTemplate{
Name: "Scikit-learn",
TrainScript: `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int, default=100)
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training Random Forest with {args.n_estimators} estimators...")
# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=42)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
logger.info(f"Training completed. Accuracy: {accuracy:.4f}")
# Save results
results = {
"model_type": "RandomForest",
"n_estimators": args.n_estimators,
"accuracy": accuracy,
"n_samples": len(X),
"n_features": X.shape[1]
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
logger.info("Results saved successfully!")
if __name__ == "__main__":
main()
`,
Requirements: `scikit-learn>=1.0.0
numpy>=1.21.0
pandas>=1.3.0
`,
}
}
// StatsModelsTemplate returns the StatsModels project template
func StatsModelsTemplate() MLProjectTemplate {
return MLProjectTemplate{
Name: "StatsModels",
TrainScript: `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
import pandas as pd
import statsmodels.api as sm
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Training statsmodels linear regression...")
# Generate synthetic data
np.random.seed(42)
n_samples = 1000
n_features = 5
X = np.random.randn(n_samples, n_features)
# True coefficients
true_coef = np.array([1.5, -2.0, 0.5, 3.0, -1.0])
noise = np.random.randn(n_samples) * 0.1
y = X @ true_coef + noise
# Create DataFrame
feature_names = [f"feature_{i}" for i in range(n_features)]
X_df = pd.DataFrame(X, columns=feature_names)
y_series = pd.Series(y, name="target")
# Add constant for intercept
X_with_const = sm.add_constant(X_df)
# Fit model
model = sm.OLS(y_series, X_with_const).fit()
logger.info(f"Model fitted successfully. R-squared: {model.rsquared:.4f}")
# Save results
results = {
"model_type": "LinearRegression",
"n_samples": n_samples,
"n_features": n_features,
"r_squared": float(model.rsquared),
"adj_r_squared": float(model.rsquared_adj),
"f_statistic": float(model.fvalue),
"f_pvalue": float(model.f_pvalue),
"coefficients": model.params.to_dict(),
"standard_errors": model.bse.to_dict(),
"p_values": model.pvalues.to_dict()
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
# Save model summary
with open(output_dir / "model_summary.txt", "w") as f:
f.write(str(model.summary()))
logger.info("Results and model summary saved successfully!")
if __name__ == "__main__":
main()
`,
Requirements: `statsmodels>=0.13.0
pandas>=1.3.0
numpy>=1.21.0
`,
}
}
// XGBoostTemplate returns the XGBoost project template
func XGBoostTemplate() MLProjectTemplate {
return MLProjectTemplate{
Name: "XGBoost",
TrainScript: `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int, default=100)
parser.add_argument("--max_depth", type=int, default=6)
parser.add_argument("--learning_rate", type=float, default=0.1)
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training XGBoost with {args.n_estimators} estimators...")
# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Convert to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Train model
params = {
'max_depth': args.max_depth,
'eta': args.learning_rate,
'objective': 'binary:logistic',
'eval_metric': 'logloss'
}
model = xgb.train(params, dtrain, args.n_estimators)
# Evaluate
y_pred = model.predict(dtest)
y_pred_binary = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred_binary)
logger.info(f"Training completed. Accuracy: {accuracy:.4f}")
# Save results
results = {
"model_type": "XGBoost",
"n_estimators": args.n_estimators,
"max_depth": args.max_depth,
"learning_rate": args.learning_rate,
"accuracy": accuracy,
"n_samples": len(X),
"n_features": X.shape[1]
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
# Save model
model.save_model(str(output_dir / "xgboost_model.json"))
logger.info("Results and model saved successfully!")
if __name__ == "__main__":
main()
`,
Requirements: `xgboost>=1.5.0
scikit-learn>=1.0.0
numpy>=1.21.0
`,
}
}
// PyTorchTemplate returns the PyTorch project template
func PyTorchTemplate() MLProjectTemplate {
return MLProjectTemplate{
Name: "PyTorch",
TrainScript: `#!/usr/bin/env python3
import argparse, json, logging, time
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
class SimpleNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(SimpleNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--learning_rate", type=float, default=0.001)
parser.add_argument("--hidden_size", type=int, default=64)
parser.add_argument("--output_dir", type=str, required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Training PyTorch model for {args.epochs} epochs...")
# Generate synthetic data
torch.manual_seed(42)
X = torch.randn(1000, 20)
y = torch.randint(0, 2, (1000,))
# Create dataset and dataloader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
# Initialize model
model = SimpleNet(20, args.hidden_size, 2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
# Training loop
model.train()
for epoch in range(args.epochs):
total_loss = 0
correct = 0
total = 0
for batch_X, batch_y in dataloader:
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
accuracy = correct / total
avg_loss = total_loss / len(dataloader)
logger.info(f"Epoch {epoch + 1}/{args.epochs}: Loss={avg_loss:.4f}, Acc={accuracy:.4f}")
time.sleep(0.1) # Small delay for logging
# Final evaluation
model.eval()
with torch.no_grad():
correct = 0
total = 0
for batch_X, batch_y in dataloader:
outputs = model(batch_X)
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
final_accuracy = correct / total
logger.info(f"Training completed. Final accuracy: {final_accuracy:.4f}")
# Save results
results = {
"model_type": "PyTorch",
"epochs": args.epochs,
"batch_size": args.batch_size,
"learning_rate": args.learning_rate,
"hidden_size": args.hidden_size,
"final_accuracy": final_accuracy,
"n_samples": len(X),
"input_features": X.shape[1]
}
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
# Save model
torch.save(model.state_dict(), output_dir / "pytorch_model.pth")
logger.info("Results and model saved successfully!")
if __name__ == "__main__":
main()
`,
Requirements: `torch>=1.9.0
torchvision>=0.10.0
numpy>=1.21.0
`,
}
}