- Fix YAML tags in auth config struct (json -> yaml) - Update CLI configs to use pre-hashed API keys - Remove double hashing in WebSocket client - Fix port mapping (9102 -> 9103) in CLI commands - Update permission keys to use jobs:read, jobs:create, etc. - Clean up all debug logging from CLI and server - All user roles now authenticate correctly: * Admin: Can queue jobs and see all jobs * Researcher: Can queue jobs and see own jobs * Analyst: Can see status (read-only access) Multi-user authentication is now fully functional.
364 lines
11 KiB
Go
364 lines
11 KiB
Go
// Package tests provides ML experiment templates for testing.
|
|
package tests
|
|
|
|
// MLProjectTemplate represents a template for creating ML projects
|
|
type MLProjectTemplate struct {
|
|
Name string
|
|
TrainScript string
|
|
Requirements string
|
|
}
|
|
|
|
// ScikitLearnTemplate returns the Scikit-learn project template
|
|
func ScikitLearnTemplate() MLProjectTemplate {
|
|
return MLProjectTemplate{
|
|
Name: "Scikit-learn",
|
|
TrainScript: `#!/usr/bin/env python3
|
|
import argparse, json, logging, time
|
|
from pathlib import Path
|
|
import numpy as np
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.datasets import make_classification
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--n_estimators", type=int, default=100)
|
|
parser.add_argument("--output_dir", type=str, required=True)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger.info(f"Training Random Forest with {args.n_estimators} estimators...")
|
|
|
|
# Generate synthetic data
|
|
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
# Train model
|
|
model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=42)
|
|
model.fit(X_train, y_train)
|
|
|
|
# Evaluate
|
|
y_pred = model.predict(X_test)
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
|
|
logger.info(f"Training completed. Accuracy: {accuracy:.4f}")
|
|
|
|
# Save results
|
|
results = {
|
|
"model_type": "RandomForest",
|
|
"n_estimators": args.n_estimators,
|
|
"accuracy": accuracy,
|
|
"n_samples": len(X),
|
|
"n_features": X.shape[1]
|
|
}
|
|
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_dir / "results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
logger.info("Results saved successfully!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
`,
|
|
Requirements: `scikit-learn>=1.0.0
|
|
numpy>=1.21.0
|
|
pandas>=1.3.0
|
|
`,
|
|
}
|
|
}
|
|
|
|
// StatsModelsTemplate returns the StatsModels project template
|
|
func StatsModelsTemplate() MLProjectTemplate {
|
|
return MLProjectTemplate{
|
|
Name: "StatsModels",
|
|
TrainScript: `#!/usr/bin/env python3
|
|
import argparse, json, logging, time
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import pandas as pd
|
|
import statsmodels.api as sm
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--output_dir", type=str, required=True)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger.info("Training statsmodels linear regression...")
|
|
|
|
# Generate synthetic data
|
|
np.random.seed(42)
|
|
n_samples = 1000
|
|
n_features = 5
|
|
|
|
X = np.random.randn(n_samples, n_features)
|
|
# True coefficients
|
|
true_coef = np.array([1.5, -2.0, 0.5, 3.0, -1.0])
|
|
noise = np.random.randn(n_samples) * 0.1
|
|
y = X @ true_coef + noise
|
|
|
|
# Create DataFrame
|
|
feature_names = [f"feature_{i}" for i in range(n_features)]
|
|
X_df = pd.DataFrame(X, columns=feature_names)
|
|
y_series = pd.Series(y, name="target")
|
|
|
|
# Add constant for intercept
|
|
X_with_const = sm.add_constant(X_df)
|
|
|
|
# Fit model
|
|
model = sm.OLS(y_series, X_with_const).fit()
|
|
|
|
logger.info(f"Model fitted successfully. R-squared: {model.rsquared:.4f}")
|
|
|
|
# Save results
|
|
results = {
|
|
"model_type": "LinearRegression",
|
|
"n_samples": n_samples,
|
|
"n_features": n_features,
|
|
"r_squared": float(model.rsquared),
|
|
"adj_r_squared": float(model.rsquared_adj),
|
|
"f_statistic": float(model.fvalue),
|
|
"f_pvalue": float(model.f_pvalue),
|
|
"coefficients": model.params.to_dict(),
|
|
"standard_errors": model.bse.to_dict(),
|
|
"p_values": model.pvalues.to_dict()
|
|
}
|
|
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_dir / "results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
# Save model summary
|
|
with open(output_dir / "model_summary.txt", "w") as f:
|
|
f.write(str(model.summary()))
|
|
|
|
logger.info("Results and model summary saved successfully!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
`,
|
|
Requirements: `statsmodels>=0.13.0
|
|
pandas>=1.3.0
|
|
numpy>=1.21.0
|
|
`,
|
|
}
|
|
}
|
|
|
|
// XGBoostTemplate returns the XGBoost project template
|
|
func XGBoostTemplate() MLProjectTemplate {
|
|
return MLProjectTemplate{
|
|
Name: "XGBoost",
|
|
TrainScript: `#!/usr/bin/env python3
|
|
import argparse, json, logging, time
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import xgboost as xgb
|
|
from sklearn.datasets import make_classification
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import accuracy_score
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--n_estimators", type=int, default=100)
|
|
parser.add_argument("--max_depth", type=int, default=6)
|
|
parser.add_argument("--learning_rate", type=float, default=0.1)
|
|
parser.add_argument("--output_dir", type=str, required=True)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger.info(f"Training XGBoost with {args.n_estimators} estimators...")
|
|
|
|
# Generate synthetic data
|
|
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
# Convert to DMatrix format
|
|
dtrain = xgb.DMatrix(X_train, label=y_train)
|
|
dtest = xgb.DMatrix(X_test, label=y_test)
|
|
|
|
# Train model
|
|
params = {
|
|
'max_depth': args.max_depth,
|
|
'eta': args.learning_rate,
|
|
'objective': 'binary:logistic',
|
|
'eval_metric': 'logloss'
|
|
}
|
|
model = xgb.train(params, dtrain, args.n_estimators)
|
|
|
|
# Evaluate
|
|
y_pred = model.predict(dtest)
|
|
y_pred_binary = (y_pred > 0.5).astype(int)
|
|
accuracy = accuracy_score(y_test, y_pred_binary)
|
|
|
|
logger.info(f"Training completed. Accuracy: {accuracy:.4f}")
|
|
|
|
# Save results
|
|
results = {
|
|
"model_type": "XGBoost",
|
|
"n_estimators": args.n_estimators,
|
|
"max_depth": args.max_depth,
|
|
"learning_rate": args.learning_rate,
|
|
"accuracy": accuracy,
|
|
"n_samples": len(X),
|
|
"n_features": X.shape[1]
|
|
}
|
|
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_dir / "results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
# Save model
|
|
model.save_model(str(output_dir / "xgboost_model.json"))
|
|
|
|
logger.info("Results and model saved successfully!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
`,
|
|
Requirements: `xgboost>=1.5.0
|
|
scikit-learn>=1.0.0
|
|
numpy>=1.21.0
|
|
`,
|
|
}
|
|
}
|
|
|
|
// PyTorchTemplate returns the PyTorch project template
|
|
func PyTorchTemplate() MLProjectTemplate {
|
|
return MLProjectTemplate{
|
|
Name: "PyTorch",
|
|
TrainScript: `#!/usr/bin/env python3
|
|
import argparse, json, logging, time
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.optim as optim
|
|
from torch.utils.data import TensorDataset, DataLoader
|
|
|
|
class SimpleNet(nn.Module):
|
|
def __init__(self, input_size, hidden_size, num_classes):
|
|
super(SimpleNet, self).__init__()
|
|
self.fc1 = nn.Linear(input_size, hidden_size)
|
|
self.relu = nn.ReLU()
|
|
self.fc2 = nn.Linear(hidden_size, num_classes)
|
|
|
|
def forward(self, x):
|
|
out = self.fc1(x)
|
|
out = self.relu(out)
|
|
out = self.fc2(out)
|
|
return out
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--epochs", type=int, default=10)
|
|
parser.add_argument("--batch_size", type=int, default=32)
|
|
parser.add_argument("--learning_rate", type=float, default=0.001)
|
|
parser.add_argument("--hidden_size", type=int, default=64)
|
|
parser.add_argument("--output_dir", type=str, required=True)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger.info(f"Training PyTorch model for {args.epochs} epochs...")
|
|
|
|
# Generate synthetic data
|
|
torch.manual_seed(42)
|
|
X = torch.randn(1000, 20)
|
|
y = torch.randint(0, 2, (1000,))
|
|
|
|
# Create dataset and dataloader
|
|
dataset = TensorDataset(X, y)
|
|
dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
|
|
|
|
# Initialize model
|
|
model = SimpleNet(20, args.hidden_size, 2)
|
|
criterion = nn.CrossEntropyLoss()
|
|
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
|
|
|
|
# Training loop
|
|
model.train()
|
|
for epoch in range(args.epochs):
|
|
total_loss = 0
|
|
correct = 0
|
|
total = 0
|
|
|
|
for batch_X, batch_y in dataloader:
|
|
optimizer.zero_grad()
|
|
outputs = model(batch_X)
|
|
loss = criterion(outputs, batch_y)
|
|
loss.backward()
|
|
optimizer.step()
|
|
|
|
total_loss += loss.item()
|
|
_, predicted = torch.max(outputs.data, 1)
|
|
total += batch_y.size(0)
|
|
correct += (predicted == batch_y).sum().item()
|
|
|
|
accuracy = correct / total
|
|
avg_loss = total_loss / len(dataloader)
|
|
|
|
logger.info(f"Epoch {epoch + 1}/{args.epochs}: Loss={avg_loss:.4f}, Acc={accuracy:.4f}")
|
|
time.sleep(0.1) # Small delay for logging
|
|
|
|
# Final evaluation
|
|
model.eval()
|
|
with torch.no_grad():
|
|
correct = 0
|
|
total = 0
|
|
for batch_X, batch_y in dataloader:
|
|
outputs = model(batch_X)
|
|
_, predicted = torch.max(outputs.data, 1)
|
|
total += batch_y.size(0)
|
|
correct += (predicted == batch_y).sum().item()
|
|
|
|
final_accuracy = correct / total
|
|
|
|
logger.info(f"Training completed. Final accuracy: {final_accuracy:.4f}")
|
|
|
|
# Save results
|
|
results = {
|
|
"model_type": "PyTorch",
|
|
"epochs": args.epochs,
|
|
"batch_size": args.batch_size,
|
|
"learning_rate": args.learning_rate,
|
|
"hidden_size": args.hidden_size,
|
|
"final_accuracy": final_accuracy,
|
|
"n_samples": len(X),
|
|
"input_features": X.shape[1]
|
|
}
|
|
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_dir / "results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
# Save model
|
|
torch.save(model.state_dict(), output_dir / "pytorch_model.pth")
|
|
|
|
logger.info("Results and model saved successfully!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
`,
|
|
Requirements: `torch>=1.9.0
|
|
torchvision>=0.10.0
|
|
numpy>=1.21.0
|
|
`,
|
|
}
|
|
}
|