#!/usr/bin/env python3 """ Secure ML Experiment Runner Optimized for data scientists with maximum speed """ import argparse import json import os from pathlib import Path import subprocess import sys import time class SecurityPolicy: """Manages security policies for experiment execution""" def __init__( self, policy_file: str = "/etc/ml_runner/security_policy.json" ): self.policy_file = policy_file self.policy = self._load_policy() def _load_policy(self) -> dict: """Load security policy from file""" try: with open(self.policy_file, "r") as f: return json.load(f) except FileNotFoundError: # Default restrictive policy for Conda return { "allow_network": False, "blocked_packages": [ "requests", "urllib3", "httpx", "aiohttp", "socket", "telnetlib", "ftplib", "smtplib", "paramiko", "fabric", ], "max_execution_time": 3600, "max_memory_gb": 16, "gpu_access": True, "allow_file_writes": True, "resource_limits": { "cpu_count": 4, "memory_gb": 16, "gpu_memory_gb": 12, }, # Conda-specific settings "conda_env": "ml_env", "package_manager": "mamba", "ds_friendly": True, } def check_package_safety(self, package_name: str) -> bool: """Check if a package is allowed""" # Always allow ML tools even if they might be in blocked list allowed_tools = self.policy.get("allowed_network_tools", []) if package_name in allowed_tools: return True if package_name in self.policy.get("blocked_packages", []): return False return True def check_network_access(self, domain: str | None = None) -> bool: """Check if network access is allowed""" if not self.policy.get("allow_network", False): return False # Check if domain is in whitelist if domain: whitelist = self.policy.get("network_whitelist", []) return any(allowed in domain for allowed in whitelist) return True def check_tool_allowed(self, tool_name: str) -> bool: """Check if a specific tool is allowed network access""" allowed_tools = self.policy.get("allowed_network_tools", []) return tool_name in allowed_tools class CondaRunner: """Secure experiment runner with Conda + Mamba""" def __init__(self, workspace_dir: str = "/workspace"): self.workspace_dir = Path(workspace_dir) self.security_policy = SecurityPolicy() self.conda_env = self.security_policy.policy.get("conda_env", "ml_env") self.package_manager = self.security_policy.policy.get( "package_manager", "mamba" ) self.results_dir = self.workspace_dir / "results" # Detect if running in conda environment self.is_conda = os.environ.get("CONDA_DEFAULT_ENV") is not None # Conda paths self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda") self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}" def setup_environment(self, requirements_file: Path) -> bool: """Setup Conda environment with mamba""" try: # Read requirements with open(requirements_file, "r") as f: requirements = [ line.strip() for line in f if line.strip() and not line.startswith("#") ] # Check each package for security for req in requirements: package_name = ( req.split("==")[0].split(">=")[0].split("<=")[0].strip() ) if not self.security_policy.check_package_safety(package_name): print( f"[SECURITY] Package '{package_name}' is blocked for security reasons" ) return False # Install packages with mamba (super fast!) for req in requirements: package_name = ( req.split("==")[0].split(">=")[0].split("<=")[0].strip() ) # Check if already installed with conda check_cmd = [ "conda", "run", "-n", self.conda_env, "python", "-c", f"import {package_name.replace('-', '_')}", ] result = subprocess.run( check_cmd, capture_output=True, text=True ) if result.returncode == 0: print(f"[OK] {package_name} already installed in conda env") continue # Try conda-forge first (faster and more reliable) print( f"[INSTALL] Installing {req} with {self.package_manager}..." ) install_cmd = [ self.package_manager, "install", "-n", self.conda_env, req, "-c", "conda-forge", "-y", ] result = subprocess.run( install_cmd, capture_output=True, text=True, timeout=300 ) if result.returncode == 0: print(f"[OK] Installed {req} with {self.package_manager}") continue # Fallback to pip if conda fails print(f"[FALLBACK] Trying pip for {req}...") pip_cmd = [ "conda", "run", "-n", self.conda_env, "pip", "install", req, "--no-cache-dir", ] result = subprocess.run( pip_cmd, capture_output=True, text=True, timeout=300 ) if result.returncode != 0: print(f"[ERROR] Failed to install {req}: {result.stderr}") return False print(f"[OK] Installed {req} with pip") return True except Exception as e: print(f"[ERROR] Environment setup failed: {e}") return False def run_experiment(self, train_script: Path, args: list[str]) -> bool: """Run experiment in secure Conda environment""" try: if not train_script.exists(): print(f"[ERROR] Training script not found: {train_script}") return False # Create results directory self.results_dir.mkdir(exist_ok=True) # Setup environment variables for security env = os.environ.copy() env.update( { "CONDA_DEFAULT_ENV": self.conda_env, "CUDA_VISIBLE_DEVICES": "0", # Allow GPU access "SECURE_MODE": "1", "NETWORK_ACCESS": ( "1" if self.security_policy.check_network_access(None) else "0" ), "CONDA_MODE": "1", } ) # Prepare command cmd = [ "conda", "run", "-n", self.conda_env, "python", str(train_script), ] + (args or []) # Add default output directory if not provided if "--output_dir" not in " ".join(args or []): cmd.extend(["--output_dir", str(self.results_dir)]) print(f"[CMD] Running command: {' '.join(cmd)}") print(f"[ENV] Conda environment: {self.conda_env}") print(f"[PKG] Package manager: {self.package_manager}") # Run with timeout and resource limits start_time = time.time() max_time = self.security_policy.policy.get( "max_execution_time", 3600 ) print(f"[RUN] Starting experiment: {train_script.name}") print(f"[TIME] Time limit: {max_time}s") process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env, cwd=str(self.workspace_dir), ) try: stdout, stderr = process.communicate(timeout=max_time) execution_time = time.time() - start_time if process.returncode == 0: print( f"[DONE] Experiment completed successfully in {execution_time:.1f}s" ) # Save execution results results = { "status": "success", "execution_time": execution_time, "stdout": stdout, "stderr": stderr, "return_code": process.returncode, "gpu_accessible": True, "security_mode": "enabled", "container_type": "conda", "conda_env": self.conda_env, "package_manager": self.package_manager, "ds_friendly": True, } results_file = self.results_dir / "execution_results.json" with open(results_file, "w") as f: json.dump(results, f, indent=2) return True else: print( f"[ERROR] Experiment failed with return code {process.returncode}" ) print(f"STDERR: {stderr}") return False except subprocess.TimeoutExpired: process.kill() print(f"[TIMEOUT] Experiment timed out after {max_time}s") return False except Exception as e: print(f"[ERROR] Experiment execution failed: {e}") return False def check_gpu_access(self) -> bool: """Check if GPU is accessible""" try: # Check with conda environment result = subprocess.run( [ "conda", "run", "-n", self.conda_env, "python", "-c", "import torch; print('CUDA available:', torch.cuda.is_available())", ], capture_output=True, text=True, timeout=10, ) return result.returncode == 0 except Exception as e: print("[ERROR] GPU access check failed:", e) return False def main(): parser = argparse.ArgumentParser(description="Secure ML Experiment Runner") parser.add_argument( "--workspace", default="/workspace", help="Workspace directory" ) parser.add_argument("--requirements", help="Requirements file path") parser.add_argument("--script", help="Training script path") parser.add_argument( "--args", nargs=argparse.REMAINDER, default=[], help="Additional script arguments", ) parser.add_argument( "--check-gpu", action="store_true", help="Check GPU access" ) args = parser.parse_args() # Initialize secure runner runner = CondaRunner(args.workspace) # Check GPU access if requested if args.check_gpu: if runner.check_gpu_access(): print("[OK] GPU access available") # Show GPU info with conda result = subprocess.run( [ "conda", "run", "-n", runner.conda_env, "python", "-c", "import torch; print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"None\"}')", ], capture_output=True, text=True, ) if result.returncode == 0: print(f"GPU Info: {result.stdout.strip()}") else: print("[ERROR] No GPU access available") return 1 # If only checking GPU, exit here if args.check_gpu: return 0 # Setup environment requirements_path = Path(args.requirements) if not requirements_path.exists(): print(f"[ERROR] Requirements file not found: {requirements_path}") return 1 print("[SETUP] Setting up secure environment...") if not runner.setup_environment(requirements_path): print("[ERROR] Failed to setup secure environment") return 1 # Run experiment script_path = Path(args.script) if not script_path.exists(): print(f"[ERROR] Training script not found: {script_path}") return 1 print("[RUN] Running experiment in secure container...") if runner.run_experiment(script_path, args.args): print("[DONE] Experiment completed successfully!") return 0 else: print("[ERROR] Experiment failed!") return 1 if __name__ == "__main__": sys.exit(main())