fetch_ml/podman/secure_runner.py
Jeremie Fraeys 4aecd469a1 feat: implement comprehensive monitoring and container orchestration
- Add Prometheus, Grafana, and Loki monitoring stack
- Include pre-configured dashboards for ML metrics and logs
- Add Podman container support with security policies
- Implement ML runtime environments for multiple frameworks
- Add containerized ML project templates (PyTorch, TensorFlow, etc.)
- Include secure runner with isolation and resource limits
- Add comprehensive log aggregation and alerting
2025-12-04 16:54:49 -05:00

402 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Secure ML Experiment Runner
Optimized for data scientists with maximum speed
"""
import argparse
import json
import os
from pathlib import Path
import subprocess
import sys
import time
class SecurityPolicy:
"""Manages security policies for experiment execution"""
def __init__(
self, policy_file: str = "/etc/ml_runner/security_policy.json"
):
self.policy_file = policy_file
self.policy = self._load_policy()
def _load_policy(self) -> dict:
"""Load security policy from file"""
try:
with open(self.policy_file, "r") as f:
return json.load(f)
except FileNotFoundError:
# Default restrictive policy for Conda
return {
"allow_network": False,
"blocked_packages": [
"requests",
"urllib3",
"httpx",
"aiohttp",
"socket",
"telnetlib",
"ftplib",
"smtplib",
"paramiko",
"fabric",
],
"max_execution_time": 3600,
"max_memory_gb": 16,
"gpu_access": True,
"allow_file_writes": True,
"resource_limits": {
"cpu_count": 4,
"memory_gb": 16,
"gpu_memory_gb": 12,
},
# Conda-specific settings
"conda_env": "ml_env",
"package_manager": "mamba",
"ds_friendly": True,
}
def check_package_safety(self, package_name: str) -> bool:
"""Check if a package is allowed"""
if package_name in self.policy.get("blocked_packages", []):
return False
return True
def check_network_access(self, domain: str | None) -> bool:
"""Check if network access is allowed"""
if not self.policy.get("allow_network", False):
return False
if domain:
allowed_domains = self.policy.get("allowed_domains", [])
return domain in allowed_domains
return True
class CondaRunner:
"""Secure experiment runner with Conda + Mamba"""
def __init__(self, workspace_dir: str = "/workspace"):
self.workspace_dir = Path(workspace_dir)
self.security_policy = SecurityPolicy()
self.conda_env = self.security_policy.policy.get("conda_env", "ml_env")
self.package_manager = self.security_policy.policy.get(
"package_manager", "mamba"
)
self.results_dir = self.workspace_dir / "results"
# Detect if running in conda environment
self.is_conda = os.environ.get("CONDA_DEFAULT_ENV") is not None
# Conda paths
self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda")
self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}"
def setup_environment(self, requirements_file: Path) -> bool:
"""Setup Conda environment with mamba"""
try:
# Read requirements
with open(requirements_file, "r") as f:
requirements = [
line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
# Check each package for security
for req in requirements:
package_name = (
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
)
if not self.security_policy.check_package_safety(package_name):
print(
f"[SECURITY] Package '{package_name}' is blocked for security reasons"
)
return False
# Install packages with mamba (super fast!)
for req in requirements:
package_name = (
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
)
# Check if already installed with conda
check_cmd = [
"conda",
"run",
"-n",
self.conda_env,
"python",
"-c",
f"import {package_name.replace('-', '_')}",
]
result = subprocess.run(
check_cmd, capture_output=True, text=True
)
if result.returncode == 0:
print(f"[OK] {package_name} already installed in conda env")
continue
# Try conda-forge first (faster and more reliable)
print(
f"[INSTALL] Installing {req} with {self.package_manager}..."
)
install_cmd = [
self.package_manager,
"install",
"-n",
self.conda_env,
req,
"-c",
"conda-forge",
"-y",
]
result = subprocess.run(
install_cmd, capture_output=True, text=True, timeout=300
)
if result.returncode == 0:
print(f"[OK] Installed {req} with {self.package_manager}")
continue
# Fallback to pip if conda fails
print(f"[FALLBACK] Trying pip for {req}...")
pip_cmd = [
"conda",
"run",
"-n",
self.conda_env,
"pip",
"install",
req,
"--no-cache-dir",
]
result = subprocess.run(
pip_cmd, capture_output=True, text=True, timeout=300
)
if result.returncode != 0:
print(f"[ERROR] Failed to install {req}: {result.stderr}")
return False
print(f"[OK] Installed {req} with pip")
return True
except Exception as e:
print(f"[ERROR] Environment setup failed: {e}")
return False
def run_experiment(self, train_script: Path, args: list[str]) -> bool:
"""Run experiment in secure Conda environment"""
try:
if not train_script.exists():
print(f"[ERROR] Training script not found: {train_script}")
return False
# Create results directory
self.results_dir.mkdir(exist_ok=True)
# Setup environment variables for security
env = os.environ.copy()
env.update(
{
"CONDA_DEFAULT_ENV": self.conda_env,
"CUDA_VISIBLE_DEVICES": "0", # Allow GPU access
"SECURE_MODE": "1",
"NETWORK_ACCESS": (
"1"
if self.security_policy.check_network_access(None)
else "0"
),
"CONDA_MODE": "1",
}
)
# Prepare command
cmd = [
"conda",
"run",
"-n",
self.conda_env,
"python",
str(train_script),
] + (args or [])
# Add default output directory if not provided
if "--output_dir" not in " ".join(args or []):
cmd.extend(["--output_dir", str(self.results_dir)])
print(f"[CMD] Running command: {' '.join(cmd)}")
print(f"[ENV] Conda environment: {self.conda_env}")
print(f"[PKG] Package manager: {self.package_manager}")
# Run with timeout and resource limits
start_time = time.time()
max_time = self.security_policy.policy.get(
"max_execution_time", 3600
)
print(f"[RUN] Starting experiment: {train_script.name}")
print(f"[TIME] Time limit: {max_time}s")
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env,
cwd=str(self.workspace_dir),
)
try:
stdout, stderr = process.communicate(timeout=max_time)
execution_time = time.time() - start_time
if process.returncode == 0:
print(
f"[DONE] Experiment completed successfully in {execution_time:.1f}s"
)
# Save execution results
results = {
"status": "success",
"execution_time": execution_time,
"stdout": stdout,
"stderr": stderr,
"return_code": process.returncode,
"gpu_accessible": True,
"security_mode": "enabled",
"container_type": "conda",
"conda_env": self.conda_env,
"package_manager": self.package_manager,
"ds_friendly": True,
}
results_file = self.results_dir / "execution_results.json"
with open(results_file, "w") as f:
json.dump(results, f, indent=2)
return True
else:
print(
f"[ERROR] Experiment failed with return code {process.returncode}"
)
print(f"STDERR: {stderr}")
return False
except subprocess.TimeoutExpired:
process.kill()
print(f"[TIMEOUT] Experiment timed out after {max_time}s")
return False
except Exception as e:
print(f"[ERROR] Experiment execution failed: {e}")
return False
def check_gpu_access(self) -> bool:
"""Check if GPU is accessible"""
try:
# Check with conda environment
result = subprocess.run(
[
"conda",
"run",
"-n",
self.conda_env,
"python",
"-c",
"import torch; print('CUDA available:', torch.cuda.is_available())",
],
capture_output=True,
text=True,
timeout=10,
)
return result.returncode == 0
except Exception as e:
print("[ERROR] GPU access check failed:", e)
return False
def main():
parser = argparse.ArgumentParser(description="Secure ML Experiment Runner")
parser.add_argument(
"--workspace", default="/workspace", help="Workspace directory"
)
parser.add_argument("--requirements", help="Requirements file path")
parser.add_argument("--script", help="Training script path")
parser.add_argument(
"--args",
nargs=argparse.REMAINDER,
default=[],
help="Additional script arguments",
)
parser.add_argument(
"--check-gpu", action="store_true", help="Check GPU access"
)
args = parser.parse_args()
# Initialize secure runner
runner = CondaRunner(args.workspace)
# Check GPU access if requested
if args.check_gpu:
if runner.check_gpu_access():
print("[OK] GPU access available")
# Show GPU info with conda
result = subprocess.run(
[
"conda",
"run",
"-n",
runner.conda_env,
"python",
"-c",
"import torch; print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"None\"}')",
],
capture_output=True,
text=True,
)
if result.returncode == 0:
print(f"GPU Info: {result.stdout.strip()}")
else:
print("[ERROR] No GPU access available")
return 1
# If only checking GPU, exit here
if args.check_gpu:
return 0
# Setup environment
requirements_path = Path(args.requirements)
if not requirements_path.exists():
print(f"[ERROR] Requirements file not found: {requirements_path}")
return 1
print("[SETUP] Setting up secure environment...")
if not runner.setup_environment(requirements_path):
print("[ERROR] Failed to setup secure environment")
return 1
# Run experiment
script_path = Path(args.script)
if not script_path.exists():
print(f"[ERROR] Training script not found: {script_path}")
return 1
print("[RUN] Running experiment in secure container...")
if runner.run_experiment(script_path, args.args):
print("[DONE] Experiment completed successfully!")
return 0
else:
print("[ERROR] Experiment failed!")
return 1
if __name__ == "__main__":
sys.exit(main())