feat(jupyter): improve runtime management and update security/workflow docs

This commit is contained in:
Jeremie Fraeys 2026-01-05 12:37:27 -05:00
parent 94112f0af5
commit 6b771e4a50
14 changed files with 1476 additions and 1325 deletions

View file

@ -1,462 +0,0 @@
# Jupyter Workspace and Experiment Integration
This guide describes the integration between Jupyter workspaces and FetchML experiments, enabling seamless resource chaining and data synchronization.
## Overview
The Jupyter-experiment integration allows you to:
- Link Jupyter workspaces with specific experiments
- Automatically track experiment metadata in workspaces
- Queue experiments directly from Jupyter workspaces
- Synchronize data between workspaces and experiments
- Maintain resource sharing and context across development and production workflows
## Architecture
### Components
1. **Workspace Metadata Manager** - Tracks relationships between workspaces and experiments
2. **Service Manager Integration** - Links Jupyter services with experiment context
3. **CLI Commands** - Provides user-facing integration commands
4. **API Endpoints** - Enables programmatic workspace-experiment management
### Data Flow
```
Jupyter Workspace ←→ Workspace Metadata ←→ Experiment Manager
↓ ↓ ↓
Notebooks Link Metadata Experiment Data
Scripts Sync History Metrics & Results
Requirements Auto-sync Config Job Queue
```
## Quick Start
### 1. Create a Jupyter workspace
```bash
# Create a new workspace
mkdir my_experiment_workspace
cd my_experiment_workspace
# Add notebooks and scripts
# (See examples/jupyter_experiment_integration.py for sample setup)
```
### 2. Start Jupyter service
```bash
# Start Jupyter with workspace
ml jupyter start --workspace ./my_experiment_workspace --name my_experiment
# Access at http://localhost:8888
```
### 3. Link workspace with experiment
```bash
# Create experiment
ml experiment create --name "my_experiment" --description "Test experiment"
# Link workspace with experiment
ml jupyter experiment link --workspace ./my_experiment_workspace --experiment <experiment_id>
```
### 4. Work in Jupyter
- Open notebooks in browser
- Develop and test code interactively
- Use MLflow for experiment tracking
- Save results and models
### 5. Queue for production
```bash
# Queue experiment from workspace
ml jupyter experiment queue --workspace ./my_experiment_workspace --script experiment.py --name "production_run"
# Monitor progress
ml status
ml monitor
```
### 6. Sync data
```bash
# Push workspace changes to experiment
ml jupyter experiment sync --workspace ./my_experiment_workspace --direction push
# Pull experiment results to workspace
ml jupyter experiment sync --workspace ./my_experiment_workspace --direction pull
```
## CLI Commands
### `ml jupyter experiment link`
Link a Jupyter workspace with an experiment.
```bash
ml jupyter experiment link --workspace <path> --experiment <id>
```
**Options:**
- `--workspace`: Path to Jupyter workspace (default: ./workspace)
- `--experiment`: Experiment ID to link with
**Creates:**
- `.jupyter_experiment.json` metadata file in workspace
- Link record in workspace metadata manager
- Association between workspace and experiment
### `ml jupyter experiment queue`
Queue an experiment from a linked workspace.
```bash
ml jupyter experiment queue --workspace <path> --script <file> --name <name>
```
**Options:**
- `--workspace`: Path to workspace (default: ./workspace)
- `--script`: Python script to execute
- `--name`: Name for the queued job
**Behavior:**
- Detects linked experiment automatically
- Passes experiment context to job queue
- Uses workspace resources and configuration
### `ml jupyter experiment sync`
Synchronize data between workspace and experiment.
```bash
ml jupyter experiment sync --workspace <path> --direction <pull|push>
```
**Options:**
- `--workspace`: Path to workspace (default: ./workspace)
- `--direction`: Sync direction (pull or push)
**Sync Types:**
- **Pull**: Download experiment metrics, results, and data to workspace
- **Push**: Upload workspace notebooks, scripts, and results to experiment
### `ml jupyter experiment status`
Show experiment status for a workspace.
```bash
ml jupyter experiment status [workspace_path]
```
**Displays:**
- Linked experiment information
- Last sync time
- Experiment metadata
- Service association
## API Endpoints
### `/api/jupyter/experiments/link`
**Method:** POST
Link a workspace with an experiment.
```json
{
"workspace": "/path/to/workspace",
"experiment_id": "experiment_123",
"service_id": "jupyter-service-456"
}
```
**Response:**
```json
{
"status": "linked",
"data": {
"workspace_path": "/path/to/workspace",
"experiment_id": "experiment_123",
"linked_at": "2023-12-06T10:30:00Z",
"sync_direction": "bidirectional"
}
}
```
### `/api/jupyter/experiments/sync`
**Method:** POST
Synchronize workspace with experiment.
```json
{
"workspace": "/path/to/workspace",
"experiment_id": "experiment_123",
"direction": "push",
"sync_type": "all"
}
```
**Response:**
```json
{
"workspace": "/path/to/workspace",
"experiment_id": "experiment_123",
"direction": "push",
"sync_type": "all",
"synced_at": "2023-12-06T10:35:00Z",
"status": "completed"
}
```
### `/api/jupyter/services`
**Methods:** GET, POST, DELETE
Manage Jupyter services.
**GET:** List all services
**POST:** Start new service
**DELETE:** Stop service
## Workspace Metadata
### `.jupyter_experiment.json`
Each linked workspace contains a metadata file:
```json
{
"experiment_id": "experiment_123",
"service_id": "jupyter-service-456",
"linked_at": 1701864600,
"last_sync": 1701865200,
"sync_direction": "bidirectional",
"auto_sync": false,
"jupyter_integration": true,
"workspace_path": "/path/to/workspace",
"tags": ["development", "ml-experiment"]
}
```
### Metadata Manager
The workspace metadata manager maintains:
- Workspace-experiment relationships
- Sync history and timestamps
- Auto-sync configuration
- Tags and additional metadata
- Service associations
## Best Practices
### Workspace Organization
1. **One workspace per experiment** - Keep workspaces focused on specific experiments
2. **Use descriptive names** - Name workspaces and services clearly
3. **Version control** - Track workspace changes with git
4. **Clean separation** - Separate data, code, and results
### Experiment Development Workflow
1. **Create workspace** with notebooks and scripts
2. **Link with experiment** for tracking
3. **Develop interactively** in Jupyter
4. **Test locally** with sample data
5. **Queue for production** when ready
6. **Monitor results** and iterate
### Data Management
1. **Use requirements.txt** for dependencies
2. **Store data separately** from notebooks
3. **Use MLflow** for experiment tracking
4. **Sync regularly** to preserve work
5. **Clean up** old workspaces
### Resource Management
1. **Monitor service usage** with `ml jupyter list`
2. **Stop unused services** with `ml jupyter stop`
3. **Use resource limits** in configuration
4. **Enable auto-sync** for automated workflows
## Troubleshooting
### Common Issues
**Workspace not linked:**
```bash
Error: No experiment link found in workspace
```
**Solution:** Run `ml jupyter experiment link` first
**Service not found:**
```bash
Error: Service not found
```
**Solution:** Check service name with `ml jupyter list`
**Sync failed:**
```bash
Error: Failed to sync workspace
```
**Solution:** Check workspace permissions and experiment exists
### Debug Commands
```bash
# Check workspace metadata
cat ./workspace/.jupyter_experiment.json
# List all services
ml jupyter list
# Check experiment status
ml jupyter experiment status
# View service logs
podman logs <service_id>
```
### Recovery
**Lost workspace link:**
1. Find experiment ID with `ml experiment list`
2. Re-link with `ml jupyter experiment link`
3. Sync data with `ml jupyter experiment sync --direction pull`
**Service stuck:**
1. Stop with `ml jupyter stop <service_name>`
2. Check logs for errors
3. Restart with `ml jupyter start`
## Examples
### Complete Workflow
```bash
# 1. Setup workspace
mkdir my_ml_project
cd my_ml_project
echo "numpy>=1.20.0" > requirements.txt
echo "mlflow>=1.20.0" >> requirements.txt
# 2. Start Jupyter
ml jupyter start --workspace . --name my_project
# 3. Create experiment
ml experiment create --name "my_project" --description "ML project experiment"
# 4. Link workspace
ml jupyter experiment link --workspace . --experiment <experiment_id>
# 5. Work in Jupyter (browser)
# - Create notebooks
# - Write experiment scripts
# - Test locally
# 6. Queue for production
ml jupyter experiment queue --workspace . --script train_model.py --name "production_run"
# 7. Monitor
ml status
ml monitor
# 8. Sync results
ml jupyter experiment sync --workspace . --direction pull
# 9. Cleanup
ml jupyter stop my_project
```
### Python Integration
```python
import requests
import json
# Link workspace
response = requests.post('http://localhost:9101/api/jupyter/experiments/link', json={
'workspace': '/path/to/workspace',
'experiment_id': 'experiment_123'
})
# Sync workspace
response = requests.post('http://localhost:9101/api/jupyter/experiments/sync', json={
'workspace': '/path/to/workspace',
'experiment_id': 'experiment_123',
'direction': 'push',
'sync_type': 'all'
})
```
## Configuration
### Service Configuration
Jupyter services can be configured with experiment-specific settings:
```yaml
service:
default_resources:
memory_limit: "8G"
cpu_limit: "2"
gpu_access: false
max_services: 5
auto_sync_interval: "30m"
```
### Workspace Settings
Workspace metadata supports custom configuration:
```json
{
"auto_sync": true,
"sync_interval": "15m",
"sync_direction": "bidirectional",
"tags": ["development", "production"],
"additional_data": {
"environment": "test",
"team": "ml-team"
}
}
```
## Migration Guide
### From Standalone Jupyter
1. **Create workspace** from existing notebooks
2. **Link with experiment** using new commands
3. **Update scripts** to use experiment context
4. **Migrate data** to experiment storage
5. **Update workflows** to use integration
### From Job Queue Only
1. **Create workspace** for development
2. **Link with existing experiments**
3. **Add interactive development** phase
4. **Implement sync workflows**
5. **Update CI/CD pipelines**
## Future Enhancements
Planned improvements:
- **Auto-sync with file watching**
- **Workspace templates**
- **Collaborative workspaces**
- **Advanced resource sharing**
- **Git integration**
- **Docker compose support**
- **Kubernetes integration**
- **Advanced monitoring**

View file

@ -1,477 +0,0 @@
# Jupyter Package Management
This guide describes the secure package management system for Jupyter workspaces in FetchML, allowing data scientists to install packages only from trusted channels while maintaining security and compliance.
## Overview
The package management system provides:
- **Trusted Channel Control** - Only allow packages from approved channels
- **Package Approval Workflow** - Optional approval process for package installations
- **Security Filtering** - Block potentially dangerous packages
- **Audit Trail** - Track all package requests and installations
- **Workspace Isolation** - Package management per workspace
## Security Features
### Trusted Channels
By default, only packages from these trusted channels are allowed:
- `conda-forge` - Community-maintained packages
- `defaults` - Anaconda default packages
- `pytorch` - PyTorch ecosystem packages
- `nvidia` - NVIDIA GPU packages
### Blocked Packages
Potentially dangerous packages are blocked by default:
- `requests` - HTTP client library
- `urllib3` - HTTP library
- `httpx` - Async HTTP client
- `aiohttp` - Async HTTP server/client
### Approval Workflow
Administrators can configure:
- **Auto-approval** for safe packages
- **Manual approval** for sensitive packages
- **Required approval** for all packages
- **Package allowlist** for pre-approved packages
## CLI Commands
### `ml jupyter package install`
Request package installation in a workspace.
```bash
ml jupyter package install --package <name> [options]
```
**Options:**
- `--package`: Package name (required)
- `--version`: Specific version (optional)
- `--channel`: Channel source (default: conda-forge)
- `--workspace`: Workspace path (default: ./workspace)
- `--user`: Requesting user (default: current user)
**Examples:**
```bash
# Install numpy (auto-approved)
ml jupyter package install --package numpy
# Install specific version
ml jupyter package install --package pandas --version 1.3.0
# Install from specific channel
ml jupyter package install --package pytorch --channel pytorch
# Install in specific workspace
ml jupyter package install --package scikit-learn --workspace ./ml_project
```
### `ml jupyter package list`
List installed packages in a workspace.
```bash
ml jupyter package list [workspace_path]
```
**Output:**
```
Installed packages in workspace: ./workspace
Package Name Version Channel Installed By
------------ ------- ------- ------------
numpy 1.21.0 conda-forge user1
pandas 1.3.0 conda-forge user1
scikit-learn 1.0.0 conda-forge user2
```
### `ml jupyter package pending`
List pending package installation requests.
```bash
ml jupyter package pending [workspace_path]
```
**Output:**
```
Pending package requests for workspace: ./workspace
Package Name Version Channel Requested By Time
------------ ------- ------- ------------ ----
torch 1.9.0 pytorch user3 2023-12-06 10:30
tensorflow 2.8.0 defaults user4 2023-12-06 11:15
```
### `ml jupyter package approve`
Approve a pending package request.
```bash
ml jupyter package approve <package_name>
```
**Example:**
```bash
# Approve torch installation
ml jupyter package approve torch
# Package will be installed automatically after approval
```
### `ml jupyter package reject`
Reject a pending package request.
```bash
ml jupyter package reject <package_name> --reason <reason>
```
**Options:**
- `--reason`: Rejection reason (optional)
**Examples:**
```bash
# Reject with default reason
ml jupyter package reject suspicious-package
# Reject with custom reason
ml jupyter package reject old-package --reason "Security policy violation"
```
## Configuration
### Package Configuration
Package management is configured per workspace with these settings:
```json
{
"trusted_channels": [
"conda-forge",
"defaults",
"pytorch",
"nvidia"
],
"allowed_packages": {},
"blocked_packages": [
"requests",
"urllib3",
"httpx",
"aiohttp"
],
"require_approval": false,
"auto_approve_safe": true,
"max_packages": 100,
"install_timeout": "5m",
"allow_conda_forge": true,
"allow_pypi": false,
"allow_local": false
}
```
### Custom Configuration
Create a custom configuration for your environment:
```go
config := &jupyter.PackageConfig{
TrustedChannels: []string{
"conda-forge",
"company-internal",
"research-team",
},
AllowedPackages: map[string]bool{
"numpy": true,
"pandas": true,
"scikit-learn": true,
"tensorflow": true,
"pytorch": true,
},
BlockedPackages: []string{
"requests",
"urllib3",
"httpx",
},
RequireApproval: true,
AutoApproveSafe: false,
MaxPackages: 50,
InstallTimeout: 10 * time.Minute,
AllowCondaForge: true,
AllowPyPI: false,
AllowLocal: false,
}
```
## Security Policies
### Channel Trust Model
1. **conda-forge** - Community reviewed, generally safe
2. **defaults** - Anaconda curated, high quality
3. **pytorch** - Official PyTorch packages
4. **nvidia** - Official NVIDIA packages
5. **Custom channels** - Require explicit approval
### Package Categories
#### Auto-Approved
- Data science libraries (numpy, pandas, scipy)
- Machine learning frameworks (scikit-learn, tensorflow, pytorch)
- Visualization libraries (matplotlib, seaborn, plotly)
#### Manual Review Required
- Network libraries (requests, urllib3, httpx)
- System utilities
- Custom or unknown packages
#### Blocked
- Known security risks
- Outdated versions
- Packages with vulnerable dependencies
### Approval Workflows
#### Auto-Approval Mode
```bash
# Safe packages install immediately
ml jupyter package install --package numpy
# Output: Package installed successfully
```
#### Manual Approval Mode
```bash
# Package requires approval
ml jupyter package install --package requests
# Output: Package request created, awaiting approval
# Administrator approves
ml jupyter package approve requests
# Output: Package approved and installed
```
#### Rejection Mode
```bash
# Blocked packages are rejected automatically
ml jupyter package install --package suspicious-package
# Output: Package blocked for security reasons
```
## Best Practices
### For Data Scientists
1. **Use trusted channels** - Stick to conda-forge and defaults
2. **Specify versions** - Pin specific versions for reproducibility
3. **Check dependencies** - Review package dependencies before installation
4. **Document requirements** - Keep requirements.txt updated
5. **Use workspaces** - Isolate packages per project
### For Administrators
1. **Configure trusted channels** - Define approved channels for your organization
2. **Set approval policies** - Configure approval workflows
3. **Monitor requests** - Review pending package requests regularly
4. **Audit installations** - Track package installations and usage
5. **Update blocklists** - Keep blocked packages list current
### For Security Teams
1. **Review channel policies** - Validate channel trustworthiness
2. **Monitor package updates** - Track security vulnerabilities
3. **Audit package usage** - Review installed packages across workspaces
4. **Configure timeouts** - Set reasonable installation timeouts
5. **Implement logging** - Enable comprehensive audit logging
## Troubleshooting
### Common Issues
**Package not found:**
```bash
Error: Package 'unknown-package' not found in trusted channels
```
**Solution:** Check spelling and verify package exists in trusted channels
**Channel not trusted:**
```bash
Error: Channel 'untrusted-channel' is not trusted
```
**Solution:** Use trusted channel or request channel approval
**Package blocked:**
```bash
Error: Package 'requests' is blocked for security reasons
```
**Solution:** Use alternative package or request exception
**Installation timeout:**
```bash
Error: Package installation timed out
```
**Solution:** Check network connectivity and package size
### Debug Commands
```bash
# Check package status
ml jupyter package list
# View pending requests
ml jupyter package pending
# Check workspace configuration
cat ./workspace/.package_config.json
# View installation logs
cat ./workspace/.package_cache/install_*.log
```
### Recovery Procedures
**Failed installation:**
```bash
# Check request status
ml jupyter package pending
# Retry installation
ml jupyter package approve <package_name>
```
**Corrupted package:**
```bash
# Remove and reinstall
ml jupyter package install --package <name> --force
```
## API Integration
### Package Manager API
```go
// Create package manager
pm, err := jupyter.NewPackageManager(logger, config, workspacePath)
// Request package
req, err := pm.RequestPackage("numpy", "1.21.0", "conda-forge", "user1")
// Approve request
err = pm.ApprovePackageRequest(req.PackageName, "admin")
// Install package
err = pm.InstallPackage(req.PackageName)
// List packages
packages, err := pm.ListInstalledPackages()
```
### REST API Endpoints
```bash
# Request package
POST /api/jupyter/packages/request
{
"package_name": "numpy",
"version": "1.21.0",
"channel": "conda-forge",
"workspace": "./workspace"
}
# List packages
GET /api/jupyter/packages/list?workspace=./workspace
# Approve package
POST /api/jupyter/packages/approve
{
"package_name": "numpy",
"approval_user": "admin"
}
```
## Examples
### Data Science Workflow
```bash
# 1. Create workspace
mkdir ml_project
cd ml_project
# 2. Start Jupyter
ml jupyter start --workspace .
# 3. Install common packages
ml jupyter package install --package numpy
ml jupyter package install --package pandas
ml jupyter package install --package scikit-learn
# 4. Request specialized package
ml jupyter package install --package pytorch --channel pytorch
# 5. Check status
ml jupyter package list
# 6. Work in Jupyter
# (Open browser and start coding)
```
### Administrator Workflow
```bash
# 1. Check pending requests
ml jupyter package pending
# 2. Review package requests
ml jupyter package list
# 3. Approve safe packages
ml jupyter package approve numpy
ml jupyter package approve pandas
# 4. Reject risky packages
ml jupyter package reject requests --reason "Security policy"
# 5. Monitor installations
ml jupyter package list
```
### Security Configuration
```bash
# 1. Configure trusted channels
echo '{"trusted_channels": ["conda-forge", "defaults"]}' > .package_config.json
# 2. Set approval policy
echo '{"require_approval": true}' >> .package_config.json
# 3. Block dangerous packages
echo '{"blocked_packages": ["requests", "urllib3"]}' >> .package_config.json
# 4. Enable logging
echo '{"audit_logging": true}' >> .package_config.json
```
## Integration with Experiment System
Package management integrates with the experiment system:
```bash
# Link workspace with experiment
ml jupyter experiment link --workspace ./project --experiment exp_123
# Install packages for experiment
ml jupyter package install --package tensorflow
# Sync packages with experiment
ml jupyter experiment sync --workspace ./project --direction push
# Package info included in experiment metadata
ml experiment show exp_123
```
This ensures reproducibility by tracking package dependencies alongside experiments.

View file

@ -1,332 +1,604 @@
# Jupyter Service Architecture
# Jupyter Workflow Guide
Comprehensive guide to Jupyter workspace management, experiment integration, and secure package management in FetchML.
## Overview
This guide describes the new Jupyter service architecture that provides standalone Jupyter services complementary to the FetchML job queue system. This approach solves the architectural mismatch between interactive Jupyter sessions and batch-oriented job processing.
## Architecture Design
### Separate Service Model
The new architecture treats Jupyter as a **separate development service** rather than trying to fit it into the job queue model:
```
Development Workflow:
ml jupyter start --workspace ./my_project
→ Standalone Jupyter service with ML tools
→ Interactive development in browser
→ Direct container access
Production Workflow:
ml queue experiment.py
→ Batch job execution through queue
→ Scalable, monitored production runs
```
### Key Components
1. **Service Manager** - Handles container lifecycle and service orchestration
2. **Workspace Manager** - Manages volume mounting and workspace isolation
3. **Network Manager** - Handles port allocation and browser access
4. **Health Monitor** - Tracks service health and performance
5. **Configuration Manager** - Manages service settings and environment
The Jupyter workflow system provides:
- **Workspace Management**: Isolated development environments
- **Experiment Integration**: Seamless linking with ML experiments
- **Package Management**: Secure package installation from trusted sources
- **Resource Sharing**: Data and context synchronization
- **Security Controls**: Approved channels and package filtering
## Quick Start
### 1. Start a Jupyter Service
### Create Jupyter Workspace
```bash
# Basic start with defaults
ml jupyter start
# Start development stack
make dev-up
# Custom workspace and port
ml jupyter start --workspace ./my_project --port 8889
# Create workspace
./cli/zig-out/bin/ml jupyter create my-workspace
# Named service with custom image
ml jupyter start --name "data-science" --workspace ./workspace --image custom/jupyter:latest
# Access workspace
open http://localhost:8888
```
### 2. List Running Services
### Link with Experiment
```bash
ml jupyter list
```
# Queue experiment from workspace
./cli/zig-out/bin/ml jupyter queue --workspace my-workspace --experiment my-experiment
### 3. Check Service Status
```bash
# All services
ml jupyter status
# Specific service
ml jupyter status jupyter-data-science-1703123456
```
### 4. Stop a Service
```bash
# Stop specific service
ml jupyter stop jupyter-data-science-1703123456
# Stop first running service (if no name provided)
ml jupyter stop
# Monitor progress
./cli/zig-out/bin/ml status
```
## Workspace Management
### Workspace Commands
### Creating Workspaces
```bash
# List available workspaces
ml jupyter workspace list
# Create new workspace
./cli/zig-out/bin/ml jupyter create workspace-name
# Validate a workspace
ml jupyter workspace validate ./my_project
# Create with specific configuration
./cli/zig-out/bin/ml jupyter create workspace-name \
--cpu=4 \
--memory=8g \
--gpu=1 \
--image=jupyter/scipy-notebook:latest
# Get workspace information
ml jupyter workspace info ./my_project
```
### Workspace Structure
A valid workspace should contain:
- Jupyter notebooks (`.ipynb` files)
- Python scripts (`.py` files)
- Requirements files (`requirements.txt`, `environment.yml`)
- Configuration files (`pyproject.toml`)
## Advanced Configuration
### Service Configuration
The Jupyter service uses a comprehensive configuration system:
```json
{
"version": "1.0.0",
"environment": "development",
"service": {
"default_image": "localhost/ml-tools-runner:latest",
"default_port": 8888,
"max_services": 5,
"default_resources": {
"memory_limit": "8G",
"cpu_limit": "2",
"gpu_access": false
}
},
"network": {
"bind_address": "127.0.0.1",
"enable_token": false,
"allow_remote": false
},
"security": {
"allow_network": false,
"blocked_packages": ["requests", "urllib3", "httpx"],
"read_only_root": false
},
"health": {
"enabled": true,
"check_interval": "30s",
"timeout": "10s",
"auto_cleanup": true
}
}
```
### Environment-Specific Settings
- **Development**: More permissive, debug enabled, network access allowed
- **Production**: Restricted access, health monitoring, resource limits enforced
- **Testing**: Single service, health checks disabled, debug mode
## Available ML Tools
The container includes these pre-installed tools:
- **MLflow 3.7.0** - Experiment tracking and model registry
- **Streamlit 1.52.1** - Interactive web apps
- **Dash 3.3.0** - Plotly-based dashboards
- **Panel 1.8.4** - Data apps and dashboards
- **Bokeh 3.8.1** - Interactive visualizations
- **WandB 0.23.1** - Experiment tracking (requires API key)
## Using ML Tools in Jupyter
### MLflow Example
```python
import mlflow
# Start tracking
with mlflow.start_run() as run:
mlflow.log_param("model", "random_forest")
mlflow.log_metric("accuracy", 0.95)
print(f"Run ID: {run.info.run_id}")
```
### Streamlit Example
```python
import streamlit as st
st.title("My ML App")
st.write("Interactive dashboard with real-time updates")
```
### Dash Example
```python
import dash
import dash_core_components as dcc
import dash_html_components as html
app = dash.Dash(__name__)
app.run_server(debug=True, host='0.0.0.0', port=8050)
```
## Integration with Job Queue
The Jupyter service complements the existing job queue system:
### Development in Jupyter
```bash
# Start Jupyter for development
ml jupyter start --workspace ./experiment
# Work interactively in browser
# http://localhost:8888
```
### Production via Job Queue
```bash
# Queue the same experiment for production
ml queue experiment.py
# Monitor production run
ml status
ml monitor
```
### Hybrid Workflow
1. **Develop**: Use Jupyter for interactive development and prototyping
2. **Test**: Validate code in Jupyter with sample data
3. **Production**: Queue validated code for scalable execution
4. **Monitor**: Track production jobs while continuing development
## Security Features
### Container Isolation
- Rootless Podman containers
- Non-root user execution
- Resource limits enforced
- Network access control
### Workspace Security
- Bind mounts with proper permissions
- Path validation and restrictions
- SELinux compatibility (`:Z` flag)
### Network Security
- Localhost binding by default
- Token/password authentication optional
- Remote access requires explicit configuration
## Health Monitoring
### Automatic Health Checks
- HTTP endpoint monitoring
- Container status tracking
- Response time measurement
- Error detection and alerting
### Service Lifecycle Management
- Automatic cleanup of stale services
- Graceful shutdown handling
- Resource usage monitoring
- Service restart policies
## Troubleshooting
### Service Won't Start
```bash
# Check container runtime
podman --version
# Validate workspace
ml jupyter workspace validate ./my_project
# Check port availability
ml jupyter status
```
### Network Issues
```bash
# Check service URL
ml jupyter status service-name
# Test connectivity
curl http://localhost:8888
# Check port conflicts
netstat -tlnp | grep :8888
```
### Workspace Problems
```bash
# List workspaces
ml jupyter workspace list
./cli/zig-out/bin/ml jupyter list
# Check permissions
ls -la ./my_project
# Validate workspace structure
ml jupyter workspace info ./my_project
# Workspace details
./cli/zig-out/bin/ml jupyter info workspace-name
```
### Service Health Issues
### Workspace Configuration
**Resource Allocation**:
```yaml
# workspace-config.yaml
resources:
cpu: 4
memory: 8g
gpu: 1
disk: 20g
environment:
python_version: "3.11"
jupyter_version: "latest"
security:
trusted_channels: ["conda-forge", "defaults", "pytorch"]
blocked_packages: ["requests", "urllib3"]
```
### Access Control
```bash
# Check service health
ml jupyter status service-name
# Set workspace permissions
./cli/zig-out/bin/ml jupyter access workspace-name \
--user=data-scientist \
--role=editor
# View container logs
podman logs container-id
# Revoke access
./cli/zig-out/bin/ml jupyter revoke workspace-name data-scientist
```
# Restart service
ml jupyter stop service-name
ml jupyter start --name service-name --workspace ./my_project
## Experiment Integration
### Architecture
```
Jupyter Workspace ←→ Workspace Metadata ←→ Experiment Manager
↓ ↓ ↓
Notebooks Link Metadata Experiment Data
Scripts Sync History Metrics & Results
```
### Linking Workspaces and Experiments
```bash
# Link existing workspace to experiment
./cli/zig-out/bin/ml jupyter link workspace-name experiment-id
# Create workspace linked to new experiment
./cli/zig-out/bin/ml jupyter create workspace-name \
--experiment experiment-id
# Queue experiment from workspace
./cli/zig-out/bin/ml jupyter queue \
--workspace workspace-name \
--config experiment-config.yaml
```
### Data Synchronization
**Automatic Sync**:
- Notebook metadata
- Experiment results
- Configuration files
- Resource usage metrics
**Manual Sync**:
```bash
# Sync workspace to experiment
./cli/zig-out/bin/ml jupyter sync workspace-name --to-experiment
# Sync experiment to workspace
./cli/zig-out/bin/ml jupyter sync workspace-name --from-experiment
# Force full sync
./cli/zig-out/bin/ml jupyter sync workspace-name --full
```
### Workspace Metadata
**Tracked Information**:
- Workspace creation and modification dates
- Linked experiment IDs
- Resource usage history
- Package installation records
- Notebook execution history
```bash
# View workspace metadata
./cli/zig-out/bin/ml jupyter metadata workspace-name
# Export metadata
./cli/zig-out/bin/ml jupyter export workspace-name --format=json
```
## Package Management
### Security Features
**Trusted Channels** (default):
- `conda-forge` - Community-maintained packages
- `defaults` - Anaconda default packages
- `pytorch` - PyTorch ecosystem packages
- `nvidia` - NVIDIA GPU packages
**Blocked Packages** (security):
- `requests` - HTTP client library
- `urllib3` - HTTP library
- `socket` - Network sockets
- `subprocess` - Process execution
- `os.system` - System commands
### Package Installation
**In Jupyter Notebook**:
```python
# Install package (checks security)
!pip install numpy pandas scikit-learn
# Install from conda
!conda install -c conda-forge matplotlib seaborn
# Check package status
!pip list
```
**From CLI**:
```bash
# Install package in workspace
./cli/zig-out/bin/ml jupyter install workspace-name numpy
# Install with version
./cli/zig-out/bin/ml jupyter install workspace-name "pandas==2.0.0"
# Install from conda
./cli/zig-out/bin/ml jupyter install workspace-name matplotlib --conda
# List installed packages
./cli/zig-out/bin/ml jupyter packages workspace-name
```
### Package Approval Workflow
**Optional Approval Process**:
1. **Request**: User requests package installation
2. **Review**: Admin reviews package security
3. **Approval**: Package added to allowlist
4. **Installation**: Package installed in workspace
```bash
# Request package (requires approval)
./cli/zig-out/bin/ml jupyter request workspace-name custom-package
# Review requests (admin)
./cli/zig-out/bin/ml jupyter review --pending
# Approve request
./cli/zig-out/bin/ml jupyter approve request-id
# Deny request
./cli/zig-out/bin/ml jupyter deny request-id --reason="Security concern"
```
### Custom Channel Configuration
```yaml
# workspace-security.yaml
package_management:
trusted_channels:
- conda-forge
- defaults
- pytorch
- nvidia
- company-internal # Custom channel
blocked_packages:
- requests
- urllib3
- socket
- subprocess
- os.system
approval_required:
- tensorflow
- pytorch
- custom-package
allowlist:
- numpy
- pandas
- scikit-learn
- matplotlib
```
## Security and Compliance
### Workspace Isolation
**Network Isolation**:
- Workspaces run in isolated networks
- Controlled outbound internet access
- Inter-workspace communication blocked
**File System Isolation**:
- Separate storage volumes per workspace
- Controlled file access permissions
- Automatic cleanup on workspace deletion
### Audit Trail
**Tracked Activities**:
- Package installations and removals
- Notebook execution history
- Data access patterns
- Resource usage metrics
- User access logs
```bash
# View audit log
./cli/zig-out/bin/ml jupyter audit workspace-name
# Export audit report
./cli/zig-out/bin/ml jupyter audit workspace-name --export=csv
# Security scan
./cli/zig-out/bin/ml jupyter security-scan workspace-name
```
### Compliance Features
**Data Protection**:
- Automatic data encryption
- Secure data transfer protocols
- GDPR compliance features
- Data retention policies
**Access Controls**:
- Role-based permissions
- Multi-factor authentication
- Session timeout management
- IP whitelisting
## Advanced Features
### Custom Images
```bash
# Build custom workspace image
./cli/zig-out/bin/ml jupyter build custom-image \
--base=jupyter/scipy-notebook \
--packages="numpy pandas scikit-learn" \
--gpu-support
# Use custom image
./cli/zig-out/bin/ml jupyter create workspace-name \
--image=custom-image
```
### Workspace Templates
```yaml
# data-science-template.yaml
name: data-science-workspace
resources:
cpu: 8
memory: 16g
gpu: 1
packages:
- numpy
- pandas
- scikit-learn
- matplotlib
- seaborn
- jupyterlab
security:
trusted_channels: ["conda-forge", "defaults"]
approval_required: []
environment:
PYTHONPATH: "/workspace"
JUPYTER_ENABLE_LAB: "yes"
```
```bash
# Create from template
./cli/zig-out/bin/ml jupyter create workspace-name \
--template=data-science-template
```
### Collaboration Features
**Workspace Sharing**:
```bash
# Share workspace with team
./cli/zig-out/bin/ml jupyter share workspace-name \
--team=data-science-team \
--role=collaborator
# Collaborative notebooks
# Multiple users can edit simultaneously
# Real-time cursor tracking
# Comment and review features
```
**Version Control**:
```bash
# Git integration
./cli/zig-out/bin/ml jupyter git workspace-name init
./cli/zig-out/bin/ml jupyter git workspace-name add .
./cli/zig-out/bin/ml jupyter git workspace-name commit -m "Initial commit"
# Notebook versioning
./cli/zig-out/bin/ml jupyter version workspace-name notebook.ipynb
```
## Monitoring and Troubleshooting
### Performance Monitoring
```bash
# Workspace resource usage
./cli/zig-out/bin/ml jupyter stats workspace-name
# Real-time monitoring
./cli/zig-out/bin/ml jupyter monitor workspace-name
# Performance report
./cli/zig-out/bin/ml jupyter report workspace-name --format=html
```
### Common Issues
**Package Installation Failures**:
```bash
# Check package security
./cli/zig-out/bin/ml jupyter check-package package-name
# Bypass security (admin only)
./cli/zig-out/bin/ml jupyter install workspace-name package-name --force
# Clear package cache
./cli/zig-out/bin/ml jupyter clear-cache workspace-name
```
**Workspace Access Issues**:
```bash
# Check workspace status
./cli/zig-out/bin/ml jupyter status workspace-name
# Restart workspace
./cli/zig-out/bin/ml jupyter restart workspace-name
# Reset workspace
./cli/zig-out/bin/ml jupyter reset workspace-name --hard
```
**Performance Issues**:
```bash
# Check resource limits
./cli/zig-out/bin/ml jupyter limits workspace-name
# Scale resources
./cli/zig-out/bin/ml jupyter scale workspace-name --cpu=8 --memory=16g
# Optimize performance
./cli/zig-out/bin/ml jupyter optimize workspace-name
```
## Best Practices
### Development
1. Use descriptive service names
2. Organize workspaces by project
3. Leverage workspace validation
4. Monitor service health regularly
### Workspace Organization
### Production
1. Use production configuration
2. Enable security restrictions
3. Monitor resource usage
4. Implement cleanup policies
1. **Use Descriptive Names**: `project-name-environment`
2. **Resource Planning**: Allocate appropriate CPU/memory
3. **Regular Cleanup**: Remove unused workspaces
4. **Version Control**: Track important changes
### Workflow Integration
1. Develop interactively in Jupyter
2. Validate code before production
3. Use job queue for scalable execution
4. Monitor both systems independently
### Package Management
## Migration from Old Architecture
1. **Minimal Packages**: Install only necessary packages
2. **Version Pinning**: Use specific package versions
3. **Security First**: Always use trusted channels
4. **Regular Updates**: Keep packages updated
The new architecture provides these benefits over the old job queue approach:
### Security Practices
- **True Interactive Sessions** - Long-running containers with persistent state
- **Direct Browser Access** - No API gateway overhead
- **Better Resource Management** - Dedicated containers per service
- **Simplified Networking** - Direct port mapping
- **Enhanced Security** - Isolated development environment
- **Improved Monitoring** - Service-specific health tracking
1. **Principle of Least Privilege**: Minimal required permissions
2. **Regular Audits**: Review workspace activities
3. **Data Classification**: Handle sensitive data appropriately
4. **Compliance**: Follow organizational policies
To migrate:
1. Stop any existing Jupyter containers
2. Use new CLI commands (`ml jupyter start/stop/list`)
3. Organize projects into workspaces
4. Configure security settings as needed
## API Integration
### Programmatic Workspace Management
```python
import requests
# Create workspace
response = requests.post('/api/v1/jupyter/workspaces', json={
'name': 'my-workspace',
'resources': {'cpu': 4, 'memory': '8g'},
'security': {'trusted_channels': ['conda-forge']}
})
# Install package
requests.post(f'/api/v1/jupyter/workspaces/my-workspace/packages', json={
'package': 'numpy',
'version': '1.24.0'
})
# Link to experiment
requests.post('/api/v1/jupyter/workspaces/my-workspace/experiments', json={
'experiment_id': 'exp-123'
})
```
### Webhooks
```yaml
# workspace-webhooks.yaml
events:
- workspace_created
- package_installed
- experiment_linked
actions:
- slack_notification
- email_alert
- log_event
```
## WebSocket Protocol
### Overview
The Jupyter CLI commands use a binary WebSocket protocol for efficient, low-latency communication with the FetchML server. This provides better performance than HTTP and allows for real-time updates.
### Connection
```bash
# WebSocket endpoint
ws://SERVER_HOST:PORT/ws
# TLS-enabled endpoint
wss://SERVER_HOST:PORT/ws
```
**Authentication**: API key is hashed using SHA256 and the first 16 bytes are sent with each request.
### Binary Message Format
All Jupyter commands follow a binary protocol for optimal performance:
**Start Jupyter Service** (Opcode: 0x0D):
```
[opcode:1][api_key_hash:16][name_len:1][name:var][workspace_len:2][workspace:var][password_len:1][password:var]
```
**Stop Jupyter Service** (Opcode: 0x0E):
```
[opcode:1][api_key_hash:16][service_id_len:1][service_id:var]
```
**List Jupyter Services** (Opcode: 0x0F):
```
[opcode:1][api_key_hash:16]
```
### Response Packets
The server responds with structured response packets:
**Success Response**:
```
[packet_type:0x00][timestamp:8][message_len:2][message:var]
```
**Error Response**:
```
[packet_type:0x01][timestamp:8][error_code:1][message_len:2][message:var][details_len:2][details:var]
```
**Data Response** (for list command):
```
[packet_type:0x04][timestamp:8][type_len:2][type:var][payload_len:4][payload:var]
```
### CLI Examples
**Start Service**:
```bash
# Basic start
ml jupyter start --name my-notebook --workspace /path/to/workspace
# With password
ml jupyter start --name my-notebook --workspace /path/to/workspace --password mypass
```
**Stop Service**:
```bash
ml jupyter stop service-id-12345
```
**List Services**:
```bash
ml jupyter list
```
### Error Codes
Common error codes in binary responses:
- `0x00`: Unknown error
- `0x01`: Invalid request format
- `0x02`: Authentication failed
- `0x03`: Permission denied
- `0x10`: Server overloaded
- `0x14`: Timeout
### WebSocket vs HTTP
**Advantages of WebSocket**:
- ✅ Lower latency (persistent connection)
- ✅ Binary protocol (smaller payloads)
- ✅ Real-time updates possible
- ✅ Reduced server load
- ✅ Single connection for CLI
**When to use HTTP**:
- For programmatic API access
- For web-based integrations
- When WebSocket is unavailable
## See Also
- **[Testing Guide](testing.md)** - Testing Jupyter workflows
- **[Deployment Guide](deployment.md)** - Production deployment
- **[Security Guide](security.md)** - Security best practices
- **[API Reference](api-key-process.md)** - API documentation
- **[CLI Reference](cli-reference.md)** - Command-line tools

View file

@ -47,7 +47,7 @@ This document outlines security features, best practices, and hardening procedur
3. **Enable TLS** (Production only)
```yaml
# configs/config-prod.yaml
# configs/api/prod.yaml
server:
tls:
enabled: true
@ -68,7 +68,7 @@ This document outlines security features, best practices, and hardening procedur
5. **Restrict IP Access**
```yaml
# configs/config-prod.yaml
# configs/api/prod.yaml
auth:
ip_whitelist:
- "10.0.0.0/8"
@ -154,19 +154,33 @@ echo -n "your-api-key" | sha256sum
### Rotate API Keys
1. Generate new API key
2. Update `config-local.yaml` with new hash
2. Update your chosen API server config (for example a private copy of `configs/api/homelab-secure.yaml`) with the new hash
3. Distribute new key to users
4. Remove old key after grace period
### Revoke API Keys
Remove user entry from `config-local.yaml`:
Remove user entry from your API server config file:
```yaml
auth:
apikeys:
api_keys:
# user_to_revoke: # Comment out or delete
```
## Secret Flow (What lives where)
- **API server config (`configs/api/*.yaml`)**
- Stores **SHA256 hashes** of API keys (never raw keys).
- The repo-shipped configs intentionally contain `CHANGE_ME_...` placeholders.
- For real deployments, make a private copy (e.g. `/etc/fetch_ml/config.yaml`) and fill in real hashes.
- **Docker Compose `.env` / secret files**
- Used for values that should not be committed (e.g. `REDIS_PASSWORD`, Grafana admin password).
- `deployments/docker-compose.homelab-secure.yml` requires `REDIS_PASSWORD` to be set explicitly.
- **TLS certs**
- Provided as mounted files (e.g. `/app/ssl/cert.pem`, `/app/ssl/key.pem`).
## Network Security
### Production Network Topology

View file

@ -4,8 +4,10 @@ package container
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"github.com/jfraeys/fetch_ml/internal/config"
@ -39,9 +41,10 @@ type ContainerConfig struct {
// ResourceConfig defines resource limits for containers
type ResourceConfig struct {
MemoryLimit string `json:"memory_limit"`
CPULimit string `json:"cpu_limit"`
GPUAccess bool `json:"gpu_access"`
MemoryLimit string `json:"memory_limit"`
CPULimit string `json:"cpu_limit"`
GPUDevices []string `json:"gpu_devices"`
AppleGPU bool `json:"apple_gpu"`
}
// NetworkConfig defines network settings for containers
@ -49,10 +52,17 @@ type NetworkConfig struct {
AllowNetwork bool `json:"allow_network"`
}
// StartContainer starts a new container
func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerConfig) (string, error) {
func podmanCgroupsMode() string {
return strings.TrimSpace(os.Getenv("FETCHML_PODMAN_CGROUPS"))
}
func BuildRunArgs(config *ContainerConfig) []string {
args := []string{"run", "-d"}
if podmanCgroupsMode() == "disabled" {
args = append(args, "--cgroups=disabled")
}
// Add name
if config.Name != "" {
args = append(args, "--name", config.Name)
@ -70,8 +80,12 @@ func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerCo
if config.Resources.CPULimit != "" {
args = append(args, "--cpus", config.Resources.CPULimit)
}
if config.Resources.GPUAccess {
args = append(args, "--device", "/dev/dri")
if config.Resources.AppleGPU {
args = append(args, "--device", "/dev/metal")
args = append(args, "--device", "/dev/mps")
}
for _, device := range config.Resources.GPUDevices {
args = append(args, "--device", device)
}
// Add volumes
@ -94,6 +108,31 @@ func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerCo
// Add image and command
args = append(args, config.Image)
args = append(args, config.Command...)
return args
}
func ParseContainerID(output string) (string, error) {
out := strings.TrimSpace(output)
if out == "" {
return "", fmt.Errorf("no container ID returned")
}
lines := strings.Split(out, "\n")
for i := len(lines) - 1; i >= 0; i-- {
line := strings.TrimSpace(lines[i])
if line == "" {
continue
}
return line, nil
}
return "", fmt.Errorf("no container ID returned")
}
// StartContainer starts a new container
func (pm *PodmanManager) StartContainer(
ctx context.Context,
config *ContainerConfig,
) (string, error) {
args := BuildRunArgs(config)
// Execute command
cmd := exec.CommandContext(ctx, "podman", args...)
@ -102,10 +141,9 @@ func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerCo
return "", fmt.Errorf("failed to start container: %w, output: %s", err, string(output))
}
// Return container ID (first line of output)
containerID := strings.TrimSpace(string(output))
if containerID == "" {
return "", fmt.Errorf("no container ID returned")
containerID, err := ParseContainerID(string(output))
if err != nil {
return "", err
}
pm.logger.Info("container started", "container_id", containerID, "name", config.Name)
@ -124,6 +162,29 @@ func (pm *PodmanManager) StopContainer(ctx context.Context, containerID string)
return nil
}
// GetContainerStateStatus returns the container's lifecycle state from `podman inspect`.
// Typical values: running, exited, created, paused.
func (pm *PodmanManager) GetContainerStateStatus(
ctx context.Context,
containerID string,
) (string, error) {
// Validate containerID to prevent injection
if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") {
return "", fmt.Errorf("invalid container ID: %s", containerID)
}
cmd := exec.CommandContext(ctx, "podman", "inspect", "--format", "{{.State.Status}}", containerID) //nolint:gosec
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("failed to inspect container: %w, output: %s", err, string(output))
}
status := strings.TrimSpace(string(output))
if status == "" {
return "unknown", nil
}
return status, nil
}
// RemoveContainer removes a container
func (pm *PodmanManager) RemoveContainer(ctx context.Context, containerID string) error {
cmd := exec.CommandContext(ctx, "podman", "rm", containerID)
@ -137,7 +198,10 @@ func (pm *PodmanManager) RemoveContainer(ctx context.Context, containerID string
}
// GetContainerStatus gets the status of a container
func (pm *PodmanManager) GetContainerStatus(ctx context.Context, containerID string) (string, error) {
func (pm *PodmanManager) GetContainerStatus(
ctx context.Context,
containerID string,
) (string, error) {
// Validate containerID to prevent injection
if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") {
return "", fmt.Errorf("invalid container ID: %s", containerID)
@ -153,7 +217,16 @@ func (pm *PodmanManager) GetContainerStatus(ctx context.Context, containerID str
status := strings.TrimSpace(string(output))
if status == "" {
// Container might be stopped, check all containers
cmd = exec.CommandContext(ctx, "podman", "ps", "-a", "--filter", "id="+containerID, "--format", "{{.Status}}") //nolint:gosec
cmd = exec.CommandContext(
ctx,
"podman",
"ps",
"-a",
"--filter",
"id="+containerID,
"--format",
"{{.Status}}",
) //nolint:gosec
output, err = cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("failed to get container status: %w, output: %s", err, string(output))
@ -167,6 +240,33 @@ func (pm *PodmanManager) GetContainerStatus(ctx context.Context, containerID str
return status, nil
}
// ExecContainer executes a command inside a running container and returns the output
func (pm *PodmanManager) ExecContainer(ctx context.Context, containerID string, command []string) (string, error) {
// Validate containerID to prevent injection
if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") {
return "", fmt.Errorf("invalid container ID: %s", containerID)
}
// Validate command to prevent injection
for _, arg := range command {
if strings.ContainsAny(arg, "&;|<>$`\"'") {
return "", fmt.Errorf("invalid command argument: %s", arg)
}
}
// Build podman exec command
args := []string{"exec", containerID}
args = append(args, command...)
cmd := exec.CommandContext(ctx, "podman", args...) //nolint:gosec
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("failed to execute command in container: %w, output: %s", err, string(output))
}
return string(output), nil
}
// PodmanConfig holds configuration for Podman container execution
type PodmanConfig struct {
Image string
@ -174,16 +274,33 @@ type PodmanConfig struct {
Results string
ContainerWorkspace string
ContainerResults string
GPUAccess bool
AppleGPU bool
GPUDevices []string
Env map[string]string
Volumes map[string]string
Memory string
CPUs string
}
// PodmanResourceOverrides converts per-task resource requests into Podman-compatible
// `--cpus` and `--memory` flag values.
//
// cpu and memoryGB are treated as optional; values <= 0 return empty overrides.
func PodmanResourceOverrides(cpu int, memoryGB int) (cpus string, memory string) {
if cpu > 0 {
cpus = strconv.Itoa(cpu)
}
if memoryGB > 0 {
memory = fmt.Sprintf("%dg", memoryGB)
}
return cpus, memory
}
// BuildPodmanCommand builds a Podman command for executing ML experiments
func BuildPodmanCommand(
ctx context.Context,
cfg PodmanConfig,
scriptPath, requirementsPath string,
scriptPath, depsPath string,
extraArgs []string,
) *exec.Cmd {
args := []string{
@ -214,14 +331,26 @@ func BuildPodmanCommand(
resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults)
args = append(args, "-v", resultsMount)
if cfg.GPUAccess {
args = append(args, "--device", "/dev/dri")
// Mount additional volumes
for hostPath, containerPath := range cfg.Volumes {
mount := fmt.Sprintf("%s:%s", hostPath, containerPath)
args = append(args, "-v", mount)
}
// Use injected GPU device paths for Apple GPU or custom configurations
for _, device := range cfg.GPUDevices {
args = append(args, "--device", device)
}
// Add environment variables
for key, value := range cfg.Env {
args = append(args, "-e", fmt.Sprintf("%s=%s", key, value))
}
// Image and command
args = append(args, cfg.Image,
"--workspace", cfg.ContainerWorkspace,
"--requirements", requirementsPath,
"--deps", depsPath,
"--script", scriptPath,
)

View file

@ -14,6 +14,38 @@ import (
var defaultBlockedPackages = []string{"requests", "urllib3", "httpx"}
func DefaultBlockedPackages() []string {
return append([]string{}, defaultBlockedPackages...)
}
func DefaultEnhancedSecurityConfigFromEnv() *EnhancedSecurityConfig {
securityConfig := GetDefaultSecurityConfig()
if blockedPkgs := os.Getenv("FETCHML_JUPYTER_BLOCKED_PACKAGES"); blockedPkgs != "" {
securityConfig.BlockedPackages = strings.Split(strings.TrimSpace(blockedPkgs), ",")
for i, pkg := range securityConfig.BlockedPackages {
securityConfig.BlockedPackages[i] = strings.TrimSpace(pkg)
}
}
if allowedPkgs := os.Getenv("FETCHML_JUPYTER_ALLOWED_PACKAGES"); allowedPkgs != "" {
securityConfig.AllowedPackages = make(map[string]bool)
allowed := strings.Split(strings.TrimSpace(allowedPkgs), ",")
for _, pkg := range allowed {
securityConfig.AllowedPackages[strings.TrimSpace(pkg)] = true
}
}
return securityConfig
}
func envDefaultImage() string {
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_DEFAULT_IMAGE")); v != "" {
return v
}
return "localhost/ml-tools-runner:latest"
}
// ConfigManager manages Jupyter service configuration
type ConfigManager struct {
logger *logging.Logger
@ -92,7 +124,11 @@ type AdvancedSettingsConfig struct {
}
// NewConfigManager creates a new configuration manager
func NewConfigManager(logger *logging.Logger, configPath string, environment string) (*ConfigManager, error) {
func NewConfigManager(
logger *logging.Logger,
configPath string,
environment string,
) (*ConfigManager, error) {
cm := &ConfigManager{
logger: logger,
configPath: configPath,
@ -215,14 +251,14 @@ func (cm *ConfigManager) getDefaultConfig() *JupyterConfig {
Version: "1.0.0",
Environment: cm.environment,
Service: ServiceConfig{
DefaultImage: "localhost/ml-tools-runner:latest",
DefaultImage: envDefaultImage(),
DefaultPort: 8888,
DefaultWorkspace: "./workspace",
MaxServices: 5,
DefaultResources: ResourceConfig{
MemoryLimit: "8G",
CPULimit: "2",
GPUAccess: false,
GPUDevices: nil,
},
SecuritySettings: SecurityConfig{
AllowNetwork: false,
@ -270,7 +306,7 @@ func (cm *ConfigManager) getDefaultConfig() *JupyterConfig {
Resources: ResourceConfig{
MemoryLimit: "8G",
CPULimit: "2",
GPUAccess: false,
GPUDevices: nil,
},
Health: HealthConfig{
Enabled: true,
@ -290,7 +326,7 @@ func (cm *ConfigManager) getDefaultConfig() *JupyterConfig {
MaxAge: "7d",
},
DefaultSettings: DefaultSettingsConfig{
Image: "localhost/ml-tools-runner:latest",
Image: envDefaultImage(),
Port: 8888,
Workspace: "./workspace",
Environment: map[string]string{"JUPYTER_ENABLE_LAB": "yes"},

View file

@ -74,7 +74,10 @@ func (hm *HealthMonitor) RemoveService(serviceID string) {
}
// CheckServiceHealth checks the health of a specific service
func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID string) (*HealthStatus, error) {
func (hm *HealthMonitor) CheckServiceHealth(
ctx context.Context,
serviceID string,
) (*HealthStatus, error) {
service, exists := hm.services[serviceID]
if !exists {
return nil, fmt.Errorf("service %s not found", serviceID)
@ -100,7 +103,10 @@ func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID strin
responseTime := time.Since(start)
if err != nil {
healthStatus.Status = statusUnhealthy
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP request failed: %v", err))
healthStatus.Errors = append(
healthStatus.Errors,
fmt.Sprintf("HTTP request failed: %v", err),
)
healthStatus.ResponseTime = responseTime
return healthStatus, nil
}
@ -119,7 +125,10 @@ func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID strin
healthStatus.Status = "healthy"
} else {
healthStatus.Status = statusUnhealthy
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP status %d", resp.StatusCode))
healthStatus.Errors = append(
healthStatus.Errors,
fmt.Sprintf("HTTP status %d", resp.StatusCode),
)
}
// Check response headers for Jupyter-specific indicators
@ -213,13 +222,16 @@ func (hm *HealthMonitor) StartMonitoring(ctx context.Context) {
}
// GetServiceMetrics returns detailed metrics for a service
func (hm *HealthMonitor) GetServiceMetrics(ctx context.Context, serviceID string) (map[string]interface{}, error) {
func (hm *HealthMonitor) GetServiceMetrics(
ctx context.Context,
serviceID string,
) (map[string]any, error) {
service, exists := hm.services[serviceID]
if !exists {
return nil, fmt.Errorf("service %s not found", serviceID)
}
metrics := make(map[string]interface{})
metrics := make(map[string]any)
// Basic service info
metrics["service_id"] = service.ID
@ -253,9 +265,12 @@ func (hm *HealthMonitor) GetServiceMetrics(ctx context.Context, serviceID string
}
// getContainerMetrics gets container-specific metrics
func (hm *HealthMonitor) getContainerMetrics(_ context.Context, _ string) map[string]interface{} {
func (hm *HealthMonitor) getContainerMetrics(
_ context.Context,
_ string,
) map[string]any {
// Lightweight container metrics - avoid heavy system calls
metrics := make(map[string]interface{})
metrics := make(map[string]any)
// Basic status check only - keep it minimal
metrics["status"] = "running"
@ -295,7 +310,10 @@ func (hm *HealthMonitor) ValidateService(service *JupyterService) []string {
}
// StartContinuousMonitoring starts continuous health monitoring
func (hm *HealthMonitor) StartContinuousMonitoring(ctx context.Context, interval time.Duration) {
func (hm *HealthMonitor) StartContinuousMonitoring(
ctx context.Context,
interval time.Duration,
) {
ticker := time.NewTicker(interval)
defer ticker.Stop()
@ -355,7 +373,10 @@ func isValidURL(url string) bool {
}
// GetHealthHistory returns health check history for a service (lightweight version)
func (hm *HealthMonitor) GetHealthHistory(_ string, duration time.Duration) ([]*HealthStatus, error) {
func (hm *HealthMonitor) GetHealthHistory(
_ string,
_ time.Duration,
) ([]*HealthStatus, error) {
// Return empty for now - keep it lightweight
return []*HealthStatus{}, nil
}
@ -366,11 +387,11 @@ func (hm *HealthMonitor) SetInterval(interval time.Duration) {
}
// GetMonitoringStatus returns the current monitoring status
func (hm *HealthMonitor) GetMonitoringStatus() map[string]interface{} {
func (hm *HealthMonitor) GetMonitoringStatus() map[string]any {
hm.servicesMutex.RLock()
defer hm.servicesMutex.RUnlock()
return map[string]interface{}{
return map[string]any{
"monitored_services": len(hm.services),
"check_interval": hm.interval.String(),
"timeout": hm.client.Timeout.String(),

View file

@ -113,7 +113,10 @@ func (nm *NetworkManager) ValidateNetworkConfig(config *NetworkConfig) error {
}
// PrepareNetworkConfig prepares network configuration for a service
func (nm *NetworkManager) PrepareNetworkConfig(serviceID string, userConfig *NetworkConfig) (*NetworkConfig, error) {
func (nm *NetworkManager) PrepareNetworkConfig(
serviceID string,
userConfig *NetworkConfig,
) (*NetworkConfig, error) {
config := &NetworkConfig{
ContainerPort: 8888,
BindAddress: "127.0.0.1",
@ -323,7 +326,11 @@ func (nm *NetworkManager) TestConnectivity(_ context.Context, config *NetworkCon
// Simple connectivity test
dialer := &net.Dialer{Timeout: 5 * time.Second}
conn, err := dialer.DialContext(context.Background(), "tcp", fmt.Sprintf("%s:%d", config.BindAddress, config.HostPort))
conn, err := dialer.DialContext(
context.Background(),
"tcp",
fmt.Sprintf("%s:%d", config.BindAddress, config.HostPort),
)
if err != nil {
return fmt.Errorf("cannot connect to %s: %w", url, err)
}

View file

@ -66,7 +66,11 @@ type PackageInfo struct {
}
// NewPackageManager creates a new package manager
func NewPackageManager(logger *logging.Logger, config *PackageConfig, workspacePath string) (*PackageManager, error) {
func NewPackageManager(
logger *logging.Logger,
config *PackageConfig,
workspacePath string,
) (*PackageManager, error) {
pm := &PackageManager{
logger: logger,
trustedChannels: config.TrustedChannels,
@ -106,7 +110,11 @@ func (pm *PackageManager) ValidatePackageRequest(req *PackageRequest) error {
// Check if channel is trusted
if req.Channel != "" {
if !pm.isChannelTrusted(req.Channel) {
return fmt.Errorf("channel '%s' is not trusted. Allowed channels: %v", req.Channel, pm.trustedChannels)
return fmt.Errorf(
"channel '%s' is not trusted. Allowed channels: %v",
req.Channel,
pm.trustedChannels,
)
}
} else {
// Default to conda-forge if no channel specified
@ -158,7 +166,12 @@ func (pm *PackageManager) isValidPackageName(name string) bool {
}
// RequestPackage creates a package installation request
func (pm *PackageManager) RequestPackage(packageName, version, channel, requestedBy string) (*PackageRequest, error) {
func (pm *PackageManager) RequestPackage(
packageName,
version,
channel,
requestedBy string,
) (*PackageRequest, error) {
req := &PackageRequest{
PackageName: strings.ToLower(strings.TrimSpace(packageName)),
Version: version,

View file

@ -232,7 +232,13 @@ func (sm *SecurityManager) GenerateSecureToken() (string, error) {
token := base64.URLEncoding.EncodeToString(bytes)
// Log token generation (without the token itself for security)
sm.logSecurityEvent("token_generation", "low", "system", "generate_token", "Secure token generated")
sm.logSecurityEvent(
"token_generation",
"low",
"system",
"generate_token",
"Secure token generated",
)
return token, nil
}

View file

@ -6,6 +6,7 @@ import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
@ -15,8 +16,185 @@ import (
const (
serviceStatusRunning = "running"
defaultWorkspaceBase = "/data/active/workspaces"
)
func stateDir() string {
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_STATE_DIR")); v != "" {
return v
}
preferred := "/data/active/jupyter"
if err := os.MkdirAll(preferred, 0o750); err == nil {
return preferred
}
return os.TempDir()
}
func workspaceBaseDir() string {
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_WORKSPACE_BASE")); v != "" {
return v
}
return defaultWorkspaceBase
}
func resolveWorkspacePath(workspace string) (string, error) {
ws := strings.TrimSpace(workspace)
if ws == "" {
return "", fmt.Errorf("workspace is required")
}
clean := filepath.Clean(ws)
// Reject obvious traversal attempts.
if clean == ".." || strings.HasPrefix(clean, ".."+string(filepath.Separator)) {
return "", fmt.Errorf("invalid workspace path: %s", workspace)
}
// For container deployments, relative paths refer to the workspace base directory.
if !filepath.IsAbs(clean) {
clean = strings.TrimPrefix(clean, "."+string(filepath.Separator))
clean = filepath.Join(workspaceBaseDir(), clean)
}
return clean, nil
}
func trashBaseDir() string {
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_TRASH_DIR")); v != "" {
return v
}
// Default to state dir to keep behavior consistent across deployments.
return filepath.Join(stateDir(), "trash", "jupyter")
}
type trashInfo struct {
OriginalName string `json:"original_name"`
DeletedAt time.Time `json:"deleted_at"`
DeletedBy string `json:"deleted_by"`
SizeBytes int64 `json:"size_bytes"`
PurgeAfter time.Time `json:"purge_after"`
Reason string `json:"reason"`
}
func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalName string) (string, *trashInfo, error) {
ws := strings.TrimSpace(workspacePath)
if ws == "" {
return "", nil, fmt.Errorf("workspace is required")
}
name := strings.TrimSpace(originalName)
if name == "" {
return "", nil, fmt.Errorf("original name is required")
}
wsResolved, err := resolveWorkspacePath(ws)
if err == nil {
ws = wsResolved
}
if err := os.MkdirAll(trashBaseDir(), 0o750); err != nil {
return "", nil, fmt.Errorf("failed to create trash directory: %w", err)
}
ts := time.Now().UTC().Format("20060102_150405")
destName := fmt.Sprintf("%s_%s", name, ts)
dest := filepath.Join(trashBaseDir(), destName)
sizeBytes, _ := dirSizeBytes(ws)
info := &trashInfo{
OriginalName: name,
DeletedAt: time.Now().UTC(),
DeletedBy: "system",
SizeBytes: sizeBytes,
PurgeAfter: time.Now().UTC().Add(30 * 24 * time.Hour),
Reason: "user_request",
}
if err := os.Rename(ws, dest); err != nil {
return "", nil, fmt.Errorf("failed to move workspace to trash: %w", err)
}
b, err := json.MarshalIndent(info, "", " ")
if err == nil {
_ = os.WriteFile(filepath.Join(dest, ".trashinfo"), b, 0o600)
}
return dest, info, nil
}
func (sm *ServiceManager) RestoreWorkspace(ctx context.Context, name string) (string, error) {
_ = ctx
wsName := strings.TrimSpace(name)
if wsName == "" {
return "", fmt.Errorf("workspace name is required")
}
base := trashBaseDir()
entries, err := os.ReadDir(base)
if err != nil {
if os.IsNotExist(err) {
return "", fmt.Errorf("no trash directory found")
}
return "", fmt.Errorf("failed to read trash directory: %w", err)
}
prefix := wsName + "_"
var best string
var bestTs string
for _, e := range entries {
if !e.IsDir() {
continue
}
n := e.Name()
if !strings.HasPrefix(n, prefix) {
continue
}
ts := strings.TrimPrefix(n, prefix)
if best == "" || ts > bestTs {
best = n
bestTs = ts
}
}
if best == "" {
return "", fmt.Errorf("no trashed workspace found for %q", wsName)
}
src := filepath.Join(base, best)
dest := filepath.Join(workspaceBaseDir(), wsName)
if _, err := os.Stat(dest); err == nil {
return "", fmt.Errorf("workspace %q already exists", wsName)
}
if err := os.MkdirAll(workspaceBaseDir(), 0o750); err != nil {
return "", fmt.Errorf("failed to create workspace base directory: %w", err)
}
if err := os.Rename(src, dest); err != nil {
return "", fmt.Errorf("failed to restore workspace: %w", err)
}
_ = os.Remove(filepath.Join(dest, ".trashinfo"))
return dest, nil
}
func dirSizeBytes(path string) (int64, error) {
var total int64
err := filepath.WalkDir(path, func(_ string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
total += info.Size()
return nil
})
if err != nil {
return 0, err
}
return total, nil
}
// ServiceManager manages standalone Jupyter services
type ServiceManager struct {
logger *logging.Logger
@ -24,6 +202,7 @@ type ServiceManager struct {
config *ServiceConfig
services map[string]*JupyterService
workspaceMetadataMgr *WorkspaceMetadataManager
securityMgr *SecurityManager
}
// ServiceConfig holds configuration for Jupyter services
@ -52,9 +231,9 @@ type NetworkConfig struct {
// ResourceConfig defines resource limits for Jupyter containers
type ResourceConfig struct {
MemoryLimit string `json:"memory_limit"`
CPULimit string `json:"cpu_limit"`
GPUAccess bool `json:"gpu_access"`
MemoryLimit string `json:"memory_limit"`
CPULimit string `json:"cpu_limit"`
GPUDevices []string `json:"gpu_devices"`
}
// SecurityConfig holds security settings for Jupyter services
@ -112,15 +291,39 @@ func NewServiceManager(logger *logging.Logger, config *ServiceConfig) (*ServiceM
}
// Initialize workspace metadata manager
dataFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_workspaces.json")
dataFile := filepath.Join(stateDir(), "fetch_ml_jupyter_workspaces.json")
workspaceMetadataMgr := NewWorkspaceMetadataManager(logger, dataFile)
// Initialize security manager with enhanced config
securityConfig := GetDefaultSecurityConfig()
// Override blocked packages from environment variable if provided
if blockedPkgs := os.Getenv("FETCHML_JUPYTER_BLOCKED_PACKAGES"); blockedPkgs != "" {
securityConfig.BlockedPackages = strings.Split(strings.TrimSpace(blockedPkgs), ",")
// Trim whitespace from each package name
for i, pkg := range securityConfig.BlockedPackages {
securityConfig.BlockedPackages[i] = strings.TrimSpace(pkg)
}
}
// Override allowed packages from environment variable if provided
if allowedPkgs := os.Getenv("FETCHML_JUPYTER_ALLOWED_PACKAGES"); allowedPkgs != "" {
securityConfig.AllowedPackages = make(map[string]bool)
allowed := strings.Split(strings.TrimSpace(allowedPkgs), ",")
for _, pkg := range allowed {
securityConfig.AllowedPackages[strings.TrimSpace(pkg)] = true
}
}
securityMgr := NewSecurityManager(logger, securityConfig)
sm := &ServiceManager{
logger: logger,
podman: podman,
config: config,
services: make(map[string]*JupyterService),
workspaceMetadataMgr: workspaceMetadataMgr,
securityMgr: securityMgr,
}
// Load existing services
@ -132,7 +335,10 @@ func NewServiceManager(logger *logging.Logger, config *ServiceConfig) (*ServiceM
}
// StartService starts a new Jupyter service
func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (*JupyterService, error) {
func (sm *ServiceManager) StartService(
ctx context.Context,
req *StartRequest,
) (*JupyterService, error) {
// Validate request
if err := sm.validateStartRequest(req); err != nil {
return nil, err
@ -155,6 +361,13 @@ func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (
return nil, fmt.Errorf("failed to start container: %w", err)
}
// Check for blacklisted packages in the container
if err := sm.checkPackageBlacklist(ctx, containerID); err != nil {
// Cleanup on blacklist violation
_ = sm.podman.StopContainer(ctx, containerID)
return nil, err
}
// Wait for Jupyter to be ready
url, err := sm.waitForJupyterReady(ctx, containerID, req.Network)
if err != nil {
@ -206,6 +419,146 @@ func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (
return service, nil
}
// checkPackageBlacklist validates that no blacklisted packages are installed in the container
func (sm *ServiceManager) checkPackageBlacklist(ctx context.Context, containerID string) error {
// Get list of installed packages from the container
// Try both pip and conda package managers
packages, err := sm.getInstalledPackages(ctx, containerID)
if err != nil {
sm.logger.Warn("failed to get installed packages for blacklist check", "error", err)
// Don't fail startup if we can't check packages, but log it
return nil
}
// Check each installed package against the blacklist
var blockedPackages []string
for _, pkg := range packages {
// Create a package request for validation
pkgReq := &PackageRequest{
PackageName: pkg,
RequestedBy: "system",
Channel: "",
Version: "",
}
if err := sm.securityMgr.ValidatePackageRequest(pkgReq); err != nil {
blockedPackages = append(blockedPackages, pkg)
}
}
// If any blocked packages are found, fail startup
if len(blockedPackages) > 0 {
return fmt.Errorf("container startup failed: blacklisted packages detected: %v. "+
"These packages are blocked by security policy. "+
"Remove them from the image or use FETCHML_JUPYTER_BLOCKED_PACKAGES to configure the blacklist",
blockedPackages)
}
return nil
}
// getInstalledPackages retrieves the list of installed packages from the container
func (sm *ServiceManager) getInstalledPackages(ctx context.Context, containerID string) ([]string, error) {
var packages []string
// Try pip list first
pipOutput, err := sm.podman.ExecContainer(ctx, containerID, []string{"pip", "list", "--format=freeze"})
if err == nil && pipOutput != "" {
pipPackages := sm.parsePipList(pipOutput)
packages = append(packages, pipPackages...)
}
// Try conda list as well
condaOutput, err := sm.podman.ExecContainer(ctx, containerID, []string{"conda", "list", "--export"})
if err == nil && condaOutput != "" {
condaPackages := sm.parseCondaList(condaOutput)
packages = append(packages, condaPackages...)
}
// Remove duplicates
uniquePackages := make(map[string]bool)
var result []string
for _, pkg := range packages {
if !uniquePackages[pkg] {
uniquePackages[pkg] = true
result = append(result, pkg)
}
}
return result, nil
}
// parsePipList parses pip list --format=freeze output
func (sm *ServiceManager) parsePipList(output string) []string {
var packages []string
lines := strings.Split(output, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" && !strings.HasPrefix(line, "#") {
// Format: package==version
parts := strings.Split(line, "==")
if len(parts) > 0 {
pkgName := strings.TrimSpace(parts[0])
if pkgName != "" {
packages = append(packages, pkgName)
}
}
}
}
return packages
}
// parseCondaList parses conda list --export output
func (sm *ServiceManager) parseCondaList(output string) []string {
var packages []string
lines := strings.Split(output, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" && !strings.HasPrefix(line, "#") {
// Format: package=version=build
parts := strings.Split(line, "=")
if len(parts) > 0 {
pkgName := strings.TrimSpace(parts[0])
if pkgName != "" {
packages = append(packages, pkgName)
}
}
}
}
return packages
}
// ParsePipList parses pip list --format=freeze output.
func ParsePipList(output string) []string {
sm := &ServiceManager{}
return sm.parsePipList(output)
}
// ParseCondaList parses conda list --export output.
func ParseCondaList(output string) []string {
sm := &ServiceManager{}
return sm.parseCondaList(output)
}
// PrepareContainerConfig builds the Podman container config for a start request.
func PrepareContainerConfig(serviceID string, req *StartRequest) *container.ContainerConfig {
sm := &ServiceManager{}
return sm.prepareContainerConfig(serviceID, req)
}
// MoveWorkspaceToTrash moves a workspace directory to the configured trash directory.
func MoveWorkspaceToTrash(workspacePath string, originalName string) (string, error) {
sm := &ServiceManager{}
trashPath, _, err := sm.moveWorkspaceToTrash(workspacePath, originalName)
return trashPath, err
}
// RestoreWorkspace restores the most recently trashed workspace for the given name.
func RestoreWorkspace(ctx context.Context, name string) (string, error) {
sm := &ServiceManager{}
return sm.RestoreWorkspace(ctx, name)
}
// StopService stops a Jupyter service
func (sm *ServiceManager) StopService(ctx context.Context, serviceID string) error {
service, exists := sm.services[serviceID]
@ -240,6 +593,67 @@ func (sm *ServiceManager) StopService(ctx context.Context, serviceID string) err
return nil
}
// RemoveService removes a Jupyter service. If purge is false, it soft-deletes the workspace by moving it to trash.
func (sm *ServiceManager) RemoveService(ctx context.Context, serviceID string, purge bool) error {
service, exists := sm.services[serviceID]
if !exists {
return fmt.Errorf("service %s not found", serviceID)
}
// Stop container first
if err := sm.podman.StopContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to stop container before removal", "service_id", serviceID, "error", err)
}
// Remove container
if err := sm.podman.RemoveContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to remove container", "service_id", serviceID, "error", err)
return fmt.Errorf("failed to remove container: %w", err)
}
// Best-effort: unlink workspace metadata.
if sm.workspaceMetadataMgr != nil && strings.TrimSpace(service.Workspace) != "" {
_ = sm.workspaceMetadataMgr.UnlinkWorkspace(service.Workspace)
}
// Workspace deletion policy.
ws := strings.TrimSpace(service.Workspace)
if ws != "" {
wsPath, err := resolveWorkspacePath(ws)
if err == nil {
ws = wsPath
}
if purge {
if err := os.RemoveAll(ws); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete workspace: %w", err)
}
} else {
dest, info, err := sm.moveWorkspaceToTrash(ws, strings.TrimSpace(service.Name))
if err != nil {
return err
}
// Persist trash path for observability.
if service.Metadata == nil {
service.Metadata = make(map[string]string)
}
service.Metadata["trash_path"] = dest
service.Metadata["purge_after"] = strconv.FormatInt(info.PurgeAfter.Unix(), 10)
}
}
// Remove from active services
delete(sm.services, serviceID)
// Save services to disk
if err := sm.saveServices(); err != nil {
sm.logger.Warn("failed to save services", "error", err)
}
sm.logger.Info("jupyter service removed", "service_id", serviceID, "name", service.Name, "purge", purge)
return nil
}
// GetService retrieves a service by ID
func (sm *ServiceManager) GetService(serviceID string) (*JupyterService, error) {
service, exists := sm.services[serviceID]
@ -296,9 +710,30 @@ func (sm *ServiceManager) validateStartRequest(req *StartRequest) error {
req.Workspace = sm.config.DefaultWorkspace
}
// Check if workspace exists
if _, err := os.Stat(req.Workspace); os.IsNotExist(err) {
return fmt.Errorf("workspace %s does not exist", req.Workspace)
// Resolve/normalize workspace path before comparing.
wsPath, err := resolveWorkspacePath(req.Workspace)
if err != nil {
return err
}
req.Workspace = wsPath
// Enforce reproducibility: do not allow creating a service/workspace with the same name
// as any existing service (regardless of status).
for _, svc := range sm.services {
if svc == nil {
continue
}
if strings.EqualFold(strings.TrimSpace(svc.Name), strings.TrimSpace(req.Name)) {
return fmt.Errorf("a jupyter service/workspace named %q already exists", req.Name)
}
if svc.Workspace != "" && svc.Workspace == req.Workspace {
return fmt.Errorf("workspace path %q is already in use by service %q", req.Workspace, svc.Name)
}
}
// Ensure workspace directory exists.
if err := os.MkdirAll(req.Workspace, 0o750); err != nil {
return fmt.Errorf("failed to create workspace directory: %w", err)
}
if req.Image == "" {
@ -331,11 +766,20 @@ func (sm *ServiceManager) generateServiceID(name string) string {
}
// prepareContainerConfig prepares container configuration
func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartRequest) *container.ContainerConfig {
func (sm *ServiceManager) prepareContainerConfig(
serviceID string,
req *StartRequest,
) *container.ContainerConfig {
imageLower := strings.ToLower(strings.TrimSpace(req.Image))
isPublicJupyter := strings.Contains(imageLower, "quay.io/jupyter/") || strings.Contains(imageLower, "jupyter/")
// Prepare volume mounts
volumes := map[string]string{
req.Workspace: "/workspace",
volumes := map[string]string{}
workspaceMount := "/workspace"
if isPublicJupyter {
workspaceMount = "/home/jovyan/work"
}
volumes[req.Workspace] = workspaceMount
// Prepare environment variables
env := map[string]string{
@ -363,17 +807,71 @@ func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartReq
}
// Prepare container command
cmd := []string{
"conda", "run", "-n", "ml_env", "jupyter", "notebook",
"--no-browser",
"--ip=0.0.0.0",
fmt.Sprintf("--port=%d", req.Network.ContainerPort),
"--NotebookApp.allow-root=True",
"--NotebookApp.ip=0.0.0.0",
}
var cmd []string
if isPublicJupyter {
condaEnv := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_CONDA_ENV"))
if condaEnv == "" {
condaEnv = "base"
}
kernelName := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_KERNEL_NAME"))
if kernelName == "" {
kernelName = condaEnv
}
displayName := fmt.Sprintf("Python (%s)", kernelName)
if !req.Network.EnableToken {
cmd = append(cmd, "--NotebookApp.token=")
jupyterTokenArg := ""
if !req.Network.EnableToken {
jupyterTokenArg = " --NotebookApp.token= --ServerApp.token="
}
script := fmt.Sprintf(
"set -euo pipefail; "+
"if conda run -n %s python -c 'import ipykernel' >/dev/null 2>&1; then "+
"conda run -n %s python -m ipykernel install --user --name %s --display-name %q; "+
"fi; "+
"exec start-notebook.sh --ip=0.0.0.0 --port=%d --no-browser --notebook-dir=%s%s",
condaEnv,
condaEnv,
kernelName,
displayName,
req.Network.ContainerPort,
workspaceMount,
jupyterTokenArg,
)
cmd = []string{"bash", "-lc", script}
} else {
condaEnv := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_CONDA_ENV"))
if condaEnv == "" {
condaEnv = "ml_env"
}
kernelName := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_KERNEL_NAME"))
if kernelName == "" {
kernelName = condaEnv
}
displayName := fmt.Sprintf("Python (%s)", kernelName)
jupyterTokenArg := ""
if !req.Network.EnableToken {
jupyterTokenArg = " --NotebookApp.token="
}
script := fmt.Sprintf(
"set -euo pipefail; "+
"if conda run -n %s python -c 'import ipykernel' >/dev/null 2>&1; then "+
"conda run -n %s python -m ipykernel install --user --name %s --display-name %q; "+
"fi; "+
"exec conda run -n %s jupyter notebook --no-browser --ip=0.0.0.0 --port=%d --NotebookApp.allow-root=True --NotebookApp.ip=0.0.0.0%s",
condaEnv,
condaEnv,
kernelName,
displayName,
condaEnv,
req.Network.ContainerPort,
jupyterTokenArg,
)
cmd = []string{"bash", "-lc", script}
}
// Prepare security options
@ -397,7 +895,7 @@ func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartReq
Resources: container.ResourceConfig{
MemoryLimit: req.Resources.MemoryLimit,
CPULimit: req.Resources.CPULimit,
GPUAccess: req.Resources.GPUAccess,
GPUDevices: req.Resources.GPUDevices,
},
Network: container.NetworkConfig{
AllowNetwork: req.Security.AllowNetwork,
@ -417,7 +915,7 @@ func (sm *ServiceManager) waitForJupyterReady(
deadline := time.Now().Add(maxWait)
for time.Now().Before(deadline) {
status, err := sm.podman.GetContainerStatus(ctx, containerID)
status, err := sm.podman.GetContainerStateStatus(ctx, containerID)
if err != nil {
return "", fmt.Errorf("failed to check container status: %w", err)
}
@ -426,7 +924,7 @@ func (sm *ServiceManager) waitForJupyterReady(
break
}
if status == "exited" || status == "error" {
if status == "exited" || status == "dead" {
return "", fmt.Errorf("container failed to start (status: %s)", status)
}
@ -451,7 +949,7 @@ func (sm *ServiceManager) waitForJupyterReady(
// loadServices loads existing services from disk
func (sm *ServiceManager) loadServices() error {
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
servicesFile := filepath.Join(stateDir(), "fetch_ml_jupyter_services.json")
data, err := os.ReadFile(servicesFile)
if err != nil {
@ -466,26 +964,78 @@ func (sm *ServiceManager) loadServices() error {
return err
}
// Validate services are still running
for id, service := range services {
if service.Status == serviceStatusRunning {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
status, err := sm.podman.GetContainerStatus(ctx, service.ContainerID)
cancel()
// Reset in-memory map before re-hydrating from disk.
sm.services = make(map[string]*JupyterService)
if err != nil || status != "running" {
service.Status = "stopped"
}
// Normalize service status + de-dupe by service name.
// Prefer: running service > newest CreatedAt.
byName := make(map[string]*JupyterService)
for id, service := range services {
if service == nil {
continue
}
sm.services[id] = service
// Ensure stable ID even if old payloads didn't persist it.
if strings.TrimSpace(service.ID) == "" {
service.ID = id
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
status, err := sm.podman.GetContainerStateStatus(ctx, service.ContainerID)
cancel()
if err != nil {
sm.logger.Warn("failed to check service status", "service_id", id, "error", err)
service.Status = "stopped"
} else if status == serviceStatusRunning {
service.Status = serviceStatusRunning
} else {
service.Status = "stopped"
}
nameKey := strings.ToLower(strings.TrimSpace(service.Name))
if nameKey == "" {
nameKey = service.ID
}
if existing, ok := byName[nameKey]; ok {
// prefer running
if existing.Status != serviceStatusRunning && service.Status == serviceStatusRunning {
byName[nameKey] = service
continue
}
if existing.Status == serviceStatusRunning && service.Status != serviceStatusRunning {
continue
}
// otherwise prefer newest
if service.CreatedAt.After(existing.CreatedAt) {
byName[nameKey] = service
}
continue
}
byName[nameKey] = service
}
for _, service := range byName {
if service == nil {
continue
}
if strings.TrimSpace(service.ID) == "" {
continue
}
sm.services[service.ID] = service
}
// Best-effort: persist the cleaned registry to avoid accumulating duplicates.
_ = sm.saveServices()
return nil
}
// saveServices saves services to disk
func (sm *ServiceManager) saveServices() error {
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
servicesFile := filepath.Join(stateDir(), "fetch_ml_jupyter_services.json")
if err := os.MkdirAll(filepath.Dir(servicesFile), 0o750); err != nil {
return err
}
data, err := json.MarshalIndent(sm.services, "", " ")
if err != nil {
@ -496,7 +1046,11 @@ func (sm *ServiceManager) saveServices() error {
}
// LinkWorkspaceWithExperiment links a workspace with an experiment
func (sm *ServiceManager) LinkWorkspaceWithExperiment(workspacePath, experimentID, serviceID string) error {
func (sm *ServiceManager) LinkWorkspaceWithExperiment(
workspacePath,
experimentID,
serviceID string,
) error {
return sm.workspaceMetadataMgr.LinkWorkspace(workspacePath, experimentID, serviceID)
}
@ -550,7 +1104,11 @@ func (sm *ServiceManager) ClearAllMetadata() error {
}
// SetAutoSync enables or disables auto-sync for a workspace
func (sm *ServiceManager) SetAutoSync(workspacePath string, enabled bool, interval time.Duration) error {
func (sm *ServiceManager) SetAutoSync(
workspacePath string,
enabled bool,
interval time.Duration,
) error {
return sm.workspaceMetadataMgr.SetAutoSync(workspacePath, enabled, interval)
}

View file

@ -34,7 +34,10 @@ type WorkspaceMetadataManager struct {
}
// NewWorkspaceMetadataManager creates a new workspace metadata manager
func NewWorkspaceMetadataManager(logger *logging.Logger, dataFile string) *WorkspaceMetadataManager {
func NewWorkspaceMetadataManager(
logger *logging.Logger,
dataFile string,
) *WorkspaceMetadataManager {
wmm := &WorkspaceMetadataManager{
logger: logger,
metadata: make(map[string]*WorkspaceMetadata),
@ -50,7 +53,11 @@ func NewWorkspaceMetadataManager(logger *logging.Logger, dataFile string) *Works
}
// LinkWorkspace links a workspace with an experiment
func (wmm *WorkspaceMetadataManager) LinkWorkspace(workspacePath, experimentID, serviceID string) error {
func (wmm *WorkspaceMetadataManager) LinkWorkspace(
workspacePath,
experimentID,
serviceID string,
) error {
wmm.mutex.Lock()
defer wmm.mutex.Unlock()
@ -97,7 +104,9 @@ func (wmm *WorkspaceMetadataManager) LinkWorkspace(workspacePath, experimentID,
}
// GetWorkspaceMetadata retrieves metadata for a workspace
func (wmm *WorkspaceMetadataManager) GetWorkspaceMetadata(workspacePath string) (*WorkspaceMetadata, error) {
func (wmm *WorkspaceMetadataManager) GetWorkspaceMetadata(
workspacePath string,
) (*WorkspaceMetadata, error) {
wmm.mutex.RLock()
defer wmm.mutex.RUnlock()
@ -176,7 +185,13 @@ func (wmm *WorkspaceMetadataManager) UnlinkWorkspace(workspacePath string) error
// Remove workspace metadata file
workspaceMetaFile := filepath.Join(absPath, ".jupyter_experiment.json")
if err := os.Remove(workspaceMetaFile); err != nil && !os.IsNotExist(err) {
wmm.logger.Warn("failed to remove workspace metadata file", "file", workspaceMetaFile, "error", err)
wmm.logger.Warn(
"failed to remove workspace metadata file",
"file",
workspaceMetaFile,
"error",
err,
)
}
wmm.logger.Info("workspace unlinked", "workspace", absPath)
@ -203,7 +218,11 @@ func (wmm *WorkspaceMetadataManager) ClearAllMetadata() error {
}
// SetAutoSync enables or disables auto-sync for a workspace
func (wmm *WorkspaceMetadataManager) SetAutoSync(workspacePath string, enabled bool, interval time.Duration) error {
func (wmm *WorkspaceMetadataManager) SetAutoSync(
workspacePath string,
enabled bool,
interval time.Duration,
) error {
wmm.mutex.Lock()
defer wmm.mutex.Unlock()
@ -353,7 +372,9 @@ func (wmm *WorkspaceMetadataManager) createWorkspaceMetadataFile(
}
// GetWorkspacesForExperiment returns all workspaces linked to an experiment
func (wmm *WorkspaceMetadataManager) GetWorkspacesForExperiment(experimentID string) []*WorkspaceMetadata {
func (wmm *WorkspaceMetadataManager) GetWorkspacesForExperiment(
experimentID string,
) []*WorkspaceMetadata {
wmm.mutex.RLock()
defer wmm.mutex.RUnlock()

View file

@ -14,7 +14,10 @@ var (
jwtPattern = regexp.MustCompile(`eyJ[a-zA-Z0-9_-]{10,}\.eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}`)
// Password-like fields in logs
passwordPattern = regexp.MustCompile(`(?i)(password|passwd|pwd|secret|token|key)["']?\s*[:=]\s*["']?([^"'\s,}]+)`)
passwordPattern = regexp.MustCompile(
`(?i)(password|passwd|pwd|secret|token|key)["']?\s*[:=]\s*["']?` +
`([^"'\s,}]+)`,
)
// Redis URLs with passwords
redisPasswordPattern = regexp.MustCompile(`redis://:[^@]+@`)

View file

@ -67,12 +67,18 @@ func NewSSHClient(host, user, keyPath string, port int, knownHostsPath string) (
if _, err := os.Stat(knownHostsPath); err == nil {
callback, err := knownhosts.New(knownHostsPath)
if err != nil {
log.Printf("Warning: failed to parse known_hosts: %v; using insecure host key verification", err)
log.Printf(
"Warning: failed to parse known_hosts: %v; using insecure host key verification",
err,
)
} else {
hostKeyCallback = callback
}
} else if !os.IsNotExist(err) {
log.Printf("Warning: known_hosts not found at %s; using insecure host key verification", knownHostsPath)
log.Printf(
"Warning: known_hosts not found at %s; using insecure host key verification",
knownHostsPath,
)
}
}
@ -101,7 +107,7 @@ func NewSSHClient(host, user, keyPath string, port int, knownHostsPath string) (
return &SSHClient{client: client, host: host}, nil
}
// NewLocalClient creates a local-mode SSHClient that executes commands on the host using the provided base path.
// NewLocalClient creates a local-mode SSHClient that executes commands on the host.
func NewLocalClient(basePath string) *SSHClient {
if basePath != "" {
basePath = config.ExpandPath(basePath)
@ -177,7 +183,11 @@ func (c *SSHClient) ExecContext(ctx context.Context, cmd string) (string, error)
// Wait a bit more for final result
select {
case res := <-resultCh:
return res.output, fmt.Errorf("command cancelled and force closed: %w (output: %s)", ctx.Err(), res.output)
return res.output, fmt.Errorf(
"command cancelled and force closed: %w (output: %s)",
ctx.Err(),
res.output,
)
case <-time.After(5 * time.Second):
return "", fmt.Errorf("command cancelled and cleanup timeout: %w", ctx.Err())
}