feat(jupyter): improve runtime management and update security/workflow docs
This commit is contained in:
parent
94112f0af5
commit
6b771e4a50
14 changed files with 1476 additions and 1325 deletions
|
|
@ -1,462 +0,0 @@
|
|||
# Jupyter Workspace and Experiment Integration
|
||||
|
||||
This guide describes the integration between Jupyter workspaces and FetchML experiments, enabling seamless resource chaining and data synchronization.
|
||||
|
||||
## Overview
|
||||
|
||||
The Jupyter-experiment integration allows you to:
|
||||
|
||||
- Link Jupyter workspaces with specific experiments
|
||||
- Automatically track experiment metadata in workspaces
|
||||
- Queue experiments directly from Jupyter workspaces
|
||||
- Synchronize data between workspaces and experiments
|
||||
- Maintain resource sharing and context across development and production workflows
|
||||
|
||||
## Architecture
|
||||
|
||||
### Components
|
||||
|
||||
1. **Workspace Metadata Manager** - Tracks relationships between workspaces and experiments
|
||||
2. **Service Manager Integration** - Links Jupyter services with experiment context
|
||||
3. **CLI Commands** - Provides user-facing integration commands
|
||||
4. **API Endpoints** - Enables programmatic workspace-experiment management
|
||||
|
||||
### Data Flow
|
||||
|
||||
```
|
||||
Jupyter Workspace ←→ Workspace Metadata ←→ Experiment Manager
|
||||
↓ ↓ ↓
|
||||
Notebooks Link Metadata Experiment Data
|
||||
Scripts Sync History Metrics & Results
|
||||
Requirements Auto-sync Config Job Queue
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Create a Jupyter workspace
|
||||
|
||||
```bash
|
||||
# Create a new workspace
|
||||
mkdir my_experiment_workspace
|
||||
cd my_experiment_workspace
|
||||
|
||||
# Add notebooks and scripts
|
||||
# (See examples/jupyter_experiment_integration.py for sample setup)
|
||||
```
|
||||
|
||||
### 2. Start Jupyter service
|
||||
|
||||
```bash
|
||||
# Start Jupyter with workspace
|
||||
ml jupyter start --workspace ./my_experiment_workspace --name my_experiment
|
||||
|
||||
# Access at http://localhost:8888
|
||||
```
|
||||
|
||||
### 3. Link workspace with experiment
|
||||
|
||||
```bash
|
||||
# Create experiment
|
||||
ml experiment create --name "my_experiment" --description "Test experiment"
|
||||
|
||||
# Link workspace with experiment
|
||||
ml jupyter experiment link --workspace ./my_experiment_workspace --experiment <experiment_id>
|
||||
```
|
||||
|
||||
### 4. Work in Jupyter
|
||||
|
||||
- Open notebooks in browser
|
||||
- Develop and test code interactively
|
||||
- Use MLflow for experiment tracking
|
||||
- Save results and models
|
||||
|
||||
### 5. Queue for production
|
||||
|
||||
```bash
|
||||
# Queue experiment from workspace
|
||||
ml jupyter experiment queue --workspace ./my_experiment_workspace --script experiment.py --name "production_run"
|
||||
|
||||
# Monitor progress
|
||||
ml status
|
||||
ml monitor
|
||||
```
|
||||
|
||||
### 6. Sync data
|
||||
|
||||
```bash
|
||||
# Push workspace changes to experiment
|
||||
ml jupyter experiment sync --workspace ./my_experiment_workspace --direction push
|
||||
|
||||
# Pull experiment results to workspace
|
||||
ml jupyter experiment sync --workspace ./my_experiment_workspace --direction pull
|
||||
```
|
||||
|
||||
## CLI Commands
|
||||
|
||||
### `ml jupyter experiment link`
|
||||
|
||||
Link a Jupyter workspace with an experiment.
|
||||
|
||||
```bash
|
||||
ml jupyter experiment link --workspace <path> --experiment <id>
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `--workspace`: Path to Jupyter workspace (default: ./workspace)
|
||||
- `--experiment`: Experiment ID to link with
|
||||
|
||||
**Creates:**
|
||||
- `.jupyter_experiment.json` metadata file in workspace
|
||||
- Link record in workspace metadata manager
|
||||
- Association between workspace and experiment
|
||||
|
||||
### `ml jupyter experiment queue`
|
||||
|
||||
Queue an experiment from a linked workspace.
|
||||
|
||||
```bash
|
||||
ml jupyter experiment queue --workspace <path> --script <file> --name <name>
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `--workspace`: Path to workspace (default: ./workspace)
|
||||
- `--script`: Python script to execute
|
||||
- `--name`: Name for the queued job
|
||||
|
||||
**Behavior:**
|
||||
- Detects linked experiment automatically
|
||||
- Passes experiment context to job queue
|
||||
- Uses workspace resources and configuration
|
||||
|
||||
### `ml jupyter experiment sync`
|
||||
|
||||
Synchronize data between workspace and experiment.
|
||||
|
||||
```bash
|
||||
ml jupyter experiment sync --workspace <path> --direction <pull|push>
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `--workspace`: Path to workspace (default: ./workspace)
|
||||
- `--direction`: Sync direction (pull or push)
|
||||
|
||||
**Sync Types:**
|
||||
- **Pull**: Download experiment metrics, results, and data to workspace
|
||||
- **Push**: Upload workspace notebooks, scripts, and results to experiment
|
||||
|
||||
### `ml jupyter experiment status`
|
||||
|
||||
Show experiment status for a workspace.
|
||||
|
||||
```bash
|
||||
ml jupyter experiment status [workspace_path]
|
||||
```
|
||||
|
||||
**Displays:**
|
||||
- Linked experiment information
|
||||
- Last sync time
|
||||
- Experiment metadata
|
||||
- Service association
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `/api/jupyter/experiments/link`
|
||||
|
||||
**Method:** POST
|
||||
|
||||
Link a workspace with an experiment.
|
||||
|
||||
```json
|
||||
{
|
||||
"workspace": "/path/to/workspace",
|
||||
"experiment_id": "experiment_123",
|
||||
"service_id": "jupyter-service-456"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"status": "linked",
|
||||
"data": {
|
||||
"workspace_path": "/path/to/workspace",
|
||||
"experiment_id": "experiment_123",
|
||||
"linked_at": "2023-12-06T10:30:00Z",
|
||||
"sync_direction": "bidirectional"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### `/api/jupyter/experiments/sync`
|
||||
|
||||
**Method:** POST
|
||||
|
||||
Synchronize workspace with experiment.
|
||||
|
||||
```json
|
||||
{
|
||||
"workspace": "/path/to/workspace",
|
||||
"experiment_id": "experiment_123",
|
||||
"direction": "push",
|
||||
"sync_type": "all"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"workspace": "/path/to/workspace",
|
||||
"experiment_id": "experiment_123",
|
||||
"direction": "push",
|
||||
"sync_type": "all",
|
||||
"synced_at": "2023-12-06T10:35:00Z",
|
||||
"status": "completed"
|
||||
}
|
||||
```
|
||||
|
||||
### `/api/jupyter/services`
|
||||
|
||||
**Methods:** GET, POST, DELETE
|
||||
|
||||
Manage Jupyter services.
|
||||
|
||||
**GET:** List all services
|
||||
**POST:** Start new service
|
||||
**DELETE:** Stop service
|
||||
|
||||
## Workspace Metadata
|
||||
|
||||
### `.jupyter_experiment.json`
|
||||
|
||||
Each linked workspace contains a metadata file:
|
||||
|
||||
```json
|
||||
{
|
||||
"experiment_id": "experiment_123",
|
||||
"service_id": "jupyter-service-456",
|
||||
"linked_at": 1701864600,
|
||||
"last_sync": 1701865200,
|
||||
"sync_direction": "bidirectional",
|
||||
"auto_sync": false,
|
||||
"jupyter_integration": true,
|
||||
"workspace_path": "/path/to/workspace",
|
||||
"tags": ["development", "ml-experiment"]
|
||||
}
|
||||
```
|
||||
|
||||
### Metadata Manager
|
||||
|
||||
The workspace metadata manager maintains:
|
||||
|
||||
- Workspace-experiment relationships
|
||||
- Sync history and timestamps
|
||||
- Auto-sync configuration
|
||||
- Tags and additional metadata
|
||||
- Service associations
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Workspace Organization
|
||||
|
||||
1. **One workspace per experiment** - Keep workspaces focused on specific experiments
|
||||
2. **Use descriptive names** - Name workspaces and services clearly
|
||||
3. **Version control** - Track workspace changes with git
|
||||
4. **Clean separation** - Separate data, code, and results
|
||||
|
||||
### Experiment Development Workflow
|
||||
|
||||
1. **Create workspace** with notebooks and scripts
|
||||
2. **Link with experiment** for tracking
|
||||
3. **Develop interactively** in Jupyter
|
||||
4. **Test locally** with sample data
|
||||
5. **Queue for production** when ready
|
||||
6. **Monitor results** and iterate
|
||||
|
||||
### Data Management
|
||||
|
||||
1. **Use requirements.txt** for dependencies
|
||||
2. **Store data separately** from notebooks
|
||||
3. **Use MLflow** for experiment tracking
|
||||
4. **Sync regularly** to preserve work
|
||||
5. **Clean up** old workspaces
|
||||
|
||||
### Resource Management
|
||||
|
||||
1. **Monitor service usage** with `ml jupyter list`
|
||||
2. **Stop unused services** with `ml jupyter stop`
|
||||
3. **Use resource limits** in configuration
|
||||
4. **Enable auto-sync** for automated workflows
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Workspace not linked:**
|
||||
```bash
|
||||
Error: No experiment link found in workspace
|
||||
```
|
||||
**Solution:** Run `ml jupyter experiment link` first
|
||||
|
||||
**Service not found:**
|
||||
```bash
|
||||
Error: Service not found
|
||||
```
|
||||
**Solution:** Check service name with `ml jupyter list`
|
||||
|
||||
**Sync failed:**
|
||||
```bash
|
||||
Error: Failed to sync workspace
|
||||
```
|
||||
**Solution:** Check workspace permissions and experiment exists
|
||||
|
||||
### Debug Commands
|
||||
|
||||
```bash
|
||||
# Check workspace metadata
|
||||
cat ./workspace/.jupyter_experiment.json
|
||||
|
||||
# List all services
|
||||
ml jupyter list
|
||||
|
||||
# Check experiment status
|
||||
ml jupyter experiment status
|
||||
|
||||
# View service logs
|
||||
podman logs <service_id>
|
||||
```
|
||||
|
||||
### Recovery
|
||||
|
||||
**Lost workspace link:**
|
||||
1. Find experiment ID with `ml experiment list`
|
||||
2. Re-link with `ml jupyter experiment link`
|
||||
3. Sync data with `ml jupyter experiment sync --direction pull`
|
||||
|
||||
**Service stuck:**
|
||||
1. Stop with `ml jupyter stop <service_name>`
|
||||
2. Check logs for errors
|
||||
3. Restart with `ml jupyter start`
|
||||
|
||||
## Examples
|
||||
|
||||
### Complete Workflow
|
||||
|
||||
```bash
|
||||
# 1. Setup workspace
|
||||
mkdir my_ml_project
|
||||
cd my_ml_project
|
||||
echo "numpy>=1.20.0" > requirements.txt
|
||||
echo "mlflow>=1.20.0" >> requirements.txt
|
||||
|
||||
# 2. Start Jupyter
|
||||
ml jupyter start --workspace . --name my_project
|
||||
|
||||
# 3. Create experiment
|
||||
ml experiment create --name "my_project" --description "ML project experiment"
|
||||
|
||||
# 4. Link workspace
|
||||
ml jupyter experiment link --workspace . --experiment <experiment_id>
|
||||
|
||||
# 5. Work in Jupyter (browser)
|
||||
# - Create notebooks
|
||||
# - Write experiment scripts
|
||||
# - Test locally
|
||||
|
||||
# 6. Queue for production
|
||||
ml jupyter experiment queue --workspace . --script train_model.py --name "production_run"
|
||||
|
||||
# 7. Monitor
|
||||
ml status
|
||||
ml monitor
|
||||
|
||||
# 8. Sync results
|
||||
ml jupyter experiment sync --workspace . --direction pull
|
||||
|
||||
# 9. Cleanup
|
||||
ml jupyter stop my_project
|
||||
```
|
||||
|
||||
### Python Integration
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
# Link workspace
|
||||
response = requests.post('http://localhost:9101/api/jupyter/experiments/link', json={
|
||||
'workspace': '/path/to/workspace',
|
||||
'experiment_id': 'experiment_123'
|
||||
})
|
||||
|
||||
# Sync workspace
|
||||
response = requests.post('http://localhost:9101/api/jupyter/experiments/sync', json={
|
||||
'workspace': '/path/to/workspace',
|
||||
'experiment_id': 'experiment_123',
|
||||
'direction': 'push',
|
||||
'sync_type': 'all'
|
||||
})
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Service Configuration
|
||||
|
||||
Jupyter services can be configured with experiment-specific settings:
|
||||
|
||||
```yaml
|
||||
service:
|
||||
default_resources:
|
||||
memory_limit: "8G"
|
||||
cpu_limit: "2"
|
||||
gpu_access: false
|
||||
max_services: 5
|
||||
auto_sync_interval: "30m"
|
||||
```
|
||||
|
||||
### Workspace Settings
|
||||
|
||||
Workspace metadata supports custom configuration:
|
||||
|
||||
```json
|
||||
{
|
||||
"auto_sync": true,
|
||||
"sync_interval": "15m",
|
||||
"sync_direction": "bidirectional",
|
||||
"tags": ["development", "production"],
|
||||
"additional_data": {
|
||||
"environment": "test",
|
||||
"team": "ml-team"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Migration Guide
|
||||
|
||||
### From Standalone Jupyter
|
||||
|
||||
1. **Create workspace** from existing notebooks
|
||||
2. **Link with experiment** using new commands
|
||||
3. **Update scripts** to use experiment context
|
||||
4. **Migrate data** to experiment storage
|
||||
5. **Update workflows** to use integration
|
||||
|
||||
### From Job Queue Only
|
||||
|
||||
1. **Create workspace** for development
|
||||
2. **Link with existing experiments**
|
||||
3. **Add interactive development** phase
|
||||
4. **Implement sync workflows**
|
||||
5. **Update CI/CD pipelines**
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Planned improvements:
|
||||
|
||||
- **Auto-sync with file watching**
|
||||
- **Workspace templates**
|
||||
- **Collaborative workspaces**
|
||||
- **Advanced resource sharing**
|
||||
- **Git integration**
|
||||
- **Docker compose support**
|
||||
- **Kubernetes integration**
|
||||
- **Advanced monitoring**
|
||||
|
|
@ -1,477 +0,0 @@
|
|||
# Jupyter Package Management
|
||||
|
||||
This guide describes the secure package management system for Jupyter workspaces in FetchML, allowing data scientists to install packages only from trusted channels while maintaining security and compliance.
|
||||
|
||||
## Overview
|
||||
|
||||
The package management system provides:
|
||||
|
||||
- **Trusted Channel Control** - Only allow packages from approved channels
|
||||
- **Package Approval Workflow** - Optional approval process for package installations
|
||||
- **Security Filtering** - Block potentially dangerous packages
|
||||
- **Audit Trail** - Track all package requests and installations
|
||||
- **Workspace Isolation** - Package management per workspace
|
||||
|
||||
## Security Features
|
||||
|
||||
### Trusted Channels
|
||||
|
||||
By default, only packages from these trusted channels are allowed:
|
||||
|
||||
- `conda-forge` - Community-maintained packages
|
||||
- `defaults` - Anaconda default packages
|
||||
- `pytorch` - PyTorch ecosystem packages
|
||||
- `nvidia` - NVIDIA GPU packages
|
||||
|
||||
### Blocked Packages
|
||||
|
||||
Potentially dangerous packages are blocked by default:
|
||||
|
||||
- `requests` - HTTP client library
|
||||
- `urllib3` - HTTP library
|
||||
- `httpx` - Async HTTP client
|
||||
- `aiohttp` - Async HTTP server/client
|
||||
|
||||
### Approval Workflow
|
||||
|
||||
Administrators can configure:
|
||||
|
||||
- **Auto-approval** for safe packages
|
||||
- **Manual approval** for sensitive packages
|
||||
- **Required approval** for all packages
|
||||
- **Package allowlist** for pre-approved packages
|
||||
|
||||
## CLI Commands
|
||||
|
||||
### `ml jupyter package install`
|
||||
|
||||
Request package installation in a workspace.
|
||||
|
||||
```bash
|
||||
ml jupyter package install --package <name> [options]
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `--package`: Package name (required)
|
||||
- `--version`: Specific version (optional)
|
||||
- `--channel`: Channel source (default: conda-forge)
|
||||
- `--workspace`: Workspace path (default: ./workspace)
|
||||
- `--user`: Requesting user (default: current user)
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Install numpy (auto-approved)
|
||||
ml jupyter package install --package numpy
|
||||
|
||||
# Install specific version
|
||||
ml jupyter package install --package pandas --version 1.3.0
|
||||
|
||||
# Install from specific channel
|
||||
ml jupyter package install --package pytorch --channel pytorch
|
||||
|
||||
# Install in specific workspace
|
||||
ml jupyter package install --package scikit-learn --workspace ./ml_project
|
||||
```
|
||||
|
||||
### `ml jupyter package list`
|
||||
|
||||
List installed packages in a workspace.
|
||||
|
||||
```bash
|
||||
ml jupyter package list [workspace_path]
|
||||
```
|
||||
|
||||
**Output:**
|
||||
```
|
||||
Installed packages in workspace: ./workspace
|
||||
Package Name Version Channel Installed By
|
||||
------------ ------- ------- ------------
|
||||
numpy 1.21.0 conda-forge user1
|
||||
pandas 1.3.0 conda-forge user1
|
||||
scikit-learn 1.0.0 conda-forge user2
|
||||
```
|
||||
|
||||
### `ml jupyter package pending`
|
||||
|
||||
List pending package installation requests.
|
||||
|
||||
```bash
|
||||
ml jupyter package pending [workspace_path]
|
||||
```
|
||||
|
||||
**Output:**
|
||||
```
|
||||
Pending package requests for workspace: ./workspace
|
||||
Package Name Version Channel Requested By Time
|
||||
------------ ------- ------- ------------ ----
|
||||
torch 1.9.0 pytorch user3 2023-12-06 10:30
|
||||
tensorflow 2.8.0 defaults user4 2023-12-06 11:15
|
||||
```
|
||||
|
||||
### `ml jupyter package approve`
|
||||
|
||||
Approve a pending package request.
|
||||
|
||||
```bash
|
||||
ml jupyter package approve <package_name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
# Approve torch installation
|
||||
ml jupyter package approve torch
|
||||
|
||||
# Package will be installed automatically after approval
|
||||
```
|
||||
|
||||
### `ml jupyter package reject`
|
||||
|
||||
Reject a pending package request.
|
||||
|
||||
```bash
|
||||
ml jupyter package reject <package_name> --reason <reason>
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `--reason`: Rejection reason (optional)
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Reject with default reason
|
||||
ml jupyter package reject suspicious-package
|
||||
|
||||
# Reject with custom reason
|
||||
ml jupyter package reject old-package --reason "Security policy violation"
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Package Configuration
|
||||
|
||||
Package management is configured per workspace with these settings:
|
||||
|
||||
```json
|
||||
{
|
||||
"trusted_channels": [
|
||||
"conda-forge",
|
||||
"defaults",
|
||||
"pytorch",
|
||||
"nvidia"
|
||||
],
|
||||
"allowed_packages": {},
|
||||
"blocked_packages": [
|
||||
"requests",
|
||||
"urllib3",
|
||||
"httpx",
|
||||
"aiohttp"
|
||||
],
|
||||
"require_approval": false,
|
||||
"auto_approve_safe": true,
|
||||
"max_packages": 100,
|
||||
"install_timeout": "5m",
|
||||
"allow_conda_forge": true,
|
||||
"allow_pypi": false,
|
||||
"allow_local": false
|
||||
}
|
||||
```
|
||||
|
||||
### Custom Configuration
|
||||
|
||||
Create a custom configuration for your environment:
|
||||
|
||||
```go
|
||||
config := &jupyter.PackageConfig{
|
||||
TrustedChannels: []string{
|
||||
"conda-forge",
|
||||
"company-internal",
|
||||
"research-team",
|
||||
},
|
||||
AllowedPackages: map[string]bool{
|
||||
"numpy": true,
|
||||
"pandas": true,
|
||||
"scikit-learn": true,
|
||||
"tensorflow": true,
|
||||
"pytorch": true,
|
||||
},
|
||||
BlockedPackages: []string{
|
||||
"requests",
|
||||
"urllib3",
|
||||
"httpx",
|
||||
},
|
||||
RequireApproval: true,
|
||||
AutoApproveSafe: false,
|
||||
MaxPackages: 50,
|
||||
InstallTimeout: 10 * time.Minute,
|
||||
AllowCondaForge: true,
|
||||
AllowPyPI: false,
|
||||
AllowLocal: false,
|
||||
}
|
||||
```
|
||||
|
||||
## Security Policies
|
||||
|
||||
### Channel Trust Model
|
||||
|
||||
1. **conda-forge** - Community reviewed, generally safe
|
||||
2. **defaults** - Anaconda curated, high quality
|
||||
3. **pytorch** - Official PyTorch packages
|
||||
4. **nvidia** - Official NVIDIA packages
|
||||
5. **Custom channels** - Require explicit approval
|
||||
|
||||
### Package Categories
|
||||
|
||||
#### Auto-Approved
|
||||
- Data science libraries (numpy, pandas, scipy)
|
||||
- Machine learning frameworks (scikit-learn, tensorflow, pytorch)
|
||||
- Visualization libraries (matplotlib, seaborn, plotly)
|
||||
|
||||
#### Manual Review Required
|
||||
- Network libraries (requests, urllib3, httpx)
|
||||
- System utilities
|
||||
- Custom or unknown packages
|
||||
|
||||
#### Blocked
|
||||
- Known security risks
|
||||
- Outdated versions
|
||||
- Packages with vulnerable dependencies
|
||||
|
||||
### Approval Workflows
|
||||
|
||||
#### Auto-Approval Mode
|
||||
```bash
|
||||
# Safe packages install immediately
|
||||
ml jupyter package install --package numpy
|
||||
# Output: Package installed successfully
|
||||
```
|
||||
|
||||
#### Manual Approval Mode
|
||||
```bash
|
||||
# Package requires approval
|
||||
ml jupyter package install --package requests
|
||||
# Output: Package request created, awaiting approval
|
||||
|
||||
# Administrator approves
|
||||
ml jupyter package approve requests
|
||||
# Output: Package approved and installed
|
||||
```
|
||||
|
||||
#### Rejection Mode
|
||||
```bash
|
||||
# Blocked packages are rejected automatically
|
||||
ml jupyter package install --package suspicious-package
|
||||
# Output: Package blocked for security reasons
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Data Scientists
|
||||
|
||||
1. **Use trusted channels** - Stick to conda-forge and defaults
|
||||
2. **Specify versions** - Pin specific versions for reproducibility
|
||||
3. **Check dependencies** - Review package dependencies before installation
|
||||
4. **Document requirements** - Keep requirements.txt updated
|
||||
5. **Use workspaces** - Isolate packages per project
|
||||
|
||||
### For Administrators
|
||||
|
||||
1. **Configure trusted channels** - Define approved channels for your organization
|
||||
2. **Set approval policies** - Configure approval workflows
|
||||
3. **Monitor requests** - Review pending package requests regularly
|
||||
4. **Audit installations** - Track package installations and usage
|
||||
5. **Update blocklists** - Keep blocked packages list current
|
||||
|
||||
### For Security Teams
|
||||
|
||||
1. **Review channel policies** - Validate channel trustworthiness
|
||||
2. **Monitor package updates** - Track security vulnerabilities
|
||||
3. **Audit package usage** - Review installed packages across workspaces
|
||||
4. **Configure timeouts** - Set reasonable installation timeouts
|
||||
5. **Implement logging** - Enable comprehensive audit logging
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Package not found:**
|
||||
```bash
|
||||
Error: Package 'unknown-package' not found in trusted channels
|
||||
```
|
||||
**Solution:** Check spelling and verify package exists in trusted channels
|
||||
|
||||
**Channel not trusted:**
|
||||
```bash
|
||||
Error: Channel 'untrusted-channel' is not trusted
|
||||
```
|
||||
**Solution:** Use trusted channel or request channel approval
|
||||
|
||||
**Package blocked:**
|
||||
```bash
|
||||
Error: Package 'requests' is blocked for security reasons
|
||||
```
|
||||
**Solution:** Use alternative package or request exception
|
||||
|
||||
**Installation timeout:**
|
||||
```bash
|
||||
Error: Package installation timed out
|
||||
```
|
||||
**Solution:** Check network connectivity and package size
|
||||
|
||||
### Debug Commands
|
||||
|
||||
```bash
|
||||
# Check package status
|
||||
ml jupyter package list
|
||||
|
||||
# View pending requests
|
||||
ml jupyter package pending
|
||||
|
||||
# Check workspace configuration
|
||||
cat ./workspace/.package_config.json
|
||||
|
||||
# View installation logs
|
||||
cat ./workspace/.package_cache/install_*.log
|
||||
```
|
||||
|
||||
### Recovery Procedures
|
||||
|
||||
**Failed installation:**
|
||||
```bash
|
||||
# Check request status
|
||||
ml jupyter package pending
|
||||
|
||||
# Retry installation
|
||||
ml jupyter package approve <package_name>
|
||||
```
|
||||
|
||||
**Corrupted package:**
|
||||
```bash
|
||||
# Remove and reinstall
|
||||
ml jupyter package install --package <name> --force
|
||||
```
|
||||
|
||||
## API Integration
|
||||
|
||||
### Package Manager API
|
||||
|
||||
```go
|
||||
// Create package manager
|
||||
pm, err := jupyter.NewPackageManager(logger, config, workspacePath)
|
||||
|
||||
// Request package
|
||||
req, err := pm.RequestPackage("numpy", "1.21.0", "conda-forge", "user1")
|
||||
|
||||
// Approve request
|
||||
err = pm.ApprovePackageRequest(req.PackageName, "admin")
|
||||
|
||||
// Install package
|
||||
err = pm.InstallPackage(req.PackageName)
|
||||
|
||||
// List packages
|
||||
packages, err := pm.ListInstalledPackages()
|
||||
```
|
||||
|
||||
### REST API Endpoints
|
||||
|
||||
```bash
|
||||
# Request package
|
||||
POST /api/jupyter/packages/request
|
||||
{
|
||||
"package_name": "numpy",
|
||||
"version": "1.21.0",
|
||||
"channel": "conda-forge",
|
||||
"workspace": "./workspace"
|
||||
}
|
||||
|
||||
# List packages
|
||||
GET /api/jupyter/packages/list?workspace=./workspace
|
||||
|
||||
# Approve package
|
||||
POST /api/jupyter/packages/approve
|
||||
{
|
||||
"package_name": "numpy",
|
||||
"approval_user": "admin"
|
||||
}
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Data Science Workflow
|
||||
|
||||
```bash
|
||||
# 1. Create workspace
|
||||
mkdir ml_project
|
||||
cd ml_project
|
||||
|
||||
# 2. Start Jupyter
|
||||
ml jupyter start --workspace .
|
||||
|
||||
# 3. Install common packages
|
||||
ml jupyter package install --package numpy
|
||||
ml jupyter package install --package pandas
|
||||
ml jupyter package install --package scikit-learn
|
||||
|
||||
# 4. Request specialized package
|
||||
ml jupyter package install --package pytorch --channel pytorch
|
||||
|
||||
# 5. Check status
|
||||
ml jupyter package list
|
||||
|
||||
# 6. Work in Jupyter
|
||||
# (Open browser and start coding)
|
||||
```
|
||||
|
||||
### Administrator Workflow
|
||||
|
||||
```bash
|
||||
# 1. Check pending requests
|
||||
ml jupyter package pending
|
||||
|
||||
# 2. Review package requests
|
||||
ml jupyter package list
|
||||
|
||||
# 3. Approve safe packages
|
||||
ml jupyter package approve numpy
|
||||
ml jupyter package approve pandas
|
||||
|
||||
# 4. Reject risky packages
|
||||
ml jupyter package reject requests --reason "Security policy"
|
||||
|
||||
# 5. Monitor installations
|
||||
ml jupyter package list
|
||||
```
|
||||
|
||||
### Security Configuration
|
||||
|
||||
```bash
|
||||
# 1. Configure trusted channels
|
||||
echo '{"trusted_channels": ["conda-forge", "defaults"]}' > .package_config.json
|
||||
|
||||
# 2. Set approval policy
|
||||
echo '{"require_approval": true}' >> .package_config.json
|
||||
|
||||
# 3. Block dangerous packages
|
||||
echo '{"blocked_packages": ["requests", "urllib3"]}' >> .package_config.json
|
||||
|
||||
# 4. Enable logging
|
||||
echo '{"audit_logging": true}' >> .package_config.json
|
||||
```
|
||||
|
||||
## Integration with Experiment System
|
||||
|
||||
Package management integrates with the experiment system:
|
||||
|
||||
```bash
|
||||
# Link workspace with experiment
|
||||
ml jupyter experiment link --workspace ./project --experiment exp_123
|
||||
|
||||
# Install packages for experiment
|
||||
ml jupyter package install --package tensorflow
|
||||
|
||||
# Sync packages with experiment
|
||||
ml jupyter experiment sync --workspace ./project --direction push
|
||||
|
||||
# Package info included in experiment metadata
|
||||
ml experiment show exp_123
|
||||
```
|
||||
|
||||
This ensures reproducibility by tracking package dependencies alongside experiments.
|
||||
|
|
@ -1,332 +1,604 @@
|
|||
# Jupyter Service Architecture
|
||||
# Jupyter Workflow Guide
|
||||
|
||||
Comprehensive guide to Jupyter workspace management, experiment integration, and secure package management in FetchML.
|
||||
|
||||
## Overview
|
||||
|
||||
This guide describes the new Jupyter service architecture that provides standalone Jupyter services complementary to the FetchML job queue system. This approach solves the architectural mismatch between interactive Jupyter sessions and batch-oriented job processing.
|
||||
|
||||
## Architecture Design
|
||||
|
||||
### Separate Service Model
|
||||
|
||||
The new architecture treats Jupyter as a **separate development service** rather than trying to fit it into the job queue model:
|
||||
|
||||
```
|
||||
Development Workflow:
|
||||
ml jupyter start --workspace ./my_project
|
||||
→ Standalone Jupyter service with ML tools
|
||||
→ Interactive development in browser
|
||||
→ Direct container access
|
||||
|
||||
Production Workflow:
|
||||
ml queue experiment.py
|
||||
→ Batch job execution through queue
|
||||
→ Scalable, monitored production runs
|
||||
```
|
||||
|
||||
### Key Components
|
||||
|
||||
1. **Service Manager** - Handles container lifecycle and service orchestration
|
||||
2. **Workspace Manager** - Manages volume mounting and workspace isolation
|
||||
3. **Network Manager** - Handles port allocation and browser access
|
||||
4. **Health Monitor** - Tracks service health and performance
|
||||
5. **Configuration Manager** - Manages service settings and environment
|
||||
The Jupyter workflow system provides:
|
||||
- **Workspace Management**: Isolated development environments
|
||||
- **Experiment Integration**: Seamless linking with ML experiments
|
||||
- **Package Management**: Secure package installation from trusted sources
|
||||
- **Resource Sharing**: Data and context synchronization
|
||||
- **Security Controls**: Approved channels and package filtering
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Start a Jupyter Service
|
||||
### Create Jupyter Workspace
|
||||
|
||||
```bash
|
||||
# Basic start with defaults
|
||||
ml jupyter start
|
||||
# Start development stack
|
||||
make dev-up
|
||||
|
||||
# Custom workspace and port
|
||||
ml jupyter start --workspace ./my_project --port 8889
|
||||
# Create workspace
|
||||
./cli/zig-out/bin/ml jupyter create my-workspace
|
||||
|
||||
# Named service with custom image
|
||||
ml jupyter start --name "data-science" --workspace ./workspace --image custom/jupyter:latest
|
||||
# Access workspace
|
||||
open http://localhost:8888
|
||||
```
|
||||
|
||||
### 2. List Running Services
|
||||
### Link with Experiment
|
||||
|
||||
```bash
|
||||
ml jupyter list
|
||||
```
|
||||
# Queue experiment from workspace
|
||||
./cli/zig-out/bin/ml jupyter queue --workspace my-workspace --experiment my-experiment
|
||||
|
||||
### 3. Check Service Status
|
||||
|
||||
```bash
|
||||
# All services
|
||||
ml jupyter status
|
||||
|
||||
# Specific service
|
||||
ml jupyter status jupyter-data-science-1703123456
|
||||
```
|
||||
|
||||
### 4. Stop a Service
|
||||
|
||||
```bash
|
||||
# Stop specific service
|
||||
ml jupyter stop jupyter-data-science-1703123456
|
||||
|
||||
# Stop first running service (if no name provided)
|
||||
ml jupyter stop
|
||||
# Monitor progress
|
||||
./cli/zig-out/bin/ml status
|
||||
```
|
||||
|
||||
## Workspace Management
|
||||
|
||||
### Workspace Commands
|
||||
### Creating Workspaces
|
||||
|
||||
```bash
|
||||
# List available workspaces
|
||||
ml jupyter workspace list
|
||||
# Create new workspace
|
||||
./cli/zig-out/bin/ml jupyter create workspace-name
|
||||
|
||||
# Validate a workspace
|
||||
ml jupyter workspace validate ./my_project
|
||||
# Create with specific configuration
|
||||
./cli/zig-out/bin/ml jupyter create workspace-name \
|
||||
--cpu=4 \
|
||||
--memory=8g \
|
||||
--gpu=1 \
|
||||
--image=jupyter/scipy-notebook:latest
|
||||
|
||||
# Get workspace information
|
||||
ml jupyter workspace info ./my_project
|
||||
```
|
||||
|
||||
### Workspace Structure
|
||||
|
||||
A valid workspace should contain:
|
||||
- Jupyter notebooks (`.ipynb` files)
|
||||
- Python scripts (`.py` files)
|
||||
- Requirements files (`requirements.txt`, `environment.yml`)
|
||||
- Configuration files (`pyproject.toml`)
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Service Configuration
|
||||
|
||||
The Jupyter service uses a comprehensive configuration system:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1.0.0",
|
||||
"environment": "development",
|
||||
"service": {
|
||||
"default_image": "localhost/ml-tools-runner:latest",
|
||||
"default_port": 8888,
|
||||
"max_services": 5,
|
||||
"default_resources": {
|
||||
"memory_limit": "8G",
|
||||
"cpu_limit": "2",
|
||||
"gpu_access": false
|
||||
}
|
||||
},
|
||||
"network": {
|
||||
"bind_address": "127.0.0.1",
|
||||
"enable_token": false,
|
||||
"allow_remote": false
|
||||
},
|
||||
"security": {
|
||||
"allow_network": false,
|
||||
"blocked_packages": ["requests", "urllib3", "httpx"],
|
||||
"read_only_root": false
|
||||
},
|
||||
"health": {
|
||||
"enabled": true,
|
||||
"check_interval": "30s",
|
||||
"timeout": "10s",
|
||||
"auto_cleanup": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Environment-Specific Settings
|
||||
|
||||
- **Development**: More permissive, debug enabled, network access allowed
|
||||
- **Production**: Restricted access, health monitoring, resource limits enforced
|
||||
- **Testing**: Single service, health checks disabled, debug mode
|
||||
|
||||
## Available ML Tools
|
||||
|
||||
The container includes these pre-installed tools:
|
||||
|
||||
- **MLflow 3.7.0** - Experiment tracking and model registry
|
||||
- **Streamlit 1.52.1** - Interactive web apps
|
||||
- **Dash 3.3.0** - Plotly-based dashboards
|
||||
- **Panel 1.8.4** - Data apps and dashboards
|
||||
- **Bokeh 3.8.1** - Interactive visualizations
|
||||
- **WandB 0.23.1** - Experiment tracking (requires API key)
|
||||
|
||||
## Using ML Tools in Jupyter
|
||||
|
||||
### MLflow Example
|
||||
```python
|
||||
import mlflow
|
||||
|
||||
# Start tracking
|
||||
with mlflow.start_run() as run:
|
||||
mlflow.log_param("model", "random_forest")
|
||||
mlflow.log_metric("accuracy", 0.95)
|
||||
print(f"Run ID: {run.info.run_id}")
|
||||
```
|
||||
|
||||
### Streamlit Example
|
||||
```python
|
||||
import streamlit as st
|
||||
|
||||
st.title("My ML App")
|
||||
st.write("Interactive dashboard with real-time updates")
|
||||
```
|
||||
|
||||
### Dash Example
|
||||
```python
|
||||
import dash
|
||||
import dash_core_components as dcc
|
||||
import dash_html_components as html
|
||||
|
||||
app = dash.Dash(__name__)
|
||||
app.run_server(debug=True, host='0.0.0.0', port=8050)
|
||||
```
|
||||
|
||||
## Integration with Job Queue
|
||||
|
||||
The Jupyter service complements the existing job queue system:
|
||||
|
||||
### Development in Jupyter
|
||||
```bash
|
||||
# Start Jupyter for development
|
||||
ml jupyter start --workspace ./experiment
|
||||
|
||||
# Work interactively in browser
|
||||
# http://localhost:8888
|
||||
```
|
||||
|
||||
### Production via Job Queue
|
||||
```bash
|
||||
# Queue the same experiment for production
|
||||
ml queue experiment.py
|
||||
|
||||
# Monitor production run
|
||||
ml status
|
||||
ml monitor
|
||||
```
|
||||
|
||||
### Hybrid Workflow
|
||||
1. **Develop**: Use Jupyter for interactive development and prototyping
|
||||
2. **Test**: Validate code in Jupyter with sample data
|
||||
3. **Production**: Queue validated code for scalable execution
|
||||
4. **Monitor**: Track production jobs while continuing development
|
||||
|
||||
## Security Features
|
||||
|
||||
### Container Isolation
|
||||
- Rootless Podman containers
|
||||
- Non-root user execution
|
||||
- Resource limits enforced
|
||||
- Network access control
|
||||
|
||||
### Workspace Security
|
||||
- Bind mounts with proper permissions
|
||||
- Path validation and restrictions
|
||||
- SELinux compatibility (`:Z` flag)
|
||||
|
||||
### Network Security
|
||||
- Localhost binding by default
|
||||
- Token/password authentication optional
|
||||
- Remote access requires explicit configuration
|
||||
|
||||
## Health Monitoring
|
||||
|
||||
### Automatic Health Checks
|
||||
- HTTP endpoint monitoring
|
||||
- Container status tracking
|
||||
- Response time measurement
|
||||
- Error detection and alerting
|
||||
|
||||
### Service Lifecycle Management
|
||||
- Automatic cleanup of stale services
|
||||
- Graceful shutdown handling
|
||||
- Resource usage monitoring
|
||||
- Service restart policies
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Service Won't Start
|
||||
```bash
|
||||
# Check container runtime
|
||||
podman --version
|
||||
|
||||
# Validate workspace
|
||||
ml jupyter workspace validate ./my_project
|
||||
|
||||
# Check port availability
|
||||
ml jupyter status
|
||||
```
|
||||
|
||||
### Network Issues
|
||||
```bash
|
||||
# Check service URL
|
||||
ml jupyter status service-name
|
||||
|
||||
# Test connectivity
|
||||
curl http://localhost:8888
|
||||
|
||||
# Check port conflicts
|
||||
netstat -tlnp | grep :8888
|
||||
```
|
||||
|
||||
### Workspace Problems
|
||||
```bash
|
||||
# List workspaces
|
||||
ml jupyter workspace list
|
||||
./cli/zig-out/bin/ml jupyter list
|
||||
|
||||
# Check permissions
|
||||
ls -la ./my_project
|
||||
|
||||
# Validate workspace structure
|
||||
ml jupyter workspace info ./my_project
|
||||
# Workspace details
|
||||
./cli/zig-out/bin/ml jupyter info workspace-name
|
||||
```
|
||||
|
||||
### Service Health Issues
|
||||
### Workspace Configuration
|
||||
|
||||
**Resource Allocation**:
|
||||
```yaml
|
||||
# workspace-config.yaml
|
||||
resources:
|
||||
cpu: 4
|
||||
memory: 8g
|
||||
gpu: 1
|
||||
disk: 20g
|
||||
|
||||
environment:
|
||||
python_version: "3.11"
|
||||
jupyter_version: "latest"
|
||||
|
||||
security:
|
||||
trusted_channels: ["conda-forge", "defaults", "pytorch"]
|
||||
blocked_packages: ["requests", "urllib3"]
|
||||
```
|
||||
|
||||
### Access Control
|
||||
|
||||
```bash
|
||||
# Check service health
|
||||
ml jupyter status service-name
|
||||
# Set workspace permissions
|
||||
./cli/zig-out/bin/ml jupyter access workspace-name \
|
||||
--user=data-scientist \
|
||||
--role=editor
|
||||
|
||||
# View container logs
|
||||
podman logs container-id
|
||||
# Revoke access
|
||||
./cli/zig-out/bin/ml jupyter revoke workspace-name data-scientist
|
||||
```
|
||||
|
||||
# Restart service
|
||||
ml jupyter stop service-name
|
||||
ml jupyter start --name service-name --workspace ./my_project
|
||||
## Experiment Integration
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
Jupyter Workspace ←→ Workspace Metadata ←→ Experiment Manager
|
||||
↓ ↓ ↓
|
||||
Notebooks Link Metadata Experiment Data
|
||||
Scripts Sync History Metrics & Results
|
||||
```
|
||||
|
||||
### Linking Workspaces and Experiments
|
||||
|
||||
```bash
|
||||
# Link existing workspace to experiment
|
||||
./cli/zig-out/bin/ml jupyter link workspace-name experiment-id
|
||||
|
||||
# Create workspace linked to new experiment
|
||||
./cli/zig-out/bin/ml jupyter create workspace-name \
|
||||
--experiment experiment-id
|
||||
|
||||
# Queue experiment from workspace
|
||||
./cli/zig-out/bin/ml jupyter queue \
|
||||
--workspace workspace-name \
|
||||
--config experiment-config.yaml
|
||||
```
|
||||
|
||||
### Data Synchronization
|
||||
|
||||
**Automatic Sync**:
|
||||
- Notebook metadata
|
||||
- Experiment results
|
||||
- Configuration files
|
||||
- Resource usage metrics
|
||||
|
||||
**Manual Sync**:
|
||||
```bash
|
||||
# Sync workspace to experiment
|
||||
./cli/zig-out/bin/ml jupyter sync workspace-name --to-experiment
|
||||
|
||||
# Sync experiment to workspace
|
||||
./cli/zig-out/bin/ml jupyter sync workspace-name --from-experiment
|
||||
|
||||
# Force full sync
|
||||
./cli/zig-out/bin/ml jupyter sync workspace-name --full
|
||||
```
|
||||
|
||||
### Workspace Metadata
|
||||
|
||||
**Tracked Information**:
|
||||
- Workspace creation and modification dates
|
||||
- Linked experiment IDs
|
||||
- Resource usage history
|
||||
- Package installation records
|
||||
- Notebook execution history
|
||||
|
||||
```bash
|
||||
# View workspace metadata
|
||||
./cli/zig-out/bin/ml jupyter metadata workspace-name
|
||||
|
||||
# Export metadata
|
||||
./cli/zig-out/bin/ml jupyter export workspace-name --format=json
|
||||
```
|
||||
|
||||
## Package Management
|
||||
|
||||
### Security Features
|
||||
|
||||
**Trusted Channels** (default):
|
||||
- `conda-forge` - Community-maintained packages
|
||||
- `defaults` - Anaconda default packages
|
||||
- `pytorch` - PyTorch ecosystem packages
|
||||
- `nvidia` - NVIDIA GPU packages
|
||||
|
||||
**Blocked Packages** (security):
|
||||
- `requests` - HTTP client library
|
||||
- `urllib3` - HTTP library
|
||||
- `socket` - Network sockets
|
||||
- `subprocess` - Process execution
|
||||
- `os.system` - System commands
|
||||
|
||||
### Package Installation
|
||||
|
||||
**In Jupyter Notebook**:
|
||||
```python
|
||||
# Install package (checks security)
|
||||
!pip install numpy pandas scikit-learn
|
||||
|
||||
# Install from conda
|
||||
!conda install -c conda-forge matplotlib seaborn
|
||||
|
||||
# Check package status
|
||||
!pip list
|
||||
```
|
||||
|
||||
**From CLI**:
|
||||
```bash
|
||||
# Install package in workspace
|
||||
./cli/zig-out/bin/ml jupyter install workspace-name numpy
|
||||
|
||||
# Install with version
|
||||
./cli/zig-out/bin/ml jupyter install workspace-name "pandas==2.0.0"
|
||||
|
||||
# Install from conda
|
||||
./cli/zig-out/bin/ml jupyter install workspace-name matplotlib --conda
|
||||
|
||||
# List installed packages
|
||||
./cli/zig-out/bin/ml jupyter packages workspace-name
|
||||
```
|
||||
|
||||
### Package Approval Workflow
|
||||
|
||||
**Optional Approval Process**:
|
||||
1. **Request**: User requests package installation
|
||||
2. **Review**: Admin reviews package security
|
||||
3. **Approval**: Package added to allowlist
|
||||
4. **Installation**: Package installed in workspace
|
||||
|
||||
```bash
|
||||
# Request package (requires approval)
|
||||
./cli/zig-out/bin/ml jupyter request workspace-name custom-package
|
||||
|
||||
# Review requests (admin)
|
||||
./cli/zig-out/bin/ml jupyter review --pending
|
||||
|
||||
# Approve request
|
||||
./cli/zig-out/bin/ml jupyter approve request-id
|
||||
|
||||
# Deny request
|
||||
./cli/zig-out/bin/ml jupyter deny request-id --reason="Security concern"
|
||||
```
|
||||
|
||||
### Custom Channel Configuration
|
||||
|
||||
```yaml
|
||||
# workspace-security.yaml
|
||||
package_management:
|
||||
trusted_channels:
|
||||
- conda-forge
|
||||
- defaults
|
||||
- pytorch
|
||||
- nvidia
|
||||
- company-internal # Custom channel
|
||||
|
||||
blocked_packages:
|
||||
- requests
|
||||
- urllib3
|
||||
- socket
|
||||
- subprocess
|
||||
- os.system
|
||||
|
||||
approval_required:
|
||||
- tensorflow
|
||||
- pytorch
|
||||
- custom-package
|
||||
|
||||
allowlist:
|
||||
- numpy
|
||||
- pandas
|
||||
- scikit-learn
|
||||
- matplotlib
|
||||
```
|
||||
|
||||
## Security and Compliance
|
||||
|
||||
### Workspace Isolation
|
||||
|
||||
**Network Isolation**:
|
||||
- Workspaces run in isolated networks
|
||||
- Controlled outbound internet access
|
||||
- Inter-workspace communication blocked
|
||||
|
||||
**File System Isolation**:
|
||||
- Separate storage volumes per workspace
|
||||
- Controlled file access permissions
|
||||
- Automatic cleanup on workspace deletion
|
||||
|
||||
### Audit Trail
|
||||
|
||||
**Tracked Activities**:
|
||||
- Package installations and removals
|
||||
- Notebook execution history
|
||||
- Data access patterns
|
||||
- Resource usage metrics
|
||||
- User access logs
|
||||
|
||||
```bash
|
||||
# View audit log
|
||||
./cli/zig-out/bin/ml jupyter audit workspace-name
|
||||
|
||||
# Export audit report
|
||||
./cli/zig-out/bin/ml jupyter audit workspace-name --export=csv
|
||||
|
||||
# Security scan
|
||||
./cli/zig-out/bin/ml jupyter security-scan workspace-name
|
||||
```
|
||||
|
||||
### Compliance Features
|
||||
|
||||
**Data Protection**:
|
||||
- Automatic data encryption
|
||||
- Secure data transfer protocols
|
||||
- GDPR compliance features
|
||||
- Data retention policies
|
||||
|
||||
**Access Controls**:
|
||||
- Role-based permissions
|
||||
- Multi-factor authentication
|
||||
- Session timeout management
|
||||
- IP whitelisting
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Custom Images
|
||||
|
||||
```bash
|
||||
# Build custom workspace image
|
||||
./cli/zig-out/bin/ml jupyter build custom-image \
|
||||
--base=jupyter/scipy-notebook \
|
||||
--packages="numpy pandas scikit-learn" \
|
||||
--gpu-support
|
||||
|
||||
# Use custom image
|
||||
./cli/zig-out/bin/ml jupyter create workspace-name \
|
||||
--image=custom-image
|
||||
```
|
||||
|
||||
### Workspace Templates
|
||||
|
||||
```yaml
|
||||
# data-science-template.yaml
|
||||
name: data-science-workspace
|
||||
resources:
|
||||
cpu: 8
|
||||
memory: 16g
|
||||
gpu: 1
|
||||
|
||||
packages:
|
||||
- numpy
|
||||
- pandas
|
||||
- scikit-learn
|
||||
- matplotlib
|
||||
- seaborn
|
||||
- jupyterlab
|
||||
|
||||
security:
|
||||
trusted_channels: ["conda-forge", "defaults"]
|
||||
approval_required: []
|
||||
|
||||
environment:
|
||||
PYTHONPATH: "/workspace"
|
||||
JUPYTER_ENABLE_LAB: "yes"
|
||||
```
|
||||
|
||||
```bash
|
||||
# Create from template
|
||||
./cli/zig-out/bin/ml jupyter create workspace-name \
|
||||
--template=data-science-template
|
||||
```
|
||||
|
||||
### Collaboration Features
|
||||
|
||||
**Workspace Sharing**:
|
||||
```bash
|
||||
# Share workspace with team
|
||||
./cli/zig-out/bin/ml jupyter share workspace-name \
|
||||
--team=data-science-team \
|
||||
--role=collaborator
|
||||
|
||||
# Collaborative notebooks
|
||||
# Multiple users can edit simultaneously
|
||||
# Real-time cursor tracking
|
||||
# Comment and review features
|
||||
```
|
||||
|
||||
**Version Control**:
|
||||
```bash
|
||||
# Git integration
|
||||
./cli/zig-out/bin/ml jupyter git workspace-name init
|
||||
./cli/zig-out/bin/ml jupyter git workspace-name add .
|
||||
./cli/zig-out/bin/ml jupyter git workspace-name commit -m "Initial commit"
|
||||
|
||||
# Notebook versioning
|
||||
./cli/zig-out/bin/ml jupyter version workspace-name notebook.ipynb
|
||||
```
|
||||
|
||||
## Monitoring and Troubleshooting
|
||||
|
||||
### Performance Monitoring
|
||||
|
||||
```bash
|
||||
# Workspace resource usage
|
||||
./cli/zig-out/bin/ml jupyter stats workspace-name
|
||||
|
||||
# Real-time monitoring
|
||||
./cli/zig-out/bin/ml jupyter monitor workspace-name
|
||||
|
||||
# Performance report
|
||||
./cli/zig-out/bin/ml jupyter report workspace-name --format=html
|
||||
```
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Package Installation Failures**:
|
||||
```bash
|
||||
# Check package security
|
||||
./cli/zig-out/bin/ml jupyter check-package package-name
|
||||
|
||||
# Bypass security (admin only)
|
||||
./cli/zig-out/bin/ml jupyter install workspace-name package-name --force
|
||||
|
||||
# Clear package cache
|
||||
./cli/zig-out/bin/ml jupyter clear-cache workspace-name
|
||||
```
|
||||
|
||||
**Workspace Access Issues**:
|
||||
```bash
|
||||
# Check workspace status
|
||||
./cli/zig-out/bin/ml jupyter status workspace-name
|
||||
|
||||
# Restart workspace
|
||||
./cli/zig-out/bin/ml jupyter restart workspace-name
|
||||
|
||||
# Reset workspace
|
||||
./cli/zig-out/bin/ml jupyter reset workspace-name --hard
|
||||
```
|
||||
|
||||
**Performance Issues**:
|
||||
```bash
|
||||
# Check resource limits
|
||||
./cli/zig-out/bin/ml jupyter limits workspace-name
|
||||
|
||||
# Scale resources
|
||||
./cli/zig-out/bin/ml jupyter scale workspace-name --cpu=8 --memory=16g
|
||||
|
||||
# Optimize performance
|
||||
./cli/zig-out/bin/ml jupyter optimize workspace-name
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Development
|
||||
1. Use descriptive service names
|
||||
2. Organize workspaces by project
|
||||
3. Leverage workspace validation
|
||||
4. Monitor service health regularly
|
||||
### Workspace Organization
|
||||
|
||||
### Production
|
||||
1. Use production configuration
|
||||
2. Enable security restrictions
|
||||
3. Monitor resource usage
|
||||
4. Implement cleanup policies
|
||||
1. **Use Descriptive Names**: `project-name-environment`
|
||||
2. **Resource Planning**: Allocate appropriate CPU/memory
|
||||
3. **Regular Cleanup**: Remove unused workspaces
|
||||
4. **Version Control**: Track important changes
|
||||
|
||||
### Workflow Integration
|
||||
1. Develop interactively in Jupyter
|
||||
2. Validate code before production
|
||||
3. Use job queue for scalable execution
|
||||
4. Monitor both systems independently
|
||||
### Package Management
|
||||
|
||||
## Migration from Old Architecture
|
||||
1. **Minimal Packages**: Install only necessary packages
|
||||
2. **Version Pinning**: Use specific package versions
|
||||
3. **Security First**: Always use trusted channels
|
||||
4. **Regular Updates**: Keep packages updated
|
||||
|
||||
The new architecture provides these benefits over the old job queue approach:
|
||||
### Security Practices
|
||||
|
||||
- **True Interactive Sessions** - Long-running containers with persistent state
|
||||
- **Direct Browser Access** - No API gateway overhead
|
||||
- **Better Resource Management** - Dedicated containers per service
|
||||
- **Simplified Networking** - Direct port mapping
|
||||
- **Enhanced Security** - Isolated development environment
|
||||
- **Improved Monitoring** - Service-specific health tracking
|
||||
1. **Principle of Least Privilege**: Minimal required permissions
|
||||
2. **Regular Audits**: Review workspace activities
|
||||
3. **Data Classification**: Handle sensitive data appropriately
|
||||
4. **Compliance**: Follow organizational policies
|
||||
|
||||
To migrate:
|
||||
1. Stop any existing Jupyter containers
|
||||
2. Use new CLI commands (`ml jupyter start/stop/list`)
|
||||
3. Organize projects into workspaces
|
||||
4. Configure security settings as needed
|
||||
## API Integration
|
||||
|
||||
### Programmatic Workspace Management
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Create workspace
|
||||
response = requests.post('/api/v1/jupyter/workspaces', json={
|
||||
'name': 'my-workspace',
|
||||
'resources': {'cpu': 4, 'memory': '8g'},
|
||||
'security': {'trusted_channels': ['conda-forge']}
|
||||
})
|
||||
|
||||
# Install package
|
||||
requests.post(f'/api/v1/jupyter/workspaces/my-workspace/packages', json={
|
||||
'package': 'numpy',
|
||||
'version': '1.24.0'
|
||||
})
|
||||
|
||||
# Link to experiment
|
||||
requests.post('/api/v1/jupyter/workspaces/my-workspace/experiments', json={
|
||||
'experiment_id': 'exp-123'
|
||||
})
|
||||
```
|
||||
|
||||
### Webhooks
|
||||
|
||||
```yaml
|
||||
# workspace-webhooks.yaml
|
||||
events:
|
||||
- workspace_created
|
||||
- package_installed
|
||||
- experiment_linked
|
||||
|
||||
actions:
|
||||
- slack_notification
|
||||
- email_alert
|
||||
- log_event
|
||||
```
|
||||
|
||||
## WebSocket Protocol
|
||||
|
||||
### Overview
|
||||
|
||||
The Jupyter CLI commands use a binary WebSocket protocol for efficient, low-latency communication with the FetchML server. This provides better performance than HTTP and allows for real-time updates.
|
||||
|
||||
### Connection
|
||||
|
||||
```bash
|
||||
# WebSocket endpoint
|
||||
ws://SERVER_HOST:PORT/ws
|
||||
|
||||
# TLS-enabled endpoint
|
||||
wss://SERVER_HOST:PORT/ws
|
||||
```
|
||||
|
||||
**Authentication**: API key is hashed using SHA256 and the first 16 bytes are sent with each request.
|
||||
|
||||
### Binary Message Format
|
||||
|
||||
All Jupyter commands follow a binary protocol for optimal performance:
|
||||
|
||||
**Start Jupyter Service** (Opcode: 0x0D):
|
||||
```
|
||||
[opcode:1][api_key_hash:16][name_len:1][name:var][workspace_len:2][workspace:var][password_len:1][password:var]
|
||||
```
|
||||
|
||||
**Stop Jupyter Service** (Opcode: 0x0E):
|
||||
```
|
||||
[opcode:1][api_key_hash:16][service_id_len:1][service_id:var]
|
||||
```
|
||||
|
||||
**List Jupyter Services** (Opcode: 0x0F):
|
||||
```
|
||||
[opcode:1][api_key_hash:16]
|
||||
```
|
||||
|
||||
### Response Packets
|
||||
|
||||
The server responds with structured response packets:
|
||||
|
||||
**Success Response**:
|
||||
```
|
||||
[packet_type:0x00][timestamp:8][message_len:2][message:var]
|
||||
```
|
||||
|
||||
**Error Response**:
|
||||
```
|
||||
[packet_type:0x01][timestamp:8][error_code:1][message_len:2][message:var][details_len:2][details:var]
|
||||
```
|
||||
|
||||
**Data Response** (for list command):
|
||||
```
|
||||
[packet_type:0x04][timestamp:8][type_len:2][type:var][payload_len:4][payload:var]
|
||||
```
|
||||
|
||||
### CLI Examples
|
||||
|
||||
**Start Service**:
|
||||
```bash
|
||||
# Basic start
|
||||
ml jupyter start --name my-notebook --workspace /path/to/workspace
|
||||
|
||||
# With password
|
||||
ml jupyter start --name my-notebook --workspace /path/to/workspace --password mypass
|
||||
```
|
||||
|
||||
**Stop Service**:
|
||||
```bash
|
||||
ml jupyter stop service-id-12345
|
||||
```
|
||||
|
||||
**List Services**:
|
||||
```bash
|
||||
ml jupyter list
|
||||
```
|
||||
|
||||
### Error Codes
|
||||
|
||||
Common error codes in binary responses:
|
||||
|
||||
- `0x00`: Unknown error
|
||||
- `0x01`: Invalid request format
|
||||
- `0x02`: Authentication failed
|
||||
- `0x03`: Permission denied
|
||||
- `0x10`: Server overloaded
|
||||
- `0x14`: Timeout
|
||||
|
||||
### WebSocket vs HTTP
|
||||
|
||||
**Advantages of WebSocket**:
|
||||
- ✅ Lower latency (persistent connection)
|
||||
- ✅ Binary protocol (smaller payloads)
|
||||
- ✅ Real-time updates possible
|
||||
- ✅ Reduced server load
|
||||
- ✅ Single connection for CLI
|
||||
|
||||
**When to use HTTP**:
|
||||
- For programmatic API access
|
||||
- For web-based integrations
|
||||
- When WebSocket is unavailable
|
||||
|
||||
## See Also
|
||||
|
||||
- **[Testing Guide](testing.md)** - Testing Jupyter workflows
|
||||
- **[Deployment Guide](deployment.md)** - Production deployment
|
||||
- **[Security Guide](security.md)** - Security best practices
|
||||
- **[API Reference](api-key-process.md)** - API documentation
|
||||
- **[CLI Reference](cli-reference.md)** - Command-line tools
|
||||
|
|
@ -47,7 +47,7 @@ This document outlines security features, best practices, and hardening procedur
|
|||
|
||||
3. **Enable TLS** (Production only)
|
||||
```yaml
|
||||
# configs/config-prod.yaml
|
||||
# configs/api/prod.yaml
|
||||
server:
|
||||
tls:
|
||||
enabled: true
|
||||
|
|
@ -68,7 +68,7 @@ This document outlines security features, best practices, and hardening procedur
|
|||
|
||||
5. **Restrict IP Access**
|
||||
```yaml
|
||||
# configs/config-prod.yaml
|
||||
# configs/api/prod.yaml
|
||||
auth:
|
||||
ip_whitelist:
|
||||
- "10.0.0.0/8"
|
||||
|
|
@ -154,19 +154,33 @@ echo -n "your-api-key" | sha256sum
|
|||
### Rotate API Keys
|
||||
|
||||
1. Generate new API key
|
||||
2. Update `config-local.yaml` with new hash
|
||||
2. Update your chosen API server config (for example a private copy of `configs/api/homelab-secure.yaml`) with the new hash
|
||||
3. Distribute new key to users
|
||||
4. Remove old key after grace period
|
||||
|
||||
### Revoke API Keys
|
||||
|
||||
Remove user entry from `config-local.yaml`:
|
||||
Remove user entry from your API server config file:
|
||||
```yaml
|
||||
auth:
|
||||
apikeys:
|
||||
api_keys:
|
||||
# user_to_revoke: # Comment out or delete
|
||||
```
|
||||
|
||||
## Secret Flow (What lives where)
|
||||
|
||||
- **API server config (`configs/api/*.yaml`)**
|
||||
- Stores **SHA256 hashes** of API keys (never raw keys).
|
||||
- The repo-shipped configs intentionally contain `CHANGE_ME_...` placeholders.
|
||||
- For real deployments, make a private copy (e.g. `/etc/fetch_ml/config.yaml`) and fill in real hashes.
|
||||
|
||||
- **Docker Compose `.env` / secret files**
|
||||
- Used for values that should not be committed (e.g. `REDIS_PASSWORD`, Grafana admin password).
|
||||
- `deployments/docker-compose.homelab-secure.yml` requires `REDIS_PASSWORD` to be set explicitly.
|
||||
|
||||
- **TLS certs**
|
||||
- Provided as mounted files (e.g. `/app/ssl/cert.pem`, `/app/ssl/key.pem`).
|
||||
|
||||
## Network Security
|
||||
|
||||
### Production Network Topology
|
||||
|
|
|
|||
|
|
@ -4,8 +4,10 @@ package container
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/internal/config"
|
||||
|
|
@ -39,9 +41,10 @@ type ContainerConfig struct {
|
|||
|
||||
// ResourceConfig defines resource limits for containers
|
||||
type ResourceConfig struct {
|
||||
MemoryLimit string `json:"memory_limit"`
|
||||
CPULimit string `json:"cpu_limit"`
|
||||
GPUAccess bool `json:"gpu_access"`
|
||||
MemoryLimit string `json:"memory_limit"`
|
||||
CPULimit string `json:"cpu_limit"`
|
||||
GPUDevices []string `json:"gpu_devices"`
|
||||
AppleGPU bool `json:"apple_gpu"`
|
||||
}
|
||||
|
||||
// NetworkConfig defines network settings for containers
|
||||
|
|
@ -49,10 +52,17 @@ type NetworkConfig struct {
|
|||
AllowNetwork bool `json:"allow_network"`
|
||||
}
|
||||
|
||||
// StartContainer starts a new container
|
||||
func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerConfig) (string, error) {
|
||||
func podmanCgroupsMode() string {
|
||||
return strings.TrimSpace(os.Getenv("FETCHML_PODMAN_CGROUPS"))
|
||||
}
|
||||
|
||||
func BuildRunArgs(config *ContainerConfig) []string {
|
||||
args := []string{"run", "-d"}
|
||||
|
||||
if podmanCgroupsMode() == "disabled" {
|
||||
args = append(args, "--cgroups=disabled")
|
||||
}
|
||||
|
||||
// Add name
|
||||
if config.Name != "" {
|
||||
args = append(args, "--name", config.Name)
|
||||
|
|
@ -70,8 +80,12 @@ func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerCo
|
|||
if config.Resources.CPULimit != "" {
|
||||
args = append(args, "--cpus", config.Resources.CPULimit)
|
||||
}
|
||||
if config.Resources.GPUAccess {
|
||||
args = append(args, "--device", "/dev/dri")
|
||||
if config.Resources.AppleGPU {
|
||||
args = append(args, "--device", "/dev/metal")
|
||||
args = append(args, "--device", "/dev/mps")
|
||||
}
|
||||
for _, device := range config.Resources.GPUDevices {
|
||||
args = append(args, "--device", device)
|
||||
}
|
||||
|
||||
// Add volumes
|
||||
|
|
@ -94,6 +108,31 @@ func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerCo
|
|||
// Add image and command
|
||||
args = append(args, config.Image)
|
||||
args = append(args, config.Command...)
|
||||
return args
|
||||
}
|
||||
|
||||
func ParseContainerID(output string) (string, error) {
|
||||
out := strings.TrimSpace(output)
|
||||
if out == "" {
|
||||
return "", fmt.Errorf("no container ID returned")
|
||||
}
|
||||
lines := strings.Split(out, "\n")
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
return line, nil
|
||||
}
|
||||
return "", fmt.Errorf("no container ID returned")
|
||||
}
|
||||
|
||||
// StartContainer starts a new container
|
||||
func (pm *PodmanManager) StartContainer(
|
||||
ctx context.Context,
|
||||
config *ContainerConfig,
|
||||
) (string, error) {
|
||||
args := BuildRunArgs(config)
|
||||
|
||||
// Execute command
|
||||
cmd := exec.CommandContext(ctx, "podman", args...)
|
||||
|
|
@ -102,10 +141,9 @@ func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerCo
|
|||
return "", fmt.Errorf("failed to start container: %w, output: %s", err, string(output))
|
||||
}
|
||||
|
||||
// Return container ID (first line of output)
|
||||
containerID := strings.TrimSpace(string(output))
|
||||
if containerID == "" {
|
||||
return "", fmt.Errorf("no container ID returned")
|
||||
containerID, err := ParseContainerID(string(output))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
pm.logger.Info("container started", "container_id", containerID, "name", config.Name)
|
||||
|
|
@ -124,6 +162,29 @@ func (pm *PodmanManager) StopContainer(ctx context.Context, containerID string)
|
|||
return nil
|
||||
}
|
||||
|
||||
// GetContainerStateStatus returns the container's lifecycle state from `podman inspect`.
|
||||
// Typical values: running, exited, created, paused.
|
||||
func (pm *PodmanManager) GetContainerStateStatus(
|
||||
ctx context.Context,
|
||||
containerID string,
|
||||
) (string, error) {
|
||||
// Validate containerID to prevent injection
|
||||
if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") {
|
||||
return "", fmt.Errorf("invalid container ID: %s", containerID)
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "podman", "inspect", "--format", "{{.State.Status}}", containerID) //nolint:gosec
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to inspect container: %w, output: %s", err, string(output))
|
||||
}
|
||||
status := strings.TrimSpace(string(output))
|
||||
if status == "" {
|
||||
return "unknown", nil
|
||||
}
|
||||
return status, nil
|
||||
}
|
||||
|
||||
// RemoveContainer removes a container
|
||||
func (pm *PodmanManager) RemoveContainer(ctx context.Context, containerID string) error {
|
||||
cmd := exec.CommandContext(ctx, "podman", "rm", containerID)
|
||||
|
|
@ -137,7 +198,10 @@ func (pm *PodmanManager) RemoveContainer(ctx context.Context, containerID string
|
|||
}
|
||||
|
||||
// GetContainerStatus gets the status of a container
|
||||
func (pm *PodmanManager) GetContainerStatus(ctx context.Context, containerID string) (string, error) {
|
||||
func (pm *PodmanManager) GetContainerStatus(
|
||||
ctx context.Context,
|
||||
containerID string,
|
||||
) (string, error) {
|
||||
// Validate containerID to prevent injection
|
||||
if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") {
|
||||
return "", fmt.Errorf("invalid container ID: %s", containerID)
|
||||
|
|
@ -153,7 +217,16 @@ func (pm *PodmanManager) GetContainerStatus(ctx context.Context, containerID str
|
|||
status := strings.TrimSpace(string(output))
|
||||
if status == "" {
|
||||
// Container might be stopped, check all containers
|
||||
cmd = exec.CommandContext(ctx, "podman", "ps", "-a", "--filter", "id="+containerID, "--format", "{{.Status}}") //nolint:gosec
|
||||
cmd = exec.CommandContext(
|
||||
ctx,
|
||||
"podman",
|
||||
"ps",
|
||||
"-a",
|
||||
"--filter",
|
||||
"id="+containerID,
|
||||
"--format",
|
||||
"{{.Status}}",
|
||||
) //nolint:gosec
|
||||
output, err = cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to get container status: %w, output: %s", err, string(output))
|
||||
|
|
@ -167,6 +240,33 @@ func (pm *PodmanManager) GetContainerStatus(ctx context.Context, containerID str
|
|||
return status, nil
|
||||
}
|
||||
|
||||
// ExecContainer executes a command inside a running container and returns the output
|
||||
func (pm *PodmanManager) ExecContainer(ctx context.Context, containerID string, command []string) (string, error) {
|
||||
// Validate containerID to prevent injection
|
||||
if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") {
|
||||
return "", fmt.Errorf("invalid container ID: %s", containerID)
|
||||
}
|
||||
|
||||
// Validate command to prevent injection
|
||||
for _, arg := range command {
|
||||
if strings.ContainsAny(arg, "&;|<>$`\"'") {
|
||||
return "", fmt.Errorf("invalid command argument: %s", arg)
|
||||
}
|
||||
}
|
||||
|
||||
// Build podman exec command
|
||||
args := []string{"exec", containerID}
|
||||
args = append(args, command...)
|
||||
|
||||
cmd := exec.CommandContext(ctx, "podman", args...) //nolint:gosec
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to execute command in container: %w, output: %s", err, string(output))
|
||||
}
|
||||
|
||||
return string(output), nil
|
||||
}
|
||||
|
||||
// PodmanConfig holds configuration for Podman container execution
|
||||
type PodmanConfig struct {
|
||||
Image string
|
||||
|
|
@ -174,16 +274,33 @@ type PodmanConfig struct {
|
|||
Results string
|
||||
ContainerWorkspace string
|
||||
ContainerResults string
|
||||
GPUAccess bool
|
||||
AppleGPU bool
|
||||
GPUDevices []string
|
||||
Env map[string]string
|
||||
Volumes map[string]string
|
||||
Memory string
|
||||
CPUs string
|
||||
}
|
||||
|
||||
// PodmanResourceOverrides converts per-task resource requests into Podman-compatible
|
||||
// `--cpus` and `--memory` flag values.
|
||||
//
|
||||
// cpu and memoryGB are treated as optional; values <= 0 return empty overrides.
|
||||
func PodmanResourceOverrides(cpu int, memoryGB int) (cpus string, memory string) {
|
||||
if cpu > 0 {
|
||||
cpus = strconv.Itoa(cpu)
|
||||
}
|
||||
if memoryGB > 0 {
|
||||
memory = fmt.Sprintf("%dg", memoryGB)
|
||||
}
|
||||
return cpus, memory
|
||||
}
|
||||
|
||||
// BuildPodmanCommand builds a Podman command for executing ML experiments
|
||||
func BuildPodmanCommand(
|
||||
ctx context.Context,
|
||||
cfg PodmanConfig,
|
||||
scriptPath, requirementsPath string,
|
||||
scriptPath, depsPath string,
|
||||
extraArgs []string,
|
||||
) *exec.Cmd {
|
||||
args := []string{
|
||||
|
|
@ -214,14 +331,26 @@ func BuildPodmanCommand(
|
|||
resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults)
|
||||
args = append(args, "-v", resultsMount)
|
||||
|
||||
if cfg.GPUAccess {
|
||||
args = append(args, "--device", "/dev/dri")
|
||||
// Mount additional volumes
|
||||
for hostPath, containerPath := range cfg.Volumes {
|
||||
mount := fmt.Sprintf("%s:%s", hostPath, containerPath)
|
||||
args = append(args, "-v", mount)
|
||||
}
|
||||
|
||||
// Use injected GPU device paths for Apple GPU or custom configurations
|
||||
for _, device := range cfg.GPUDevices {
|
||||
args = append(args, "--device", device)
|
||||
}
|
||||
|
||||
// Add environment variables
|
||||
for key, value := range cfg.Env {
|
||||
args = append(args, "-e", fmt.Sprintf("%s=%s", key, value))
|
||||
}
|
||||
|
||||
// Image and command
|
||||
args = append(args, cfg.Image,
|
||||
"--workspace", cfg.ContainerWorkspace,
|
||||
"--requirements", requirementsPath,
|
||||
"--deps", depsPath,
|
||||
"--script", scriptPath,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -14,6 +14,38 @@ import (
|
|||
|
||||
var defaultBlockedPackages = []string{"requests", "urllib3", "httpx"}
|
||||
|
||||
func DefaultBlockedPackages() []string {
|
||||
return append([]string{}, defaultBlockedPackages...)
|
||||
}
|
||||
|
||||
func DefaultEnhancedSecurityConfigFromEnv() *EnhancedSecurityConfig {
|
||||
securityConfig := GetDefaultSecurityConfig()
|
||||
|
||||
if blockedPkgs := os.Getenv("FETCHML_JUPYTER_BLOCKED_PACKAGES"); blockedPkgs != "" {
|
||||
securityConfig.BlockedPackages = strings.Split(strings.TrimSpace(blockedPkgs), ",")
|
||||
for i, pkg := range securityConfig.BlockedPackages {
|
||||
securityConfig.BlockedPackages[i] = strings.TrimSpace(pkg)
|
||||
}
|
||||
}
|
||||
|
||||
if allowedPkgs := os.Getenv("FETCHML_JUPYTER_ALLOWED_PACKAGES"); allowedPkgs != "" {
|
||||
securityConfig.AllowedPackages = make(map[string]bool)
|
||||
allowed := strings.Split(strings.TrimSpace(allowedPkgs), ",")
|
||||
for _, pkg := range allowed {
|
||||
securityConfig.AllowedPackages[strings.TrimSpace(pkg)] = true
|
||||
}
|
||||
}
|
||||
|
||||
return securityConfig
|
||||
}
|
||||
|
||||
func envDefaultImage() string {
|
||||
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_DEFAULT_IMAGE")); v != "" {
|
||||
return v
|
||||
}
|
||||
return "localhost/ml-tools-runner:latest"
|
||||
}
|
||||
|
||||
// ConfigManager manages Jupyter service configuration
|
||||
type ConfigManager struct {
|
||||
logger *logging.Logger
|
||||
|
|
@ -92,7 +124,11 @@ type AdvancedSettingsConfig struct {
|
|||
}
|
||||
|
||||
// NewConfigManager creates a new configuration manager
|
||||
func NewConfigManager(logger *logging.Logger, configPath string, environment string) (*ConfigManager, error) {
|
||||
func NewConfigManager(
|
||||
logger *logging.Logger,
|
||||
configPath string,
|
||||
environment string,
|
||||
) (*ConfigManager, error) {
|
||||
cm := &ConfigManager{
|
||||
logger: logger,
|
||||
configPath: configPath,
|
||||
|
|
@ -215,14 +251,14 @@ func (cm *ConfigManager) getDefaultConfig() *JupyterConfig {
|
|||
Version: "1.0.0",
|
||||
Environment: cm.environment,
|
||||
Service: ServiceConfig{
|
||||
DefaultImage: "localhost/ml-tools-runner:latest",
|
||||
DefaultImage: envDefaultImage(),
|
||||
DefaultPort: 8888,
|
||||
DefaultWorkspace: "./workspace",
|
||||
MaxServices: 5,
|
||||
DefaultResources: ResourceConfig{
|
||||
MemoryLimit: "8G",
|
||||
CPULimit: "2",
|
||||
GPUAccess: false,
|
||||
GPUDevices: nil,
|
||||
},
|
||||
SecuritySettings: SecurityConfig{
|
||||
AllowNetwork: false,
|
||||
|
|
@ -270,7 +306,7 @@ func (cm *ConfigManager) getDefaultConfig() *JupyterConfig {
|
|||
Resources: ResourceConfig{
|
||||
MemoryLimit: "8G",
|
||||
CPULimit: "2",
|
||||
GPUAccess: false,
|
||||
GPUDevices: nil,
|
||||
},
|
||||
Health: HealthConfig{
|
||||
Enabled: true,
|
||||
|
|
@ -290,7 +326,7 @@ func (cm *ConfigManager) getDefaultConfig() *JupyterConfig {
|
|||
MaxAge: "7d",
|
||||
},
|
||||
DefaultSettings: DefaultSettingsConfig{
|
||||
Image: "localhost/ml-tools-runner:latest",
|
||||
Image: envDefaultImage(),
|
||||
Port: 8888,
|
||||
Workspace: "./workspace",
|
||||
Environment: map[string]string{"JUPYTER_ENABLE_LAB": "yes"},
|
||||
|
|
|
|||
|
|
@ -74,7 +74,10 @@ func (hm *HealthMonitor) RemoveService(serviceID string) {
|
|||
}
|
||||
|
||||
// CheckServiceHealth checks the health of a specific service
|
||||
func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID string) (*HealthStatus, error) {
|
||||
func (hm *HealthMonitor) CheckServiceHealth(
|
||||
ctx context.Context,
|
||||
serviceID string,
|
||||
) (*HealthStatus, error) {
|
||||
service, exists := hm.services[serviceID]
|
||||
if !exists {
|
||||
return nil, fmt.Errorf("service %s not found", serviceID)
|
||||
|
|
@ -100,7 +103,10 @@ func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID strin
|
|||
responseTime := time.Since(start)
|
||||
if err != nil {
|
||||
healthStatus.Status = statusUnhealthy
|
||||
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP request failed: %v", err))
|
||||
healthStatus.Errors = append(
|
||||
healthStatus.Errors,
|
||||
fmt.Sprintf("HTTP request failed: %v", err),
|
||||
)
|
||||
healthStatus.ResponseTime = responseTime
|
||||
return healthStatus, nil
|
||||
}
|
||||
|
|
@ -119,7 +125,10 @@ func (hm *HealthMonitor) CheckServiceHealth(ctx context.Context, serviceID strin
|
|||
healthStatus.Status = "healthy"
|
||||
} else {
|
||||
healthStatus.Status = statusUnhealthy
|
||||
healthStatus.Errors = append(healthStatus.Errors, fmt.Sprintf("HTTP status %d", resp.StatusCode))
|
||||
healthStatus.Errors = append(
|
||||
healthStatus.Errors,
|
||||
fmt.Sprintf("HTTP status %d", resp.StatusCode),
|
||||
)
|
||||
}
|
||||
|
||||
// Check response headers for Jupyter-specific indicators
|
||||
|
|
@ -213,13 +222,16 @@ func (hm *HealthMonitor) StartMonitoring(ctx context.Context) {
|
|||
}
|
||||
|
||||
// GetServiceMetrics returns detailed metrics for a service
|
||||
func (hm *HealthMonitor) GetServiceMetrics(ctx context.Context, serviceID string) (map[string]interface{}, error) {
|
||||
func (hm *HealthMonitor) GetServiceMetrics(
|
||||
ctx context.Context,
|
||||
serviceID string,
|
||||
) (map[string]any, error) {
|
||||
service, exists := hm.services[serviceID]
|
||||
if !exists {
|
||||
return nil, fmt.Errorf("service %s not found", serviceID)
|
||||
}
|
||||
|
||||
metrics := make(map[string]interface{})
|
||||
metrics := make(map[string]any)
|
||||
|
||||
// Basic service info
|
||||
metrics["service_id"] = service.ID
|
||||
|
|
@ -253,9 +265,12 @@ func (hm *HealthMonitor) GetServiceMetrics(ctx context.Context, serviceID string
|
|||
}
|
||||
|
||||
// getContainerMetrics gets container-specific metrics
|
||||
func (hm *HealthMonitor) getContainerMetrics(_ context.Context, _ string) map[string]interface{} {
|
||||
func (hm *HealthMonitor) getContainerMetrics(
|
||||
_ context.Context,
|
||||
_ string,
|
||||
) map[string]any {
|
||||
// Lightweight container metrics - avoid heavy system calls
|
||||
metrics := make(map[string]interface{})
|
||||
metrics := make(map[string]any)
|
||||
|
||||
// Basic status check only - keep it minimal
|
||||
metrics["status"] = "running"
|
||||
|
|
@ -295,7 +310,10 @@ func (hm *HealthMonitor) ValidateService(service *JupyterService) []string {
|
|||
}
|
||||
|
||||
// StartContinuousMonitoring starts continuous health monitoring
|
||||
func (hm *HealthMonitor) StartContinuousMonitoring(ctx context.Context, interval time.Duration) {
|
||||
func (hm *HealthMonitor) StartContinuousMonitoring(
|
||||
ctx context.Context,
|
||||
interval time.Duration,
|
||||
) {
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
|
|
@ -355,7 +373,10 @@ func isValidURL(url string) bool {
|
|||
}
|
||||
|
||||
// GetHealthHistory returns health check history for a service (lightweight version)
|
||||
func (hm *HealthMonitor) GetHealthHistory(_ string, duration time.Duration) ([]*HealthStatus, error) {
|
||||
func (hm *HealthMonitor) GetHealthHistory(
|
||||
_ string,
|
||||
_ time.Duration,
|
||||
) ([]*HealthStatus, error) {
|
||||
// Return empty for now - keep it lightweight
|
||||
return []*HealthStatus{}, nil
|
||||
}
|
||||
|
|
@ -366,11 +387,11 @@ func (hm *HealthMonitor) SetInterval(interval time.Duration) {
|
|||
}
|
||||
|
||||
// GetMonitoringStatus returns the current monitoring status
|
||||
func (hm *HealthMonitor) GetMonitoringStatus() map[string]interface{} {
|
||||
func (hm *HealthMonitor) GetMonitoringStatus() map[string]any {
|
||||
hm.servicesMutex.RLock()
|
||||
defer hm.servicesMutex.RUnlock()
|
||||
|
||||
return map[string]interface{}{
|
||||
return map[string]any{
|
||||
"monitored_services": len(hm.services),
|
||||
"check_interval": hm.interval.String(),
|
||||
"timeout": hm.client.Timeout.String(),
|
||||
|
|
|
|||
|
|
@ -113,7 +113,10 @@ func (nm *NetworkManager) ValidateNetworkConfig(config *NetworkConfig) error {
|
|||
}
|
||||
|
||||
// PrepareNetworkConfig prepares network configuration for a service
|
||||
func (nm *NetworkManager) PrepareNetworkConfig(serviceID string, userConfig *NetworkConfig) (*NetworkConfig, error) {
|
||||
func (nm *NetworkManager) PrepareNetworkConfig(
|
||||
serviceID string,
|
||||
userConfig *NetworkConfig,
|
||||
) (*NetworkConfig, error) {
|
||||
config := &NetworkConfig{
|
||||
ContainerPort: 8888,
|
||||
BindAddress: "127.0.0.1",
|
||||
|
|
@ -323,7 +326,11 @@ func (nm *NetworkManager) TestConnectivity(_ context.Context, config *NetworkCon
|
|||
|
||||
// Simple connectivity test
|
||||
dialer := &net.Dialer{Timeout: 5 * time.Second}
|
||||
conn, err := dialer.DialContext(context.Background(), "tcp", fmt.Sprintf("%s:%d", config.BindAddress, config.HostPort))
|
||||
conn, err := dialer.DialContext(
|
||||
context.Background(),
|
||||
"tcp",
|
||||
fmt.Sprintf("%s:%d", config.BindAddress, config.HostPort),
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot connect to %s: %w", url, err)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,7 +66,11 @@ type PackageInfo struct {
|
|||
}
|
||||
|
||||
// NewPackageManager creates a new package manager
|
||||
func NewPackageManager(logger *logging.Logger, config *PackageConfig, workspacePath string) (*PackageManager, error) {
|
||||
func NewPackageManager(
|
||||
logger *logging.Logger,
|
||||
config *PackageConfig,
|
||||
workspacePath string,
|
||||
) (*PackageManager, error) {
|
||||
pm := &PackageManager{
|
||||
logger: logger,
|
||||
trustedChannels: config.TrustedChannels,
|
||||
|
|
@ -106,7 +110,11 @@ func (pm *PackageManager) ValidatePackageRequest(req *PackageRequest) error {
|
|||
// Check if channel is trusted
|
||||
if req.Channel != "" {
|
||||
if !pm.isChannelTrusted(req.Channel) {
|
||||
return fmt.Errorf("channel '%s' is not trusted. Allowed channels: %v", req.Channel, pm.trustedChannels)
|
||||
return fmt.Errorf(
|
||||
"channel '%s' is not trusted. Allowed channels: %v",
|
||||
req.Channel,
|
||||
pm.trustedChannels,
|
||||
)
|
||||
}
|
||||
} else {
|
||||
// Default to conda-forge if no channel specified
|
||||
|
|
@ -158,7 +166,12 @@ func (pm *PackageManager) isValidPackageName(name string) bool {
|
|||
}
|
||||
|
||||
// RequestPackage creates a package installation request
|
||||
func (pm *PackageManager) RequestPackage(packageName, version, channel, requestedBy string) (*PackageRequest, error) {
|
||||
func (pm *PackageManager) RequestPackage(
|
||||
packageName,
|
||||
version,
|
||||
channel,
|
||||
requestedBy string,
|
||||
) (*PackageRequest, error) {
|
||||
req := &PackageRequest{
|
||||
PackageName: strings.ToLower(strings.TrimSpace(packageName)),
|
||||
Version: version,
|
||||
|
|
|
|||
|
|
@ -232,7 +232,13 @@ func (sm *SecurityManager) GenerateSecureToken() (string, error) {
|
|||
token := base64.URLEncoding.EncodeToString(bytes)
|
||||
|
||||
// Log token generation (without the token itself for security)
|
||||
sm.logSecurityEvent("token_generation", "low", "system", "generate_token", "Secure token generated")
|
||||
sm.logSecurityEvent(
|
||||
"token_generation",
|
||||
"low",
|
||||
"system",
|
||||
"generate_token",
|
||||
"Secure token generated",
|
||||
)
|
||||
|
||||
return token, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import (
|
|||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
|
@ -15,8 +16,185 @@ import (
|
|||
|
||||
const (
|
||||
serviceStatusRunning = "running"
|
||||
defaultWorkspaceBase = "/data/active/workspaces"
|
||||
)
|
||||
|
||||
func stateDir() string {
|
||||
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_STATE_DIR")); v != "" {
|
||||
return v
|
||||
}
|
||||
preferred := "/data/active/jupyter"
|
||||
if err := os.MkdirAll(preferred, 0o750); err == nil {
|
||||
return preferred
|
||||
}
|
||||
|
||||
return os.TempDir()
|
||||
}
|
||||
|
||||
func workspaceBaseDir() string {
|
||||
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_WORKSPACE_BASE")); v != "" {
|
||||
return v
|
||||
}
|
||||
return defaultWorkspaceBase
|
||||
}
|
||||
|
||||
func resolveWorkspacePath(workspace string) (string, error) {
|
||||
ws := strings.TrimSpace(workspace)
|
||||
if ws == "" {
|
||||
return "", fmt.Errorf("workspace is required")
|
||||
}
|
||||
|
||||
clean := filepath.Clean(ws)
|
||||
// Reject obvious traversal attempts.
|
||||
if clean == ".." || strings.HasPrefix(clean, ".."+string(filepath.Separator)) {
|
||||
return "", fmt.Errorf("invalid workspace path: %s", workspace)
|
||||
}
|
||||
|
||||
// For container deployments, relative paths refer to the workspace base directory.
|
||||
if !filepath.IsAbs(clean) {
|
||||
clean = strings.TrimPrefix(clean, "."+string(filepath.Separator))
|
||||
clean = filepath.Join(workspaceBaseDir(), clean)
|
||||
}
|
||||
|
||||
return clean, nil
|
||||
}
|
||||
|
||||
func trashBaseDir() string {
|
||||
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_TRASH_DIR")); v != "" {
|
||||
return v
|
||||
}
|
||||
// Default to state dir to keep behavior consistent across deployments.
|
||||
return filepath.Join(stateDir(), "trash", "jupyter")
|
||||
}
|
||||
|
||||
type trashInfo struct {
|
||||
OriginalName string `json:"original_name"`
|
||||
DeletedAt time.Time `json:"deleted_at"`
|
||||
DeletedBy string `json:"deleted_by"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
PurgeAfter time.Time `json:"purge_after"`
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalName string) (string, *trashInfo, error) {
|
||||
ws := strings.TrimSpace(workspacePath)
|
||||
if ws == "" {
|
||||
return "", nil, fmt.Errorf("workspace is required")
|
||||
}
|
||||
name := strings.TrimSpace(originalName)
|
||||
if name == "" {
|
||||
return "", nil, fmt.Errorf("original name is required")
|
||||
}
|
||||
|
||||
wsResolved, err := resolveWorkspacePath(ws)
|
||||
if err == nil {
|
||||
ws = wsResolved
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(trashBaseDir(), 0o750); err != nil {
|
||||
return "", nil, fmt.Errorf("failed to create trash directory: %w", err)
|
||||
}
|
||||
|
||||
ts := time.Now().UTC().Format("20060102_150405")
|
||||
destName := fmt.Sprintf("%s_%s", name, ts)
|
||||
dest := filepath.Join(trashBaseDir(), destName)
|
||||
|
||||
sizeBytes, _ := dirSizeBytes(ws)
|
||||
info := &trashInfo{
|
||||
OriginalName: name,
|
||||
DeletedAt: time.Now().UTC(),
|
||||
DeletedBy: "system",
|
||||
SizeBytes: sizeBytes,
|
||||
PurgeAfter: time.Now().UTC().Add(30 * 24 * time.Hour),
|
||||
Reason: "user_request",
|
||||
}
|
||||
|
||||
if err := os.Rename(ws, dest); err != nil {
|
||||
return "", nil, fmt.Errorf("failed to move workspace to trash: %w", err)
|
||||
}
|
||||
b, err := json.MarshalIndent(info, "", " ")
|
||||
if err == nil {
|
||||
_ = os.WriteFile(filepath.Join(dest, ".trashinfo"), b, 0o600)
|
||||
}
|
||||
|
||||
return dest, info, nil
|
||||
}
|
||||
|
||||
func (sm *ServiceManager) RestoreWorkspace(ctx context.Context, name string) (string, error) {
|
||||
_ = ctx
|
||||
wsName := strings.TrimSpace(name)
|
||||
if wsName == "" {
|
||||
return "", fmt.Errorf("workspace name is required")
|
||||
}
|
||||
|
||||
base := trashBaseDir()
|
||||
entries, err := os.ReadDir(base)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return "", fmt.Errorf("no trash directory found")
|
||||
}
|
||||
return "", fmt.Errorf("failed to read trash directory: %w", err)
|
||||
}
|
||||
|
||||
prefix := wsName + "_"
|
||||
var best string
|
||||
var bestTs string
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
n := e.Name()
|
||||
if !strings.HasPrefix(n, prefix) {
|
||||
continue
|
||||
}
|
||||
ts := strings.TrimPrefix(n, prefix)
|
||||
if best == "" || ts > bestTs {
|
||||
best = n
|
||||
bestTs = ts
|
||||
}
|
||||
}
|
||||
if best == "" {
|
||||
return "", fmt.Errorf("no trashed workspace found for %q", wsName)
|
||||
}
|
||||
|
||||
src := filepath.Join(base, best)
|
||||
dest := filepath.Join(workspaceBaseDir(), wsName)
|
||||
if _, err := os.Stat(dest); err == nil {
|
||||
return "", fmt.Errorf("workspace %q already exists", wsName)
|
||||
}
|
||||
if err := os.MkdirAll(workspaceBaseDir(), 0o750); err != nil {
|
||||
return "", fmt.Errorf("failed to create workspace base directory: %w", err)
|
||||
}
|
||||
if err := os.Rename(src, dest); err != nil {
|
||||
return "", fmt.Errorf("failed to restore workspace: %w", err)
|
||||
}
|
||||
_ = os.Remove(filepath.Join(dest, ".trashinfo"))
|
||||
|
||||
return dest, nil
|
||||
}
|
||||
|
||||
func dirSizeBytes(path string) (int64, error) {
|
||||
var total int64
|
||||
err := filepath.WalkDir(path, func(_ string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
total += info.Size()
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return total, nil
|
||||
}
|
||||
|
||||
// ServiceManager manages standalone Jupyter services
|
||||
type ServiceManager struct {
|
||||
logger *logging.Logger
|
||||
|
|
@ -24,6 +202,7 @@ type ServiceManager struct {
|
|||
config *ServiceConfig
|
||||
services map[string]*JupyterService
|
||||
workspaceMetadataMgr *WorkspaceMetadataManager
|
||||
securityMgr *SecurityManager
|
||||
}
|
||||
|
||||
// ServiceConfig holds configuration for Jupyter services
|
||||
|
|
@ -52,9 +231,9 @@ type NetworkConfig struct {
|
|||
|
||||
// ResourceConfig defines resource limits for Jupyter containers
|
||||
type ResourceConfig struct {
|
||||
MemoryLimit string `json:"memory_limit"`
|
||||
CPULimit string `json:"cpu_limit"`
|
||||
GPUAccess bool `json:"gpu_access"`
|
||||
MemoryLimit string `json:"memory_limit"`
|
||||
CPULimit string `json:"cpu_limit"`
|
||||
GPUDevices []string `json:"gpu_devices"`
|
||||
}
|
||||
|
||||
// SecurityConfig holds security settings for Jupyter services
|
||||
|
|
@ -112,15 +291,39 @@ func NewServiceManager(logger *logging.Logger, config *ServiceConfig) (*ServiceM
|
|||
}
|
||||
|
||||
// Initialize workspace metadata manager
|
||||
dataFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_workspaces.json")
|
||||
dataFile := filepath.Join(stateDir(), "fetch_ml_jupyter_workspaces.json")
|
||||
workspaceMetadataMgr := NewWorkspaceMetadataManager(logger, dataFile)
|
||||
|
||||
// Initialize security manager with enhanced config
|
||||
securityConfig := GetDefaultSecurityConfig()
|
||||
|
||||
// Override blocked packages from environment variable if provided
|
||||
if blockedPkgs := os.Getenv("FETCHML_JUPYTER_BLOCKED_PACKAGES"); blockedPkgs != "" {
|
||||
securityConfig.BlockedPackages = strings.Split(strings.TrimSpace(blockedPkgs), ",")
|
||||
// Trim whitespace from each package name
|
||||
for i, pkg := range securityConfig.BlockedPackages {
|
||||
securityConfig.BlockedPackages[i] = strings.TrimSpace(pkg)
|
||||
}
|
||||
}
|
||||
|
||||
// Override allowed packages from environment variable if provided
|
||||
if allowedPkgs := os.Getenv("FETCHML_JUPYTER_ALLOWED_PACKAGES"); allowedPkgs != "" {
|
||||
securityConfig.AllowedPackages = make(map[string]bool)
|
||||
allowed := strings.Split(strings.TrimSpace(allowedPkgs), ",")
|
||||
for _, pkg := range allowed {
|
||||
securityConfig.AllowedPackages[strings.TrimSpace(pkg)] = true
|
||||
}
|
||||
}
|
||||
|
||||
securityMgr := NewSecurityManager(logger, securityConfig)
|
||||
|
||||
sm := &ServiceManager{
|
||||
logger: logger,
|
||||
podman: podman,
|
||||
config: config,
|
||||
services: make(map[string]*JupyterService),
|
||||
workspaceMetadataMgr: workspaceMetadataMgr,
|
||||
securityMgr: securityMgr,
|
||||
}
|
||||
|
||||
// Load existing services
|
||||
|
|
@ -132,7 +335,10 @@ func NewServiceManager(logger *logging.Logger, config *ServiceConfig) (*ServiceM
|
|||
}
|
||||
|
||||
// StartService starts a new Jupyter service
|
||||
func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (*JupyterService, error) {
|
||||
func (sm *ServiceManager) StartService(
|
||||
ctx context.Context,
|
||||
req *StartRequest,
|
||||
) (*JupyterService, error) {
|
||||
// Validate request
|
||||
if err := sm.validateStartRequest(req); err != nil {
|
||||
return nil, err
|
||||
|
|
@ -155,6 +361,13 @@ func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (
|
|||
return nil, fmt.Errorf("failed to start container: %w", err)
|
||||
}
|
||||
|
||||
// Check for blacklisted packages in the container
|
||||
if err := sm.checkPackageBlacklist(ctx, containerID); err != nil {
|
||||
// Cleanup on blacklist violation
|
||||
_ = sm.podman.StopContainer(ctx, containerID)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Wait for Jupyter to be ready
|
||||
url, err := sm.waitForJupyterReady(ctx, containerID, req.Network)
|
||||
if err != nil {
|
||||
|
|
@ -206,6 +419,146 @@ func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (
|
|||
return service, nil
|
||||
}
|
||||
|
||||
// checkPackageBlacklist validates that no blacklisted packages are installed in the container
|
||||
func (sm *ServiceManager) checkPackageBlacklist(ctx context.Context, containerID string) error {
|
||||
// Get list of installed packages from the container
|
||||
// Try both pip and conda package managers
|
||||
packages, err := sm.getInstalledPackages(ctx, containerID)
|
||||
if err != nil {
|
||||
sm.logger.Warn("failed to get installed packages for blacklist check", "error", err)
|
||||
// Don't fail startup if we can't check packages, but log it
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check each installed package against the blacklist
|
||||
var blockedPackages []string
|
||||
for _, pkg := range packages {
|
||||
// Create a package request for validation
|
||||
pkgReq := &PackageRequest{
|
||||
PackageName: pkg,
|
||||
RequestedBy: "system",
|
||||
Channel: "",
|
||||
Version: "",
|
||||
}
|
||||
|
||||
if err := sm.securityMgr.ValidatePackageRequest(pkgReq); err != nil {
|
||||
blockedPackages = append(blockedPackages, pkg)
|
||||
}
|
||||
}
|
||||
|
||||
// If any blocked packages are found, fail startup
|
||||
if len(blockedPackages) > 0 {
|
||||
return fmt.Errorf("container startup failed: blacklisted packages detected: %v. "+
|
||||
"These packages are blocked by security policy. "+
|
||||
"Remove them from the image or use FETCHML_JUPYTER_BLOCKED_PACKAGES to configure the blacklist",
|
||||
blockedPackages)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// getInstalledPackages retrieves the list of installed packages from the container
|
||||
func (sm *ServiceManager) getInstalledPackages(ctx context.Context, containerID string) ([]string, error) {
|
||||
var packages []string
|
||||
|
||||
// Try pip list first
|
||||
pipOutput, err := sm.podman.ExecContainer(ctx, containerID, []string{"pip", "list", "--format=freeze"})
|
||||
if err == nil && pipOutput != "" {
|
||||
pipPackages := sm.parsePipList(pipOutput)
|
||||
packages = append(packages, pipPackages...)
|
||||
}
|
||||
|
||||
// Try conda list as well
|
||||
condaOutput, err := sm.podman.ExecContainer(ctx, containerID, []string{"conda", "list", "--export"})
|
||||
if err == nil && condaOutput != "" {
|
||||
condaPackages := sm.parseCondaList(condaOutput)
|
||||
packages = append(packages, condaPackages...)
|
||||
}
|
||||
|
||||
// Remove duplicates
|
||||
uniquePackages := make(map[string]bool)
|
||||
var result []string
|
||||
for _, pkg := range packages {
|
||||
if !uniquePackages[pkg] {
|
||||
uniquePackages[pkg] = true
|
||||
result = append(result, pkg)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// parsePipList parses pip list --format=freeze output
|
||||
func (sm *ServiceManager) parsePipList(output string) []string {
|
||||
var packages []string
|
||||
lines := strings.Split(output, "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" && !strings.HasPrefix(line, "#") {
|
||||
// Format: package==version
|
||||
parts := strings.Split(line, "==")
|
||||
if len(parts) > 0 {
|
||||
pkgName := strings.TrimSpace(parts[0])
|
||||
if pkgName != "" {
|
||||
packages = append(packages, pkgName)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return packages
|
||||
}
|
||||
|
||||
// parseCondaList parses conda list --export output
|
||||
func (sm *ServiceManager) parseCondaList(output string) []string {
|
||||
var packages []string
|
||||
lines := strings.Split(output, "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" && !strings.HasPrefix(line, "#") {
|
||||
// Format: package=version=build
|
||||
parts := strings.Split(line, "=")
|
||||
if len(parts) > 0 {
|
||||
pkgName := strings.TrimSpace(parts[0])
|
||||
if pkgName != "" {
|
||||
packages = append(packages, pkgName)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return packages
|
||||
}
|
||||
|
||||
// ParsePipList parses pip list --format=freeze output.
|
||||
func ParsePipList(output string) []string {
|
||||
sm := &ServiceManager{}
|
||||
return sm.parsePipList(output)
|
||||
}
|
||||
|
||||
// ParseCondaList parses conda list --export output.
|
||||
func ParseCondaList(output string) []string {
|
||||
sm := &ServiceManager{}
|
||||
return sm.parseCondaList(output)
|
||||
}
|
||||
|
||||
// PrepareContainerConfig builds the Podman container config for a start request.
|
||||
func PrepareContainerConfig(serviceID string, req *StartRequest) *container.ContainerConfig {
|
||||
sm := &ServiceManager{}
|
||||
return sm.prepareContainerConfig(serviceID, req)
|
||||
}
|
||||
|
||||
// MoveWorkspaceToTrash moves a workspace directory to the configured trash directory.
|
||||
func MoveWorkspaceToTrash(workspacePath string, originalName string) (string, error) {
|
||||
sm := &ServiceManager{}
|
||||
trashPath, _, err := sm.moveWorkspaceToTrash(workspacePath, originalName)
|
||||
return trashPath, err
|
||||
}
|
||||
|
||||
// RestoreWorkspace restores the most recently trashed workspace for the given name.
|
||||
func RestoreWorkspace(ctx context.Context, name string) (string, error) {
|
||||
sm := &ServiceManager{}
|
||||
return sm.RestoreWorkspace(ctx, name)
|
||||
}
|
||||
|
||||
// StopService stops a Jupyter service
|
||||
func (sm *ServiceManager) StopService(ctx context.Context, serviceID string) error {
|
||||
service, exists := sm.services[serviceID]
|
||||
|
|
@ -240,6 +593,67 @@ func (sm *ServiceManager) StopService(ctx context.Context, serviceID string) err
|
|||
return nil
|
||||
}
|
||||
|
||||
// RemoveService removes a Jupyter service. If purge is false, it soft-deletes the workspace by moving it to trash.
|
||||
func (sm *ServiceManager) RemoveService(ctx context.Context, serviceID string, purge bool) error {
|
||||
service, exists := sm.services[serviceID]
|
||||
if !exists {
|
||||
return fmt.Errorf("service %s not found", serviceID)
|
||||
}
|
||||
|
||||
// Stop container first
|
||||
if err := sm.podman.StopContainer(ctx, service.ContainerID); err != nil {
|
||||
sm.logger.Warn("failed to stop container before removal", "service_id", serviceID, "error", err)
|
||||
}
|
||||
|
||||
// Remove container
|
||||
if err := sm.podman.RemoveContainer(ctx, service.ContainerID); err != nil {
|
||||
sm.logger.Warn("failed to remove container", "service_id", serviceID, "error", err)
|
||||
return fmt.Errorf("failed to remove container: %w", err)
|
||||
}
|
||||
|
||||
// Best-effort: unlink workspace metadata.
|
||||
if sm.workspaceMetadataMgr != nil && strings.TrimSpace(service.Workspace) != "" {
|
||||
_ = sm.workspaceMetadataMgr.UnlinkWorkspace(service.Workspace)
|
||||
}
|
||||
|
||||
// Workspace deletion policy.
|
||||
ws := strings.TrimSpace(service.Workspace)
|
||||
if ws != "" {
|
||||
wsPath, err := resolveWorkspacePath(ws)
|
||||
if err == nil {
|
||||
ws = wsPath
|
||||
}
|
||||
if purge {
|
||||
if err := os.RemoveAll(ws); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to delete workspace: %w", err)
|
||||
}
|
||||
} else {
|
||||
dest, info, err := sm.moveWorkspaceToTrash(ws, strings.TrimSpace(service.Name))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Persist trash path for observability.
|
||||
if service.Metadata == nil {
|
||||
service.Metadata = make(map[string]string)
|
||||
}
|
||||
service.Metadata["trash_path"] = dest
|
||||
service.Metadata["purge_after"] = strconv.FormatInt(info.PurgeAfter.Unix(), 10)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove from active services
|
||||
delete(sm.services, serviceID)
|
||||
|
||||
// Save services to disk
|
||||
if err := sm.saveServices(); err != nil {
|
||||
sm.logger.Warn("failed to save services", "error", err)
|
||||
}
|
||||
|
||||
sm.logger.Info("jupyter service removed", "service_id", serviceID, "name", service.Name, "purge", purge)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetService retrieves a service by ID
|
||||
func (sm *ServiceManager) GetService(serviceID string) (*JupyterService, error) {
|
||||
service, exists := sm.services[serviceID]
|
||||
|
|
@ -296,9 +710,30 @@ func (sm *ServiceManager) validateStartRequest(req *StartRequest) error {
|
|||
req.Workspace = sm.config.DefaultWorkspace
|
||||
}
|
||||
|
||||
// Check if workspace exists
|
||||
if _, err := os.Stat(req.Workspace); os.IsNotExist(err) {
|
||||
return fmt.Errorf("workspace %s does not exist", req.Workspace)
|
||||
// Resolve/normalize workspace path before comparing.
|
||||
wsPath, err := resolveWorkspacePath(req.Workspace)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Workspace = wsPath
|
||||
|
||||
// Enforce reproducibility: do not allow creating a service/workspace with the same name
|
||||
// as any existing service (regardless of status).
|
||||
for _, svc := range sm.services {
|
||||
if svc == nil {
|
||||
continue
|
||||
}
|
||||
if strings.EqualFold(strings.TrimSpace(svc.Name), strings.TrimSpace(req.Name)) {
|
||||
return fmt.Errorf("a jupyter service/workspace named %q already exists", req.Name)
|
||||
}
|
||||
if svc.Workspace != "" && svc.Workspace == req.Workspace {
|
||||
return fmt.Errorf("workspace path %q is already in use by service %q", req.Workspace, svc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure workspace directory exists.
|
||||
if err := os.MkdirAll(req.Workspace, 0o750); err != nil {
|
||||
return fmt.Errorf("failed to create workspace directory: %w", err)
|
||||
}
|
||||
|
||||
if req.Image == "" {
|
||||
|
|
@ -331,11 +766,20 @@ func (sm *ServiceManager) generateServiceID(name string) string {
|
|||
}
|
||||
|
||||
// prepareContainerConfig prepares container configuration
|
||||
func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartRequest) *container.ContainerConfig {
|
||||
func (sm *ServiceManager) prepareContainerConfig(
|
||||
serviceID string,
|
||||
req *StartRequest,
|
||||
) *container.ContainerConfig {
|
||||
imageLower := strings.ToLower(strings.TrimSpace(req.Image))
|
||||
isPublicJupyter := strings.Contains(imageLower, "quay.io/jupyter/") || strings.Contains(imageLower, "jupyter/")
|
||||
|
||||
// Prepare volume mounts
|
||||
volumes := map[string]string{
|
||||
req.Workspace: "/workspace",
|
||||
volumes := map[string]string{}
|
||||
workspaceMount := "/workspace"
|
||||
if isPublicJupyter {
|
||||
workspaceMount = "/home/jovyan/work"
|
||||
}
|
||||
volumes[req.Workspace] = workspaceMount
|
||||
|
||||
// Prepare environment variables
|
||||
env := map[string]string{
|
||||
|
|
@ -363,17 +807,71 @@ func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartReq
|
|||
}
|
||||
|
||||
// Prepare container command
|
||||
cmd := []string{
|
||||
"conda", "run", "-n", "ml_env", "jupyter", "notebook",
|
||||
"--no-browser",
|
||||
"--ip=0.0.0.0",
|
||||
fmt.Sprintf("--port=%d", req.Network.ContainerPort),
|
||||
"--NotebookApp.allow-root=True",
|
||||
"--NotebookApp.ip=0.0.0.0",
|
||||
}
|
||||
var cmd []string
|
||||
if isPublicJupyter {
|
||||
condaEnv := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_CONDA_ENV"))
|
||||
if condaEnv == "" {
|
||||
condaEnv = "base"
|
||||
}
|
||||
kernelName := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_KERNEL_NAME"))
|
||||
if kernelName == "" {
|
||||
kernelName = condaEnv
|
||||
}
|
||||
displayName := fmt.Sprintf("Python (%s)", kernelName)
|
||||
|
||||
if !req.Network.EnableToken {
|
||||
cmd = append(cmd, "--NotebookApp.token=")
|
||||
jupyterTokenArg := ""
|
||||
if !req.Network.EnableToken {
|
||||
jupyterTokenArg = " --NotebookApp.token= --ServerApp.token="
|
||||
}
|
||||
|
||||
script := fmt.Sprintf(
|
||||
"set -euo pipefail; "+
|
||||
"if conda run -n %s python -c 'import ipykernel' >/dev/null 2>&1; then "+
|
||||
"conda run -n %s python -m ipykernel install --user --name %s --display-name %q; "+
|
||||
"fi; "+
|
||||
"exec start-notebook.sh --ip=0.0.0.0 --port=%d --no-browser --notebook-dir=%s%s",
|
||||
condaEnv,
|
||||
condaEnv,
|
||||
kernelName,
|
||||
displayName,
|
||||
req.Network.ContainerPort,
|
||||
workspaceMount,
|
||||
jupyterTokenArg,
|
||||
)
|
||||
|
||||
cmd = []string{"bash", "-lc", script}
|
||||
} else {
|
||||
condaEnv := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_CONDA_ENV"))
|
||||
if condaEnv == "" {
|
||||
condaEnv = "ml_env"
|
||||
}
|
||||
kernelName := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_KERNEL_NAME"))
|
||||
if kernelName == "" {
|
||||
kernelName = condaEnv
|
||||
}
|
||||
displayName := fmt.Sprintf("Python (%s)", kernelName)
|
||||
|
||||
jupyterTokenArg := ""
|
||||
if !req.Network.EnableToken {
|
||||
jupyterTokenArg = " --NotebookApp.token="
|
||||
}
|
||||
|
||||
script := fmt.Sprintf(
|
||||
"set -euo pipefail; "+
|
||||
"if conda run -n %s python -c 'import ipykernel' >/dev/null 2>&1; then "+
|
||||
"conda run -n %s python -m ipykernel install --user --name %s --display-name %q; "+
|
||||
"fi; "+
|
||||
"exec conda run -n %s jupyter notebook --no-browser --ip=0.0.0.0 --port=%d --NotebookApp.allow-root=True --NotebookApp.ip=0.0.0.0%s",
|
||||
condaEnv,
|
||||
condaEnv,
|
||||
kernelName,
|
||||
displayName,
|
||||
condaEnv,
|
||||
req.Network.ContainerPort,
|
||||
jupyterTokenArg,
|
||||
)
|
||||
|
||||
cmd = []string{"bash", "-lc", script}
|
||||
}
|
||||
|
||||
// Prepare security options
|
||||
|
|
@ -397,7 +895,7 @@ func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartReq
|
|||
Resources: container.ResourceConfig{
|
||||
MemoryLimit: req.Resources.MemoryLimit,
|
||||
CPULimit: req.Resources.CPULimit,
|
||||
GPUAccess: req.Resources.GPUAccess,
|
||||
GPUDevices: req.Resources.GPUDevices,
|
||||
},
|
||||
Network: container.NetworkConfig{
|
||||
AllowNetwork: req.Security.AllowNetwork,
|
||||
|
|
@ -417,7 +915,7 @@ func (sm *ServiceManager) waitForJupyterReady(
|
|||
deadline := time.Now().Add(maxWait)
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
status, err := sm.podman.GetContainerStatus(ctx, containerID)
|
||||
status, err := sm.podman.GetContainerStateStatus(ctx, containerID)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to check container status: %w", err)
|
||||
}
|
||||
|
|
@ -426,7 +924,7 @@ func (sm *ServiceManager) waitForJupyterReady(
|
|||
break
|
||||
}
|
||||
|
||||
if status == "exited" || status == "error" {
|
||||
if status == "exited" || status == "dead" {
|
||||
return "", fmt.Errorf("container failed to start (status: %s)", status)
|
||||
}
|
||||
|
||||
|
|
@ -451,7 +949,7 @@ func (sm *ServiceManager) waitForJupyterReady(
|
|||
|
||||
// loadServices loads existing services from disk
|
||||
func (sm *ServiceManager) loadServices() error {
|
||||
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
|
||||
servicesFile := filepath.Join(stateDir(), "fetch_ml_jupyter_services.json")
|
||||
|
||||
data, err := os.ReadFile(servicesFile)
|
||||
if err != nil {
|
||||
|
|
@ -466,26 +964,78 @@ func (sm *ServiceManager) loadServices() error {
|
|||
return err
|
||||
}
|
||||
|
||||
// Validate services are still running
|
||||
for id, service := range services {
|
||||
if service.Status == serviceStatusRunning {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
status, err := sm.podman.GetContainerStatus(ctx, service.ContainerID)
|
||||
cancel()
|
||||
// Reset in-memory map before re-hydrating from disk.
|
||||
sm.services = make(map[string]*JupyterService)
|
||||
|
||||
if err != nil || status != "running" {
|
||||
service.Status = "stopped"
|
||||
}
|
||||
// Normalize service status + de-dupe by service name.
|
||||
// Prefer: running service > newest CreatedAt.
|
||||
byName := make(map[string]*JupyterService)
|
||||
for id, service := range services {
|
||||
if service == nil {
|
||||
continue
|
||||
}
|
||||
sm.services[id] = service
|
||||
// Ensure stable ID even if old payloads didn't persist it.
|
||||
if strings.TrimSpace(service.ID) == "" {
|
||||
service.ID = id
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
status, err := sm.podman.GetContainerStateStatus(ctx, service.ContainerID)
|
||||
cancel()
|
||||
if err != nil {
|
||||
sm.logger.Warn("failed to check service status", "service_id", id, "error", err)
|
||||
service.Status = "stopped"
|
||||
} else if status == serviceStatusRunning {
|
||||
service.Status = serviceStatusRunning
|
||||
} else {
|
||||
service.Status = "stopped"
|
||||
}
|
||||
|
||||
nameKey := strings.ToLower(strings.TrimSpace(service.Name))
|
||||
if nameKey == "" {
|
||||
nameKey = service.ID
|
||||
}
|
||||
|
||||
if existing, ok := byName[nameKey]; ok {
|
||||
// prefer running
|
||||
if existing.Status != serviceStatusRunning && service.Status == serviceStatusRunning {
|
||||
byName[nameKey] = service
|
||||
continue
|
||||
}
|
||||
if existing.Status == serviceStatusRunning && service.Status != serviceStatusRunning {
|
||||
continue
|
||||
}
|
||||
// otherwise prefer newest
|
||||
if service.CreatedAt.After(existing.CreatedAt) {
|
||||
byName[nameKey] = service
|
||||
}
|
||||
continue
|
||||
}
|
||||
byName[nameKey] = service
|
||||
}
|
||||
|
||||
for _, service := range byName {
|
||||
if service == nil {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(service.ID) == "" {
|
||||
continue
|
||||
}
|
||||
sm.services[service.ID] = service
|
||||
}
|
||||
|
||||
// Best-effort: persist the cleaned registry to avoid accumulating duplicates.
|
||||
_ = sm.saveServices()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// saveServices saves services to disk
|
||||
func (sm *ServiceManager) saveServices() error {
|
||||
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
|
||||
servicesFile := filepath.Join(stateDir(), "fetch_ml_jupyter_services.json")
|
||||
if err := os.MkdirAll(filepath.Dir(servicesFile), 0o750); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(sm.services, "", " ")
|
||||
if err != nil {
|
||||
|
|
@ -496,7 +1046,11 @@ func (sm *ServiceManager) saveServices() error {
|
|||
}
|
||||
|
||||
// LinkWorkspaceWithExperiment links a workspace with an experiment
|
||||
func (sm *ServiceManager) LinkWorkspaceWithExperiment(workspacePath, experimentID, serviceID string) error {
|
||||
func (sm *ServiceManager) LinkWorkspaceWithExperiment(
|
||||
workspacePath,
|
||||
experimentID,
|
||||
serviceID string,
|
||||
) error {
|
||||
return sm.workspaceMetadataMgr.LinkWorkspace(workspacePath, experimentID, serviceID)
|
||||
}
|
||||
|
||||
|
|
@ -550,7 +1104,11 @@ func (sm *ServiceManager) ClearAllMetadata() error {
|
|||
}
|
||||
|
||||
// SetAutoSync enables or disables auto-sync for a workspace
|
||||
func (sm *ServiceManager) SetAutoSync(workspacePath string, enabled bool, interval time.Duration) error {
|
||||
func (sm *ServiceManager) SetAutoSync(
|
||||
workspacePath string,
|
||||
enabled bool,
|
||||
interval time.Duration,
|
||||
) error {
|
||||
return sm.workspaceMetadataMgr.SetAutoSync(workspacePath, enabled, interval)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,10 @@ type WorkspaceMetadataManager struct {
|
|||
}
|
||||
|
||||
// NewWorkspaceMetadataManager creates a new workspace metadata manager
|
||||
func NewWorkspaceMetadataManager(logger *logging.Logger, dataFile string) *WorkspaceMetadataManager {
|
||||
func NewWorkspaceMetadataManager(
|
||||
logger *logging.Logger,
|
||||
dataFile string,
|
||||
) *WorkspaceMetadataManager {
|
||||
wmm := &WorkspaceMetadataManager{
|
||||
logger: logger,
|
||||
metadata: make(map[string]*WorkspaceMetadata),
|
||||
|
|
@ -50,7 +53,11 @@ func NewWorkspaceMetadataManager(logger *logging.Logger, dataFile string) *Works
|
|||
}
|
||||
|
||||
// LinkWorkspace links a workspace with an experiment
|
||||
func (wmm *WorkspaceMetadataManager) LinkWorkspace(workspacePath, experimentID, serviceID string) error {
|
||||
func (wmm *WorkspaceMetadataManager) LinkWorkspace(
|
||||
workspacePath,
|
||||
experimentID,
|
||||
serviceID string,
|
||||
) error {
|
||||
wmm.mutex.Lock()
|
||||
defer wmm.mutex.Unlock()
|
||||
|
||||
|
|
@ -97,7 +104,9 @@ func (wmm *WorkspaceMetadataManager) LinkWorkspace(workspacePath, experimentID,
|
|||
}
|
||||
|
||||
// GetWorkspaceMetadata retrieves metadata for a workspace
|
||||
func (wmm *WorkspaceMetadataManager) GetWorkspaceMetadata(workspacePath string) (*WorkspaceMetadata, error) {
|
||||
func (wmm *WorkspaceMetadataManager) GetWorkspaceMetadata(
|
||||
workspacePath string,
|
||||
) (*WorkspaceMetadata, error) {
|
||||
wmm.mutex.RLock()
|
||||
defer wmm.mutex.RUnlock()
|
||||
|
||||
|
|
@ -176,7 +185,13 @@ func (wmm *WorkspaceMetadataManager) UnlinkWorkspace(workspacePath string) error
|
|||
// Remove workspace metadata file
|
||||
workspaceMetaFile := filepath.Join(absPath, ".jupyter_experiment.json")
|
||||
if err := os.Remove(workspaceMetaFile); err != nil && !os.IsNotExist(err) {
|
||||
wmm.logger.Warn("failed to remove workspace metadata file", "file", workspaceMetaFile, "error", err)
|
||||
wmm.logger.Warn(
|
||||
"failed to remove workspace metadata file",
|
||||
"file",
|
||||
workspaceMetaFile,
|
||||
"error",
|
||||
err,
|
||||
)
|
||||
}
|
||||
|
||||
wmm.logger.Info("workspace unlinked", "workspace", absPath)
|
||||
|
|
@ -203,7 +218,11 @@ func (wmm *WorkspaceMetadataManager) ClearAllMetadata() error {
|
|||
}
|
||||
|
||||
// SetAutoSync enables or disables auto-sync for a workspace
|
||||
func (wmm *WorkspaceMetadataManager) SetAutoSync(workspacePath string, enabled bool, interval time.Duration) error {
|
||||
func (wmm *WorkspaceMetadataManager) SetAutoSync(
|
||||
workspacePath string,
|
||||
enabled bool,
|
||||
interval time.Duration,
|
||||
) error {
|
||||
wmm.mutex.Lock()
|
||||
defer wmm.mutex.Unlock()
|
||||
|
||||
|
|
@ -353,7 +372,9 @@ func (wmm *WorkspaceMetadataManager) createWorkspaceMetadataFile(
|
|||
}
|
||||
|
||||
// GetWorkspacesForExperiment returns all workspaces linked to an experiment
|
||||
func (wmm *WorkspaceMetadataManager) GetWorkspacesForExperiment(experimentID string) []*WorkspaceMetadata {
|
||||
func (wmm *WorkspaceMetadataManager) GetWorkspacesForExperiment(
|
||||
experimentID string,
|
||||
) []*WorkspaceMetadata {
|
||||
wmm.mutex.RLock()
|
||||
defer wmm.mutex.RUnlock()
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,10 @@ var (
|
|||
jwtPattern = regexp.MustCompile(`eyJ[a-zA-Z0-9_-]{10,}\.eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}`)
|
||||
|
||||
// Password-like fields in logs
|
||||
passwordPattern = regexp.MustCompile(`(?i)(password|passwd|pwd|secret|token|key)["']?\s*[:=]\s*["']?([^"'\s,}]+)`)
|
||||
passwordPattern = regexp.MustCompile(
|
||||
`(?i)(password|passwd|pwd|secret|token|key)["']?\s*[:=]\s*["']?` +
|
||||
`([^"'\s,}]+)`,
|
||||
)
|
||||
|
||||
// Redis URLs with passwords
|
||||
redisPasswordPattern = regexp.MustCompile(`redis://:[^@]+@`)
|
||||
|
|
|
|||
|
|
@ -67,12 +67,18 @@ func NewSSHClient(host, user, keyPath string, port int, knownHostsPath string) (
|
|||
if _, err := os.Stat(knownHostsPath); err == nil {
|
||||
callback, err := knownhosts.New(knownHostsPath)
|
||||
if err != nil {
|
||||
log.Printf("Warning: failed to parse known_hosts: %v; using insecure host key verification", err)
|
||||
log.Printf(
|
||||
"Warning: failed to parse known_hosts: %v; using insecure host key verification",
|
||||
err,
|
||||
)
|
||||
} else {
|
||||
hostKeyCallback = callback
|
||||
}
|
||||
} else if !os.IsNotExist(err) {
|
||||
log.Printf("Warning: known_hosts not found at %s; using insecure host key verification", knownHostsPath)
|
||||
log.Printf(
|
||||
"Warning: known_hosts not found at %s; using insecure host key verification",
|
||||
knownHostsPath,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -101,7 +107,7 @@ func NewSSHClient(host, user, keyPath string, port int, knownHostsPath string) (
|
|||
return &SSHClient{client: client, host: host}, nil
|
||||
}
|
||||
|
||||
// NewLocalClient creates a local-mode SSHClient that executes commands on the host using the provided base path.
|
||||
// NewLocalClient creates a local-mode SSHClient that executes commands on the host.
|
||||
func NewLocalClient(basePath string) *SSHClient {
|
||||
if basePath != "" {
|
||||
basePath = config.ExpandPath(basePath)
|
||||
|
|
@ -177,7 +183,11 @@ func (c *SSHClient) ExecContext(ctx context.Context, cmd string) (string, error)
|
|||
// Wait a bit more for final result
|
||||
select {
|
||||
case res := <-resultCh:
|
||||
return res.output, fmt.Errorf("command cancelled and force closed: %w (output: %s)", ctx.Err(), res.output)
|
||||
return res.output, fmt.Errorf(
|
||||
"command cancelled and force closed: %w (output: %s)",
|
||||
ctx.Err(),
|
||||
res.output,
|
||||
)
|
||||
case <-time.After(5 * time.Second):
|
||||
return "", fmt.Errorf("command cancelled and cleanup timeout: %w", ctx.Err())
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue