feat: add comprehensive configuration and deployment infrastructure

- Add development and production configuration templates - Include Docker build files for containerized deployment - Add Nginx configuration with SSL/TLS setup - Include environment configuration examples - Add SSL certificate setup and management - Configure application schemas and validation - Support for both local and production deployment scenarios Provides flexible deployment options from development to production with proper security, monitoring, and configuration management.
2025-12-04 16:54:02 -05:00 · 2025-12-04 16:54:02 -05:00 · 3de1e6e9ab
commit 3de1e6e9ab
parent d225ea1f00
21 changed files with 1514 additions and 0 deletions
--- a/.env.dev
+++ b/.env.dev
@ -0,0 +1,6 @@
+# Development environment variables
+REDIS_PASSWORD=JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k=
+JWT_SECRET=M/11uD5waf4glbTmFQiqSJaMCtCXTFwxvxRiFZL3GuFQO82PoURsIfFbmzyxrbPJ
+L5uc9Qj3Gd3Ijw7/kRMhwA==
+GRAFANA_USER=admin
+GRAFANA_PASSWORD=pd/UiVYlS+wmXlMmvh6mTw==
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,63 @@
+# Fetch ML Environment Variables
+# Copy this file to .env and modify as needed
+
+# Server Configuration
+FETCH_ML_HOST=localhost
+FETCH_ML_PORT=8080
+FETCH_ML_LOG_LEVEL=info
+FETCH_ML_LOG_FILE=logs/fetch_ml.log
+
+# Database Configuration
+FETCH_ML_DB_TYPE=sqlite
+FETCH_ML_DB_PATH=db/fetch_ml.db
+
+# Redis Configuration
+FETCH_ML_REDIS_URL=redis://localhost:6379
+FETCH_ML_REDIS_PASSWORD=
+FETCH_ML_REDIS_DB=0
+
+# Authentication
+FETCH_ML_AUTH_ENABLED=true
+FETCH_ML_AUTH_CONFIG=configs/config-local.yaml
+
+# Security
+FETCH_ML_SECRET_KEY=your-secret-key-here
+FETCH_ML_JWT_EXPIRY=24h
+
+# Container Runtime
+FETCH_ML_CONTAINER_RUNTIME=podman
+FETCH_ML_CONTAINER_REGISTRY=docker.io
+
+# Storage
+FETCH_ML_STORAGE_PATH=data
+FETCH_ML_RESULTS_PATH=results
+FETCH_ML_TEMP_PATH=/tmp/fetch_ml
+
+# Development
+FETCH_ML_DEBUG=false
+FETCH_ML_DEV_MODE=false
+
+# CLI Configuration (overrides ~/.ml/config.toml)
+FETCH_ML_CLI_HOST=localhost
+FETCH_ML_CLI_USER=mluser
+FETCH_ML_CLI_BASE=/opt/ml
+FETCH_ML_CLI_PORT=22
+FETCH_ML_CLI_API_KEY=your-api-key-here
+
+# TUI Configuration (overrides TUI config file)
+FETCH_ML_TUI_HOST=localhost
+FETCH_ML_TUI_USER=mluser
+FETCH_ML_TUI_SSH_KEY=~/.ssh/id_rsa
+FETCH_ML_TUI_PORT=22
+FETCH_ML_TUI_BASE_PATH=/opt/ml
+FETCH_ML_TUI_TRAIN_SCRIPT=train.py
+FETCH_ML_TUI_REDIS_ADDR=localhost:6379
+FETCH_ML_TUI_REDIS_PASSWORD=
+FETCH_ML_TUI_REDIS_DB=0
+FETCH_ML_TUI_KNOWN_HOSTS=~/.ssh/known_hosts
+
+# Monitoring Security
+# Generate with: openssl rand -base64 32
+GRAFANA_ADMIN_PASSWORD=changeme-generate-secure-password
+REDIS_PASSWORD=changeme-generate-secure-password
+
--- a/build/README.md
+++ b/build/README.md
@ -0,0 +1,30 @@
+# Build Configuration
+
+This directory contains build configurations for containerization.
+
+## Docker
+
+**Location**: `build/docker/`
+
+### Dockerfiles
+
+- **`simple.Dockerfile`** - Lightweight API server image
+- **`api-server.Dockerfile`** - Full-featured API server
+
+### Usage
+
+```bash
+# Build from project root
+docker build -f build/docker/simple.Dockerfile -t fetchml:latest .
+
+# Or use Makefile
+make docker-build
+```
+
+## Podman
+
+**Location**: `../podman/`
+
+Podman configurations for running ML experiments with GPU support.
+
+**Note**: Not for building - these are runtime configs for experiment execution.
--- a/build/docker/.dockerignore
+++ b/build/docker/.dockerignore
@ -0,0 +1,39 @@
+# Development files
+.git/
+.github/
+.windsurf/
+*.md
+
+# IDE files
+.vscode/
+.idea/
+*.swp
+
+# Test files
+*_test.go
+tests/
+
+# Build artifacts
+bin/
+cli/zig-out/
+.zig-cache/
+
+# Runtime data
+data/
+logs/
+experiments/
+dump.rdb
+
+# Secrets
+*.key
+*.pem
+secrets/
+
+# Python
+__pycache__/
+*.pyc
+.venv/
+
+# Documentation
+docs/
+examples/
--- a/build/docker/api-server.Dockerfile
+++ b/build/docker/api-server.Dockerfile
@ -0,0 +1,71 @@
+# Multi-stage build for ML Experiment Manager
+FROM golang:1.25-alpine AS go-builder
+
+# Install dependencies
+RUN apk add --no-cache git make podman redis
+
+# Set working directory
+WORKDIR /app
+
+# Copy go mod files
+COPY go.mod go.sum ./
+
+# Download dependencies
+RUN go mod download
+
+# Copy source code
+COPY . .
+
+# Build Go binaries
+RUN make build
+
+# Zig CLI stage
+FROM alpine:3.19 AS zig-builder
+
+# Install dependencies
+RUN apk add --no-cache curl xz
+
+# Install Zig
+RUN curl -L https://ziglang.org/download/0.15.2/zig-linux-aarch64-0.15.2.tar.xz | tar -xJ -C /opt
+ENV PATH="/opt/zig-linux-aarch64-0.15.2:${PATH}"
+
+# Copy CLI source
+COPY cli/ /app/cli/
+
+# Build Zig CLI
+WORKDIR /app/cli
+RUN zig build cross
+
+# Final stage
+FROM alpine:3.19
+
+# Install runtime dependencies
+RUN apk add --no-cache ca-certificates rsync openssh-client redis
+
+# Create app user
+RUN addgroup -g 1001 -S appgroup && \
+    adduser -u 1001 -S appuser -G appgroup
+
+# Set working directory
+WORKDIR /app
+
+# Copy binaries from builders
+COPY --from=go-builder /app/bin/ /usr/local/bin/
+COPY --from=zig-builder /app/cli/zig-out/bin/ml /usr/local/bin/
+
+# Create directories
+RUN mkdir -p /data/ml-experiments /home/appuser/.ml && \
+    chown -R appuser:appgroup /data /home/appuser
+
+# Switch to app user
+USER appuser
+
+# Expose ports
+EXPOSE 9100 9101
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD wget --no-verbose --tries=1 --spider http://localhost:9100/health || exit 1
+
+# Default command
+CMD ["/usr/local/bin/api-server"]
--- a/build/docker/simple.Dockerfile
+++ b/build/docker/simple.Dockerfile
@ -0,0 +1,61 @@
+# Simple Dockerfile for homelab use
+FROM golang:1.25-alpine AS builder
+
+# Install dependencies
+RUN apk add --no-cache git make
+
+# Set working directory
+WORKDIR /app
+
+# Copy go mod files
+COPY go.mod go.sum ./
+
+# Download dependencies
+RUN go mod download
+
+# Copy source code
+COPY . .
+
+# Build Go binaries
+RUN make build
+
+# Final stage
+FROM alpine:3.19
+
+# Install runtime dependencies
+RUN apk add --no-cache ca-certificates redis openssl
+
+# Create app user
+RUN addgroup -g 1001 -S appgroup && \
+    adduser -u 1001 -S appuser -G appgroup
+
+# Set working directory
+WORKDIR /app
+
+# Copy binaries from builder
+COPY --from=builder /app/bin/ /usr/local/bin/
+
+# Copy configs and templates
+COPY --from=builder /app/configs/ /app/configs/
+COPY --from=builder /app/nginx/ /app/nginx/
+
+# Create necessary directories
+RUN mkdir -p /app/data/experiments /app/logs /app/ssl
+
+# Generate SSL certificates for container use
+RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
+    -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
+    chmod 644 /app/ssl/cert.pem /app/ssl/key.pem
+
+# Switch to app user
+USER appuser
+
+# Expose ports
+EXPOSE 9101
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+  CMD curl -k -f https://localhost:9101/health || exit 1
+
+# Default command
+CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
--- a/configs/config-dev.yaml
+++ b/configs/config-dev.yaml
@ -0,0 +1,36 @@
+base_path: "./data/experiments"
+
+auth:
+  enabled: true
+  apikeys:
+    test_user:
+      hash: "02d4e2b0d8b4869a34511cc01ff1ebbc3cac581a6b361988106eaedca9886a38"
+      admin: true
+      roles: ["data_scientist", "admin"]
+      permissions:
+        read: true
+        write: true
+        delete: true
+
+server:
+  address: ":9102"
+  tls:
+    enabled: false
+
+security:
+  rate_limit:
+    enabled: true
+    requests_per_minute: 60
+    burst_size: 10
+  ip_whitelist:
+    - "127.0.0.1"
+    - "::1"
+    - "localhost"
+
+redis:
+  url: "redis://localhost:6379"
+  password: "${REDIS_PASSWORD}"
+
+logging:
+  level: "info"
+  file: ""  # Empty = stderr only (dev mode)
--- a/configs/config-docker.yaml
+++ b/configs/config-docker.yaml
@ -0,0 +1,39 @@
+base_path: "/app/data/experiments"
+
+auth:
+  enabled: true
+  api_keys:
+    homelab_user:
+      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
+      admin: true
+      roles: ["user", "admin"]
+      permissions:
+        read: true
+        write: true
+        delete: true
+
+server:
+  address: ":9101"
+  tls:
+    enabled: true
+    cert_file: "/app/ssl/cert.pem"
+    key_file: "/app/ssl/key.pem"
+
+security:
+  rate_limit:
+    enabled: true
+    requests_per_minute: 30
+  ip_whitelist:
+    - "127.0.0.1"
+    - "::1"
+    - "192.168.0.0/16"
+    - "10.0.0.0/8"
+
+redis:
+  url: "redis://redis:6379"
+  max_connections: 10
+
+logging:
+  level: "info"
+  file: "/app/logs/app.log"
+  audit_file: "/app/logs/audit.log"
--- a/configs/config-local.toml
+++ b/configs/config-local.toml
@ -0,0 +1,6 @@
+worker_host = "127.0.0.1"
+worker_user = "dev_user"
+worker_base = "/tmp/ml-experiments"
+worker_port = 9101
+api_key = "dev_test_api_key_12345"
+protocol = "http"
--- a/configs/config-local.yaml
+++ b/configs/config-local.yaml
@ -0,0 +1,33 @@
+auth:
+    enabled: true
+    apikeys:
+        dev_user:
+            hash: 2baf1f40105d9501fe319a8ec463fdf4325a2a5df445adf3f572f626253678c9
+            admin: true
+            roles:
+                - admin
+            permissions:
+                '*': true
+
+server:
+    address: ":9101"
+    tls:
+        enabled: false
+
+security:
+    rate_limit:
+        enabled: false
+    ip_whitelist:
+        - "127.0.0.1"
+        - "::1"
+        - "localhost"
+        - "10.0.0.0/8"
+        - "192.168.0.0/16"
+        - "172.16.0.0/12"
+
+# Prometheus metrics
+metrics:
+    enabled: true
+    listen_addr: ":9100"
+    tls:
+        enabled: false
--- a/configs/config-no-tls.yaml
+++ b/configs/config-no-tls.yaml
@ -0,0 +1,27 @@
+base_path: "./data/experiments"
+
+auth:
+  enabled: true
+
+server:
+  address: ":9102"
+  tls:
+    enabled: false
+
+security:
+  rate_limit:
+    enabled: true
+    requests_per_minute: 60
+    burst_size: 10
+  ip_whitelist:
+    - "127.0.0.1"
+    - "::1"
+    - "localhost"
+
+redis:
+  url: "redis://localhost:6379"
+  password: "${REDIS_PASSWORD}"
+
+logging:
+  level: "info"
+  file: "./logs/fetch_ml.log"
--- a/configs/config-prod.yaml
+++ b/configs/config-prod.yaml
@ -0,0 +1,53 @@
+base_path: "./data/ml-experiments"
+
+auth:
+  enabled: true
+  apikeys:
+    homelab_user:
+      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
+      admin: true
+      roles: ["admin"]
+      permissions:
+        read: true
+        write: true
+        delete: true
+
+server:
+  address: ":9101"
+  tls:
+    enabled: false  # Disabled for local testing
+    cert_file: "./ssl/cert.pem"
+    key_file: "./ssl/key.pem"
+    min_version: "1.3"
+
+security:
+  rate_limit:
+    enabled: true
+    requests_per_minute: 60
+    burst_size: 10
+  ip_whitelist:
+    - "127.0.0.1"
+    - "::1"
+    - "localhost"
+    - "10.0.0.0/8"
+    - "192.168.0.0/16"
+    - "172.16.0.0/12"
+  failed_login_lockout:
+    enabled: true
+    max_attempts: 5
+    lockout_duration: "15m"
+
+# SQLite database for production
+database:
+  type: "sqlite"
+  connection: "data/fetch_ml.db"
+  
+redis:
+  url: "redis://localhost:6379"
+  addr: "localhost:6379"
+  password: "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
+
+logging:
+  level: "info"
+  file: "logs/fetch_ml.log"
+  audit_log: "logs/audit.log"
--- a/configs/examples/config-postgres.yaml
+++ b/configs/examples/config-postgres.yaml
@ -0,0 +1,64 @@
+# Fetch ML Configuration Example for PostgreSQL
+# This example shows how to configure Fetch ML to use PostgreSQL as the database
+
+auth:
+  enabled: true
+  apikeys:
+    admin:
+      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password"
+      admin: true
+      roles: ["admin"]
+
+server:
+  address: ":9101"
+  tls:
+    enabled: false
+
+database:
+  type: "postgres"
+  host: "localhost"
+  port: 5432
+  username: "fetchml"
+  password: "your_password_here"
+  database: "fetchml"
+  # Alternatively, you can use a full connection string:
+  # connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable"
+
+redis:
+  host: "localhost"
+  port: 6379
+  password: ""
+  db: 0
+  pool_size: 10
+  max_retries: 3
+
+logging:
+  level: "info"
+  console: true
+  format: "text"
+
+security:
+  secret_key: "your-secret-key-here-at-least-16-characters"
+  jwt_expiry: "24h"
+  rate_limit:
+    enabled: false
+    requests_per_minute: 60
+    burst_size: 10
+
+containers:
+  runtime: "podman"
+  registry: "docker.io"
+  pull_policy: "missing"
+  resources:
+    cpu_limit: "2"
+    memory_limit: "4Gi"
+    gpu_limit: 1
+
+storage:
+  data_path: "data"
+  results_path: "results"
+  temp_path: "/tmp/fetch_ml"
+  cleanup:
+    enabled: true
+    max_age_hours: 168
+    max_size_gb: 10
--- a/configs/examples/config.yaml.example
+++ b/configs/examples/config.yaml.example
@ -0,0 +1,66 @@
+# Fetch ML Configuration Example
+# Copy this file to config.yaml and customize for your environment
+
+auth:
+  enabled: true
+  api_keys:
+    # Example API key (replace with real hashed keys)
+    admin:
+      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8"  # "password"
+      admin: true
+      roles: ["admin"]
+      permissions:
+        "*": true
+
+server:
+  host: "localhost"
+  port: 8080
+
+database:
+  type: "sqlite"
+  connection: "data/fetch_ml.db"
+  host: ""
+  port: 5432
+  username: ""
+  password: ""
+  database: "fetch_ml"
+
+redis:
+  url: "redis://localhost:6379"
+  host: "localhost"
+  port: 6379
+  password: ""
+  db: 0
+  pool_size: 10
+  max_retries: 3
+
+logging:
+  level: "info"
+  file: "logs/fetch_ml.log"
+  format: "text"
+  console: true
+
+security:
+  secret_key: "your-secret-key-at-least-16-chars"
+  jwt_expiry: "24h"
+  rate_limit:
+    enabled: false
+    requests_per_minute: 60
+
+containers:
+  runtime: "podman"
+  registry: "docker.io"
+  pull_policy: "missing"
+  resources:
+    cpu_limit: "2"
+    memory_limit: "4Gi"
+    gpu_limit: 1
+
+storage:
+  data_path: "data"
+  results_path: "results"
+  temp_path: "/tmp/fetch_ml"
+  cleanup:
+    enabled: true
+    max_age_hours: 168
+    max_size_gb: 10
--- a/configs/schema/config_schema.yaml
+++ b/configs/schema/config_schema.yaml
@ -0,0 +1,238 @@
+# Fetch ML Configuration Schema (JSON Schema expressed as YAML)
+
+$schema: "http://json-schema.org/draft-07/schema#"
+title: "Fetch ML Configuration"
+type: object
+additionalProperties: false
+required:
+  - auth
+  - server
+properties:
+  base_path:
+    type: string
+    description: Base path for experiment data
+  auth:
+    type: object
+    additionalProperties: false
+    required:
+      - enabled
+    properties:
+      enabled:
+        type: boolean
+        description: Enable or disable authentication
+      apikeys:
+        type: object
+        description: API key registry
+        additionalProperties:
+          type: object
+          additionalProperties: false
+          required:
+            - hash
+          properties:
+            hash:
+              type: string
+              description: SHA256 hash of the API key
+            admin:
+              type: boolean
+              default: false
+            roles:
+              type: array
+              items:
+                type: string
+                enum: [admin, data_scientist, data_engineer, viewer, operator]
+            permissions:
+              type: object
+              additionalProperties:
+                type: boolean
+  server:
+    type: object
+    additionalProperties: false
+    required: [address]
+    properties:
+      address:
+        type: string
+        description: Listen address, e.g. ":9101"
+      tls:
+        type: object
+        additionalProperties: false
+        properties:
+          enabled:
+            type: boolean
+            default: false
+          cert_file:
+            type: string
+          key_file:
+            type: string
+          min_version:
+            type: string
+            description: Minimum TLS version (e.g. "1.3")
+  database:
+    type: object
+    additionalProperties: false
+    properties:
+      type:
+        type: string
+        enum: [sqlite, postgres, mysql]
+        default: sqlite
+      connection:
+        type: string
+      host:
+        type: string
+      port:
+        type: integer
+        minimum: 1
+        maximum: 65535
+      username:
+        type: string
+      password:
+        type: string
+      database:
+        type: string
+  redis:
+    type: object
+    additionalProperties: false
+    properties:
+      url:
+        type: string
+        pattern: "^redis://"
+      addr:
+        type: string
+        description: Optional host:port shorthand for Redis
+      host:
+        type: string
+        default: "localhost"
+      port:
+        type: integer
+        minimum: 1
+        maximum: 65535
+        default: 6379
+      password:
+        type: string
+      db:
+        type: integer
+        minimum: 0
+        default: 0
+      pool_size:
+        type: integer
+        minimum: 1
+        default: 10
+      max_retries:
+        type: integer
+        minimum: 0
+        default: 3
+  logging:
+    type: object
+    additionalProperties: false
+    properties:
+      level:
+        type: string
+        enum: [debug, info, warn, error, fatal]
+        default: "info"
+      file:
+        type: string
+      audit_log:
+        type: string
+      format:
+        type: string
+        enum: [text, json]
+        default: "text"
+      console:
+        type: boolean
+        default: true
+  security:
+    type: object
+    additionalProperties: false
+    properties:
+      secret_key:
+        type: string
+        minLength: 16
+      jwt_expiry:
+        type: string
+        pattern: "^\\d+[smhd]$"
+        default: "24h"
+      ip_whitelist:
+        type: array
+        items:
+          type: string
+      failed_login_lockout:
+        type: object
+        additionalProperties: false
+        properties:
+          enabled:
+            type: boolean
+          max_attempts:
+            type: integer
+            minimum: 1
+          lockout_duration:
+            type: string
+            description: Duration string, e.g. "15m"
+      rate_limit:
+        type: object
+        additionalProperties: false
+        properties:
+          enabled:
+            type: boolean
+            default: false
+          requests_per_minute:
+            type: integer
+            minimum: 1
+            default: 60
+          burst_size:
+            type: integer
+            minimum: 1
+  containers:
+    type: object
+    additionalProperties: false
+    properties:
+      runtime:
+        type: string
+        enum: [podman, docker]
+        default: "podman"
+      registry:
+        type: string
+        default: "docker.io"
+      pull_policy:
+        type: string
+        enum: [always, missing, never]
+        default: "missing"
+      resources:
+        type: object
+        additionalProperties: false
+        properties:
+          cpu_limit:
+            type: string
+            description: CPU limit (e.g., "2" or "500m")
+          memory_limit:
+            type: string
+            description: Memory limit (e.g., "1Gi" or "512Mi")
+          gpu_limit:
+            type: integer
+            minimum: 0
+  storage:
+    type: object
+    additionalProperties: false
+    properties:
+      data_path:
+        type: string
+        default: "data"
+      results_path:
+        type: string
+        default: "results"
+      temp_path:
+        type: string
+        default: "/tmp/fetch_ml"
+      cleanup:
+        type: object
+        additionalProperties: false
+        properties:
+          enabled:
+            type: boolean
+            default: true
+          max_age_hours:
+            type: integer
+            minimum: 1
+            default: 168
+          max_size_gb:
+            type: integer
+            minimum: 1
+            default: 10
--- a/configs/schema/permissions.yaml
+++ b/configs/schema/permissions.yaml
@ -0,0 +1,139 @@
+# Role-based permissions configuration
+# Defines what each role can do in the system
+
+# Permission format: resource:action
+# Examples: jobs:create, data:read, users:manage
+
+roles:
+  admin:
+    description: "Full system access"
+    permissions:
+      - "*"
+    
+  data_scientist:
+    description: "ML experiment management"
+    permissions:
+      - "jobs:create"
+      - "jobs:read"
+      - "jobs:update"
+      - "jobs:delete:own"
+      - "data:read"
+      - "data:create"
+      - "models:read"
+      - "models:create"
+      - "models:update:own"
+      - "metrics:read"
+    
+  data_engineer:
+    description: "Data pipeline and infrastructure"
+    permissions:
+      - "data:create"
+      - "data:read"
+      - "data:update"
+      - "data:delete"
+      - "jobs:read"
+      - "jobs:update"
+      - "pipelines:create"
+      - "pipelines:read"
+      - "pipelines:update"
+      - "storage:read"
+      - "storage:write"
+    
+  viewer:
+    description: "Read-only access"
+    permissions:
+      - "jobs:read"
+      - "data:read"
+      - "models:read"
+      - "metrics:read"
+      - "pipelines:read"
+    
+  operator:
+    description: "System operations and monitoring"
+    permissions:
+      - "jobs:read"
+      - "jobs:update"
+      - "jobs:restart"
+      - "metrics:read"
+      - "system:read"
+      - "system:status"
+      - "logs:read"
+
+# Permission groups for easier management
+groups:
+  ml_developer:
+    description: "Combined data scientist and data engineer"
+    inherits:
+      - data_scientist
+      - data_engineer
+    
+  read_only:
+    description: "Read access to all resources"
+    permissions:
+      - "jobs:read"
+      - "data:read"
+      - "models:read"
+      - "pipelines:read"
+      - "metrics:read"
+      - "system:read"
+
+# Resource hierarchy for permission inheritance
+hierarchy:
+  jobs:
+    children:
+      create: true
+      read: true
+      update: true
+      delete: true
+      restart: true
+    special:
+      own: "User can only access their own resources"
+  
+  data:
+    children:
+      create: true
+      read: true
+      update: true
+      delete: true
+      upload: true
+      download: true
+  
+  models:
+    children:
+      create: true
+      read: true
+      update: true
+      delete: true
+      deploy: true
+    special:
+      own: "User can only access their own models"
+  
+  system:
+    children:
+      read: true
+      status: true
+      manage: true
+      config: true
+  
+  metrics:
+    children:
+      read: true
+      export: true
+      delete: true
+  
+  pipelines:
+    children:
+      create: true
+      read: true
+      update: true
+      delete: true
+      run: true
+      stop: true
+
+# Default permissions for new users
+defaults:
+  new_user_role: "viewer"
+  admin_users:
+    - "admin"
+    - "root"
+    - "system"
--- a/configs/worker-prod.toml
+++ b/configs/worker-prod.toml
@ -0,0 +1,39 @@
+worker_id = "worker-prod-01"
+base_path = "/data/ml-experiments"
+max_workers = 4
+
+# Redis connection
+redis_addr = "localhost:6379"
+redis_password = "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
+redis_db = 0
+
+# SSH connection (for remote operations)
+host = "localhost"
+user = "ml-user"
+port = 22
+ssh_key = "~/.ssh/id_rsa"
+
+# Podman configuration
+podman_image = "ml-training:latest"
+gpu_access = true
+container_workspace = "/workspace"
+container_results = "/results"
+train_script = "train.py"
+
+# Dataset management
+auto_fetch_data = true
+data_dir = "/data/datasets"
+data_manager_path = "/usr/local/bin/data_manager"
+dataset_cache_ttl = "24h"
+
+# Task management
+task_lease_duration = "1h"
+heartbeat_interval = "30s"
+graceful_timeout = "5m"
+poll_interval = "100ms"
+metrics_flush_interval = "10s"
+
+# Metrics exporter
+[metrics]
+enabled = true
+listen_addr = ":9090"
--- a/nginx/README.md
+++ b/nginx/README.md
@ -0,0 +1,138 @@
+# Nginx Configuration for FetchML
+
+This directory contains nginx configurations for FetchML.
+
+## Files
+
+- **`fetchml-site.conf`** - Ready-to-use site configuration (recommended)
+- **`nginx-secure.conf`** - Full standalone nginx config (advanced)
+- **`setup-nginx.sh`** - Helper script for easy installation
+
+## Quick Setup
+
+### Option 1: Automated (Recommended)
+
+```bash
+sudo ./nginx/setup-nginx.sh
+```
+
+This will:
+- Detect your nginx setup (Debian or RHEL style)
+- Prompt for your domain and SSL certificates  
+- Install the configuration
+- Test and reload nginx
+
+### Option 2: Manual
+
+**For Debian/Ubuntu:**
+```bash
+# 1. Edit fetchml-site.conf and change:
+#    - ml.example.com to your domain
+#    - SSL certificate paths
+#    - Port if not using 9102
+
+# 2. Install
+sudo cp nginx/fetchml-site.conf /etc/nginx/sites-available/fetchml
+sudo ln -s /etc/nginx/sites-available/fetchml /etc/nginx/sites-enabled/
+
+# 3. Test and reload
+sudo nginx -t
+sudo systemctl reload nginx
+```
+
+**For RHEL/Rocky/CentOS:**
+```bash
+# 1. Edit fetchml-site.conf (same as above)
+
+# 2. Install
+sudo cp nginx/fetchml-site.conf /etc/nginx/conf.d/fetchml.conf
+
+# 3. Test and reload
+sudo nginx -t
+sudo systemctl reload nginx
+```
+
+## Configuration Details
+
+### Endpoints
+
+- `/ws` - WebSocket API (rate limited: 5 req/s)
+- `/api/` - REST API (rate limited: 10 req/s)  
+- `/health` - Health check
+- `/grafana/` - Grafana (commented out by default)
+
+### Security Features
+
+- TLSv1.2 and TLSv1.3 only
+- Security headers (HSTS, CSP, etc.)
+- Rate limiting per endpoint
+- Request size limits (10MB)
+- Version hiding
+
+### What to Change
+
+Before using, update these values in `fetchml-site.conf`:
+
+1. **Domain**: Replace `ml.example.com` with your domain
+2. **SSL Certificates**: Update paths to your actual certificates
+3. **Port**: Change `9102` if using a different port
+4. **Grafana**: Uncomment if you want to expose it
+
+## SSL Certificates
+
+### Self-Signed (Dev/Testing)
+
+```bash
+sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
+  -keyout /etc/ssl/private/fetchml.key \
+  -out /etc/ssl/certs/fetchml.crt \
+  -subj "/CN=ml.example.com"
+```
+
+### Let's Encrypt (Production)
+
+```bash
+sudo apt-get install certbot python3-certbot-nginx
+sudo certbot --nginx -d ml.example.com
+```
+
+## Troubleshooting
+
+### Test Configuration
+```bash
+sudo nginx -t
+```
+
+### Check Logs
+```bash
+sudo tail -f /var/log/nginx/fetchml_error.log
+sudo tail -f /var/log/nginx/fetchml_access.log
+```
+
+### Verify Proxy
+```bash
+curl -I https://ml.example.com/health
+```
+
+### Common Issues
+
+**"Permission denied" error**: Check that nginx user can access SSL certificates
+```bash
+sudo chmod 644 /etc/ssl/certs/fetchml.crt
+sudo chmod 600 /etc/ssl/private/fetchml.key
+```
+
+**WebSocket not working**: Ensure your firewall allows the connection and backend is running
+```bash
+# Check backend
+curl http://localhost:9102/health
+
+# Check firewall
+sudo firewall-cmd --list-all
+```
+
+## Integration with Existing Nginx
+
+If you already have nginx running, just drop `fetchml-site.conf` into your sites directory. It won't conflict with other sites.
+
+The configuration is self-contained and only handles the specified `server_name`.
--- a/nginx/fetchml-site.conf
+++ b/nginx/fetchml-site.conf
@ -0,0 +1,100 @@
+# FetchML Nginx Site Configuration
+# Drop this file into /etc/nginx/sites-available/fetchml
+# Then: sudo ln -s /etc/nginx/sites-available/fetchml /etc/nginx/sites-enabled/
+# Test: sudo nginx -t
+# Reload: sudo systemctl reload nginx
+
+server {
+    listen 80;
+    server_name ml.example.com;  # CHANGE THIS to your domain
+    
+    # Redirect HTTP to HTTPS
+    return 301 https://$server_name$request_uri;
+}
+
+server {
+    listen 443 ssl http2;
+    server_name ml.example.com;  # CHANGE THIS to your domain
+    
+    # SSL Configuration
+    # CHANGE THESE paths to your actual SSL certificates
+    ssl_certificate /etc/ssl/certs/ml.example.com.crt;
+    ssl_certificate_key /etc/ssl/private/ml.example.com.key;
+    
+    # Modern SSL settings
+    ssl_protocols TLSv1.3 TLSv1.2;
+    ssl_prefer_server_ciphers on;
+    ssl_ciphers ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305;
+    ssl_session_timeout 1d;
+    ssl_session_cache shared:MozSSL:10m;
+    ssl_session_tickets off;
+    
+    # Security headers
+    add_header X-Frame-Options DENY always;
+    add_header X-Content-Type-Options nosniff always;
+    add_header X-XSS-Protection "1; mode=block" always;
+    add_header Referrer-Policy "strict-origin-when-cross-origin" always;
+    add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
+    
+    # Hide nginx version
+    server_tokens off;
+    
+    # Rate limiting for API
+    limit_req_zone $binary_remote_addr zone=fetchml_api:10m rate=10r/s;
+    limit_req_zone $binary_remote_addr zone=fetchml_ws:10m rate=5r/s;
+    
+    # Client limits
+    client_max_body_size 10M;
+    client_body_timeout 12s;
+    client_header_timeout 12s;
+    
+    # Logging
+    access_log /var/log/nginx/fetchml_access.log;
+    error_log /var/log/nginx/fetchml_error.log warn;
+    
+    # WebSocket endpoint
+    location /ws {
+        limit_req zone=fetchml_ws burst=10 nodelay;
+        
+        proxy_pass http://localhost:9102;  # CHANGE PORT if needed
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        
+        # WebSocket timeouts
+        proxy_connect_timeout 7d;
+        proxy_send_timeout 7d;
+        proxy_read_timeout 7d;
+    }
+    
+    # API endpoints
+    location /api/ {
+        limit_req zone=fetchml_api burst=20 nodelay;
+        
+        proxy_pass http://localhost:9102;  # CHANGE PORT if needed
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header X-API-Key $http_x_api_key;
+    }
+    
+    # Health check
+    location /health {
+        proxy_pass http://localhost:9102;  # CHANGE PORT if needed
+        proxy_set_header Host $host;
+        access_log off;
+    }
+    
+    # Grafana (optional - only if you want to expose it)
+    # Uncomment if you want Grafana accessible via nginx
+    # location /grafana/ {
+    #     proxy_pass http://localhost:3000/;
+    #     proxy_set_header Host $host;
+    #     proxy_set_header X-Real-IP $remote_addr;
+    # }
+}
--- a/nginx/nginx-secure.conf
+++ b/nginx/nginx-secure.conf
@ -0,0 +1,157 @@
+events {
+    worker_connections 1024;
+}
+
+http {
+    # Security headers
+    add_header X-Frame-Options DENY always;
+    add_header X-Content-Type-Options nosniff always;
+    add_header X-XSS-Protection "1; mode=block" always;
+    add_header Referrer-Policy "strict-origin-when-cross-origin" always;
+    add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'" always;
+    add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
+
+    # Hide server version
+    server_tokens off;
+
+    # Rate limiting
+    limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
+    limit_req_zone $binary_remote_addr zone=ws:10m rate=5r/s;
+
+    # Connection limiting
+    limit_conn_zone $binary_remote_addr zone=conn_limit_per_ip:10m;
+
+    # Logging
+    log_format security '$remote_addr - $remote_user [$time_local] '
+                       '"$request" $status $body_bytes_sent '
+                       '"$http_referer" "$http_user_agent" '
+                       '$request_time $upstream_response_time';
+
+    access_log /var/log/nginx/security.log security;
+    error_log /var/log/nginx/error.log warn;
+
+    # Redirect HTTP to HTTPS
+    server {
+        listen 80;
+        server_name _;
+        return 301 https://$host$request_uri;
+    }
+
+    # HTTPS server
+    server {
+        listen 443 ssl http2;
+        server_name ml-experiments.example.com;
+
+        # SSL configuration
+        ssl_certificate /etc/nginx/ssl/cert.pem;
+        ssl_certificate_key /etc/nginx/ssl/key.pem;
+        ssl_trusted_certificate /etc/nginx/ssl/ca.pem;
+
+        # Modern SSL configuration
+        ssl_protocols TLSv1.3;
+        ssl_prefer_server_ciphers on;
+        ssl_ciphers ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305;
+        ssl_session_timeout 1d;
+        ssl_session_cache shared:SSL:50m;
+        ssl_session_tickets off;
+
+        # OCSP stapling
+        ssl_stapling on;
+        ssl_stapling_verify on;
+
+        # Security limits
+        client_max_body_size 10M;
+        client_body_timeout 12s;
+        client_header_timeout 12s;
+        keepalive_timeout 15s;
+        send_timeout 10s;
+        limit_conn conn_limit_per_ip 20;
+
+        # API endpoints
+        location /health {
+            proxy_pass https://api-server:9101;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_connect_timeout 5s;
+            proxy_send_timeout 10s;
+            proxy_read_timeout 10s;
+        }
+
+        # WebSocket endpoint with special rate limiting
+        location /ws {
+            limit_req zone=ws burst=10 nodelay;
+            
+            proxy_pass https://api-server:9101;
+            proxy_http_version 1.1;
+            proxy_set_header Upgrade $http_upgrade;
+            proxy_set_header Connection "upgrade";
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_connect_timeout 7d;
+            proxy_send_timeout 7d;
+            proxy_read_timeout 7d;
+            
+            # WebSocket specific headers
+            proxy_set_header Sec-WebSocket-Key $http_sec_websocket_key;
+            proxy_set_header Sec-WebSocket-Protocol $http_sec_websocket_protocol;
+            proxy_set_header Sec-WebSocket-Version $http_sec_websocket_version;
+        }
+
+        # API endpoints with rate limiting
+        location /api/ {
+            limit_req zone=api burst=20 nodelay;
+            
+            proxy_pass https://api-server:9101;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_set_header X-API-Key $http_x_api_key;
+            proxy_connect_timeout 5s;
+            proxy_send_timeout 10s;
+            proxy_read_timeout 10s;
+        }
+
+        # Deny all other locations
+        location / {
+            return 404;
+        }
+
+        # Security monitoring endpoints (admin only)
+        location /admin/ {
+            # IP whitelist for admin access
+            allow 10.0.0.0/8;
+            allow 192.168.0.0/16;
+            allow 172.16.0.0/12;
+            deny all;
+            
+            proxy_pass https://api-server:9101;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+
+        # Health check for load balancers
+        location /lb-health {
+            access_log off;
+            return 200 "healthy\n";
+            add_header Content-Type text/plain;
+        }
+    }
+
+    # Default server to catch unknown hosts
+    server {
+        listen 443 ssl http2 default_server;
+        server_name _;
+        
+        ssl_certificate /etc/nginx/ssl/cert.pem;
+        ssl_certificate_key /etc/nginx/ssl/key.pem;
+        
+        return 444;
+    }
+}
--- a/nginx/setup-nginx.sh
+++ b/nginx/setup-nginx.sh
@ -0,0 +1,109 @@
+#!/bin/bash
+# Nginx Setup Helper for FetchML
+# This script helps integrate FetchML into an existing nginx setup
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SITE_CONFIG="$SCRIPT_DIR/fetchml-site.conf"
+
+# Colors
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+echo -e "${BLUE}FetchML Nginx Setup Helper${NC}"
+echo ""
+
+# Check if nginx is installed
+if ! command -v nginx &>/dev/null; then
+    echo -e "${YELLOW}Nginx is not installed.${NC}"
+    echo "Install with:"
+    echo "  Ubuntu/Debian: sudo apt-get install nginx"
+    echo "  RHEL/Rocky:    sudo dnf install nginx"
+    exit 1
+fi
+
+# Detect nginx config structure
+if [ -d "/etc/nginx/sites-available" ]; then
+    # Debian/Ubuntu style
+    SITES_AVAILABLE="/etc/nginx/sites-available"
+    SITES_ENABLED="/etc/nginx/sites-enabled"
+    STYLE="debian"
+elif [ -d "/etc/nginx/conf.d" ]; then
+    # RHEL/CentOS style
+    SITES_AVAILABLE="/etc/nginx/conf.d"
+    SITES_ENABLED=""
+    STYLE="rhel"
+else
+    echo -e "${YELLOW}Could not detect nginx configuration directory.${NC}"
+    echo "Please manually copy $SITE_CONFIG to your nginx config directory."
+    exit 1
+fi
+
+echo "Detected nginx style: $STYLE"
+echo ""
+
+# Read values
+read -p "Enter your domain name (e.g., ml.example.com): " domain
+read -p "Enter API server port [9102]: " port
+port=${port:-9102}
+
+read -p "Enter SSL certificate path: " cert_path
+read -p "Enter SSL key path: " key_path
+
+# Create temp config with substitutions
+temp_config="/tmp/fetchml-site.conf"
+sed -e "s|ml\.example\.com|$domain|g" \
+    -e "s|localhost:9102|localhost:$port|g" \
+    -e "s|/etc/ssl/certs/ml\.example\.com\.crt|$cert_path|g" \
+    -e "s|/etc/ssl/private/ml\.example\.com\.key|$key_path|g" \
+    "$SITE_CONFIG" > "$temp_config"
+
+# Install config
+echo ""
+echo -e "${BLUE}Installing nginx configuration...${NC}"
+
+if [ "$STYLE" = "debian" ]; then
+    sudo cp "$temp_config" "$SITES_AVAILABLE/fetchml"
+    sudo ln -sf "$SITES_AVAILABLE/fetchml" "$SITES_ENABLED/fetchml"
+    echo -e "${GREEN}✓${NC} Config installed to $SITES_AVAILABLE/fetchml"
+    echo -e "${GREEN}✓${NC} Symlink created in $SITES_ENABLED/"
+else
+    sudo cp "$temp_config" "$SITES_AVAILABLE/fetchml.conf"
+    echo -e "${GREEN}✓${NC} Config installed to $SITES_AVAILABLE/fetchml.conf"
+fi
+
+# Test nginx config
+echo ""
+echo -e "${BLUE}Testing nginx configuration...${NC}"
+if sudo nginx -t; then
+    echo -e "${GREEN}✓${NC} Nginx configuration is valid"
+    
+    # Offer to reload
+    read -p "Reload nginx now? [y/N]: " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        sudo systemctl reload nginx
+        echo -e "${GREEN}✓${NC} Nginx reloaded"
+    else
+        echo "Reload later with: sudo systemctl reload nginx"
+    fi
+else
+    echo -e "${YELLOW}!${NC} Nginx configuration test failed"
+    echo "Please fix the errors and run: sudo nginx -t"
+fi
+
+# Cleanup
+rm -f "$temp_config"
+
+echo ""
+echo -e "${GREEN}Setup complete!${NC}"
+echo ""
+echo "Your site is configured for: https://$domain"
+echo ""
+echo "Next steps:"
+echo "  1. Ensure your DNS points to this server"
+echo "  2. Start FetchML API server on port $port"
+echo "  3. Visit https://$domain/health to test"