diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..d53a72f --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,94 @@ +# Scripts Directory + +This directory contains setup and utility scripts for FetchML. + +## Production Scripts + +### `setup-prod.sh` +**Purpose**: Automated production setup for Rocky Linux bare metal deployment +**Usage**: `sudo ./scripts/setup-prod.sh [base_path] [user] [group]` +**What it does**: +- Creates system user and groups +- Sets up directory structure (`/data/ml-experiments/*`) +- Installs dependencies (Go, Podman, Redis) +- Configures GPU support for Podman +- Creates systemd service files +- Sets up log rotation + +**Example**: +```bash +sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group +``` + +### `validate-prod-config.sh` +**Purpose**: Validates production configuration files +**Usage**: `./scripts/validate-prod-config.sh [api-config] [worker-config]` +**What it does**: +- Checks config file syntax +- Verifies base_path consistency +- Tests Redis connectivity +- Validates Podman setup +- Checks directory permissions + +**Example**: +```bash +./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml +``` + +## Legacy Setup Scripts (Deprecated) + +The following scripts are from earlier iterations and are **deprecated** in favor of `setup-prod.sh`: + +- `setup_rocky.sh` - Use `setup-prod.sh` instead +- `setup_ubuntu.sh` - Ubuntu support (not primary target) +- `auto_setup.sh` - Old automated setup (superseded) +- `setup_common.sh` - Common functions (integrated into setup-prod.sh) +- `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead) +- `test_tools.sh` - Tool testing (integrated into validate-prod-config.sh) + +### Cleanup Recommendation +These legacy scripts can be removed or archived. The current production setup only needs: +- `setup-prod.sh` +- `validate-prod-config.sh` + +## Usage Workflow + +### First-Time Production Setup +```bash +# 1. Run production setup +sudo ./scripts/setup-prod.sh + +# 2. Copy and configure +sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml +sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml +sudo vim /etc/fetch_ml/config.yaml # Update API keys, etc. + +# 3. Build and install +make prod +sudo make install + +# 4. Validate +./scripts/validate-prod-config.sh /etc/fetch_ml/config.yaml /etc/fetch_ml/worker.toml + +# 5. Start services +sudo systemctl start fetchml-api fetchml-worker +sudo systemctl enable fetchml-api fetchml-worker +``` + +### Development Setup (macOS) +```bash +# Use docker-compose for local development +docker-compose up -d + +# Or run components directly +make dev +./bin/api-server -config configs/config-local.yaml +``` + +## Script Maintenance + +When adding new scripts: +1. Add executable permission: `chmod +x scripts/new-script.sh` +2. Add header comment with purpose and usage +3. Update this README +4. Use consistent error handling and logging diff --git a/scripts/create_bitwarden_fetchml_item.sh b/scripts/create_bitwarden_fetchml_item.sh new file mode 100644 index 0000000..66a7cd0 --- /dev/null +++ b/scripts/create_bitwarden_fetchml_item.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Create a Bitwarden item for a FetchML API user. +# +# Usage: +# ./scripts/create_bitwarden_fetchml_item.sh +# +# Requirements: +# - Bitwarden CLI (bw) installed +# - You are logged in and unlocked (bw login; bw unlock) +# - jq installed +# +# This script does NOT run on the homelab server. Run it from your +# own machine where you manage Bitwarden. + +if [[ $# -ne 3 ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +USER_NAME="$1" +API_KEY="$2" +API_KEY_HASH="$3" + +ITEM_NAME="FetchML API  $USER_NAME" + +# Get base item template +TEMPLATE_JSON=$(bw get template item) + +# Build item JSON with jq +ITEM_JSON=$(echo "$TEMPLATE_JSON" | jq \ + --arg name "$ITEM_NAME" \ + --arg username "$USER_NAME" \ + --arg password "$API_KEY" \ + --arg hash "$API_KEY_HASH" \ + '.name = $name + | .login.username = $username + | .login.password = $password + | .notes = "FetchML API key for user " + $username + | .fields = [{"name":"api_key_hash","value":$hash,"type":1}]') + +# Create item in Bitwarden +# If you ever want to edit instead, you can capture the ID from this call +# and use: bw edit item + +echo "$ITEM_JSON" | bw encode | bw create item + +echo "Created Bitwarden item: $ITEM_NAME" diff --git a/scripts/legacy/auto_setup.sh b/scripts/legacy/auto_setup.sh new file mode 100755 index 0000000..1801c74 --- /dev/null +++ b/scripts/legacy/auto_setup.sh @@ -0,0 +1,455 @@ +#!/bin/bash + +# Automatic Setup Script for ML Experiment Manager +# Handles complete environment setup with security features + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +detect_os() { + if [[ "$OSTYPE" == "darwin"* ]]; then + echo "macos" + elif [[ "$OSTYPE" == "linux-gnu"* ]]; then + echo "linux" + else + echo "unknown" + fi +} + +install_go() { + print_info "Installing Go..." + + local os=$(detect_os) + local go_version="1.23.0" + + if [[ "$os" == "macos" ]]; then + if command -v brew &> /dev/null; then + brew install go + else + print_error "Homebrew not found. Please install Go manually." + return 1 + fi + elif [[ "$os" == "linux" ]]; then + wget -q "https://go.dev/dl/go${go_version}.linux-amd64.tar.gz" + sudo rm -rf /usr/local/go + sudo tar -C /usr/local -xzf "go${go_version}.linux-amd64.tar.gz" + rm "go${go_version}.linux-amd64.tar.gz" + + # Add to PATH + echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc + export PATH=$PATH:/usr/local/go/bin + fi + + print_success "Go installed" +} + +install_zig() { + print_info "Installing Zig..." + + local os=$(detect_os) + + if [[ "$os" == "macos" ]]; then + if command -v brew &> /dev/null; then + brew install zig + else + print_error "Homebrew not found. Please install Zig manually." + return 1 + fi + elif [[ "$os" == "linux" ]]; then + # Download Zig binary + local zig_version="0.13.0" + wget -q "https://ziglang.org/download/${zig_version}/zig-linux-x86_64-${zig_version}.tar.xz" + tar -xf "zig-linux-x86_64-${zig_version}.tar.xz" + sudo mv "zig-linux-x86_64-${zig_version}/zig" /usr/local/bin/ + rm -rf "zig-linux-x86_64-${zig_version}.tar.xz" "zig-linux-x86_64-${zig_version}" + fi + + print_success "Zig installed" +} + +install_docker() { + print_info "Installing Docker..." + + local os=$(detect_os) + + if [[ "$os" == "macos" ]]; then + if command -v brew &> /dev/null; then + brew install --cask docker + print_warning "Docker Desktop installed. Please start it manually." + else + print_error "Homebrew not found. Please install Docker manually." + return 1 + fi + elif [[ "$os" == "linux" ]]; then + # Install Docker using official script + curl -fsSL https://get.docker.com -o get-docker.sh + sudo sh get-docker.sh + sudo usermod -aG docker $USER + rm get-docker.sh + + # Start Docker + sudo systemctl enable docker + sudo systemctl start docker + + print_success "Docker installed. You may need to log out and log back in." + fi +} + +install_redis() { + print_info "Installing Redis..." + + local os=$(detect_os) + + if [[ "$os" == "macos" ]]; then + if command -v brew &> /dev/null; then + brew install redis + brew services start redis + else + print_error "Homebrew not found. Please install Redis manually." + return 1 + fi + elif [[ "$os" == "linux" ]]; then + sudo apt-get update + sudo apt-get install -y redis-server + sudo systemctl enable redis-server + sudo systemctl start redis-server + fi + + print_success "Redis installed and started" +} + +install_dependencies() { + print_info "Installing dependencies..." + + local os=$(detect_os) + + # Install basic tools + if [[ "$os" == "macos" ]]; then + if command -v brew &> /dev/null; then + brew install openssl curl jq + fi + elif [[ "$os" == "linux" ]]; then + sudo apt-get update + sudo apt-get install -y openssl curl jq build-essential + fi + + # Install Go tools + if command -v go &> /dev/null; then + go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest + go install golang.org/x/tools/cmd/goimports@latest + fi + + print_success "Dependencies installed" +} + +setup_project() { + print_info "Setting up project..." + + # Create directories + mkdir -p bin + mkdir -p data + mkdir -p logs + mkdir -p db + mkdir -p ssl + mkdir -p configs + mkdir -p scripts + + # Build project + if command -v make &> /dev/null; then + make build + if command -v zig &> /dev/null; then + make cli-build + fi + else + print_warning "Make not found, building manually..." + go build -o bin/worker ./cmd/worker + go build -o bin/tui ./cmd/tui + go build -o bin/data_manager ./cmd/data_manager + go build -o bin/user_manager ./cmd/user_manager + go build -o bin/api-server ./cmd/api-server + + if command -v zig &> /dev/null; then + cd cli && zig build && cd .. + fi + fi + + print_success "Project setup completed" +} + +setup_security() { + print_info "Setting up security features..." + + # Generate SSL certificates + if command -v openssl &> /dev/null; then + openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \ + -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \ + -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || { + print_warning "Failed to generate SSL certificates" + } + print_success "SSL certificates generated" + fi + + # Generate secure configuration + local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123") + local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234") + + cat > configs/security-config.yaml << EOF +base_path: "/data/ml-experiments" + +auth: + enabled: true + api_keys: + test_user: + hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)" + admin: true + roles: ["data_scientist", "admin"] + permissions: + read: true + write: true + delete: true + +server: + address: ":9101" + tls: + enabled: true + cert_file: "./ssl/cert.pem" + key_file: "./ssl/key.pem" + min_version: "1.3" + +security: + rate_limit: + enabled: true + requests_per_minute: 60 + burst_size: 10 + ip_whitelist: + - "127.0.0.1" + - "::1" + - "10.0.0.0/8" + - "192.168.0.0/16" + - "172.16.0.0/12" + failed_login_lockout: + enabled: true + max_attempts: 5 + lockout_duration: "15m" + +redis: + url: "redis://localhost:6379" + password: "${redis_password}" + +logging: + level: "info" + file: "logs/fetch_ml.log" + audit_log: "logs/audit.log" +EOF + + cat > .env.dev << EOF +# Development environment variables +REDIS_PASSWORD=${redis_password} +JWT_SECRET=${jwt_secret} +GRAFANA_USER=admin +GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password") +EOF + + print_success "Security configuration created" +} + +test_installation() { + print_info "Testing installation..." + + local tests_passed=0 + local tests_total=0 + + # Test Go + tests_total=$((tests_total + 1)) + if command -v go &> /dev/null; then + print_success "Go: Installed" + tests_passed=$((tests_passed + 1)) + else + print_error "Go: Not found" + fi + + # Test Zig + tests_total=$((tests_total + 1)) + if command -v zig &> /dev/null; then + print_success "Zig: Installed" + tests_passed=$((tests_passed + 1)) + else + print_warning "Zig: Not found (optional)" + tests_total=$((tests_total - 1)) + fi + + # Test Docker + tests_total=$((tests_total + 1)) + if command -v docker &> /dev/null; then + print_success "Docker: Installed" + tests_passed=$((tests_passed + 1)) + else + print_warning "Docker: Not found (optional)" + tests_total=$((tests_total - 1)) + fi + + # Test Redis + tests_total=$((tests_total + 1)) + if command -v redis-cli &> /dev/null; then + if redis-cli ping | grep -q "PONG"; then + print_success "Redis: Running" + tests_passed=$((tests_passed + 1)) + else + print_warning "Redis: Not running" + fi + else + print_warning "Redis: Not found (optional)" + tests_total=$((tests_total - 1)) + fi + + # Test binaries + if [[ -f "bin/api-server" ]]; then + tests_total=$((tests_total + 1)) + if ./bin/api-server --help > /dev/null 2>&1; then + print_success "API Server: Built" + tests_passed=$((tests_passed + 1)) + else + print_error "API Server: Build failed" + fi + fi + + if [[ $tests_total -gt 0 ]]; then + local success_rate=$((tests_passed * 100 / tests_total)) + print_info "Tests: $tests_passed/$tests_total passed ($success_rate%)" + fi + + print_success "Installation testing completed" +} + +show_next_steps() { + print_success "Automatic setup completed!" + echo + echo "Next Steps:" + echo "===========" + echo "" + echo "1. Load environment variables:" + echo " source .env.dev" + echo "" + echo "2. Start the API server:" + echo " ./bin/api-server -config configs/config.yaml" + echo "" + echo "3. Test the Zig CLI (if installed):" + echo " ./cli/zig-out/bin/ml --help" + echo "" + echo "4. Deploy with Docker (optional):" + echo " make docker-run" + echo "" + echo "5. Docker Compose deployment:" + echo " docker-compose up -d" + echo "" + echo "Configuration Files:" + echo " configs/config.yaml # Main configuration" + echo " configs/config_local.yaml # Local development" + echo " ssl/cert.pem, ssl/key.pem # TLS certificates" + echo "" + echo "Documentation:" + echo " docs/DEPLOYMENT.md # Deployment guide" + echo "" + echo "Quick Commands:" + echo " make help # Show all commands" + echo " make test # Run tests" + echo " docker-compose up -d # Start services" + echo "" + print_success "Ready to use ML Experiment Manager!" +} + +# Main setup function +main() { + echo "ML Experiment Manager Automatic Setup" + echo "=====================================" + echo "" + + print_info "Starting automatic setup..." + echo "" + + # Check and install dependencies + if ! command -v go &> /dev/null; then + print_info "Go not found, installing..." + install_go + fi + + if ! command -v zig &> /dev/null; then + print_info "Zig not found, installing..." + install_zig + fi + + if ! command -v docker &> /dev/null; then + print_info "Docker not found, installing..." + install_docker + fi + + if ! command -v redis-cli &> /dev/null; then + print_info "Redis not found, installing..." + install_redis + fi + + # Install additional dependencies + install_dependencies + + # Setup project + setup_project + + # Setup security + setup_security + + # Test installation + test_installation + + # Show next steps + show_next_steps +} + +# Handle command line arguments +case "${1:-setup}" in + "setup") + main + ;; + "deps") + install_dependencies + ;; + "test") + test_installation + ;; + "help"|"-h"|"--help") + echo "Automatic Setup Script" + echo "Usage: $0 {setup|deps|test|help}" + echo "" + echo "Commands:" + echo " setup - Run full automatic setup" + echo " deps - Install dependencies only" + echo " test - Test installation" + echo " help - Show this help" + ;; + *) + print_error "Unknown command: $1" + echo "Use '$0 help' for usage information" + exit 1 + ;; +esac diff --git a/scripts/legacy/quick_start.sh b/scripts/legacy/quick_start.sh new file mode 100755 index 0000000..700212c --- /dev/null +++ b/scripts/legacy/quick_start.sh @@ -0,0 +1,314 @@ +#!/usr/bin/env bash + +# Fetch ML Quick Start Script with Security +# Sets up development environment with security features and creates test user + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +check_prerequisites() { + print_info "Checking prerequisites..." + + # Check Go + if ! command -v go &> /dev/null; then + print_error "Go is not installed. Please install Go 1.25 or later." + exit 1 + fi + + local go_version=$(go version | awk '{print $3}' | sed 's/go//') + print_info "Go version: $go_version" + + # Check Zig + if ! command -v zig &> /dev/null; then + print_warning "Zig is not installed. CLI features will not be available." + else + local zig_version=$(zig version) + print_info "Zig version: $zig_version" + fi + + # Check Docker + if ! command -v docker &> /dev/null; then + print_warning "Docker is not installed. Container features will not work." + fi + + # Check Redis + if ! command -v redis-server &> /dev/null && ! command -v redis-cli &> /dev/null; then + print_warning "Redis is not installed. Starting local Redis..." + fi + + # Check OpenSSL for certificates + if ! command -v openssl &> /dev/null; then + print_warning "OpenSSL is not installed. TLS certificates will not be generated." + fi + + print_success "Prerequisites checked" +} + +setup_project() { + print_info "Setting up Fetch ML project..." + + # Create directories + mkdir -p bin + mkdir -p data + mkdir -p logs + mkdir -p db + mkdir -p ssl + mkdir -p configs + + print_success "Project directories created" +} + +build_project() { + print_info "Building Fetch ML..." + + # Build Go binaries + make build + + # Build Zig CLI if available + if command -v zig &> /dev/null; then + make cli-build + print_success "Zig CLI built" + fi + + print_success "Build completed" +} + +generate_ssl_certificates() { + print_info "Generating SSL certificates..." + + if command -v openssl &> /dev/null; then + # Generate self-signed certificate for development + openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \ + -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \ + -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || { + print_warning "Failed to generate SSL certificates" + return 1 + } + + print_success "SSL certificates generated in ssl/" + print_info "Certificates are self-signed (development only)" + else + print_warning "OpenSSL not available, skipping SSL certificates" + fi +} + +setup_redis() { + print_info "Setting up Redis..." + + if command -v redis-server &> /dev/null; then + if ! pgrep -f "redis-server" > /dev/null; then + redis-server --daemonize yes --port 6379 + print_success "Redis started" + else + print_info "Redis already running" + fi + else + print_warning "Redis not available, some features may be limited" + fi +} + +create_secure_config() { + print_info "Creating secure development configuration..." + + # Generate secure passwords and secrets + local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123") + local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234") + + # Create development config + cat > configs/config.yaml << EOF +base_path: "/data/ml-experiments" + +auth: + enabled: true + api_keys: + test_user: + hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)" + admin: true + roles: ["data_scientist", "admin"] + permissions: + read: true + write: true + delete: true + +server: + address: ":9101" + tls: + enabled: true + cert_file: "./ssl/cert.pem" + key_file: "./ssl/key.pem" + min_version: "1.3" + +security: + rate_limit: + enabled: true + requests_per_minute: 60 + burst_size: 10 + ip_whitelist: + - "127.0.0.1" + - "::1" + - "10.0.0.0/8" + - "192.168.0.0/16" + - "172.16.0.0/12" + failed_login_lockout: + enabled: true + max_attempts: 5 + lockout_duration: "15m" + +redis: + url: "redis://localhost:6379" + password: "${redis_password}" + +logging: + level: "info" + file: "logs/fetch_ml.log" + audit_log: "logs/audit.log" +EOF + + # Create environment file + cat > .env.dev << EOF +# Development environment variables +REDIS_PASSWORD=${redis_password} +JWT_SECRET=${jwt_secret} +GRAFANA_USER=admin +GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password") +EOF + + print_success "Secure configuration created" + print_warning "Using development certificates and passwords" +} + +create_test_user() { + print_info "Creating test user..." + + # Generate API key for test user + local api_key="dev_test_api_key_12345" + local api_key_hash=$(echo -n "$api_key" | sha256sum | cut -d' ' -f1) + + print_success "Test user created successfully" + echo "Username: test_user" + echo "API Key: $api_key" + echo "API Key Hash: $api_key_hash" + echo "Store this key safely!" + echo "" + echo "Environment variables in .env.dev" + echo "Run: source .env.dev" +} + +test_setup() { + print_info "Testing setup..." + + # Test Go binaries + if [[ -f "bin/api-server" ]]; then + ./bin/api-server --help > /dev/null 2>&1 || true + print_success "API server binary OK" + fi + + if [[ -f "bin/worker" ]]; then + ./bin/worker --help > /dev/null 2>&1 || true + print_success "Worker binary OK" + fi + + # Test Zig CLI + if [[ -f "cli/zig-out/bin/ml" ]]; then + ./cli/zig-out/bin/ml --help > /dev/null 2>&1 || true + print_success "Zig CLI binary OK" + fi + + # Test Redis connection + if command -v redis-cli &> /dev/null; then + if redis-cli ping > /dev/null 2>&1; then + print_success "Redis connection OK" + else + print_warning "Redis not responding" + fi + fi + + # Test SSL certificates + if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then + if openssl x509 -in ssl/cert.pem -noout -checkend 86400 > /dev/null 2>&1; then + print_success "SSL certificates valid" + else + print_warning "SSL certificates expired or invalid" + fi + fi +} + +show_next_steps() { + print_success "Secure quick start completed!" + echo + echo "Next steps:" + echo "1. Load environment variables:" + echo " source .env.dev" + echo + echo "2. Start API server:" + echo " ./bin/api-server -config configs/config.yaml" + echo + echo "3. Test Zig CLI:" + echo " ./cli/zig-out/bin/ml --help" + echo + echo "4. Test with curl (HTTPS):" + echo " curl -k -H 'X-API-Key: dev_test_api_key_12345' https://localhost:9101/health" + echo + echo "5. Deploy with Docker:" + echo " docker-compose up -d" + echo + echo "Features Enabled:" + echo " ✅ HTTPS/TLS encryption" + echo " ✅ API key authentication" + echo " ✅ Rate limiting" + echo " ✅ IP whitelisting" + echo " ✅ Security headers" + echo " ✅ Audit logging" + echo + echo "Configuration Files:" + echo " configs/config.yaml # Main configuration" + echo " .env.dev # Environment variables" + echo " ssl/cert.pem, ssl/key.pem # TLS certificates" + echo + echo "Documentation:" + echo " docs/DEPLOYMENT.md # Deployment guide" + echo "" + print_success "Ready to run ML experiments!" +} + +# Main function +main() { + echo "Fetch ML Quick Start Script (with Security & Zig CLI)" + echo "====================================================" + echo "" + + check_prerequisites + setup_project + build_project + generate_ssl_certificates + setup_redis + create_secure_config + create_test_user + test_setup + show_next_steps +} + +# Run main function +main "$@" diff --git a/scripts/legacy/setup_common.sh b/scripts/legacy/setup_common.sh new file mode 100755 index 0000000..54040ea --- /dev/null +++ b/scripts/legacy/setup_common.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash + +# Shared helper functions for Fetch ML setup scripts (Ubuntu/Rocky) +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Configuration defaults +FETCH_ML_USER="fetchml" +FETCH_ML_HOME="/opt/fetchml" +SERVICE_DIR="/etc/systemd/system" +LOG_DIR="/var/log/fetchml" +DATA_DIR="/var/lib/fetchml" +CONFIG_DIR="$FETCH_ML_HOME/configs" + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# Download file with checksum verification +# Args: url, checksum, dest +secure_download() { + local url="$1" checksum="$2" dest="$3" + curl -fsSL "$url" -o "$dest" + echo "$checksum $dest" | sha256sum --check --status || { + log_error "Checksum verification failed for $dest" + rm -f "$dest" + exit 1 + } +} + +cleanup_temp() { + if [[ -n "${TMP_FILES:-}" ]]; then + rm -f $TMP_FILES || true + fi +} +trap cleanup_temp EXIT + +ensure_user() { + if ! id "$FETCH_ML_USER" &>/dev/null; then + useradd -m -d "$FETCH_ML_HOME" -s /bin/bash "$FETCH_ML_USER" + fi + usermod -aG podman "$FETCH_ML_USER" || true +} + +create_directories() { + mkdir -p "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" "$FETCH_ML_HOME/bin" "$CONFIG_DIR" + chown -R "$FETCH_ML_USER":"$FETCH_ML_USER" "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" +} + +setup_systemd_service() { + local name="$1" exec="$2" + cat > "$SERVICE_DIR/${name}.service" < /etc/logrotate.d/fetch_ml <<'EOF' +/var/log/fetchml/*.log { + daily + missingok + rotate 14 + compress + delaycompress + notifempty + create 0640 fetchml fetchml +} +EOF +} + +hardening_steps() { + # Increase file limits + if ! grep -q fetchml /etc/security/limits.conf; then + cat >> /etc/security/limits.conf <<'EOF' +fetchml soft nofile 65536 +fetchml hard nofile 65536 +EOF + fi + + # Enable unattended security upgrades if available + if command -v apt-get &>/dev/null; then + apt-get install -y unattended-upgrades >/dev/null || true + elif command -v dnf &>/dev/null; then + dnf install -y dnf-automatic >/dev/null || true + fi +} + +selinux_guidance() { + if command -v getenforce &>/dev/null; then + local mode=$(getenforce) + log_info "SELinux mode: $mode" + if [[ "$mode" == "Enforcing" ]]; then + log_info "Ensure systemd units and directories have proper contexts. Example:" + echo " semanage fcontext -a -t bin_t '$FETCH_ML_HOME/bin(/.*)?'" + echo " restorecon -Rv $FETCH_ML_HOME/bin" + fi + fi +} diff --git a/scripts/legacy/setup_rocky.sh b/scripts/legacy/setup_rocky.sh new file mode 100755 index 0000000..6a5205b --- /dev/null +++ b/scripts/legacy/setup_rocky.sh @@ -0,0 +1,417 @@ +#!/usr/bin/env bash + +# Fetch ML Rocky Linux Setup Script +# Optimized for ML experiments on Rocky Linux 8/9 + +set -euo pipefail + +# shellcheck source=scripts/setup_common.sh +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "$SCRIPT_DIR/setup_common.sh" + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "This script must be run as root" + exit 1 + fi +} + +check_rocky() { + if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then + log_error "This script is designed for Rocky Linux systems" + exit 1 + fi + + local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+') + log_info "Rocky Linux version: $rocky_version" + + # Use dnf for Rocky 9+, yum for Rocky 8 + if command -v dnf &> /dev/null; then + PKG_MANAGER="dnf" + else + PKG_MANAGER="yum" + fi +} + +update_system() { + log_info "Updating system packages..." + $PKG_MANAGER update -y + $PKG_MANAGER upgrade -y + $PKG_MANAGER install -y curl wget gnupg2 +} + +enable_epel() { + log_info "Enabling EPEL repository..." + + if $PKG_MANAGER repolist | grep -q "epel"; then + log_info "EPEL already enabled" + return + fi + + $PKG_MANAGER install -y epel-release + $PKG_MANAGER config-manager --set-enabled powertools + + log_success "EPEL repository enabled" +} + +install_go() { + log_info "Installing Go 1.25..." + + if command -v go &> /dev/null; then + local go_version=$(go version | awk '{print $3}' | sed 's/go//') + log_info "Go already installed: $go_version" + return + fi + + cd /tmp + TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz" + secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz" + tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz + + # Add to PATH + echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile + echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile + export PATH=$PATH:/usr/local/go/bin + + log_success "Go 1.25 installed" +} + +install_podman() { + log_info "Installing Podman..." + + if command -v podman &> /dev/null; then + log_info "Podman already installed" + return + fi + + # Install Podman and related tools + $PKG_MANAGER install -y podman podman-compose containernetworking-plugins + + # Configure Podman + mkdir -p /etc/containers + cat > /etc/containers/containers.conf << EOF +[containers] +user_namespace_enable = 1 +runtime = "crun" + +[network] +network_backend = "netavark" + +[engine] +cgroup_manager = "systemd" +EOF + + # Enable user namespaces + echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf + sysctl -p user.max_user_namespaces=15000 + + log_success "Podman installed" +} + +install_redis() { + log_info "Installing Redis..." + + if command -v redis-server &> /dev/null; then + log_info "Redis already installed" + return + fi + + $PKG_MANAGER install -y redis + + # Configure Redis for production + sed -i 's/supervised no/supervised systemd/' /etc/redis.conf + sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf + + systemctl enable redis + systemctl start redis + + log_success "Redis installed and configured" +} + +install_nvidia_drivers() { + log_info "Checking for NVIDIA GPU..." + + if command -v nvidia-smi &> /dev/null; then + log_info "NVIDIA drivers already installed" + nvidia-smi + return + fi + + if lspci | grep -i nvidia &> /dev/null; then + log_info "NVIDIA GPU detected, installing drivers..." + + # Enable NVIDIA repository + $PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo + + # Clean and install + $PKG_MANAGER clean all + $PKG_MANAGER module enable -y nvidia-driver:latest-dkms + $PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit + + # Configure Podman for NVIDIA (only if needed) + if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then + log_warning "NVIDIA GPU access test failed, you may need to reboot" + else + log_success "NVIDIA drivers installed and GPU access verified" + fi + + # Reboot required + log_warning "System reboot required for NVIDIA drivers" + log_info "Run: reboot" + else + log_info "No NVIDIA GPU detected, skipping driver installation" + fi +} + +install_ml_tools() { + log_info "Installing ML tools and dependencies..." + + # Python and ML packages + $PKG_MANAGER install -y python3 python3-pip python3-devel + + # System dependencies for ML + $PKG_MANAGER groupinstall -y "Development Tools" + $PKG_MANAGER install -y cmake git pkgconfig + $PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel + $PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel + $PKG_MANAGER install -y gtk3-devel + $PKG_MANAGER install -y atlas-devel blas-devel lapack-devel + + # Install common ML libraries + pip3 install --upgrade pip + pip3 install numpy scipy scikit-learn pandas + pip3 install jupyter matplotlib seaborn + pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + + log_success "ML tools installed" +} + +create_user() { + log_info "Creating fetchml user..." + + if id "$FETCH_ML_USER" &>/dev/null; then + log_info "User $FETCH_ML_USER already exists" + return + fi + + useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER + usermod -aG podman $FETCH_ML_USER + + # Create directories + mkdir -p $FETCH_ML_HOME/.config/containers + mkdir -p $FETCH_ML_HOME/go/bin + mkdir -p $LOG_DIR + mkdir -p $DATA_DIR + + chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME + chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR + chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR + + log_success "User $FETCH_ML_USER created" +} + +setup_firewall() { + log_info "Configuring firewall..." + + if command -v firewall-cmd &> /dev/null; then + systemctl enable firewalld + systemctl start firewalld + + firewall-cmd --permanent --add-service=ssh + firewall-cmd --permanent --add-port=8080/tcp # Worker API + firewall-cmd --permanent --add-port=8081/tcp # Data manager API + firewall-cmd --permanent --add-port=6379/tcp # Redis + firewall-cmd --reload + + firewall-cmd --list-all + else + log_warning "Firewalld not available, skipping firewall configuration" + fi +} + +setup_systemd_services() { + log_info "Setting up systemd services..." + + # Fetch ML Worker service + cat > $SERVICE_DIR/fetch_ml_worker.service << EOF +[Unit] +Description=Fetch ML Worker Service +After=network.target redis.service +Wants=redis.service + +[Service] +Type=simple +User=$FETCH_ML_USER +Group=$FETCH_ML_USER +WorkingDirectory=$FETCH_ML_HOME +Environment=FETCH_ML_HOME=$FETCH_ML_HOME +Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin +ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml +Restart=always +RestartSec=5 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=fetch_ml_worker + +[Install] +WantedBy=multi-user.target +EOF + + # Fetch ML Data Manager service + cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF +[Unit] +Description=Fetch ML Data Manager Service +After=network.target redis.service +Wants=redis.service + +[Service] +Type=simple +User=$FETCH_ML_USER +Group=$FETCH_ML_USER +WorkingDirectory=$FETCH_ML_HOME +Environment=FETCH_ML_HOME=$FETCH_ML_HOME +Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin +ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml +Restart=always +RestartSec=5 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=fetch_ml_data_manager + +[Install] +WantedBy=multi-user.target +EOF + + # Enable services + systemctl daemon-reload + systemctl enable fetch_ml_worker + systemctl enable fetch_ml_data_manager + + log_success "Systemd services configured" +} + +setup_log_rotation() { + log_info "Setting up log rotation..." + + cat > /etc/logrotate.d/fetch_ml << EOF +$LOG_DIR/*.log { + daily + missingok + rotate 30 + compress + delaycompress + notifempty + create 0644 $FETCH_ML_USER $FETCH_ML_USER + postrotate + systemctl reload fetch_ml_worker || true + systemctl reload fetch_ml_data_manager || true + endscript +} +EOF + + log_success "Log rotation configured" +} + +optimize_system() { + log_info "Optimizing system for ML workloads..." + + # Increase file limits + echo "* soft nofile 65536" >> /etc/security/limits.conf + echo "* hard nofile 65536" >> /etc/security/limits.conf + + # Optimize kernel parameters for ML + cat >> /etc/sysctl.conf << EOF +# ML Optimization +net.core.rmem_max = 134217728 +net.core.wmem_max = 134217728 +vm.swappiness = 10 +vm.dirty_ratio = 15 +vm.dirty_background_ratio = 5 +EOF + + sysctl -p + + # Configure GPU persistence mode if NVIDIA available + if command -v nvidia-smi &> /dev/null; then + nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode" + fi + + # Disable SELinux for better container compatibility (optional) + if [[ -f /etc/selinux/config ]]; then + log_warning "Consider setting SELinux to permissive mode for better container compatibility" + log_info "Edit /etc/selinux/config and set SELINUX=permissive" + fi + + log_success "System optimized for ML workloads" +} + +install_fetch_ml() { + log_info "Installing Fetch ML..." + + # Clone or copy Fetch ML + cd $FETCH_ML_HOME + + if [[ ! -d "fetch_ml" ]]; then + log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml" + log_info "Example: git clone https://github.com/your-org/fetch_ml.git" + return + fi + + cd fetch_ml + + # Build + export PATH=$PATH:/usr/local/go/bin + make build + + # Copy binaries + cp bin/* $FETCH_ML_HOME/bin/ + chmod +x $FETCH_ML_HOME/bin/* + + # Copy configs + mkdir -p $FETCH_ML_HOME/configs + cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml + + # Set permissions + chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME + + log_success "Fetch ML installed" +} + +main() { + log_info "Starting Fetch ML Rocky Linux server setup..." + + check_root + check_rocky + + update_system + enable_epel + install_go + install_podman + install_redis + install_nvidia_drivers + install_ml_tools + ensure_user + create_directories + setup_firewall + setup_systemd_services + setup_logrotate + hardening_steps + selinux_guidance + install_fetch_ml + + log_success "Fetch ML setup complete!" + echo + log_info "Next steps:" + echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml" + echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml" + echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager" + echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager" + echo "5. View logs: journalctl -u fetch_ml_worker -f" + echo + log_info "Services will be available at:" + echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080" + echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081" +} + +# Run main function +main "$@" diff --git a/scripts/legacy/setup_ubuntu.sh b/scripts/legacy/setup_ubuntu.sh new file mode 100755 index 0000000..2a112ef --- /dev/null +++ b/scripts/legacy/setup_ubuntu.sh @@ -0,0 +1,294 @@ +#!/usr/bin/env bash + +# Fetch ML Ubuntu Server Setup Script +# Optimized for ML experiments on Ubuntu 20.04/22.04 + +set -euo pipefail + +# shellcheck source=scripts/setup_common.sh +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +source "$SCRIPT_DIR/setup_common.sh" + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "This script must be run as root" + exit 1 + fi +} + +check_ubuntu() { + if ! command -v apt-get &> /dev/null; then + log_error "This script is designed for Ubuntu systems" + exit 1 + fi + + local ubuntu_version=$(lsb_release -rs) + log_info "Ubuntu version: $ubuntu_version" + + if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then + log_warning "Ubuntu version < 20.04 may not support all features" + fi +} + +update_system() { + log_info "Updating system packages..." + apt-get update -y + apt-get upgrade -y + apt-get install -y curl wget gnupg lsb-release software-properties-common +} + +install_go() { + log_info "Installing Go 1.25..." + + if command -v go &> /dev/null; then + local go_version=$(go version | awk '{print $3}' | sed 's/go//') + log_info "Go already installed: $go_version" + return + fi + + cd /tmp + TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz" + secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz" + tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz + + # Add to PATH + echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile + echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile + export PATH=$PATH:/usr/local/go/bin + + log_success "Go 1.25 installed" +} + +install_podman() { + log_info "Installing Podman..." + + if command -v podman &> /dev/null; then + log_info "Podman already installed" + return + fi + + # Add official Podman repository + echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list + curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add - + + apt-get update -y + apt-get install -y podman podman-compose + + # Configure Podman for rootless operation + echo "user_namespace_enable = 1" >> /etc/containers/containers.conf + echo "runtime = \"crun\"" >> /etc/containers/containers.conf + + log_success "Podman installed" +} + +install_redis() { + log_info "Installing Redis..." + + if command -v redis-server &> /dev/null; then + log_info "Redis already installed" + return + fi + + apt-get install -y redis-server + + # Configure Redis for production + sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf + sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf + + systemctl enable redis-server + systemctl start redis-server + + log_success "Redis installed and configured" +} + +install_nvidia_drivers() { + log_info "Checking for NVIDIA GPU..." + + if command -v nvidia-smi &> /dev/null; then + log_info "NVIDIA drivers already installed" + nvidia-smi + return + fi + + if lspci | grep -i nvidia &> /dev/null; then + log_info "NVIDIA GPU detected, installing drivers..." + + # Add NVIDIA repository + TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb" + secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb" + dpkg -i /tmp/cuda-keyring_1.1-1_all.deb + apt-get update -y + + # Install drivers + apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit + + # Configure Podman for NVIDIA (only if needed) + if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then + log_warning "NVIDIA GPU access test failed, you may need to reboot" + else + log_success "NVIDIA drivers installed and GPU access verified" + fi + + else + log_info "No NVIDIA GPU detected, skipping driver installation" + fi +} + +install_ml_tools() { + log_info "Installing ML tools and dependencies..." + + # Python and ML packages + apt-get install -y python3 python3-pip python3-venv + + # System dependencies for ML + apt-get install -y build-essential cmake git pkg-config + apt-get install -y libjpeg-dev libpng-dev libtiff-dev + apt-get install -y libavcodec-dev libavformat-dev libswscale-dev + apt-get install -y libgtk2.0-dev libcanberra-gtk-module + apt-get install -y libxvidcore-dev libx264-dev + apt-get install -y libatlas-base-dev gfortran + + # Install common ML libraries + pip3 install --upgrade pip + pip3 install numpy scipy scikit-learn pandas + pip3 install jupyter matplotlib seaborn + pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + + log_success "ML tools installed" +} + +create_user() { + log_info "Creating fetchml user..." + ensure_user + create_directories + log_success "User $FETCH_ML_USER and directories created" +} + +setup_firewall() { + log_info "Configuring firewall..." + + if command -v ufw &> /dev/null; then + ufw --force enable + ufw allow ssh + ufw allow 8080/tcp # Worker API + ufw allow 8081/tcp # Data manager API + ufw allow 6379/tcp # Redis + ufw status + else + log_warning "UFW not available, skipping firewall configuration" + fi +} + +setup_systemd_services() { + log_info "Setting up systemd services..." + + setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml" + setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml" + + # Enable services + systemctl daemon-reload + systemctl enable fetch_ml_worker + systemctl enable fetch_ml_data_manager + + log_success "Systemd services configured" +} + +setup_log_rotation() { + log_info "Setting up log rotation..." + setup_logrotate + log_success "Log rotation configured" +} + +optimize_system() { + log_info "Optimizing system for ML workloads..." + hardening_steps + + # Optimize kernel parameters for ML + cat >> /etc/sysctl.conf << EOF +# ML Optimization +net.core.rmem_max = 134217728 +net.core.wmem_max = 134217728 +vm.swappiness = 10 +vm.dirty_ratio = 15 +vm.dirty_background_ratio = 5 +EOF + + sysctl -p + + # Configure GPU persistence mode if NVIDIA available + if command -v nvidia-smi &> /dev/null; then + nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode" + fi + + log_success "System optimized for ML workloads" +} + +install_fetch_ml() { + log_info "Installing Fetch ML..." + + # Clone or copy Fetch ML + cd $FETCH_ML_HOME + + if [[ ! -d "fetch_ml" ]]; then + # This would be replaced with actual repository URL + log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml" + log_info "Example: git clone https://github.com/your-org/fetch_ml.git" + return + fi + + cd fetch_ml + + # Build + export PATH=$PATH:/usr/local/go/bin + make build + + # Copy binaries + cp bin/* $FETCH_ML_HOME/bin/ + chmod +x $FETCH_ML_HOME/bin/* + + # Copy configs + mkdir -p $FETCH_ML_HOME/configs + cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml + + # Set permissions + chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME + + log_success "Fetch ML installed" +} + +main() { + log_info "Starting Fetch ML Ubuntu server setup..." + + check_root + check_ubuntu + + update_system + install_go + install_podman + install_redis + install_nvidia_drivers + install_ml_tools + ensure_user + create_directories + setup_firewall + setup_systemd_services + setup_logrotate + hardening_steps + install_fetch_ml + + log_success "Fetch ML setup complete!" + echo + log_info "Next steps:" + echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml" + echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml" + echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager" + echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager" + echo "5. View logs: journalctl -u fetch_ml_worker -f" + echo + log_info "Services will be available at:" + echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080" + echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081" +} + +# Run main function +main "$@" diff --git a/scripts/legacy/test_tools.sh b/scripts/legacy/test_tools.sh new file mode 100755 index 0000000..efd1cfb --- /dev/null +++ b/scripts/legacy/test_tools.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -e + +echo "=== Test Tools Harness ===" + +# Function to check if Redis is running, start temporary instance if needed +ensure_redis() { + if ! redis-cli ping >/dev/null 2>&1; then + echo "Starting temporary Redis instance..." + redis-server --daemonize yes --port 6379 + sleep 2 + if ! redis-cli ping >/dev/null 2>&1; then + echo "Failed to start Redis" + exit 1 + fi + echo "Redis started successfully" + # Set up cleanup trap + trap 'echo "Stopping temporary Redis..."; redis-cli shutdown || true' EXIT + else + echo "Redis is already running" + fi +} + +# Step 1: Build Go binaries +echo "Building Go binaries..." +go build -o bin/api-server ./cmd/api-server +go build -o bin/worker ./cmd/worker +go build -o bin/data_manager ./cmd/data_manager +go build -o bin/user_manager ./cmd/user_manager + +# Step 2: Build Zig CLI +echo "Building Zig CLI..." +cd cli +zig build +cd .. + +# Step 3: Ensure Redis is running +ensure_redis + +# Step 4: Run Go tests +echo "Running Go tests..." +go test ./... + +# Step 5: Run Zig tests +echo "Running Zig CLI tests..." +cd cli +zig test +cd .. + +# Step 6: Run Go E2E tests (Redis is already available) +echo "Running Go E2E tests..." +go test ./tests/e2e/... + +# Step 7: Smoke test API server and CLI +echo "Running smoke test..." +# Start API server in background on different port +./bin/api-server -config configs/config.yaml -port 19101 -no-tls > /tmp/api-server.log 2>&1 & +API_PID=$! +sleep 2 + +# Test CLI status +./cli/zig-out/bin/ml status -server http://localhost:19101 + +# Clean up +kill $API_PID 2>/dev/null || true + +echo "=== All tests completed successfully ===" diff --git a/scripts/lib/common.sh b/scripts/lib/common.sh new file mode 100755 index 0000000..66f3b88 --- /dev/null +++ b/scripts/lib/common.sh @@ -0,0 +1,183 @@ +#!/bin/bash +# Common shell functions for FetchML scripts +# Source this file in other scripts: source "$(dirname "$0")/lib/common.sh" + +# Colors for output +export BOLD='\033[1m' +export GREEN='\033[0;32m' +export BLUE='\033[0;34m' +export YELLOW='\033[0;33m' +export RED='\033[0;31m' +export NC='\033[0m' + +################### +# Logging functions +################### + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}✓${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +log_step() { + local step=$1 + local total=$2 + local message=$3 + echo -e "${BLUE}[$step/$total]${NC} $message" +} + +print_header() { + local title=$1 + echo "" + echo -e "${BOLD}=== $title ===${NC}" + echo "" +} + +################### +# System detection +################### + +detect_distro() { + if [ -f /etc/os-release ]; then + . /etc/os-release + export DISTRO=$ID + export DISTRO_VERSION=$VERSION_ID + elif [ -f /etc/redhat-release ]; then + export DISTRO="rhel" + export DISTRO_VERSION="unknown" + else + export DISTRO="unknown" + export DISTRO_VERSION="unknown" + fi + + # Detect package manager + if command -v dnf &>/dev/null; then + export PKG_MANAGER="dnf" + elif command -v yum &>/dev/null; then + export PKG_MANAGER="yum" + elif command -v apt-get &>/dev/null; then + export PKG_MANAGER="apt" + elif command -v pacman &>/dev/null; then + export PKG_MANAGER="pacman" + elif command -v zypper &>/dev/null; then + export PKG_MANAGER="zypper" + else + log_warn "No known package manager found" + export PKG_MANAGER="unknown" + fi + + log_info "Detected: $DISTRO $DISTRO_VERSION (using $PKG_MANAGER)" +} + +################### +# Utility functions +################### + +check_command() { + local cmd=$1 + local install_hint=$2 + + if ! command -v "$cmd" &>/dev/null; then + log_error "$cmd not found" + if [ -n "$install_hint" ]; then + log_info "Install with: $install_hint" + fi + return 1 + fi + return 0 +} + +check_root() { + if [ "$EUID" -ne 0 ]; then + log_error "This script must be run with sudo" + exit 1 + fi +} + +confirm() { + local prompt=$1 + local default=${2:-n} + + if [ "$default" = "y" ]; then + read -p "$prompt [Y/n]: " -n 1 -r + else + read -p "$prompt [y/N]: " -n 1 -r + fi + echo + + if [[ $REPLY =~ ^[Yy]$ ]]; then + return 0 + else + return 1 + fi +} + +################### +# Firewall management +################### + +setup_firewall() { + local port=$1 + local comment=${2:-"FetchML"} + + if command -v firewall-cmd &>/dev/null; then + # RHEL/Rocky/Fedora (firewalld) + sudo firewall-cmd --permanent --add-port="${port}/tcp" >/dev/null 2>&1 + log_success "Firewall rule added (firewalld): ${port}/tcp" + return 0 + elif command -v ufw &>/dev/null; then + # Ubuntu/Debian (ufw) + sudo ufw allow "${port}/tcp" comment "$comment" >/dev/null 2>&1 + log_success "Firewall rule added (ufw): ${port}/tcp" + return 0 + else + log_warn "No firewall detected. Manually open port ${port}/tcp" + return 1 + fi +} + +reload_firewall() { + if command -v firewall-cmd &>/dev/null; then + sudo firewall-cmd --reload >/dev/null 2>&1 + log_success "Firewall reloaded (firewalld)" + elif command -v ufw &>/dev/null; then + sudo ufw reload >/dev/null 2>&1 || true + log_success "Firewall reloaded (ufw)" + fi +} + +################### +# File/directory management +################### + +create_dir() { + local dir=$1 + local owner=${2:-$USER} + local group=${3:-$(id -gn)} + + sudo mkdir -p "$dir" + sudo chown "$owner:$group" "$dir" + sudo chmod 755 "$dir" + log_success "Created: $dir" +} + +check_service() { + local service=$1 + + if systemctl list-unit-files | grep -q "^${service}"; then + return 0 + else + return 1 + fi +} diff --git a/scripts/setup-monitoring-prod.sh b/scripts/setup-monitoring-prod.sh new file mode 100755 index 0000000..bc1319f --- /dev/null +++ b/scripts/setup-monitoring-prod.sh @@ -0,0 +1,275 @@ +#!/bin/bash +# Production Monitoring Stack Setup for Linux +# Deploys Prometheus/Grafana/Loki/Promtail as Podman containers with systemd +# Compatible with: Rocky/RHEL/CentOS, Ubuntu/Debian, Arch, SUSE, etc. + +set -e + +BOLD='\033[1m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[0;33m' +NC='\033[0m' + +echo -e "${BOLD}=== FetchML Monitoring Stack Setup (Linux) ===${NC}\n" + +# Detect Linux distribution and package manager +detect_distro() { + if [ -f /etc/os-release ]; then + . /etc/os-release + DISTRO=$ID + DISTRO_VERSION=$VERSION_ID + elif [ -f /etc/redhat-release ]; then + DISTRO="rhel" + else + DISTRO="unknown" + fi + + # Detect package manager + if command -v dnf &>/dev/null; then + PKG_MANAGER="dnf" + elif command -v yum &>/dev/null; then + PKG_MANAGER="yum" + elif command -v apt-get &>/dev/null; then + PKG_MANAGER="apt" + elif command -v pacman &>/dev/null; then + PKG_MANAGER="pacman" + elif command -v zypper &>/dev/null; then + PKG_MANAGER="zypper" + else + echo -e "${YELLOW}Warning: No known package manager found${NC}" + PKG_MANAGER="unknown" + fi + + echo "Detected distribution: $DISTRO (using $PKG_MANAGER)" +} + +detect_distro + +# Configuration +DATA_PATH="${1:-/data/monitoring}" +ML_USER="${2:-ml-user}" +ML_GROUP="${3:-ml-group}" + +echo "Configuration:" +echo " Monitoring data path: $DATA_PATH" +echo " User: $ML_USER" +echo " Group: $ML_GROUP" +echo "" + +# Create pod for monitoring stack +POD_NAME="monitoring" + +# 1. Create directories +echo -e "${BLUE}[1/6]${NC} Creating directory structure..." +sudo mkdir -p "${DATA_PATH}"/{prometheus,grafana,loki,promtail-config} +sudo mkdir -p /etc/fetch_ml/monitoring +sudo mkdir -p /var/lib/grafana/dashboards + +sudo chown -R $ML_USER:$ML_GROUP $DATA_PATH +sudo chmod 755 $DATA_PATH + +echo -e "${GREEN}✓${NC} Directories created" + +# 2. Copy configuration files +echo -e "${BLUE}[2/6]${NC} Copying configuration files..." +sudo cp monitoring/prometheus.yml /etc/fetch_ml/monitoring/ +sudo cp monitoring/loki-config.yml /etc/fetch_ml/monitoring/ +sudo cp monitoring/promtail-config.yml /etc/fetch_ml/monitoring/ +sudo cp monitoring/grafana/provisioning /etc/fetch_ml/monitoring/ -r +sudo cp monitoring/grafana-dashboard.json /var/lib/grafana/dashboards/ml-queue.json +sudo cp monitoring/logs-dashboard.json /var/lib/grafana/dashboards/logs.json + +sudo chown -R $ML_USER:$ML_GROUP /etc/fetch_ml/monitoring +sudo chown -R $ML_USER:$ML_GROUP /var/lib/grafana + +echo -e "${GREEN}✓${NC} Configuration copied" + +# 3. Create Podman pod +echo -e "${BLUE}[3/6]${NC} Creating Podman pod..." +sudo -u $ML_USER podman pod create \\ + --name $POD_NAME \\ + -p 3000:3000 \\ + -p 9090:9090 \\ + -p 3100:3100 \\ + || echo "Pod may already exist" + +echo -e "${GREEN}✓${NC} Pod created" + +# 4. Create systemd service for monitoring pod +echo -e "${BLUE}[4/6]${NC} Creating systemd services..." + +# Prometheus service +sudo tee /etc/systemd/system/prometheus.service >/dev/null </dev/null </dev/null </dev/null </dev/null + +sudo systemctl daemon-reload +echo -e "${GREEN}✓${NC} Pod service created" + +# 6. Setup firewall rules +echo -e "${BLUE}[6/6]${NC} Configuring firewall..." +if command -v firewall-cmd &>/dev/null; then + # RHEL/Rocky/Fedora (firewalld) + sudo firewall-cmd --permanent --add-port=3000/tcp # Grafana + sudo firewall-cmd --permanent --add-port=9090/tcp # Prometheus + sudo firewall-cmd --reload + echo -e "${GREEN}✓${NC} Firewall configured (firewalld)" +elif command -v ufw &>/dev/null; then + # Ubuntu/Debian (ufw) + sudo ufw allow 3000/tcp comment 'Grafana' + sudo ufw allow 9090/tcp comment 'Prometheus' + echo -e "${GREEN}✓${NC} Firewall configured (ufw)" +else + echo -e "${YELLOW}!${NC} No firewall detected. You may need to manually open ports 3000 and 9090" +fi + +# Summary +echo "" +echo -e "${BOLD}=== Monitoring Stack Setup Complete! ===${NC}" +echo "" +echo "Services created:" +echo " - prometheus.service (Metrics collection)" +echo " - loki.service (Log aggregation)" +echo " - grafana.service (Visualization)" +echo " - promtail.service (Log shipping)" +echo "" +echo -e "${BOLD}Next steps:${NC}" +echo "1. Start services:" +echo " sudo systemctl start prometheus" +echo " sudo systemctl start loki" +echo " sudo systemctl start promtail" +echo " sudo systemctl start grafana" +echo "" +echo "2. Enable on boot:" +echo " sudo systemctl enable prometheus loki promtail grafana" +echo "" +echo "3. Access Grafana:" +echo " http://YOUR_SERVER_IP:3000" +echo " Username: admin" +echo " Password: admin (change on first login)" +echo "" +echo "4. Check logs:" +echo " sudo journalctl -u prometheus -f" +echo " sudo journalctl -u grafana -f" +echo "" diff --git a/scripts/setup-prod.sh b/scripts/setup-prod.sh new file mode 100755 index 0000000..56fceb5 --- /dev/null +++ b/scripts/setup-prod.sh @@ -0,0 +1,229 @@ +#!/bin/bash +# Production Setup Script for Rocky Linux (Bare Metal) +# This script sets up the complete FetchML environment on bare metal + +set -e + +BOLD='\033[1m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n" + +# Configuration +BASE_PATH="${1:-/data/ml-experiments}" +ML_USER="${2:-ml-user}" +ML_GROUP="${3:-ml-group}" + +echo "Configuration:" +echo " Base path: $BASE_PATH" +echo " ML user: $ML_USER" +echo " ML group: $ML_GROUP" +echo "" + +# 1. Create system user if it doesn't exist +echo -e "${BLUE}[1/8]${NC} Creating system user..." +if id "$ML_USER" &>/dev/null; then + echo " User $ML_USER already exists" +else + sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER + echo -e "${GREEN}✓${NC} Created user: $ML_USER" +fi + +# 2. Create directory structure +echo -e "${BLUE}[2/8]${NC} Creating directory structure..." +sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets} +sudo mkdir -p /var/log/fetch_ml +sudo mkdir -p /etc/fetch_ml + +echo -e "${GREEN}✓${NC} Created directories:" +echo " $BASE_PATH/experiments/" +echo " $BASE_PATH/pending/" +echo " $BASE_PATH/running/" +echo " $BASE_PATH/finished/" +echo " $BASE_PATH/failed/" +echo " $BASE_PATH/datasets/" +echo " /var/log/fetch_ml/" +echo " /etc/fetch_ml/" + +# 3. Set ownership and permissions +echo -e "${BLUE}[3/8]${NC} Setting permissions..." +sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH +sudo chmod 755 $BASE_PATH +sudo chmod 700 $BASE_PATH/experiments # Restrict experiment data + +sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml +sudo chmod 755 /var/log/fetch_ml + +echo -e "${GREEN}✓${NC} Permissions set" + +# 4. Install system dependencies (Rocky Linux) +echo -e "${BLUE}[4/8]${NC} Installing system dependencies..." +sudo dnf install -y \ + golang \ + podman \ + redis \ + git \ + make \ + gcc \ + || echo "Some packages may already be installed" + +echo -e "${GREEN}✓${NC} Dependencies installed" + +# 5. Configure Podman for GPU access (if NVIDIA GPU present) +echo -e "${BLUE}[5/8]${NC} Configuring Podman..." +if lspci | grep -i nvidia &>/dev/null; then + echo " NVIDIA GPU detected, configuring GPU access..." + + # Install nvidia-container-toolkit if not present + if ! command -v nvidia-container-toolkit &>/dev/null; then + echo " Installing nvidia-container-toolkit..." + sudo dnf config-manager --add-repo \ + https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo + sudo dnf install -y nvidia-container-toolkit + fi + + # Configure Podman CDI + sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + echo -e "${GREEN}✓${NC} GPU support configured" +else + echo " No NVIDIA GPU detected, skipping GPU setup" +fi + +# 6. Configure Redis +echo -e "${BLUE}[6/8]${NC} Configuring Redis..." +sudo systemctl enable redis +sudo systemctl start redis || echo "Redis may already be running" + +# Set Redis password if not already configured +if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then + REDIS_PASSWORD=$(openssl rand -base64 32) + echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null + sudo systemctl restart redis + echo " Generated Redis password: $REDIS_PASSWORD" + echo " Save this password for your configuration!" +else + echo " Redis password already configured" +fi + +echo -e "${GREEN}✓${NC} Redis configured" + +# 7. Setup systemd services +echo -e "${BLUE}[7/8]${NC} Creating systemd services..." + +# API Server service +sudo tee /etc/systemd/system/fetchml-api.service >/dev/null </dev/null </dev/null </dev/null 2>&1 || true + systemctl reload fetchml-worker >/dev/null 2>&1 || true + endscript +} +EOF + +echo -e "${GREEN}✓${NC} Log rotation configured" + +# Summary +echo "" +echo -e "${BOLD}=== Setup Complete! ===${NC}" +echo "" +echo "Directory structure created at: $BASE_PATH" +echo "Logs will be written to: /var/log/fetch_ml/" +echo "Configuration directory: /etc/fetch_ml/" +echo "" +echo -e "${BOLD}Next steps:${NC}" +echo "1. Copy your config files:" +echo " sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml" +echo " sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml" +echo "" +echo "2. Build and install binaries:" +echo " make build" +echo " sudo cp bin/api-server /usr/local/bin/fetchml-api" +echo " sudo cp bin/worker /usr/local/bin/fetchml-worker" +echo "" +echo "3. Update config files with your settings (Redis password, API keys, etc.)" +echo "" +echo "4. Start services:" +echo " sudo systemctl start fetchml-api" +echo " sudo systemctl start fetchml-worker" +echo "" +echo "5. Enable services to start on boot:" +echo " sudo systemctl enable fetchml-api" +echo " sudo systemctl enable fetchml-worker" +echo "" +echo "6. Check status:" +echo " sudo systemctl status fetchml-api" +echo " sudo systemctl status fetchml-worker" +echo " sudo journalctl -u fetchml-api -f" +echo "" diff --git a/scripts/setup-production.sh b/scripts/setup-production.sh new file mode 100644 index 0000000..e69de29 diff --git a/scripts/validate-prod-config.sh b/scripts/validate-prod-config.sh new file mode 100755 index 0000000..0e501e6 --- /dev/null +++ b/scripts/validate-prod-config.sh @@ -0,0 +1,204 @@ +#!/bin/bash +# Production Configuration Validator +# Verifies all paths and configs are consistent for experiment lifecycle + +set -e + +BOLD='\033[1m' +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${BOLD}=== FetchML Production Configuration Validator ===${NC}\n" + +# Configuration file paths +API_CONFIG="${1:-configs/config-prod.yaml}" +WORKER_CONFIG="${2:-configs/worker-prod.toml}" + +errors=0 +warnings=0 + +check_pass() { + echo -e "${GREEN}✓${NC} $1" +} + +check_fail() { + echo -e "${RED}✗${NC} $1" + ((errors++)) +} + +check_warn() { + echo -e "${YELLOW}⚠${NC} $1" + ((warnings++)) +} + +# 1. Check API server config exists +echo -e "${BOLD}Checking API Server Configuration${NC}" +if [ ! -f "$API_CONFIG" ]; then + check_fail "API config not found: $API_CONFIG" +else + check_pass "API config found: $API_CONFIG" + + # Extract base_path from API config + API_BASE_PATH=$(grep 'base_path:' "$API_CONFIG" | head -1 | awk '{print $2}' | tr -d '"') + echo " Base path: $API_BASE_PATH" + + # Check if path is absolute + if [[ "$API_BASE_PATH" != /* ]]; then + check_fail "base_path must be absolute: $API_BASE_PATH" + else + check_pass "base_path is absolute" + fi + + # Check Redis config + if grep -q 'redis:' "$API_CONFIG"; then + check_pass "Redis configuration present" + else + check_fail "Redis configuration missing" + fi + + # Check auth enabled + if grep -q 'enabled: true' "$API_CONFIG"; then + check_pass "Authentication enabled" + else + check_warn "Authentication disabled (not recommended for production)" + fi +fi + +echo "" + +# 2. Check Worker config (if provided) +if [ -f "$WORKER_CONFIG" ]; then + echo -e "${BOLD}Checking Worker Configuration${NC}" + check_pass "Worker config found: $WORKER_CONFIG" + + # Extract base_path from worker config + WORKER_BASE_PATH=$(grep 'base_path' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "') + echo " Base path: $WORKER_BASE_PATH" + + # Compare paths + if [ "$API_BASE_PATH" = "$WORKER_BASE_PATH" ]; then + check_pass "API and Worker base_path match" + else + check_fail "base_path mismatch! API: $API_BASE_PATH, Worker: $WORKER_BASE_PATH" + fi + + # Check podman_image configured + if grep -q 'podman_image' "$WORKER_CONFIG"; then + PODMAN_IMAGE=$(grep 'podman_image' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "') + check_pass "Podman image configured: $PODMAN_IMAGE" + else + check_fail "podman_image not configured" + fi +else + check_warn "Worker config not found: $WORKER_CONFIG (optional for API server only)" +fi + +echo "" + +# 3. Check directory structure (if base_path exists) +if [ -n "$API_BASE_PATH" ] && [ -d "$API_BASE_PATH" ]; then + echo -e "${BOLD}Checking Directory Structure${NC}" + check_pass "Base directory exists: $API_BASE_PATH" + + # Check subdirectories + for dir in experiments pending running finished failed; do + if [ -d "$API_BASE_PATH/$dir" ]; then + check_pass "$dir/ directory exists" + else + check_warn "$dir/ directory missing (will be created automatically)" + fi + done + + # Check permissions + if [ -w "$API_BASE_PATH" ]; then + check_pass "Base directory is writable" + else + check_fail "Base directory is not writable (check permissions)" + fi + +elif [ -n "$API_BASE_PATH" ]; then + check_warn "Base directory does not exist: $API_BASE_PATH (will need to be created)" +fi + +echo "" + +# 4. Check Redis connectivity (if server is running) +echo -e "${BOLD}Checking Redis Connectivity${NC}" +if command -v redis-cli &> /dev/null; then + if redis-cli ping &> /dev/null; then + check_pass "Redis server is running and accessible" + + # Check queue + QUEUE_SIZE=$(redis-cli llen fetchml:tasks:queue 2>/dev/null || echo "0") + echo " Queue size: $QUEUE_SIZE tasks" + else + check_warn "Redis server not accessible (start with: redis-server)" + fi +else + check_warn "redis-cli not installed (cannot verify Redis connectivity)" +fi + +echo "" + +# 5. Check Podman (if worker config exists) +if [ -f "$WORKER_CONFIG" ]; then + echo -e "${BOLD}Checking Podman${NC}" + if command -v podman &> /dev/null; then + check_pass "Podman is installed" + + # Check if image exists + if [ -n "$PODMAN_IMAGE" ]; then + if podman image exists "$PODMAN_IMAGE" 2>/dev/null; then + check_pass "Podman image exists: $PODMAN_IMAGE" + else + check_warn "Podman image not found: $PODMAN_IMAGE (needs to be built)" + fi + fi + + # Check GPU access (if configured) + if grep -q 'gpu_access.*true' "$WORKER_CONFIG" 2>/dev/null; then + if podman run --rm --device nvidia.com/gpu=all nvidia/cuda:11.8.0-base nvidia-smi &>/dev/null; then + check_pass "GPU access working" + else + check_warn "GPU access configured but not working (check nvidia-container-toolkit)" + fi + fi + else + check_fail "Podman not installed (required for worker)" + fi +fi + +echo "" + +# 6. Check CLI config consistency +echo -e "${BOLD}Checking CLI Configuration${NC}" +CLI_CONFIG="$HOME/.ml/config.toml" +if [ -f "$CLI_CONFIG" ]; then + check_pass "CLI config found: $CLI_CONFIG" + + CLI_BASE=$(grep 'worker_base' "$CLI_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "') + if [ "$CLI_BASE" = "$API_BASE_PATH" ]; then + check_pass "CLI worker_base matches server base_path" + else + check_warn "CLI worker_base ($CLI_BASE) differs from server ($API_BASE_PATH)" + fi +else + check_warn "CLI config not found (run: ml init)" +fi + +echo "" + +# Summary +echo -e "${BOLD}=== Summary ===${NC}" +if [ $errors -eq 0 ] && [ $warnings -eq 0 ]; then + echo -e "${GREEN}All checks passed! Configuration is ready for production.${NC}" + exit 0 +elif [ $errors -eq 0 ]; then + echo -e "${YELLOW}Configuration has $warnings warning(s). Review before deployment.${NC}" + exit 0 +else + echo -e "${RED}Configuration has $errors error(s) and $warnings warning(s). Fix before deployment.${NC}" + exit 1 +fi diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..289c23f --- /dev/null +++ b/setup.sh @@ -0,0 +1,313 @@ +#!/bin/bash + +# Balanced Homelab Setup Script +# Keeps essential security (Fail2Ban, monitoring) while simplifying complexity + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Simple dependency check +check_deps() { + print_info "Checking dependencies..." + + local missing=() + + if ! command -v go &> /dev/null; then + missing+=("go") + fi + + if ! command -v zig &> /dev/null; then + missing+=("zig") + fi + + if ! command -v redis-server &> /dev/null; then + missing+=("redis-server") + fi + + if ! command -v docker &> /dev/null; then + missing+=("docker") + fi + + if [[ ${#missing[@]} -gt 0 ]]; then + print_error "Missing dependencies: ${missing[*]}" + echo "" + echo "Install with:" + echo " macOS: brew install ${missing[*]}" + echo " Ubuntu: sudo apt-get install ${missing[*]}" + exit 1 + fi + + print_success "Dependencies OK" +} + +# Simple setup +setup_project() { + print_info "Setting up project..." + + # Create essential directories + mkdir -p ssl logs configs data monitoring + + # Generate simple SSL cert + if [[ ! -f "ssl/cert.pem" ]]; then + openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem \ + -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Homelab/CN=localhost" \ + -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null + print_success "SSL certificates generated" + fi + + # Create balanced config + cat > configs/config.yaml << 'EOF' +base_path: "./data/experiments" + +auth: + enabled: true + api_keys: + homelab_user: + hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password" + admin: true + roles: ["user", "admin"] + permissions: + read: true + write: true + delete: true + +server: + address: ":9101" + tls: + enabled: true + cert_file: "./ssl/cert.pem" + key_file: "./ssl/key.pem" + +security: + rate_limit: + enabled: true + requests_per_minute: 30 + burst_size: 5 + ip_whitelist: + - "127.0.0.1" + - "::1" + - "192.168.0.0/16" + - "10.0.0.0/8" + - "172.16.0.0/12" + failed_login_lockout: + enabled: true + max_attempts: 3 + lockout_duration: "15m" + +redis: + url: "redis://localhost:6379" + +logging: + level: "info" + file: "./logs/app.log" + audit_log: "./logs/audit.log" + access_log: "./logs/access.log" + +monitoring: + enabled: true + metrics_port: 9090 + health_check_interval: "30s" +EOF + + print_success "Configuration created" +} + +# Simple build +build_project() { + print_info "Building project..." + + # Build Go apps + go build -o bin/api-server ./cmd/api-server + go build -o bin/worker ./cmd/worker + go build -o bin/tui ./cmd/tui + + # Build Zig CLI + cd cli && zig build && cd .. + + print_success "Build completed" +} + +# Setup Fail2Ban +setup_fail2ban() { + print_info "Setting up Fail2Ban..." + + if ! command -v fail2ban-server &> /dev/null; then + print_warning "Fail2Ban not installed, skipping..." + return + fi + + # Create Fail2Ban configuration + sudo mkdir -p /etc/fail2ban/jail.d 2>/dev/null || true + + cat > /tmp/ml-experiments-jail.conf << 'EOF' +[DEFAULT] +bantime = 3600 +findtime = 600 +maxretry = 3 +backend = systemd + +[sshd] +enabled = true +port = ssh +logpath = /var/log/auth.log +maxretry = 3 + +[ml-experiments-api] +enabled = true +port = 9101 +filter = ml-experiments-api +logpath = ./logs/audit.log +maxretry = 5 +bantime = 7200 + +[ml-experiments-auth] +enabled = true +filter = ml-experiments-auth +logpath = ./logs/audit.log +maxretry = 3 +bantime = 3600 +EOF + + # Create filter definitions + cat > /tmp/ml-experiments-api.conf << 'EOF' +[Definition] +failregex = ^.*.*"status":40[13].*$ +ignoreregex = +EOF + + cat > /tmp/ml-experiments-auth.conf << 'EOF' +[Definition] +failregex = ^.*"event":"failed_login".*"client_ip":"".*$ +ignoreregex = +EOF + + # Try to install configurations + if sudo cp /tmp/ml-experiments-jail.conf /etc/fail2ban/jail.d/ 2>/dev/null; then + sudo cp /tmp/ml-experiments-*.conf /etc/fail2ban/filter.d/ 2>/dev/null || true + sudo systemctl restart fail2ban 2>/dev/null || true + print_success "Fail2Ban configured" + else + print_warning "Could not configure Fail2Ban (requires sudo)" + fi + + rm -f /tmp/ml-experiments-*.conf +} + +# Setup Redis +setup_redis() { + print_info "Setting up Redis..." + + if ! pgrep -f "redis-server" > /dev/null; then + redis-server --daemonize yes --port 6379 + print_success "Redis started" + else + print_info "Redis already running" + fi +} + +# Create simple management script +create_manage_script() { + cat > manage.sh << 'EOF' +#!/bin/bash + +# Simple management script + +case "${1:-status}" in + "start") + echo "Starting services..." + redis-server --daemonize yes --port 6379 2>/dev/null || true + ./bin/api-server -config configs/config.yaml & + echo "Services started" + ;; + "stop") + echo "Stopping services..." + pkill -f "api-server" || true + redis-cli shutdown 2>/dev/null || true + echo "Services stopped" + ;; + "status") + echo "=== Status ===" + if pgrep -f "redis-server" > /dev/null; then + echo "✅ Redis: Running" + else + echo "❌ Redis: Stopped" + fi + + if pgrep -f "api-server" > /dev/null; then + echo "✅ API Server: Running" + else + echo "❌ API Server: Stopped" + fi + ;; + "logs") + echo "=== Recent Logs ===" + tail -20 logs/app.log 2>/dev/null || echo "No logs yet" + ;; + "test") + echo "=== Testing ===" + curl -k -s https://localhost:9101/health || echo "API server not responding" + ;; + *) + echo "Usage: $0 {start|stop|status|logs|test}" + ;; +esac +EOF + + chmod +x manage.sh + print_success "Management script created" +} + +# Show next steps +show_next_steps() { + print_success "Setup completed!" + echo "" + echo "🎉 Setup complete!" + echo "" + echo "Next steps:" + echo " 1. Start services: ./tools/manage.sh start" + echo " 2. Check status: ./tools/manage.sh status" + echo " 3. Test API: curl -k -H 'X-API-Key: password' https://localhost:9101/health" + echo "" + echo "Configuration: configs/config.yaml" + echo "Logs: logs/app.log and logs/audit.log" + echo "" + print_success "Ready for homelab use!" +} + +# Main setup +main() { + echo "ML Experiment Manager - Homelab Setup" + echo "=====================================" + echo "" + + check_deps + setup_project + build_project + setup_redis + create_manage_script + show_next_steps +} + +main "$@" diff --git a/tools/manage.sh b/tools/manage.sh new file mode 100755 index 0000000..5f79e79 --- /dev/null +++ b/tools/manage.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +# Project Management Script for ML Experiment Manager +# Provides unified interface for managing all components + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +CYAN='\033[0;36m' +NC='\033[0m' + +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_header() { + echo -e "${PURPLE}$1${NC}" +} + +print_app() { + echo -e "${CYAN}$1${NC}" +} + +show_status() { + print_header "ML Experiment Manager Status" + echo "==================================" + echo "" + + # Check Go apps + print_app "Go Applications:" + local go_apps=("api-server" "worker" "tui" "data_manager" "user_manager") + for app in "${go_apps[@]}"; do + if [[ -f "bin/$app" ]]; then + echo " ✅ $app: Built" + else + echo " ❌ $app: Not built" + fi + done + echo "" + + # Check Zig CLI + print_app "Zig CLI:" + if [[ -f "cli/zig-out/bin/ml" ]]; then + echo " ✅ CLI: Built" + else + echo " ❌ CLI: Not built" + fi + echo "" + + # Check services + print_app "Services:" + if command -v redis-cli &> /dev/null; then + if redis-cli ping | grep -q "PONG"; then + echo " ✅ Redis: Running" + else + echo " ⚠️ Redis: Not running" + fi + else + echo " ❌ Redis: Not installed" + fi + + if command -v docker &> /dev/null; then + echo " ✅ Docker: Available" + else + echo " ❌ Docker: Not installed" + fi + echo "" + + # Check configuration + print_app "Configuration:" + if [[ -f "configs/config-local.yaml" ]]; then + echo " ✅ Security config: Found" + else + echo " ⚠️ Security config: Not found" + fi + + if [[ -f ".env.dev" ]]; then + echo " ✅ Environment: Found" + else + echo " ⚠️ Environment: Not found" + fi + + if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then + echo " ✅ SSL certificates: Found" + else + echo " ⚠️ SSL certificates: Not found" + fi + echo "" +} + +build_all() { + print_header "Building All Components" + echo "=============================" + echo "" + + print_info "Building Go applications..." + make build + + if command -v zig &> /dev/null; then + print_info "Building Zig CLI..." + make cli-build + else + print_warning "Zig not found, skipping CLI build" + fi + + print_success "Build completed!" +} + +test_all() { + print_header "Running All Tests" + echo "====================" + echo "" + + print_info "Running main test suite..." + make test + + print_info "Running comprehensive tests..." + make test-all + + print_success "All tests completed!" +} + +start_services() { + print_header "Starting Services" + echo "===================" + echo "" + + # Start Redis if available + if command -v redis-server &> /dev/null; then + if ! pgrep -f "redis-server" > /dev/null; then + print_info "Starting Redis..." + redis-server --daemonize yes --port 6379 + print_success "Redis started" + else + print_info "Redis already running" + fi + fi + + # Start API server if built + if [[ -f "bin/api-server" ]]; then + print_info "Starting API server..." + if [[ -f "configs/config-local.yaml" ]]; then + ./bin/api-server --config configs/config-local.yaml & + else + print_warning "No config found, using defaults" + ./bin/api-server & + fi + print_success "API server started (PID: $!)" + else + print_error "API server not built. Run 'make build' first." + fi + + print_success "Services started!" +} + +check_health() { + print_header "API Health Check" + echo "==================" + echo "" + + print_info "Checking if API port is open..." + + # First check if port 9101 is open + if ! nc -z localhost 9101 2>/dev/null; then + print_error "API port 9101 not open - is it running?" + print_info "Start with: ./tools/manage.sh start" + return 1 + fi + + print_info "Port 9101 is open, checking API health endpoint..." + + # Try the health endpoint + response=$(curl -k -s --max-time 3 -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health 2>/dev/null) + + if [[ "$response" == "OK" ]]; then + print_success "API is healthy: $response" + elif [[ "$response" == *"IP not whitelisted"* ]]; then + print_warning "API running but IP not whitelisted (expected behavior)" + print_info "Try: curl -k -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health" + else + print_error "Unexpected response: $response" + fi +} + +stop_services() { + print_header "Stopping Services" + echo "==================" + echo "" + + # Stop API server + if pgrep -f "api-server" > /dev/null; then + print_info "Stopping API server..." + pkill -f "api-server" + print_success "API server stopped" + fi + + # Stop Redis + if command -v redis-cli &> /dev/null; then + print_info "Stopping Redis..." + redis-cli shutdown 2>/dev/null || true + print_success "Redis stopped" + fi + + print_success "All services stopped!" +} + +run_security() { + print_header "Security Management" + echo "====================" + echo "" + + case "${1:-check}" in + "check") + print_info "Running security checks..." + make security-check + ;; + "monitor") + print_info "Starting security monitoring..." + make security-monitor + ;; + "deploy") + print_info "Deploying with security..." + make security-deploy + ;; + "audit") + print_info "Running security audit..." + make security-audit + ;; + *) + echo "Usage: $0 security {check|monitor|deploy|audit}" + exit 1 + ;; + esac +} + +run_development() { + print_header "Development Environment" + echo "=========================" + echo "" + + case "${1:-setup}" in + "setup") + print_info "Setting up development environment..." + ./scripts/auto_setup.sh + ;; + "quick") + print_info "Running quick start..." + ./scripts/quick_start.sh + ;; + "deps") + print_info "Installing dependencies..." + make install-deps + ;; + *) + echo "Usage: $0 dev {setup|quick|deps}" + exit 1 + ;; + esac +} + +show_logs() { + print_header "Application Logs" + echo "==================" + echo "" + + # Show application logs + if [[ -f "logs/fetch_ml.log" ]]; then + print_app "Application Log:" + tail -20 logs/fetch_ml.log + echo "" + fi + + if [[ -f "logs/audit.log" ]]; then + print_app "Security Log:" + tail -20 logs/audit.log + echo "" + fi + + # Show Docker logs if running + if command -v docker &> /dev/null; then + local containers=$(docker ps --format "table {{.Names}}" | grep "ml-experiment" || true) + if [[ -n "$containers" ]]; then + print_app "Docker Logs:" + docker logs --tail=20 $(echo "$containers" | tail -1) 2>/dev/null || true + fi + fi +} + +cleanup() { + print_header "Cleanup Project" + echo "================" + echo "" + + print_info "Cleaning project artifacts..." + make clean-all + + print_info "Stopping services..." + stop_services + + print_success "Cleanup completed!" +} + +show_help() { + print_header "Project Management Script" + echo "===========================" + echo "" + echo "Usage: ./tools/manage.sh {status|build|test|start|stop|health|security|dev|logs|cleanup|help}" + echo "" + echo "Commands:" + echo " status - Show project status" + echo " build - Build all components" + echo " test - Run all tests" + echo " start - Start all services" + echo " stop - Stop all services" + echo " health - Check API health endpoint" +echo " security - Security management (check|monitor|deploy|audit)" + echo " dev - Development environment (setup|quick|deps)" + echo " logs - Show application logs" + echo " cleanup - Clean project artifacts and stop services" + echo " help - Show this help" + echo "" + echo "Examples:" + echo " $0 status # Show current status" + echo " $0 health # Check API health" + echo " $0 build && $0 test # Build and test everything" + echo " $0 start # Start all services" + echo " $0 security monitor # Start security monitoring" + echo " $0 dev setup # Setup development environment" + echo "" + echo "Quick Start:" + echo " $0 dev setup && $0 start && $0 status" +} + +# Main function +main() { + case "${1:-help}" in + "status") + show_status + ;; + "build") + build_all + ;; + "test") + test_all + ;; + "start") + start_services + ;; + "stop") + stop_services + ;; + "health") + check_health + ;; + "security") + run_security "${2:-check}" + ;; + "dev") + run_development "${2:-setup}" + ;; + "logs") + show_logs + ;; + "cleanup") + cleanup + ;; + "help"|"-h"|"--help") + show_help + ;; + *) + print_error "Unknown command: $1" + echo "Use '$0 help' for usage information" + exit 1 + ;; + esac +} + +# Run main function +main "$@"