#!/usr/bin/env bash # Fetch ML Ubuntu Server Setup Script # Optimized for ML experiments on Ubuntu 20.04/22.04 set -euo pipefail # shellcheck source=scripts/setup_common.sh SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) source "$SCRIPT_DIR/setup_common.sh" check_root() { if [[ $EUID -ne 0 ]]; then log_error "This script must be run as root" exit 1 fi } check_ubuntu() { if ! command -v apt-get &> /dev/null; then log_error "This script is designed for Ubuntu systems" exit 1 fi local ubuntu_version=$(lsb_release -rs) log_info "Ubuntu version: $ubuntu_version" if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then log_warning "Ubuntu version < 20.04 may not support all features" fi } update_system() { log_info "Updating system packages..." apt-get update -y apt-get upgrade -y apt-get install -y curl wget gnupg lsb-release software-properties-common } install_go() { log_info "Installing Go 1.25..." if command -v go &> /dev/null; then local go_version=$(go version | awk '{print $3}' | sed 's/go//') log_info "Go already installed: $go_version" return fi cd /tmp TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz" secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz" tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz # Add to PATH echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile export PATH=$PATH:/usr/local/go/bin log_success "Go 1.25 installed" } install_podman() { log_info "Installing Podman..." if command -v podman &> /dev/null; then log_info "Podman already installed" return fi # Add official Podman repository echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add - apt-get update -y apt-get install -y podman podman-compose # Configure Podman for rootless operation echo "user_namespace_enable = 1" >> /etc/containers/containers.conf echo "runtime = \"crun\"" >> /etc/containers/containers.conf log_success "Podman installed" } install_redis() { log_info "Installing Redis..." if command -v redis-server &> /dev/null; then log_info "Redis already installed" return fi apt-get install -y redis-server # Configure Redis for production sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf systemctl enable redis-server systemctl start redis-server log_success "Redis installed and configured" } install_nvidia_drivers() { log_info "Checking for NVIDIA GPU..." if command -v nvidia-smi &> /dev/null; then log_info "NVIDIA drivers already installed" nvidia-smi return fi if lspci | grep -i nvidia &> /dev/null; then log_info "NVIDIA GPU detected, installing drivers..." # Add NVIDIA repository TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb" secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb" dpkg -i /tmp/cuda-keyring_1.1-1_all.deb apt-get update -y # Install drivers apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit # Configure Podman for NVIDIA (only if needed) if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then log_warning "NVIDIA GPU access test failed, you may need to reboot" else log_success "NVIDIA drivers installed and GPU access verified" fi else log_info "No NVIDIA GPU detected, skipping driver installation" fi } install_ml_tools() { log_info "Installing ML tools and dependencies..." # Python and ML packages apt-get install -y python3 python3-pip python3-venv # System dependencies for ML apt-get install -y build-essential cmake git pkg-config apt-get install -y libjpeg-dev libpng-dev libtiff-dev apt-get install -y libavcodec-dev libavformat-dev libswscale-dev apt-get install -y libgtk2.0-dev libcanberra-gtk-module apt-get install -y libxvidcore-dev libx264-dev apt-get install -y libatlas-base-dev gfortran # Install common ML libraries pip3 install --upgrade pip pip3 install numpy scipy scikit-learn pandas pip3 install jupyter matplotlib seaborn pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu log_success "ML tools installed" } create_user() { log_info "Creating fetchml user..." ensure_user create_directories log_success "User $FETCH_ML_USER and directories created" } setup_firewall() { log_info "Configuring firewall..." if command -v ufw &> /dev/null; then ufw --force enable ufw allow ssh ufw allow 8080/tcp # Worker API ufw allow 8081/tcp # Data manager API ufw allow 6379/tcp # Redis ufw status else log_warning "UFW not available, skipping firewall configuration" fi } setup_systemd_services() { log_info "Setting up systemd services..." setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml" setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml" # Enable services systemctl daemon-reload systemctl enable fetch_ml_worker systemctl enable fetch_ml_data_manager log_success "Systemd services configured" } setup_log_rotation() { log_info "Setting up log rotation..." setup_logrotate log_success "Log rotation configured" } optimize_system() { log_info "Optimizing system for ML workloads..." hardening_steps # Optimize kernel parameters for ML cat >> /etc/sysctl.conf << EOF # ML Optimization net.core.rmem_max = 134217728 net.core.wmem_max = 134217728 vm.swappiness = 10 vm.dirty_ratio = 15 vm.dirty_background_ratio = 5 EOF sysctl -p # Configure GPU persistence mode if NVIDIA available if command -v nvidia-smi &> /dev/null; then nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode" fi log_success "System optimized for ML workloads" } install_fetch_ml() { log_info "Installing Fetch ML..." # Clone or copy Fetch ML cd $FETCH_ML_HOME if [[ ! -d "fetch_ml" ]]; then # This would be replaced with actual repository URL log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml" log_info "Example: git clone https://github.com/your-org/fetch_ml.git" return fi cd fetch_ml # Build export PATH=$PATH:/usr/local/go/bin make build # Copy binaries cp bin/* $FETCH_ML_HOME/bin/ chmod +x $FETCH_ML_HOME/bin/* # Copy configs mkdir -p $FETCH_ML_HOME/configs cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml # Set permissions chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME log_success "Fetch ML installed" } main() { log_info "Starting Fetch ML Ubuntu server setup..." check_root check_ubuntu update_system install_go install_podman install_redis install_nvidia_drivers install_ml_tools ensure_user create_directories setup_firewall setup_systemd_services setup_logrotate hardening_steps install_fetch_ml log_success "Fetch ML setup complete!" echo log_info "Next steps:" echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml" echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml" echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager" echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager" echo "5. View logs: journalctl -u fetch_ml_worker -f" echo log_info "Services will be available at:" echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080" echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081" } # Run main function main "$@"