#!/usr/bin/env bash # Fetch ML Rocky Linux Setup Script # Optimized for ML experiments on Rocky Linux 8/9 set -euo pipefail # shellcheck source=scripts/setup_common.sh SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) source "$SCRIPT_DIR/setup_common.sh" check_root() { if [[ $EUID -ne 0 ]]; then log_error "This script must be run as root" exit 1 fi } check_rocky() { if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then log_error "This script is designed for Rocky Linux systems" exit 1 fi local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+') log_info "Rocky Linux version: $rocky_version" # Use dnf for Rocky 9+, yum for Rocky 8 if command -v dnf &> /dev/null; then PKG_MANAGER="dnf" else PKG_MANAGER="yum" fi } update_system() { log_info "Updating system packages..." $PKG_MANAGER update -y $PKG_MANAGER upgrade -y $PKG_MANAGER install -y curl wget gnupg2 } enable_epel() { log_info "Enabling EPEL repository..." if $PKG_MANAGER repolist | grep -q "epel"; then log_info "EPEL already enabled" return fi $PKG_MANAGER install -y epel-release $PKG_MANAGER config-manager --set-enabled powertools log_success "EPEL repository enabled" } install_go() { log_info "Installing Go 1.25..." if command -v go &> /dev/null; then local go_version=$(go version | awk '{print $3}' | sed 's/go//') log_info "Go already installed: $go_version" return fi cd /tmp TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz" secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz" tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz # Add to PATH echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile export PATH=$PATH:/usr/local/go/bin log_success "Go 1.25 installed" } install_podman() { log_info "Installing Podman..." if command -v podman &> /dev/null; then log_info "Podman already installed" return fi # Install Podman and related tools $PKG_MANAGER install -y podman podman-compose containernetworking-plugins # Configure Podman mkdir -p /etc/containers cat > /etc/containers/containers.conf << EOF [containers] user_namespace_enable = 1 runtime = "crun" [network] network_backend = "netavark" [engine] cgroup_manager = "systemd" EOF # Enable user namespaces echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf sysctl -p user.max_user_namespaces=15000 log_success "Podman installed" } install_redis() { log_info "Installing Redis..." if command -v redis-server &> /dev/null; then log_info "Redis already installed" return fi $PKG_MANAGER install -y redis # Configure Redis for production sed -i 's/supervised no/supervised systemd/' /etc/redis.conf sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf systemctl enable redis systemctl start redis log_success "Redis installed and configured" } install_nvidia_drivers() { log_info "Checking for NVIDIA GPU..." if command -v nvidia-smi &> /dev/null; then log_info "NVIDIA drivers already installed" nvidia-smi return fi if lspci | grep -i nvidia &> /dev/null; then log_info "NVIDIA GPU detected, installing drivers..." # Enable NVIDIA repository $PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo # Clean and install $PKG_MANAGER clean all $PKG_MANAGER module enable -y nvidia-driver:latest-dkms $PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit # Configure Podman for NVIDIA (only if needed) if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then log_warning "NVIDIA GPU access test failed, you may need to reboot" else log_success "NVIDIA drivers installed and GPU access verified" fi # Reboot required log_warning "System reboot required for NVIDIA drivers" log_info "Run: reboot" else log_info "No NVIDIA GPU detected, skipping driver installation" fi } install_ml_tools() { log_info "Installing ML tools and dependencies..." # Python and ML packages $PKG_MANAGER install -y python3 python3-pip python3-devel # System dependencies for ML $PKG_MANAGER groupinstall -y "Development Tools" $PKG_MANAGER install -y cmake git pkgconfig $PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel $PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel $PKG_MANAGER install -y gtk3-devel $PKG_MANAGER install -y atlas-devel blas-devel lapack-devel # Install common ML libraries pip3 install --upgrade pip pip3 install numpy scipy scikit-learn pandas pip3 install jupyter matplotlib seaborn pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu log_success "ML tools installed" } create_user() { log_info "Creating fetchml user..." if id "$FETCH_ML_USER" &>/dev/null; then log_info "User $FETCH_ML_USER already exists" return fi useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER usermod -aG podman $FETCH_ML_USER # Create directories mkdir -p $FETCH_ML_HOME/.config/containers mkdir -p $FETCH_ML_HOME/go/bin mkdir -p $LOG_DIR mkdir -p $DATA_DIR chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR log_success "User $FETCH_ML_USER created" } setup_firewall() { log_info "Configuring firewall..." if command -v firewall-cmd &> /dev/null; then systemctl enable firewalld systemctl start firewalld firewall-cmd --permanent --add-service=ssh firewall-cmd --permanent --add-port=8080/tcp # Worker API firewall-cmd --permanent --add-port=8081/tcp # Data manager API firewall-cmd --permanent --add-port=6379/tcp # Redis firewall-cmd --reload firewall-cmd --list-all else log_warning "Firewalld not available, skipping firewall configuration" fi } setup_systemd_services() { log_info "Setting up systemd services..." # Fetch ML Worker service cat > $SERVICE_DIR/fetch_ml_worker.service << EOF [Unit] Description=Fetch ML Worker Service After=network.target redis.service Wants=redis.service [Service] Type=simple User=$FETCH_ML_USER Group=$FETCH_ML_USER WorkingDirectory=$FETCH_ML_HOME Environment=FETCH_ML_HOME=$FETCH_ML_HOME Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml Restart=always RestartSec=5 StandardOutput=journal StandardError=journal SyslogIdentifier=fetch_ml_worker [Install] WantedBy=multi-user.target EOF # Fetch ML Data Manager service cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF [Unit] Description=Fetch ML Data Manager Service After=network.target redis.service Wants=redis.service [Service] Type=simple User=$FETCH_ML_USER Group=$FETCH_ML_USER WorkingDirectory=$FETCH_ML_HOME Environment=FETCH_ML_HOME=$FETCH_ML_HOME Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml Restart=always RestartSec=5 StandardOutput=journal StandardError=journal SyslogIdentifier=fetch_ml_data_manager [Install] WantedBy=multi-user.target EOF # Enable services systemctl daemon-reload systemctl enable fetch_ml_worker systemctl enable fetch_ml_data_manager log_success "Systemd services configured" } setup_log_rotation() { log_info "Setting up log rotation..." cat > /etc/logrotate.d/fetch_ml << EOF $LOG_DIR/*.log { daily missingok rotate 30 compress delaycompress notifempty create 0644 $FETCH_ML_USER $FETCH_ML_USER postrotate systemctl reload fetch_ml_worker || true systemctl reload fetch_ml_data_manager || true endscript } EOF log_success "Log rotation configured" } optimize_system() { log_info "Optimizing system for ML workloads..." # Increase file limits echo "* soft nofile 65536" >> /etc/security/limits.conf echo "* hard nofile 65536" >> /etc/security/limits.conf # Optimize kernel parameters for ML cat >> /etc/sysctl.conf << EOF # ML Optimization net.core.rmem_max = 134217728 net.core.wmem_max = 134217728 vm.swappiness = 10 vm.dirty_ratio = 15 vm.dirty_background_ratio = 5 EOF sysctl -p # Configure GPU persistence mode if NVIDIA available if command -v nvidia-smi &> /dev/null; then nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode" fi # Disable SELinux for better container compatibility (optional) if [[ -f /etc/selinux/config ]]; then log_warning "Consider setting SELinux to permissive mode for better container compatibility" log_info "Edit /etc/selinux/config and set SELINUX=permissive" fi log_success "System optimized for ML workloads" } install_fetch_ml() { log_info "Installing Fetch ML..." # Clone or copy Fetch ML cd $FETCH_ML_HOME if [[ ! -d "fetch_ml" ]]; then log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml" log_info "Example: git clone https://github.com/your-org/fetch_ml.git" return fi cd fetch_ml # Build export PATH=$PATH:/usr/local/go/bin make build # Copy binaries cp bin/* $FETCH_ML_HOME/bin/ chmod +x $FETCH_ML_HOME/bin/* # Copy configs mkdir -p $FETCH_ML_HOME/configs cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml # Set permissions chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME log_success "Fetch ML installed" } main() { log_info "Starting Fetch ML Rocky Linux server setup..." check_root check_rocky update_system enable_epel install_go install_podman install_redis install_nvidia_drivers install_ml_tools ensure_user create_directories setup_firewall setup_systemd_services setup_logrotate hardening_steps selinux_guidance install_fetch_ml log_success "Fetch ML setup complete!" echo log_info "Next steps:" echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml" echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml" echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager" echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager" echo "5. View logs: journalctl -u fetch_ml_worker -f" echo log_info "Services will be available at:" echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080" echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081" } # Run main function main "$@"