fetch_ml/scripts/legacy/setup_rocky.sh
Jeremie Fraeys bb25743b0f feat: add comprehensive setup scripts and management tools
- Add production setup scripts for automated deployment
- Include monitoring setup and configuration validation
- Add legacy setup scripts for various Linux distributions
- Implement Bitwarden integration for secure credential management
- Add development and production environment setup
- Include comprehensive management tools and utilities
- Add shell script library with common functions

Provides complete automation for setup, deployment, and management
of FetchML platform in development and production environments.
2025-12-04 16:55:04 -05:00

417 lines
11 KiB
Bash
Executable file

#!/usr/bin/env bash
# Fetch ML Rocky Linux Setup Script
# Optimized for ML experiments on Rocky Linux 8/9
set -euo pipefail
# shellcheck source=scripts/setup_common.sh
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
source "$SCRIPT_DIR/setup_common.sh"
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root"
exit 1
fi
}
check_rocky() {
if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then
log_error "This script is designed for Rocky Linux systems"
exit 1
fi
local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+')
log_info "Rocky Linux version: $rocky_version"
# Use dnf for Rocky 9+, yum for Rocky 8
if command -v dnf &> /dev/null; then
PKG_MANAGER="dnf"
else
PKG_MANAGER="yum"
fi
}
update_system() {
log_info "Updating system packages..."
$PKG_MANAGER update -y
$PKG_MANAGER upgrade -y
$PKG_MANAGER install -y curl wget gnupg2
}
enable_epel() {
log_info "Enabling EPEL repository..."
if $PKG_MANAGER repolist | grep -q "epel"; then
log_info "EPEL already enabled"
return
fi
$PKG_MANAGER install -y epel-release
$PKG_MANAGER config-manager --set-enabled powertools
log_success "EPEL repository enabled"
}
install_go() {
log_info "Installing Go 1.25..."
if command -v go &> /dev/null; then
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
log_info "Go already installed: $go_version"
return
fi
cd /tmp
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
# Add to PATH
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
export PATH=$PATH:/usr/local/go/bin
log_success "Go 1.25 installed"
}
install_podman() {
log_info "Installing Podman..."
if command -v podman &> /dev/null; then
log_info "Podman already installed"
return
fi
# Install Podman and related tools
$PKG_MANAGER install -y podman podman-compose containernetworking-plugins
# Configure Podman
mkdir -p /etc/containers
cat > /etc/containers/containers.conf << EOF
[containers]
user_namespace_enable = 1
runtime = "crun"
[network]
network_backend = "netavark"
[engine]
cgroup_manager = "systemd"
EOF
# Enable user namespaces
echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf
sysctl -p user.max_user_namespaces=15000
log_success "Podman installed"
}
install_redis() {
log_info "Installing Redis..."
if command -v redis-server &> /dev/null; then
log_info "Redis already installed"
return
fi
$PKG_MANAGER install -y redis
# Configure Redis for production
sed -i 's/supervised no/supervised systemd/' /etc/redis.conf
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf
systemctl enable redis
systemctl start redis
log_success "Redis installed and configured"
}
install_nvidia_drivers() {
log_info "Checking for NVIDIA GPU..."
if command -v nvidia-smi &> /dev/null; then
log_info "NVIDIA drivers already installed"
nvidia-smi
return
fi
if lspci | grep -i nvidia &> /dev/null; then
log_info "NVIDIA GPU detected, installing drivers..."
# Enable NVIDIA repository
$PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo
# Clean and install
$PKG_MANAGER clean all
$PKG_MANAGER module enable -y nvidia-driver:latest-dkms
$PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit
# Configure Podman for NVIDIA (only if needed)
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
log_warning "NVIDIA GPU access test failed, you may need to reboot"
else
log_success "NVIDIA drivers installed and GPU access verified"
fi
# Reboot required
log_warning "System reboot required for NVIDIA drivers"
log_info "Run: reboot"
else
log_info "No NVIDIA GPU detected, skipping driver installation"
fi
}
install_ml_tools() {
log_info "Installing ML tools and dependencies..."
# Python and ML packages
$PKG_MANAGER install -y python3 python3-pip python3-devel
# System dependencies for ML
$PKG_MANAGER groupinstall -y "Development Tools"
$PKG_MANAGER install -y cmake git pkgconfig
$PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel
$PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel
$PKG_MANAGER install -y gtk3-devel
$PKG_MANAGER install -y atlas-devel blas-devel lapack-devel
# Install common ML libraries
pip3 install --upgrade pip
pip3 install numpy scipy scikit-learn pandas
pip3 install jupyter matplotlib seaborn
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
log_success "ML tools installed"
}
create_user() {
log_info "Creating fetchml user..."
if id "$FETCH_ML_USER" &>/dev/null; then
log_info "User $FETCH_ML_USER already exists"
return
fi
useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER
usermod -aG podman $FETCH_ML_USER
# Create directories
mkdir -p $FETCH_ML_HOME/.config/containers
mkdir -p $FETCH_ML_HOME/go/bin
mkdir -p $LOG_DIR
mkdir -p $DATA_DIR
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR
chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR
log_success "User $FETCH_ML_USER created"
}
setup_firewall() {
log_info "Configuring firewall..."
if command -v firewall-cmd &> /dev/null; then
systemctl enable firewalld
systemctl start firewalld
firewall-cmd --permanent --add-service=ssh
firewall-cmd --permanent --add-port=8080/tcp # Worker API
firewall-cmd --permanent --add-port=8081/tcp # Data manager API
firewall-cmd --permanent --add-port=6379/tcp # Redis
firewall-cmd --reload
firewall-cmd --list-all
else
log_warning "Firewalld not available, skipping firewall configuration"
fi
}
setup_systemd_services() {
log_info "Setting up systemd services..."
# Fetch ML Worker service
cat > $SERVICE_DIR/fetch_ml_worker.service << EOF
[Unit]
Description=Fetch ML Worker Service
After=network.target redis.service
Wants=redis.service
[Service]
Type=simple
User=$FETCH_ML_USER
Group=$FETCH_ML_USER
WorkingDirectory=$FETCH_ML_HOME
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml
Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=fetch_ml_worker
[Install]
WantedBy=multi-user.target
EOF
# Fetch ML Data Manager service
cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF
[Unit]
Description=Fetch ML Data Manager Service
After=network.target redis.service
Wants=redis.service
[Service]
Type=simple
User=$FETCH_ML_USER
Group=$FETCH_ML_USER
WorkingDirectory=$FETCH_ML_HOME
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml
Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=fetch_ml_data_manager
[Install]
WantedBy=multi-user.target
EOF
# Enable services
systemctl daemon-reload
systemctl enable fetch_ml_worker
systemctl enable fetch_ml_data_manager
log_success "Systemd services configured"
}
setup_log_rotation() {
log_info "Setting up log rotation..."
cat > /etc/logrotate.d/fetch_ml << EOF
$LOG_DIR/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 0644 $FETCH_ML_USER $FETCH_ML_USER
postrotate
systemctl reload fetch_ml_worker || true
systemctl reload fetch_ml_data_manager || true
endscript
}
EOF
log_success "Log rotation configured"
}
optimize_system() {
log_info "Optimizing system for ML workloads..."
# Increase file limits
echo "* soft nofile 65536" >> /etc/security/limits.conf
echo "* hard nofile 65536" >> /etc/security/limits.conf
# Optimize kernel parameters for ML
cat >> /etc/sysctl.conf << EOF
# ML Optimization
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
vm.swappiness = 10
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5
EOF
sysctl -p
# Configure GPU persistence mode if NVIDIA available
if command -v nvidia-smi &> /dev/null; then
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
fi
# Disable SELinux for better container compatibility (optional)
if [[ -f /etc/selinux/config ]]; then
log_warning "Consider setting SELinux to permissive mode for better container compatibility"
log_info "Edit /etc/selinux/config and set SELINUX=permissive"
fi
log_success "System optimized for ML workloads"
}
install_fetch_ml() {
log_info "Installing Fetch ML..."
# Clone or copy Fetch ML
cd $FETCH_ML_HOME
if [[ ! -d "fetch_ml" ]]; then
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
return
fi
cd fetch_ml
# Build
export PATH=$PATH:/usr/local/go/bin
make build
# Copy binaries
cp bin/* $FETCH_ML_HOME/bin/
chmod +x $FETCH_ML_HOME/bin/*
# Copy configs
mkdir -p $FETCH_ML_HOME/configs
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
# Set permissions
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
log_success "Fetch ML installed"
}
main() {
log_info "Starting Fetch ML Rocky Linux server setup..."
check_root
check_rocky
update_system
enable_epel
install_go
install_podman
install_redis
install_nvidia_drivers
install_ml_tools
ensure_user
create_directories
setup_firewall
setup_systemd_services
setup_logrotate
hardening_steps
selinux_guidance
install_fetch_ml
log_success "Fetch ML setup complete!"
echo
log_info "Next steps:"
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
echo "5. View logs: journalctl -u fetch_ml_worker -f"
echo
log_info "Services will be available at:"
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
}
# Run main function
main "$@"