- Add production setup scripts for automated deployment - Include monitoring setup and configuration validation - Add legacy setup scripts for various Linux distributions - Implement Bitwarden integration for secure credential management - Add development and production environment setup - Include comprehensive management tools and utilities - Add shell script library with common functions Provides complete automation for setup, deployment, and management of FetchML platform in development and production environments.
417 lines
11 KiB
Bash
Executable file
417 lines
11 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
# Fetch ML Rocky Linux Setup Script
|
|
# Optimized for ML experiments on Rocky Linux 8/9
|
|
|
|
set -euo pipefail
|
|
|
|
# shellcheck source=scripts/setup_common.sh
|
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
|
source "$SCRIPT_DIR/setup_common.sh"
|
|
|
|
check_root() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
log_error "This script must be run as root"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
check_rocky() {
|
|
if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then
|
|
log_error "This script is designed for Rocky Linux systems"
|
|
exit 1
|
|
fi
|
|
|
|
local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+')
|
|
log_info "Rocky Linux version: $rocky_version"
|
|
|
|
# Use dnf for Rocky 9+, yum for Rocky 8
|
|
if command -v dnf &> /dev/null; then
|
|
PKG_MANAGER="dnf"
|
|
else
|
|
PKG_MANAGER="yum"
|
|
fi
|
|
}
|
|
|
|
update_system() {
|
|
log_info "Updating system packages..."
|
|
$PKG_MANAGER update -y
|
|
$PKG_MANAGER upgrade -y
|
|
$PKG_MANAGER install -y curl wget gnupg2
|
|
}
|
|
|
|
enable_epel() {
|
|
log_info "Enabling EPEL repository..."
|
|
|
|
if $PKG_MANAGER repolist | grep -q "epel"; then
|
|
log_info "EPEL already enabled"
|
|
return
|
|
fi
|
|
|
|
$PKG_MANAGER install -y epel-release
|
|
$PKG_MANAGER config-manager --set-enabled powertools
|
|
|
|
log_success "EPEL repository enabled"
|
|
}
|
|
|
|
install_go() {
|
|
log_info "Installing Go 1.25..."
|
|
|
|
if command -v go &> /dev/null; then
|
|
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
|
log_info "Go already installed: $go_version"
|
|
return
|
|
fi
|
|
|
|
cd /tmp
|
|
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
|
|
|
|
# Add to PATH
|
|
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
|
|
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
|
|
export PATH=$PATH:/usr/local/go/bin
|
|
|
|
log_success "Go 1.25 installed"
|
|
}
|
|
|
|
install_podman() {
|
|
log_info "Installing Podman..."
|
|
|
|
if command -v podman &> /dev/null; then
|
|
log_info "Podman already installed"
|
|
return
|
|
fi
|
|
|
|
# Install Podman and related tools
|
|
$PKG_MANAGER install -y podman podman-compose containernetworking-plugins
|
|
|
|
# Configure Podman
|
|
mkdir -p /etc/containers
|
|
cat > /etc/containers/containers.conf << EOF
|
|
[containers]
|
|
user_namespace_enable = 1
|
|
runtime = "crun"
|
|
|
|
[network]
|
|
network_backend = "netavark"
|
|
|
|
[engine]
|
|
cgroup_manager = "systemd"
|
|
EOF
|
|
|
|
# Enable user namespaces
|
|
echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf
|
|
sysctl -p user.max_user_namespaces=15000
|
|
|
|
log_success "Podman installed"
|
|
}
|
|
|
|
install_redis() {
|
|
log_info "Installing Redis..."
|
|
|
|
if command -v redis-server &> /dev/null; then
|
|
log_info "Redis already installed"
|
|
return
|
|
fi
|
|
|
|
$PKG_MANAGER install -y redis
|
|
|
|
# Configure Redis for production
|
|
sed -i 's/supervised no/supervised systemd/' /etc/redis.conf
|
|
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf
|
|
|
|
systemctl enable redis
|
|
systemctl start redis
|
|
|
|
log_success "Redis installed and configured"
|
|
}
|
|
|
|
install_nvidia_drivers() {
|
|
log_info "Checking for NVIDIA GPU..."
|
|
|
|
if command -v nvidia-smi &> /dev/null; then
|
|
log_info "NVIDIA drivers already installed"
|
|
nvidia-smi
|
|
return
|
|
fi
|
|
|
|
if lspci | grep -i nvidia &> /dev/null; then
|
|
log_info "NVIDIA GPU detected, installing drivers..."
|
|
|
|
# Enable NVIDIA repository
|
|
$PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo
|
|
|
|
# Clean and install
|
|
$PKG_MANAGER clean all
|
|
$PKG_MANAGER module enable -y nvidia-driver:latest-dkms
|
|
$PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit
|
|
|
|
# Configure Podman for NVIDIA (only if needed)
|
|
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
|
|
log_warning "NVIDIA GPU access test failed, you may need to reboot"
|
|
else
|
|
log_success "NVIDIA drivers installed and GPU access verified"
|
|
fi
|
|
|
|
# Reboot required
|
|
log_warning "System reboot required for NVIDIA drivers"
|
|
log_info "Run: reboot"
|
|
else
|
|
log_info "No NVIDIA GPU detected, skipping driver installation"
|
|
fi
|
|
}
|
|
|
|
install_ml_tools() {
|
|
log_info "Installing ML tools and dependencies..."
|
|
|
|
# Python and ML packages
|
|
$PKG_MANAGER install -y python3 python3-pip python3-devel
|
|
|
|
# System dependencies for ML
|
|
$PKG_MANAGER groupinstall -y "Development Tools"
|
|
$PKG_MANAGER install -y cmake git pkgconfig
|
|
$PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel
|
|
$PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel
|
|
$PKG_MANAGER install -y gtk3-devel
|
|
$PKG_MANAGER install -y atlas-devel blas-devel lapack-devel
|
|
|
|
# Install common ML libraries
|
|
pip3 install --upgrade pip
|
|
pip3 install numpy scipy scikit-learn pandas
|
|
pip3 install jupyter matplotlib seaborn
|
|
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
|
|
|
log_success "ML tools installed"
|
|
}
|
|
|
|
create_user() {
|
|
log_info "Creating fetchml user..."
|
|
|
|
if id "$FETCH_ML_USER" &>/dev/null; then
|
|
log_info "User $FETCH_ML_USER already exists"
|
|
return
|
|
fi
|
|
|
|
useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER
|
|
usermod -aG podman $FETCH_ML_USER
|
|
|
|
# Create directories
|
|
mkdir -p $FETCH_ML_HOME/.config/containers
|
|
mkdir -p $FETCH_ML_HOME/go/bin
|
|
mkdir -p $LOG_DIR
|
|
mkdir -p $DATA_DIR
|
|
|
|
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
|
chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR
|
|
chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR
|
|
|
|
log_success "User $FETCH_ML_USER created"
|
|
}
|
|
|
|
setup_firewall() {
|
|
log_info "Configuring firewall..."
|
|
|
|
if command -v firewall-cmd &> /dev/null; then
|
|
systemctl enable firewalld
|
|
systemctl start firewalld
|
|
|
|
firewall-cmd --permanent --add-service=ssh
|
|
firewall-cmd --permanent --add-port=8080/tcp # Worker API
|
|
firewall-cmd --permanent --add-port=8081/tcp # Data manager API
|
|
firewall-cmd --permanent --add-port=6379/tcp # Redis
|
|
firewall-cmd --reload
|
|
|
|
firewall-cmd --list-all
|
|
else
|
|
log_warning "Firewalld not available, skipping firewall configuration"
|
|
fi
|
|
}
|
|
|
|
setup_systemd_services() {
|
|
log_info "Setting up systemd services..."
|
|
|
|
# Fetch ML Worker service
|
|
cat > $SERVICE_DIR/fetch_ml_worker.service << EOF
|
|
[Unit]
|
|
Description=Fetch ML Worker Service
|
|
After=network.target redis.service
|
|
Wants=redis.service
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=$FETCH_ML_USER
|
|
Group=$FETCH_ML_USER
|
|
WorkingDirectory=$FETCH_ML_HOME
|
|
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
|
|
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
|
|
ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml
|
|
Restart=always
|
|
RestartSec=5
|
|
StandardOutput=journal
|
|
StandardError=journal
|
|
SyslogIdentifier=fetch_ml_worker
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
# Fetch ML Data Manager service
|
|
cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF
|
|
[Unit]
|
|
Description=Fetch ML Data Manager Service
|
|
After=network.target redis.service
|
|
Wants=redis.service
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=$FETCH_ML_USER
|
|
Group=$FETCH_ML_USER
|
|
WorkingDirectory=$FETCH_ML_HOME
|
|
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
|
|
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
|
|
ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml
|
|
Restart=always
|
|
RestartSec=5
|
|
StandardOutput=journal
|
|
StandardError=journal
|
|
SyslogIdentifier=fetch_ml_data_manager
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
# Enable services
|
|
systemctl daemon-reload
|
|
systemctl enable fetch_ml_worker
|
|
systemctl enable fetch_ml_data_manager
|
|
|
|
log_success "Systemd services configured"
|
|
}
|
|
|
|
setup_log_rotation() {
|
|
log_info "Setting up log rotation..."
|
|
|
|
cat > /etc/logrotate.d/fetch_ml << EOF
|
|
$LOG_DIR/*.log {
|
|
daily
|
|
missingok
|
|
rotate 30
|
|
compress
|
|
delaycompress
|
|
notifempty
|
|
create 0644 $FETCH_ML_USER $FETCH_ML_USER
|
|
postrotate
|
|
systemctl reload fetch_ml_worker || true
|
|
systemctl reload fetch_ml_data_manager || true
|
|
endscript
|
|
}
|
|
EOF
|
|
|
|
log_success "Log rotation configured"
|
|
}
|
|
|
|
optimize_system() {
|
|
log_info "Optimizing system for ML workloads..."
|
|
|
|
# Increase file limits
|
|
echo "* soft nofile 65536" >> /etc/security/limits.conf
|
|
echo "* hard nofile 65536" >> /etc/security/limits.conf
|
|
|
|
# Optimize kernel parameters for ML
|
|
cat >> /etc/sysctl.conf << EOF
|
|
# ML Optimization
|
|
net.core.rmem_max = 134217728
|
|
net.core.wmem_max = 134217728
|
|
vm.swappiness = 10
|
|
vm.dirty_ratio = 15
|
|
vm.dirty_background_ratio = 5
|
|
EOF
|
|
|
|
sysctl -p
|
|
|
|
# Configure GPU persistence mode if NVIDIA available
|
|
if command -v nvidia-smi &> /dev/null; then
|
|
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
|
|
fi
|
|
|
|
# Disable SELinux for better container compatibility (optional)
|
|
if [[ -f /etc/selinux/config ]]; then
|
|
log_warning "Consider setting SELinux to permissive mode for better container compatibility"
|
|
log_info "Edit /etc/selinux/config and set SELINUX=permissive"
|
|
fi
|
|
|
|
log_success "System optimized for ML workloads"
|
|
}
|
|
|
|
install_fetch_ml() {
|
|
log_info "Installing Fetch ML..."
|
|
|
|
# Clone or copy Fetch ML
|
|
cd $FETCH_ML_HOME
|
|
|
|
if [[ ! -d "fetch_ml" ]]; then
|
|
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
|
|
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
|
|
return
|
|
fi
|
|
|
|
cd fetch_ml
|
|
|
|
# Build
|
|
export PATH=$PATH:/usr/local/go/bin
|
|
make build
|
|
|
|
# Copy binaries
|
|
cp bin/* $FETCH_ML_HOME/bin/
|
|
chmod +x $FETCH_ML_HOME/bin/*
|
|
|
|
# Copy configs
|
|
mkdir -p $FETCH_ML_HOME/configs
|
|
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
|
|
|
|
# Set permissions
|
|
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
|
|
|
log_success "Fetch ML installed"
|
|
}
|
|
|
|
main() {
|
|
log_info "Starting Fetch ML Rocky Linux server setup..."
|
|
|
|
check_root
|
|
check_rocky
|
|
|
|
update_system
|
|
enable_epel
|
|
install_go
|
|
install_podman
|
|
install_redis
|
|
install_nvidia_drivers
|
|
install_ml_tools
|
|
ensure_user
|
|
create_directories
|
|
setup_firewall
|
|
setup_systemd_services
|
|
setup_logrotate
|
|
hardening_steps
|
|
selinux_guidance
|
|
install_fetch_ml
|
|
|
|
log_success "Fetch ML setup complete!"
|
|
echo
|
|
log_info "Next steps:"
|
|
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
|
|
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
|
|
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
|
|
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
|
|
echo "5. View logs: journalctl -u fetch_ml_worker -f"
|
|
echo
|
|
log_info "Services will be available at:"
|
|
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
|
|
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
|
|
}
|
|
|
|
# Run main function
|
|
main "$@"
|