- Add production setup scripts for automated deployment - Include monitoring setup and configuration validation - Add legacy setup scripts for various Linux distributions - Implement Bitwarden integration for secure credential management - Add development and production environment setup - Include comprehensive management tools and utilities - Add shell script library with common functions Provides complete automation for setup, deployment, and management of FetchML platform in development and production environments.
294 lines
8.7 KiB
Bash
Executable file
294 lines
8.7 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
# Fetch ML Ubuntu Server Setup Script
|
|
# Optimized for ML experiments on Ubuntu 20.04/22.04
|
|
|
|
set -euo pipefail
|
|
|
|
# shellcheck source=scripts/setup_common.sh
|
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
|
source "$SCRIPT_DIR/setup_common.sh"
|
|
|
|
check_root() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
log_error "This script must be run as root"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
check_ubuntu() {
|
|
if ! command -v apt-get &> /dev/null; then
|
|
log_error "This script is designed for Ubuntu systems"
|
|
exit 1
|
|
fi
|
|
|
|
local ubuntu_version=$(lsb_release -rs)
|
|
log_info "Ubuntu version: $ubuntu_version"
|
|
|
|
if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then
|
|
log_warning "Ubuntu version < 20.04 may not support all features"
|
|
fi
|
|
}
|
|
|
|
update_system() {
|
|
log_info "Updating system packages..."
|
|
apt-get update -y
|
|
apt-get upgrade -y
|
|
apt-get install -y curl wget gnupg lsb-release software-properties-common
|
|
}
|
|
|
|
install_go() {
|
|
log_info "Installing Go 1.25..."
|
|
|
|
if command -v go &> /dev/null; then
|
|
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
|
log_info "Go already installed: $go_version"
|
|
return
|
|
fi
|
|
|
|
cd /tmp
|
|
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
|
|
|
|
# Add to PATH
|
|
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
|
|
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
|
|
export PATH=$PATH:/usr/local/go/bin
|
|
|
|
log_success "Go 1.25 installed"
|
|
}
|
|
|
|
install_podman() {
|
|
log_info "Installing Podman..."
|
|
|
|
if command -v podman &> /dev/null; then
|
|
log_info "Podman already installed"
|
|
return
|
|
fi
|
|
|
|
# Add official Podman repository
|
|
echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list
|
|
curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add -
|
|
|
|
apt-get update -y
|
|
apt-get install -y podman podman-compose
|
|
|
|
# Configure Podman for rootless operation
|
|
echo "user_namespace_enable = 1" >> /etc/containers/containers.conf
|
|
echo "runtime = \"crun\"" >> /etc/containers/containers.conf
|
|
|
|
log_success "Podman installed"
|
|
}
|
|
|
|
install_redis() {
|
|
log_info "Installing Redis..."
|
|
|
|
if command -v redis-server &> /dev/null; then
|
|
log_info "Redis already installed"
|
|
return
|
|
fi
|
|
|
|
apt-get install -y redis-server
|
|
|
|
# Configure Redis for production
|
|
sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf
|
|
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf
|
|
|
|
systemctl enable redis-server
|
|
systemctl start redis-server
|
|
|
|
log_success "Redis installed and configured"
|
|
}
|
|
|
|
install_nvidia_drivers() {
|
|
log_info "Checking for NVIDIA GPU..."
|
|
|
|
if command -v nvidia-smi &> /dev/null; then
|
|
log_info "NVIDIA drivers already installed"
|
|
nvidia-smi
|
|
return
|
|
fi
|
|
|
|
if lspci | grep -i nvidia &> /dev/null; then
|
|
log_info "NVIDIA GPU detected, installing drivers..."
|
|
|
|
# Add NVIDIA repository
|
|
TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb"
|
|
secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb"
|
|
dpkg -i /tmp/cuda-keyring_1.1-1_all.deb
|
|
apt-get update -y
|
|
|
|
# Install drivers
|
|
apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit
|
|
|
|
# Configure Podman for NVIDIA (only if needed)
|
|
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
|
|
log_warning "NVIDIA GPU access test failed, you may need to reboot"
|
|
else
|
|
log_success "NVIDIA drivers installed and GPU access verified"
|
|
fi
|
|
|
|
else
|
|
log_info "No NVIDIA GPU detected, skipping driver installation"
|
|
fi
|
|
}
|
|
|
|
install_ml_tools() {
|
|
log_info "Installing ML tools and dependencies..."
|
|
|
|
# Python and ML packages
|
|
apt-get install -y python3 python3-pip python3-venv
|
|
|
|
# System dependencies for ML
|
|
apt-get install -y build-essential cmake git pkg-config
|
|
apt-get install -y libjpeg-dev libpng-dev libtiff-dev
|
|
apt-get install -y libavcodec-dev libavformat-dev libswscale-dev
|
|
apt-get install -y libgtk2.0-dev libcanberra-gtk-module
|
|
apt-get install -y libxvidcore-dev libx264-dev
|
|
apt-get install -y libatlas-base-dev gfortran
|
|
|
|
# Install common ML libraries
|
|
pip3 install --upgrade pip
|
|
pip3 install numpy scipy scikit-learn pandas
|
|
pip3 install jupyter matplotlib seaborn
|
|
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
|
|
|
log_success "ML tools installed"
|
|
}
|
|
|
|
create_user() {
|
|
log_info "Creating fetchml user..."
|
|
ensure_user
|
|
create_directories
|
|
log_success "User $FETCH_ML_USER and directories created"
|
|
}
|
|
|
|
setup_firewall() {
|
|
log_info "Configuring firewall..."
|
|
|
|
if command -v ufw &> /dev/null; then
|
|
ufw --force enable
|
|
ufw allow ssh
|
|
ufw allow 8080/tcp # Worker API
|
|
ufw allow 8081/tcp # Data manager API
|
|
ufw allow 6379/tcp # Redis
|
|
ufw status
|
|
else
|
|
log_warning "UFW not available, skipping firewall configuration"
|
|
fi
|
|
}
|
|
|
|
setup_systemd_services() {
|
|
log_info "Setting up systemd services..."
|
|
|
|
setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml"
|
|
setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml"
|
|
|
|
# Enable services
|
|
systemctl daemon-reload
|
|
systemctl enable fetch_ml_worker
|
|
systemctl enable fetch_ml_data_manager
|
|
|
|
log_success "Systemd services configured"
|
|
}
|
|
|
|
setup_log_rotation() {
|
|
log_info "Setting up log rotation..."
|
|
setup_logrotate
|
|
log_success "Log rotation configured"
|
|
}
|
|
|
|
optimize_system() {
|
|
log_info "Optimizing system for ML workloads..."
|
|
hardening_steps
|
|
|
|
# Optimize kernel parameters for ML
|
|
cat >> /etc/sysctl.conf << EOF
|
|
# ML Optimization
|
|
net.core.rmem_max = 134217728
|
|
net.core.wmem_max = 134217728
|
|
vm.swappiness = 10
|
|
vm.dirty_ratio = 15
|
|
vm.dirty_background_ratio = 5
|
|
EOF
|
|
|
|
sysctl -p
|
|
|
|
# Configure GPU persistence mode if NVIDIA available
|
|
if command -v nvidia-smi &> /dev/null; then
|
|
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
|
|
fi
|
|
|
|
log_success "System optimized for ML workloads"
|
|
}
|
|
|
|
install_fetch_ml() {
|
|
log_info "Installing Fetch ML..."
|
|
|
|
# Clone or copy Fetch ML
|
|
cd $FETCH_ML_HOME
|
|
|
|
if [[ ! -d "fetch_ml" ]]; then
|
|
# This would be replaced with actual repository URL
|
|
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
|
|
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
|
|
return
|
|
fi
|
|
|
|
cd fetch_ml
|
|
|
|
# Build
|
|
export PATH=$PATH:/usr/local/go/bin
|
|
make build
|
|
|
|
# Copy binaries
|
|
cp bin/* $FETCH_ML_HOME/bin/
|
|
chmod +x $FETCH_ML_HOME/bin/*
|
|
|
|
# Copy configs
|
|
mkdir -p $FETCH_ML_HOME/configs
|
|
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
|
|
|
|
# Set permissions
|
|
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
|
|
|
log_success "Fetch ML installed"
|
|
}
|
|
|
|
main() {
|
|
log_info "Starting Fetch ML Ubuntu server setup..."
|
|
|
|
check_root
|
|
check_ubuntu
|
|
|
|
update_system
|
|
install_go
|
|
install_podman
|
|
install_redis
|
|
install_nvidia_drivers
|
|
install_ml_tools
|
|
ensure_user
|
|
create_directories
|
|
setup_firewall
|
|
setup_systemd_services
|
|
setup_logrotate
|
|
hardening_steps
|
|
install_fetch_ml
|
|
|
|
log_success "Fetch ML setup complete!"
|
|
echo
|
|
log_info "Next steps:"
|
|
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
|
|
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
|
|
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
|
|
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
|
|
echo "5. View logs: journalctl -u fetch_ml_worker -f"
|
|
echo
|
|
log_info "Services will be available at:"
|
|
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
|
|
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
|
|
}
|
|
|
|
# Run main function
|
|
main "$@"
|