- Reorganize configs into environments/, workers/, deprecated/ folders - Reorganize scripts into testing/, deployment/, maintenance/, benchmarks/ folders - Add comprehensive testing guide documentation - Add new Makefile targets: test-full, test-auth, test-status - Update script paths in Makefile to match new organization - Create testing protocol documentation - Add cleanup status checking functionality Testing framework now includes: - Quick authentication tests (make test-auth) - Full test suite runner (make test-full) - Cleanup status monitoring (make test-status) - Comprehensive documentation and troubleshooting guides
229 lines
6.5 KiB
Bash
Executable file
229 lines
6.5 KiB
Bash
Executable file
#!/bin/bash
|
|
# Production Setup Script for Rocky Linux (Bare Metal)
|
|
# This script sets up the complete FetchML environment on bare metal
|
|
|
|
set -e
|
|
|
|
BOLD='\033[1m'
|
|
GREEN='\033[0;32m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n"
|
|
|
|
# Configuration
|
|
BASE_PATH="${1:-/data/ml-experiments}"
|
|
ML_USER="${2:-ml-user}"
|
|
ML_GROUP="${3:-ml-group}"
|
|
|
|
echo "Configuration:"
|
|
echo " Base path: $BASE_PATH"
|
|
echo " ML user: $ML_USER"
|
|
echo " ML group: $ML_GROUP"
|
|
echo ""
|
|
|
|
# 1. Create system user if it doesn't exist
|
|
echo -e "${BLUE}[1/8]${NC} Creating system user..."
|
|
if id "$ML_USER" &>/dev/null; then
|
|
echo " User $ML_USER already exists"
|
|
else
|
|
sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER
|
|
echo -e "${GREEN}✓${NC} Created user: $ML_USER"
|
|
fi
|
|
|
|
# 2. Create directory structure
|
|
echo -e "${BLUE}[2/8]${NC} Creating directory structure..."
|
|
sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets}
|
|
sudo mkdir -p /var/log/fetch_ml
|
|
sudo mkdir -p /etc/fetch_ml
|
|
|
|
echo -e "${GREEN}✓${NC} Created directories:"
|
|
echo " $BASE_PATH/experiments/"
|
|
echo " $BASE_PATH/pending/"
|
|
echo " $BASE_PATH/running/"
|
|
echo " $BASE_PATH/finished/"
|
|
echo " $BASE_PATH/failed/"
|
|
echo " $BASE_PATH/datasets/"
|
|
echo " /var/log/fetch_ml/"
|
|
echo " /etc/fetch_ml/"
|
|
|
|
# 3. Set ownership and permissions
|
|
echo -e "${BLUE}[3/8]${NC} Setting permissions..."
|
|
sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH
|
|
sudo chmod 755 $BASE_PATH
|
|
sudo chmod 700 $BASE_PATH/experiments # Restrict experiment data
|
|
|
|
sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml
|
|
sudo chmod 755 /var/log/fetch_ml
|
|
|
|
echo -e "${GREEN}✓${NC} Permissions set"
|
|
|
|
# 4. Install system dependencies (Rocky Linux)
|
|
echo -e "${BLUE}[4/8]${NC} Installing system dependencies..."
|
|
sudo dnf install -y \
|
|
golang \
|
|
podman \
|
|
redis \
|
|
git \
|
|
make \
|
|
gcc \
|
|
|| echo "Some packages may already be installed"
|
|
|
|
echo -e "${GREEN}✓${NC} Dependencies installed"
|
|
|
|
# 5. Configure Podman for GPU access (if NVIDIA GPU present)
|
|
echo -e "${BLUE}[5/8]${NC} Configuring Podman..."
|
|
if lspci | grep -i nvidia &>/dev/null; then
|
|
echo " NVIDIA GPU detected, configuring GPU access..."
|
|
|
|
# Install nvidia-container-toolkit if not present
|
|
if ! command -v nvidia-container-toolkit &>/dev/null; then
|
|
echo " Installing nvidia-container-toolkit..."
|
|
sudo dnf config-manager --add-repo \
|
|
https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
|
|
sudo dnf install -y nvidia-container-toolkit
|
|
fi
|
|
|
|
# Configure Podman CDI
|
|
sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
|
|
echo -e "${GREEN}✓${NC} GPU support configured"
|
|
else
|
|
echo " No NVIDIA GPU detected, skipping GPU setup"
|
|
fi
|
|
|
|
# 6. Configure Redis
|
|
echo -e "${BLUE}[6/8]${NC} Configuring Redis..."
|
|
sudo systemctl enable redis
|
|
sudo systemctl start redis || echo "Redis may already be running"
|
|
|
|
# Set Redis password if not already configured
|
|
if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
|
|
REDIS_PASSWORD=$(openssl rand -base64 32)
|
|
echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null
|
|
sudo systemctl restart redis
|
|
echo " Generated Redis password: $REDIS_PASSWORD"
|
|
echo " Save this password for your configuration!"
|
|
else
|
|
echo " Redis password already configured"
|
|
fi
|
|
|
|
echo -e "${GREEN}✓${NC} Redis configured"
|
|
|
|
# 7. Setup systemd services
|
|
echo -e "${BLUE}[7/8]${NC} Creating systemd services..."
|
|
|
|
# API Server service
|
|
sudo tee /etc/systemd/system/fetchml-api.service >/dev/null <<EOF
|
|
[Unit]
|
|
Description=FetchML API Server
|
|
After=network.target redis.service
|
|
Wants=redis.service
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=$ML_USER
|
|
Group=$ML_GROUP
|
|
WorkingDirectory=/opt/fetch_ml
|
|
ExecStart=/usr/local/bin/fetchml-api -config /etc/fetch_ml/config.yaml
|
|
Restart=always
|
|
RestartSec=10
|
|
StandardOutput=append:/var/log/fetch_ml/api.log
|
|
StandardError=append:/var/log/fetch_ml/api-error.log
|
|
|
|
# Security hardening
|
|
NoNewPrivileges=true
|
|
PrivateTmp=true
|
|
ProtectSystem=strict
|
|
ProtectHome=true
|
|
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
# Worker service
|
|
sudo tee /etc/systemd/system/fetchml-worker.service >/dev/null <<EOF
|
|
[Unit]
|
|
Description=FetchML Worker
|
|
After=network.target redis.service fetchml-api.service
|
|
Wants=redis.service
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=$ML_USER
|
|
Group=$ML_GROUP
|
|
WorkingDirectory=/opt/fetch_ml
|
|
ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetch_ml/worker.toml
|
|
Restart=always
|
|
RestartSec=10
|
|
StandardOutput=append:/var/log/fetch_ml/worker.log
|
|
StandardError=append:/var/log/fetch_ml/worker-error.log
|
|
|
|
# Security hardening
|
|
NoNewPrivileges=true
|
|
PrivateTmp=true
|
|
ProtectSystem=strict
|
|
ProtectHome=true
|
|
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
sudo systemctl daemon-reload
|
|
echo -e "${GREEN}✓${NC} Systemd services created"
|
|
|
|
# 8. Setup logrotate
|
|
echo -e "${BLUE}[8/8]${NC} Configuring log rotation..."
|
|
sudo tee /etc/logrotate.d/fetchml >/dev/null <<EOF
|
|
/var/log/fetch_ml/*.log {
|
|
daily
|
|
rotate 14
|
|
compress
|
|
delaycompress
|
|
notifempty
|
|
missingok
|
|
create 0640 $ML_USER $ML_GROUP
|
|
sharedscripts
|
|
postrotate
|
|
systemctl reload fetchml-api >/dev/null 2>&1 || true
|
|
systemctl reload fetchml-worker >/dev/null 2>&1 || true
|
|
endscript
|
|
}
|
|
EOF
|
|
|
|
echo -e "${GREEN}✓${NC} Log rotation configured"
|
|
|
|
# Summary
|
|
echo ""
|
|
echo -e "${BOLD}=== Setup Complete! ===${NC}"
|
|
echo ""
|
|
echo "Directory structure created at: $BASE_PATH"
|
|
echo "Logs will be written to: /var/log/fetch_ml/"
|
|
echo "Configuration directory: /etc/fetch_ml/"
|
|
echo ""
|
|
echo -e "${BOLD}Next steps:${NC}"
|
|
echo "1. Copy your config files:"
|
|
echo " sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml"
|
|
echo " sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml"
|
|
echo ""
|
|
echo "2. Build and install binaries:"
|
|
echo " make build"
|
|
echo " sudo cp bin/api-server /usr/local/bin/fetchml-api"
|
|
echo " sudo cp bin/worker /usr/local/bin/fetchml-worker"
|
|
echo ""
|
|
echo "3. Update config files with your settings (Redis password, API keys, etc.)"
|
|
echo ""
|
|
echo "4. Start services:"
|
|
echo " sudo systemctl start fetchml-api"
|
|
echo " sudo systemctl start fetchml-worker"
|
|
echo ""
|
|
echo "5. Enable services to start on boot:"
|
|
echo " sudo systemctl enable fetchml-api"
|
|
echo " sudo systemctl enable fetchml-worker"
|
|
echo ""
|
|
echo "6. Check status:"
|
|
echo " sudo systemctl status fetchml-api"
|
|
echo " sudo systemctl status fetchml-worker"
|
|
echo " sudo journalctl -u fetchml-api -f"
|
|
echo ""
|