fetch_ml/docs/_site/operations/index.html
Jeremie Fraeys 385d2cf386 docs: add comprehensive documentation with MkDocs site
- Add complete API documentation and architecture guides
- Include quick start, installation, and deployment guides
- Add troubleshooting and security documentation
- Include CLI reference and configuration schema docs
- Add production monitoring and operations guides
- Implement MkDocs configuration with search functionality
- Include comprehensive user and developer documentation

Provides complete documentation for users and developers
covering all aspects of the FetchML platform.
2025-12-04 16:54:57 -05:00

2105 lines
No EOL
47 KiB
HTML

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="description" content="Secure Machine Learning Platform">
<link rel="prev" href="../production-monitoring/">
<link rel="next" href="../redis-ha/">
<link rel="icon" href="../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.0">
<title>Operations Runbook - Fetch ML Documentation</title>
<link rel="stylesheet" href="../assets/stylesheets/main.618322db.min.css">
<link rel="stylesheet" href="../assets/stylesheets/palette.ab4e12ef.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="blue" data-md-color-accent="blue">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#operations-runbook" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href=".." title="Fetch ML Documentation" class="md-header__button md-logo" aria-label="Fetch ML Documentation" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
Fetch ML Documentation
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Operations Runbook
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="blue" data-md-color-accent="blue" aria-hidden="true" type="radio" name="__palette" id="__palette_0">
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="blue" data-md-color-accent="blue" aria-hidden="true" type="radio" name="__palette" id="__palette_1">
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<a href="javascript:void(0)" class="md-search__icon md-icon" title="Share" aria-label="Share" data-clipboard data-clipboard-text="" data-md-component="search-share" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18 16.08c-.76 0-1.44.3-1.96.77L8.91 12.7c.05-.23.09-.46.09-.7s-.04-.47-.09-.7l7.05-4.11c.54.5 1.25.81 2.04.81a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3c0 .24.04.47.09.7L8.04 9.81C7.5 9.31 6.79 9 6 9a3 3 0 0 0-3 3 3 3 0 0 0 3 3c.79 0 1.5-.31 2.04-.81l7.12 4.15c-.05.21-.08.43-.08.66 0 1.61 1.31 2.91 2.92 2.91s2.92-1.3 2.92-2.91A2.92 2.92 0 0 0 18 16.08"/></svg>
</a>
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/jfraeys/fetch_ml" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
GitHub
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
<div class="md-grid">
<ul class="md-tabs__list">
<li class="md-tabs__item">
<a href=".." class="md-tabs__link">
Home
</a>
</li>
<li class="md-tabs__item">
<a href="../quick-start/" class="md-tabs__link">
Getting Started
</a>
</li>
<li class="md-tabs__item">
<a href="../development-setup/" class="md-tabs__link">
Development
</a>
</li>
<li class="md-tabs__item md-tabs__item--active">
<a href="../deployment/" class="md-tabs__link">
Operations & Production
</a>
</li>
<li class="md-tabs__item">
<a href="../security/" class="md-tabs__link">
Security
</a>
</li>
<li class="md-tabs__item">
<a href="../configuration-schema/" class="md-tabs__link">
Reference
</a>
</li>
</ul>
</div>
</nav>
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary md-nav--lifted md-nav--integrated" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href=".." title="Fetch ML Documentation" class="md-nav__button md-logo" aria-label="Fetch ML Documentation" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
Fetch ML Documentation
</label>
<div class="md-nav__source">
<a href="https://github.com/jfraeys/fetch_ml" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
GitHub
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href=".." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Getting Started
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Getting Started
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../quick-start/" class="md-nav__link">
<span class="md-ellipsis">
Quick Start
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../installation/" class="md-nav__link">
<span class="md-ellipsis">
Simple Installation Guide
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../first-experiment/" class="md-nav__link">
<span class="md-ellipsis">
First Experiment
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Development
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Development
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../development-setup/" class="md-nav__link">
<span class="md-ellipsis">
Development Setup
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../testing/" class="md-nav__link">
<span class="md-ellipsis">
Testing Guide
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../architecture/" class="md-nav__link">
<span class="md-ellipsis">
Homelab Architecture
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../cli-reference/" class="md-nav__link">
<span class="md-ellipsis">
CLI Reference
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../zig-cli/" class="md-nav__link">
<span class="md-ellipsis">
Zig CLI Guide
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../queue/" class="md-nav__link">
<span class="md-ellipsis">
Task Queue Architecture
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../smart-defaults/" class="md-nav__link">
<span class="md-ellipsis">
Smart Defaults
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../cicd/" class="md-nav__link">
<span class="md-ellipsis">
CI/CD Pipeline
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" checked>
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="">
<span class="md-ellipsis">
Operations & Production
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Operations & Production
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../deployment/" class="md-nav__link">
<span class="md-ellipsis">
ML Experiment Manager - Deployment Guide
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../environment-variables/" class="md-nav__link">
<span class="md-ellipsis">
Environment Variables
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../production-monitoring/" class="md-nav__link">
<span class="md-ellipsis">
Production Monitoring Deployment Guide (Linux)
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Operations Runbook
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Operations Runbook
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#task-queue-operations" class="md-nav__link">
<span class="md-ellipsis">
Task Queue Operations
</span>
</a>
<nav class="md-nav" aria-label="Task Queue Operations">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#monitoring-queue-health" class="md-nav__link">
<span class="md-ellipsis">
Monitoring Queue Health
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#handling-stuck-tasks" class="md-nav__link">
<span class="md-ellipsis">
Handling Stuck Tasks
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#dead-letter-queue-management" class="md-nav__link">
<span class="md-ellipsis">
Dead Letter Queue Management
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#worker-crashes" class="md-nav__link">
<span class="md-ellipsis">
Worker Crashes
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#worker-operations" class="md-nav__link">
<span class="md-ellipsis">
Worker Operations
</span>
</a>
<nav class="md-nav" aria-label="Worker Operations">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#graceful-shutdown" class="md-nav__link">
<span class="md-ellipsis">
Graceful Shutdown
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#force-shutdown" class="md-nav__link">
<span class="md-ellipsis">
Force Shutdown
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#worker-heartbeat-monitoring" class="md-nav__link">
<span class="md-ellipsis">
Worker Heartbeat Monitoring
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#redis-operations" class="md-nav__link">
<span class="md-ellipsis">
Redis Operations
</span>
</a>
<nav class="md-nav" aria-label="Redis Operations">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#backup" class="md-nav__link">
<span class="md-ellipsis">
Backup
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#restore" class="md-nav__link">
<span class="md-ellipsis">
Restore
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#memory-management" class="md-nav__link">
<span class="md-ellipsis">
Memory Management
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#common-issues" class="md-nav__link">
<span class="md-ellipsis">
Common Issues
</span>
</a>
<nav class="md-nav" aria-label="Common Issues">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#issue-queue-growing-unbounded" class="md-nav__link">
<span class="md-ellipsis">
Issue: Queue Growing Unbounded
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#issue-high-retry-rate" class="md-nav__link">
<span class="md-ellipsis">
Issue: High Retry Rate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#issue-leases-expiring-prematurely" class="md-nav__link">
<span class="md-ellipsis">
Issue: Leases Expiring Prematurely
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#performance-tuning" class="md-nav__link">
<span class="md-ellipsis">
Performance Tuning
</span>
</a>
<nav class="md-nav" aria-label="Performance Tuning">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#worker-concurrency" class="md-nav__link">
<span class="md-ellipsis">
Worker Concurrency
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#redis-configuration" class="md-nav__link">
<span class="md-ellipsis">
Redis Configuration
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#alerting-rules" class="md-nav__link">
<span class="md-ellipsis">
Alerting Rules
</span>
</a>
<nav class="md-nav" aria-label="Alerting Rules">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#critical-alerts" class="md-nav__link">
<span class="md-ellipsis">
Critical Alerts
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#warning-alerts" class="md-nav__link">
<span class="md-ellipsis">
Warning Alerts
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#health-checks" class="md-nav__link">
<span class="md-ellipsis">
Health Checks
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#runbook-checklist" class="md-nav__link">
<span class="md-ellipsis">
Runbook Checklist
</span>
</a>
<nav class="md-nav" aria-label="Runbook Checklist">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#daily-operations" class="md-nav__link">
<span class="md-ellipsis">
Daily Operations
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#weekly-operations" class="md-nav__link">
<span class="md-ellipsis">
Weekly Operations
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#monthly-operations" class="md-nav__link">
<span class="md-ellipsis">
Monthly Operations
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../redis-ha/" class="md-nav__link">
<span class="md-ellipsis">
Redis High Availability (Optional)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../release-checklist/" class="md-nav__link">
<span class="md-ellipsis">
Release Checklist
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
Security
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Security
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../security/" class="md-nav__link">
<span class="md-ellipsis">
Security Guide
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../api-key-process/" class="md-nav__link">
<span class="md-ellipsis">
FetchML API Key Process
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../user-permissions/" class="md-nav__link">
<span class="md-ellipsis">
User Permissions in Fetch ML
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_6" >
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-ellipsis">
Reference
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
Reference
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../configuration-schema/" class="md-nav__link">
<span class="md-ellipsis">
Configuration Schema
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../troubleshooting/" class="md-nav__link">
<span class="md-ellipsis">
Troubleshooting
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="operations-runbook">Operations Runbook<a class="headerlink" href="#operations-runbook" title="Permanent link">&para;</a></h1>
<p>Operational guide for troubleshooting and maintaining the ML experiment system.</p>
<h2 id="task-queue-operations">Task Queue Operations<a class="headerlink" href="#task-queue-operations" title="Permanent link">&para;</a></h2>
<h3 id="monitoring-queue-health">Monitoring Queue Health<a class="headerlink" href="#monitoring-queue-health" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code># Check queue depth
ZCARD task:queue
# List pending tasks
ZRANGE task:queue 0 -1 WITHSCORES
# Check dead letter queue
KEYS task:dlq:*
</code></pre></div>
<h3 id="handling-stuck-tasks">Handling Stuck Tasks<a class="headerlink" href="#handling-stuck-tasks" title="Permanent link">&para;</a></h3>
<p><strong>Symptom:</strong> Tasks stuck in "running" status</p>
<p><strong>Diagnosis:</strong>
<div class="highlight"><pre><span></span><code><span class="c1"># Check for expired leases</span>
redis-cli<span class="w"> </span>GET<span class="w"> </span>task:<span class="o">{</span>task-id<span class="o">}</span>
<span class="c1"># Look for LeaseExpiry in past</span>
</code></pre></div></p>
<p>**Rem</p>
<p>ediation:**
Tasks with expired leases are automatically reclaimed every 1 minute. To force immediate reclamation:
<div class="highlight"><pre><span></span><code><span class="c1"># Restart worker to trigger reclaim cycle</span>
systemctl<span class="w"> </span>restart<span class="w"> </span>ml-worker
</code></pre></div></p>
<h3 id="dead-letter-queue-management">Dead Letter Queue Management<a class="headerlink" href="#dead-letter-queue-management" title="Permanent link">&para;</a></h3>
<p><strong>View failed tasks:</strong>
<div class="highlight"><pre><span></span><code>KEYS task:dlq:*
</code></pre></div></p>
<p><strong>Inspect failed task:</strong>
<div class="highlight"><pre><span></span><code>GET task:dlq:{task-id}
</code></pre></div></p>
<p><strong>Retry from DLQ:</strong>
<div class="highlight"><pre><span></span><code><span class="c1"># Manual retry (requires custom script)</span>
<span class="c1"># 1. Get task from DLQ</span>
<span class="c1"># 2. Reset retry count</span>
<span class="c1"># 3. Re-queue task</span>
</code></pre></div></p>
<h3 id="worker-crashes">Worker Crashes<a class="headerlink" href="#worker-crashes" title="Permanent link">&para;</a></h3>
<p><strong>Symptom:</strong> Worker disappeared mid-task</p>
<p><strong>What Happens:</strong>
1. Lease expires after 30 minutes (default)
2. Background reclaim job detects expired lease
3. Task is retried (up to 3 attempts)
4. After max retries → Dead Letter Queue</p>
<p><strong>Prevention:</strong>
- Monitor worker heartbeats
- Set up alerts for worker down
- Use process manager (systemd, supervisor)</p>
<h2 id="worker-operations">Worker Operations<a class="headerlink" href="#worker-operations" title="Permanent link">&para;</a></h2>
<h3 id="graceful-shutdown">Graceful Shutdown<a class="headerlink" href="#graceful-shutdown" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code><span class="c1"># Send SIGTERM for graceful shutdown</span>
<span class="nb">kill</span><span class="w"> </span>-TERM<span class="w"> </span><span class="k">$(</span>pgrep<span class="w"> </span>ml-worker<span class="k">)</span>
<span class="c1"># Worker will:</span>
<span class="c1"># 1. Stop accepting new tasks</span>
<span class="c1"># 2. Finish active tasks (up to 5min timeout)</span>
<span class="c1"># 3. Release all leases</span>
<span class="c1"># 4. Exit cleanly</span>
</code></pre></div>
<h3 id="force-shutdown">Force Shutdown<a class="headerlink" href="#force-shutdown" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code><span class="c1"># Force kill (leases will be reclaimed automatically)</span>
<span class="nb">kill</span><span class="w"> </span>-9<span class="w"> </span><span class="k">$(</span>pgrep<span class="w"> </span>ml-worker<span class="k">)</span>
</code></pre></div>
<h3 id="worker-heartbeat-monitoring">Worker Heartbeat Monitoring<a class="headerlink" href="#worker-heartbeat-monitoring" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code># Check worker heartbeats
HGETALL worker:heartbeat
# Example output:
# worker-abc123 1701234567
# worker-def456 1701234580
</code></pre></div>
<p><strong>Alert if:</strong> Heartbeat timestamp &gt; 5 minutes old</p>
<h2 id="redis-operations">Redis Operations<a class="headerlink" href="#redis-operations" title="Permanent link">&para;</a></h2>
<h3 id="backup">Backup<a class="headerlink" href="#backup" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code><span class="c1"># Manual backup</span>
redis-cli<span class="w"> </span>SAVE
cp<span class="w"> </span>/var/lib/redis/dump.rdb<span class="w"> </span>/backup/redis-<span class="k">$(</span>date<span class="w"> </span>+%Y%m%d<span class="k">)</span>.rdb
</code></pre></div>
<h3 id="restore">Restore<a class="headerlink" href="#restore" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code><span class="c1"># Stop Redis</span>
systemctl<span class="w"> </span>stop<span class="w"> </span>redis
<span class="c1"># Restore snapshot</span>
cp<span class="w"> </span>/backup/redis-20231201.rdb<span class="w"> </span>/var/lib/redis/dump.rdb
<span class="c1"># Start Redis</span>
systemctl<span class="w"> </span>start<span class="w"> </span>redis
</code></pre></div>
<h3 id="memory-management">Memory Management<a class="headerlink" href="#memory-management" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code># Check memory usage
INFO memory
# Evict old data if needed
FLUSHDB # DANGER: Clears all data!
</code></pre></div>
<h2 id="common-issues">Common Issues<a class="headerlink" href="#common-issues" title="Permanent link">&para;</a></h2>
<h3 id="issue-queue-growing-unbounded">Issue: Queue Growing Unbounded<a class="headerlink" href="#issue-queue-growing-unbounded" title="Permanent link">&para;</a></h3>
<p><strong>Symptoms:</strong>
- <code>ZCARD task:queue</code> keeps increasing
- No workers processing tasks</p>
<p><strong>Diagnosis:</strong>
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker status</span>
systemctl<span class="w"> </span>status<span class="w"> </span>ml-worker
<span class="c1"># Check logs</span>
journalctl<span class="w"> </span>-u<span class="w"> </span>ml-worker<span class="w"> </span>-n<span class="w"> </span><span class="m">100</span>
</code></pre></div></p>
<p><strong>Resolution:</strong>
1. Verify workers are running
2. Check Redis connectivity
3. Verify lease configuration</p>
<h3 id="issue-high-retry-rate">Issue: High Retry Rate<a class="headerlink" href="#issue-high-retry-rate" title="Permanent link">&para;</a></h3>
<p><strong>Symptoms:</strong>
- Many tasks in DLQ
- <code>retry_count</code> field high on tasks</p>
<p><strong>Diagnosis:</strong>
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker logs for errors</span>
journalctl<span class="w"> </span>-u<span class="w"> </span>ml-worker<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s2">&quot;retry&quot;</span>
<span class="c1"># Look for patterns (network issues, resource limits, etc)</span>
</code></pre></div></p>
<p><strong>Resolution:</strong>
- Fix underlying issue (network, resources, etc)
- Adjust retry limits if permanent failures
- Increase task timeout if jobs are slow</p>
<h3 id="issue-leases-expiring-prematurely">Issue: Leases Expiring Prematurely<a class="headerlink" href="#issue-leases-expiring-prematurely" title="Permanent link">&para;</a></h3>
<p><strong>Symptoms:</strong>
- Tasks retried even though worker is healthy
- Logs show "lease expired" frequently</p>
<p><strong>Diagnosis:</strong>
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker config</span>
<span class="l l-Scalar l-Scalar-Plain">cat configs/worker-config.yaml | grep -A3 &quot;lease&quot;</span>
<span class="l l-Scalar l-Scalar-Plain">task_lease_duration</span><span class="p p-Indicator">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">30m</span><span class="w"> </span><span class="c1"># Too short?</span>
<span class="nt">heartbeat_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1m</span><span class="w"> </span><span class="c1"># Too infrequent?</span>
</code></pre></div></p>
<p><strong>Resolution:</strong>
<div class="highlight"><pre><span></span><code><span class="c1"># Increase lease duration for long-running jobs</span>
<span class="nt">task_lease_duration</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">60m</span>
<span class="nt">heartbeat_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">30s</span><span class="w"> </span><span class="c1"># More frequent heartbeats</span>
</code></pre></div></p>
<h2 id="performance-tuning">Performance Tuning<a class="headerlink" href="#performance-tuning" title="Permanent link">&para;</a></h2>
<h3 id="worker-concurrency">Worker Concurrency<a class="headerlink" href="#worker-concurrency" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code><span class="c1"># worker-config.yaml</span>
<span class="nt">max_workers</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">4</span><span class="w"> </span><span class="c1"># Number of parallel tasks</span>
<span class="c1"># Adjust based on:</span>
<span class="c1"># - CPU cores available</span>
<span class="c1"># - Memory per task</span>
<span class="c1"># - GPU availability</span>
</code></pre></div>
<h3 id="redis-configuration">Redis Configuration<a class="headerlink" href="#redis-configuration" title="Permanent link">&para;</a></h3>
<div class="highlight"><pre><span></span><code># /etc/redis/redis.conf
# Persistence
save 900 1
save 300 10
# Memory
maxmemory 2gb
maxmemory-policy noeviction
# Performance
tcp-keepalive 300
timeout 0
</code></pre></div>
<h2 id="alerting-rules">Alerting Rules<a class="headerlink" href="#alerting-rules" title="Permanent link">&para;</a></h2>
<h3 id="critical-alerts">Critical Alerts<a class="headerlink" href="#critical-alerts" title="Permanent link">&para;</a></h3>
<ol>
<li><strong>Worker Down</strong> (no heartbeat &gt; 5min)</li>
<li><strong>Queue Depth</strong> &gt; 1000 tasks</li>
<li><strong>DLQ Growth</strong> &gt; 100 tasks/hour</li>
<li><strong>Redis Down</strong> (connection failed)</li>
</ol>
<h3 id="warning-alerts">Warning Alerts<a class="headerlink" href="#warning-alerts" title="Permanent link">&para;</a></h3>
<ol>
<li><strong>High Retry Rate</strong> &gt; 10% of tasks</li>
<li><strong>Slow Queue Drain</strong> (depth increasing over 1 hour)</li>
<li><strong>Worker Memory</strong> &gt; 80% usage</li>
</ol>
<h2 id="health-checks">Health Checks<a class="headerlink" href="#health-checks" title="Permanent link">&para;</a></h2>
<div class="highlight"><pre><span></span><code><span class="ch">#!/bin/bash</span>
<span class="c1"># health-check.sh</span>
<span class="c1"># Check Redis</span>
redis-cli<span class="w"> </span>PING<span class="w"> </span><span class="o">||</span><span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;Redis DOWN&quot;</span>
<span class="c1"># Check worker heartbeat</span>
<span class="nv">WORKER_ID</span><span class="o">=</span><span class="k">$(</span>cat<span class="w"> </span>/var/run/ml-worker.pid<span class="k">)</span>
<span class="nv">LAST_HB</span><span class="o">=</span><span class="k">$(</span>redis-cli<span class="w"> </span>HGET<span class="w"> </span>worker:heartbeat<span class="w"> </span><span class="s2">&quot;</span><span class="nv">$WORKER_ID</span><span class="s2">&quot;</span><span class="k">)</span>
<span class="nv">NOW</span><span class="o">=</span><span class="k">$(</span>date<span class="w"> </span>+%s<span class="k">)</span>
<span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="k">$((</span><span class="nv">NOW</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="nv">LAST_HB</span><span class="k">))</span><span class="w"> </span>-gt<span class="w"> </span><span class="m">300</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;Worker heartbeat stale&quot;</span>
<span class="k">fi</span>
<span class="c1"># Check queue depth</span>
<span class="nv">DEPTH</span><span class="o">=</span><span class="k">$(</span>redis-cli<span class="w"> </span>ZCARD<span class="w"> </span>task:queue<span class="k">)</span>
<span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="s2">&quot;</span><span class="nv">$DEPTH</span><span class="s2">&quot;</span><span class="w"> </span>-gt<span class="w"> </span><span class="m">1000</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;Queue depth critical: </span><span class="nv">$DEPTH</span><span class="s2">&quot;</span>
<span class="k">fi</span>
</code></pre></div>
<h2 id="runbook-checklist">Runbook Checklist<a class="headerlink" href="#runbook-checklist" title="Permanent link">&para;</a></h2>
<h3 id="daily-operations">Daily Operations<a class="headerlink" href="#daily-operations" title="Permanent link">&para;</a></h3>
<ol>
<li>Check queue depth</li>
<li>Verify worker heartbeats</li>
<li>Review DLQ for patterns</li>
<li>Check Redis memory usage</li>
</ol>
<h3 id="weekly-operations">Weekly Operations<a class="headerlink" href="#weekly-operations" title="Permanent link">&para;</a></h3>
<ol>
<li>Review retry rates</li>
<li>Analyze failed task patterns</li>
<li>Backup Redis snapshot</li>
<li>Review worker logs</li>
</ol>
<h3 id="monthly-operations">Monthly Operations<a class="headerlink" href="#monthly-operations" title="Permanent link">&para;</a></h3>
<ol>
<li>Performance tuning review</li>
<li>Capacity planning</li>
<li>Update documentation</li>
<li>Test disaster recovery</li>
</ol>
<hr />
<p><strong>For homelab setups:</strong>
Most of these operations can be simplified. Focus on:
- Basic monitoring (queue depth, worker status)
- Periodic Redis backups
- Graceful shutdowns for maintenance</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "..", "features": ["navigation.instant", "navigation.tracking", "navigation.tabs", "navigation.sections", "navigation.expand", "navigation.indexes", "toc.integrate", "search.highlight", "search.share"], "search": "../assets/javascripts/workers/search.7a47a382.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../assets/javascripts/bundle.e71a0d61.min.js"></script>
</body>
</html>