- Add complete API documentation and architecture guides - Include quick start, installation, and deployment guides - Add troubleshooting and security documentation - Include CLI reference and configuration schema docs - Add production monitoring and operations guides - Implement MkDocs configuration with search functionality - Include comprehensive user and developer documentation Provides complete documentation for users and developers covering all aspects of the FetchML platform.
2105 lines
No EOL
47 KiB
HTML
2105 lines
No EOL
47 KiB
HTML
|
|
<!doctype html>
|
|
<html lang="en" class="no-js">
|
|
<head>
|
|
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
|
|
<meta name="description" content="Secure Machine Learning Platform">
|
|
|
|
|
|
|
|
|
|
<link rel="prev" href="../production-monitoring/">
|
|
|
|
|
|
<link rel="next" href="../redis-ha/">
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="icon" href="../assets/images/favicon.png">
|
|
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.0">
|
|
|
|
|
|
|
|
<title>Operations Runbook - Fetch ML Documentation</title>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/main.618322db.min.css">
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/palette.ab4e12ef.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
|
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
|
|
|
|
|
|
|
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
|
|
|
|
|
|
|
|
|
|
|
</head>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="blue" data-md-color-accent="blue">
|
|
|
|
|
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
|
<label class="md-overlay" for="__drawer"></label>
|
|
<div data-md-component="skip">
|
|
|
|
|
|
<a href="#operations-runbook" class="md-skip">
|
|
Skip to content
|
|
</a>
|
|
|
|
</div>
|
|
<div data-md-component="announce">
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<header class="md-header" data-md-component="header">
|
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
|
<a href=".." title="Fetch ML Documentation" class="md-header__button md-logo" aria-label="Fetch ML Documentation" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
<label class="md-header__button md-icon" for="__drawer">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
|
</label>
|
|
<div class="md-header__title" data-md-component="header-title">
|
|
<div class="md-header__ellipsis">
|
|
<div class="md-header__topic">
|
|
<span class="md-ellipsis">
|
|
Fetch ML Documentation
|
|
</span>
|
|
</div>
|
|
<div class="md-header__topic" data-md-component="header-topic">
|
|
<span class="md-ellipsis">
|
|
|
|
Operations Runbook
|
|
|
|
</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<form class="md-header__option" data-md-component="palette">
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="blue" data-md-color-accent="blue" aria-hidden="true" type="radio" name="__palette" id="__palette_0">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="blue" data-md-color-accent="blue" aria-hidden="true" type="radio" name="__palette" id="__palette_1">
|
|
|
|
|
|
</form>
|
|
|
|
|
|
|
|
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-header__button md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
</label>
|
|
<div class="md-search" data-md-component="search" role="dialog">
|
|
<label class="md-search__overlay" for="__search"></label>
|
|
<div class="md-search__inner" role="search">
|
|
<form class="md-search__form" name="search">
|
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
|
<label class="md-search__icon md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
|
|
</label>
|
|
<nav class="md-search__options" aria-label="Search">
|
|
|
|
<a href="javascript:void(0)" class="md-search__icon md-icon" title="Share" aria-label="Share" data-clipboard data-clipboard-text="" data-md-component="search-share" tabindex="-1">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18 16.08c-.76 0-1.44.3-1.96.77L8.91 12.7c.05-.23.09-.46.09-.7s-.04-.47-.09-.7l7.05-4.11c.54.5 1.25.81 2.04.81a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3c0 .24.04.47.09.7L8.04 9.81C7.5 9.31 6.79 9 6 9a3 3 0 0 0-3 3 3 3 0 0 0 3 3c.79 0 1.5-.31 2.04-.81l7.12 4.15c-.05.21-.08.43-.08.66 0 1.61 1.31 2.91 2.92 2.91s2.92-1.3 2.92-2.91A2.92 2.92 0 0 0 18 16.08"/></svg>
|
|
</a>
|
|
|
|
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
|
|
</button>
|
|
</nav>
|
|
|
|
</form>
|
|
<div class="md-search__output">
|
|
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
|
|
<div class="md-search-result" data-md-component="search-result">
|
|
<div class="md-search-result__meta">
|
|
Initializing search
|
|
</div>
|
|
<ol class="md-search-result__list" role="presentation"></ol>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-header__source">
|
|
<a href="https://github.com/jfraeys/fetch_ml" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
GitHub
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
</header>
|
|
|
|
<div class="md-container" data-md-component="container">
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
|
<div class="md-grid">
|
|
<ul class="md-tabs__list">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href=".." class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Home
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../quick-start/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Getting Started
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../development-setup/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Development
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item md-tabs__item--active">
|
|
<a href="../deployment/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Operations & Production
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../security/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Security
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../configuration-schema/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Reference
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</div>
|
|
</nav>
|
|
|
|
|
|
|
|
<main class="md-main" data-md-component="main">
|
|
<div class="md-main__inner md-grid">
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--primary md-nav--lifted md-nav--integrated" aria-label="Navigation" data-md-level="0">
|
|
<label class="md-nav__title" for="__drawer">
|
|
<a href=".." title="Fetch ML Documentation" class="md-nav__button md-logo" aria-label="Fetch ML Documentation" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
Fetch ML Documentation
|
|
</label>
|
|
|
|
<div class="md-nav__source">
|
|
<a href="https://github.com/jfraeys/fetch_ml" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
GitHub
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href=".." class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Home
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Getting Started
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Getting Started
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../quick-start/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Quick Start
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../installation/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Simple Installation Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../first-experiment/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
First Experiment
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_3" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Development
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Development
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../development-setup/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Development Setup
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../testing/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Testing Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../architecture/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Homelab Architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../cli-reference/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
CLI Reference
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../zig-cli/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Zig CLI Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../queue/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Task Queue Architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../smart-defaults/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Smart Defaults
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../cicd/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
CI/CD Pipeline
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" checked>
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Operations & Production
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="true">
|
|
<label class="md-nav__title" for="__nav_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Operations & Production
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../deployment/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
ML Experiment Manager - Deployment Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../environment-variables/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Environment Variables
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../production-monitoring/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Production Monitoring Deployment Guide (Linux)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active">
|
|
|
|
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Operations Runbook
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<a href="./" class="md-nav__link md-nav__link--active">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Operations Runbook
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#task-queue-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Task Queue Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Task Queue Operations">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#monitoring-queue-health" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Monitoring Queue Health
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#handling-stuck-tasks" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Handling Stuck Tasks
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#dead-letter-queue-management" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Dead Letter Queue Management
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-crashes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Crashes
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Worker Operations">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#graceful-shutdown" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Graceful Shutdown
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#force-shutdown" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Force Shutdown
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-heartbeat-monitoring" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Heartbeat Monitoring
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#redis-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Redis Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Redis Operations">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#backup" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Backup
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#restore" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Restore
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#memory-management" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Memory Management
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#common-issues" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Common Issues
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Common Issues">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#issue-queue-growing-unbounded" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Issue: Queue Growing Unbounded
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#issue-high-retry-rate" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Issue: High Retry Rate
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#issue-leases-expiring-prematurely" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Issue: Leases Expiring Prematurely
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#performance-tuning" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Performance Tuning
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Performance Tuning">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-concurrency" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Concurrency
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#redis-configuration" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Redis Configuration
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#alerting-rules" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Alerting Rules
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Alerting Rules">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#critical-alerts" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Critical Alerts
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#warning-alerts" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Warning Alerts
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#health-checks" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Health Checks
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#runbook-checklist" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Runbook Checklist
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Runbook Checklist">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#daily-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Daily Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#weekly-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Weekly Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#monthly-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Monthly Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../redis-ha/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Redis High Availability (Optional)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../release-checklist/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Release Checklist
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_5" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Security
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_5">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Security
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../security/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Security Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../api-key-process/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
FetchML API Key Process
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../user-permissions/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
User Permissions in Fetch ML
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_6" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Reference
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_6">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Reference
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../configuration-schema/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Configuration Schema
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../troubleshooting/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Troubleshooting
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="md-content" data-md-component="content">
|
|
|
|
<article class="md-content__inner md-typeset">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<h1 id="operations-runbook">Operations Runbook<a class="headerlink" href="#operations-runbook" title="Permanent link">¶</a></h1>
|
|
<p>Operational guide for troubleshooting and maintaining the ML experiment system.</p>
|
|
<h2 id="task-queue-operations">Task Queue Operations<a class="headerlink" href="#task-queue-operations" title="Permanent link">¶</a></h2>
|
|
<h3 id="monitoring-queue-health">Monitoring Queue Health<a class="headerlink" href="#monitoring-queue-health" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># Check queue depth
|
|
ZCARD task:queue
|
|
|
|
# List pending tasks
|
|
ZRANGE task:queue 0 -1 WITHSCORES
|
|
|
|
# Check dead letter queue
|
|
KEYS task:dlq:*
|
|
</code></pre></div>
|
|
<h3 id="handling-stuck-tasks">Handling Stuck Tasks<a class="headerlink" href="#handling-stuck-tasks" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptom:</strong> Tasks stuck in "running" status</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check for expired leases</span>
|
|
redis-cli<span class="w"> </span>GET<span class="w"> </span>task:<span class="o">{</span>task-id<span class="o">}</span>
|
|
<span class="c1"># Look for LeaseExpiry in past</span>
|
|
</code></pre></div></p>
|
|
<p>**Rem</p>
|
|
<p>ediation:**
|
|
Tasks with expired leases are automatically reclaimed every 1 minute. To force immediate reclamation:
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Restart worker to trigger reclaim cycle</span>
|
|
systemctl<span class="w"> </span>restart<span class="w"> </span>ml-worker
|
|
</code></pre></div></p>
|
|
<h3 id="dead-letter-queue-management">Dead Letter Queue Management<a class="headerlink" href="#dead-letter-queue-management" title="Permanent link">¶</a></h3>
|
|
<p><strong>View failed tasks:</strong>
|
|
<div class="highlight"><pre><span></span><code>KEYS task:dlq:*
|
|
</code></pre></div></p>
|
|
<p><strong>Inspect failed task:</strong>
|
|
<div class="highlight"><pre><span></span><code>GET task:dlq:{task-id}
|
|
</code></pre></div></p>
|
|
<p><strong>Retry from DLQ:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Manual retry (requires custom script)</span>
|
|
<span class="c1"># 1. Get task from DLQ</span>
|
|
<span class="c1"># 2. Reset retry count</span>
|
|
<span class="c1"># 3. Re-queue task</span>
|
|
</code></pre></div></p>
|
|
<h3 id="worker-crashes">Worker Crashes<a class="headerlink" href="#worker-crashes" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptom:</strong> Worker disappeared mid-task</p>
|
|
<p><strong>What Happens:</strong>
|
|
1. Lease expires after 30 minutes (default)
|
|
2. Background reclaim job detects expired lease
|
|
3. Task is retried (up to 3 attempts)
|
|
4. After max retries → Dead Letter Queue</p>
|
|
<p><strong>Prevention:</strong>
|
|
- Monitor worker heartbeats
|
|
- Set up alerts for worker down
|
|
- Use process manager (systemd, supervisor)</p>
|
|
<h2 id="worker-operations">Worker Operations<a class="headerlink" href="#worker-operations" title="Permanent link">¶</a></h2>
|
|
<h3 id="graceful-shutdown">Graceful Shutdown<a class="headerlink" href="#graceful-shutdown" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Send SIGTERM for graceful shutdown</span>
|
|
<span class="nb">kill</span><span class="w"> </span>-TERM<span class="w"> </span><span class="k">$(</span>pgrep<span class="w"> </span>ml-worker<span class="k">)</span>
|
|
|
|
<span class="c1"># Worker will:</span>
|
|
<span class="c1"># 1. Stop accepting new tasks</span>
|
|
<span class="c1"># 2. Finish active tasks (up to 5min timeout)</span>
|
|
<span class="c1"># 3. Release all leases</span>
|
|
<span class="c1"># 4. Exit cleanly</span>
|
|
</code></pre></div>
|
|
<h3 id="force-shutdown">Force Shutdown<a class="headerlink" href="#force-shutdown" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Force kill (leases will be reclaimed automatically)</span>
|
|
<span class="nb">kill</span><span class="w"> </span>-9<span class="w"> </span><span class="k">$(</span>pgrep<span class="w"> </span>ml-worker<span class="k">)</span>
|
|
</code></pre></div>
|
|
<h3 id="worker-heartbeat-monitoring">Worker Heartbeat Monitoring<a class="headerlink" href="#worker-heartbeat-monitoring" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># Check worker heartbeats
|
|
HGETALL worker:heartbeat
|
|
|
|
# Example output:
|
|
# worker-abc123 1701234567
|
|
# worker-def456 1701234580
|
|
</code></pre></div>
|
|
<p><strong>Alert if:</strong> Heartbeat timestamp > 5 minutes old</p>
|
|
<h2 id="redis-operations">Redis Operations<a class="headerlink" href="#redis-operations" title="Permanent link">¶</a></h2>
|
|
<h3 id="backup">Backup<a class="headerlink" href="#backup" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Manual backup</span>
|
|
redis-cli<span class="w"> </span>SAVE
|
|
cp<span class="w"> </span>/var/lib/redis/dump.rdb<span class="w"> </span>/backup/redis-<span class="k">$(</span>date<span class="w"> </span>+%Y%m%d<span class="k">)</span>.rdb
|
|
</code></pre></div>
|
|
<h3 id="restore">Restore<a class="headerlink" href="#restore" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Stop Redis</span>
|
|
systemctl<span class="w"> </span>stop<span class="w"> </span>redis
|
|
|
|
<span class="c1"># Restore snapshot</span>
|
|
cp<span class="w"> </span>/backup/redis-20231201.rdb<span class="w"> </span>/var/lib/redis/dump.rdb
|
|
|
|
<span class="c1"># Start Redis</span>
|
|
systemctl<span class="w"> </span>start<span class="w"> </span>redis
|
|
</code></pre></div>
|
|
<h3 id="memory-management">Memory Management<a class="headerlink" href="#memory-management" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># Check memory usage
|
|
INFO memory
|
|
|
|
# Evict old data if needed
|
|
FLUSHDB # DANGER: Clears all data!
|
|
</code></pre></div>
|
|
<h2 id="common-issues">Common Issues<a class="headerlink" href="#common-issues" title="Permanent link">¶</a></h2>
|
|
<h3 id="issue-queue-growing-unbounded">Issue: Queue Growing Unbounded<a class="headerlink" href="#issue-queue-growing-unbounded" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptoms:</strong>
|
|
- <code>ZCARD task:queue</code> keeps increasing
|
|
- No workers processing tasks</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker status</span>
|
|
systemctl<span class="w"> </span>status<span class="w"> </span>ml-worker
|
|
|
|
<span class="c1"># Check logs</span>
|
|
journalctl<span class="w"> </span>-u<span class="w"> </span>ml-worker<span class="w"> </span>-n<span class="w"> </span><span class="m">100</span>
|
|
</code></pre></div></p>
|
|
<p><strong>Resolution:</strong>
|
|
1. Verify workers are running
|
|
2. Check Redis connectivity
|
|
3. Verify lease configuration</p>
|
|
<h3 id="issue-high-retry-rate">Issue: High Retry Rate<a class="headerlink" href="#issue-high-retry-rate" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptoms:</strong>
|
|
- Many tasks in DLQ
|
|
- <code>retry_count</code> field high on tasks</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker logs for errors</span>
|
|
journalctl<span class="w"> </span>-u<span class="w"> </span>ml-worker<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s2">"retry"</span>
|
|
|
|
<span class="c1"># Look for patterns (network issues, resource limits, etc)</span>
|
|
</code></pre></div></p>
|
|
<p><strong>Resolution:</strong>
|
|
- Fix underlying issue (network, resources, etc)
|
|
- Adjust retry limits if permanent failures
|
|
- Increase task timeout if jobs are slow</p>
|
|
<h3 id="issue-leases-expiring-prematurely">Issue: Leases Expiring Prematurely<a class="headerlink" href="#issue-leases-expiring-prematurely" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptoms:</strong>
|
|
- Tasks retried even though worker is healthy
|
|
- Logs show "lease expired" frequently</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker config</span>
|
|
<span class="l l-Scalar l-Scalar-Plain">cat configs/worker-config.yaml | grep -A3 "lease"</span>
|
|
|
|
<span class="l l-Scalar l-Scalar-Plain">task_lease_duration</span><span class="p p-Indicator">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">30m</span><span class="w"> </span><span class="c1"># Too short?</span>
|
|
<span class="nt">heartbeat_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1m</span><span class="w"> </span><span class="c1"># Too infrequent?</span>
|
|
</code></pre></div></p>
|
|
<p><strong>Resolution:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Increase lease duration for long-running jobs</span>
|
|
<span class="nt">task_lease_duration</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">60m</span>
|
|
<span class="nt">heartbeat_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">30s</span><span class="w"> </span><span class="c1"># More frequent heartbeats</span>
|
|
</code></pre></div></p>
|
|
<h2 id="performance-tuning">Performance Tuning<a class="headerlink" href="#performance-tuning" title="Permanent link">¶</a></h2>
|
|
<h3 id="worker-concurrency">Worker Concurrency<a class="headerlink" href="#worker-concurrency" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># worker-config.yaml</span>
|
|
<span class="nt">max_workers</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">4</span><span class="w"> </span><span class="c1"># Number of parallel tasks</span>
|
|
|
|
<span class="c1"># Adjust based on:</span>
|
|
<span class="c1"># - CPU cores available</span>
|
|
<span class="c1"># - Memory per task</span>
|
|
<span class="c1"># - GPU availability</span>
|
|
</code></pre></div>
|
|
<h3 id="redis-configuration">Redis Configuration<a class="headerlink" href="#redis-configuration" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># /etc/redis/redis.conf
|
|
|
|
# Persistence
|
|
save 900 1
|
|
save 300 10
|
|
|
|
# Memory
|
|
maxmemory 2gb
|
|
maxmemory-policy noeviction
|
|
|
|
# Performance
|
|
tcp-keepalive 300
|
|
timeout 0
|
|
</code></pre></div>
|
|
<h2 id="alerting-rules">Alerting Rules<a class="headerlink" href="#alerting-rules" title="Permanent link">¶</a></h2>
|
|
<h3 id="critical-alerts">Critical Alerts<a class="headerlink" href="#critical-alerts" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li><strong>Worker Down</strong> (no heartbeat > 5min)</li>
|
|
<li><strong>Queue Depth</strong> > 1000 tasks</li>
|
|
<li><strong>DLQ Growth</strong> > 100 tasks/hour</li>
|
|
<li><strong>Redis Down</strong> (connection failed)</li>
|
|
</ol>
|
|
<h3 id="warning-alerts">Warning Alerts<a class="headerlink" href="#warning-alerts" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li><strong>High Retry Rate</strong> > 10% of tasks</li>
|
|
<li><strong>Slow Queue Drain</strong> (depth increasing over 1 hour)</li>
|
|
<li><strong>Worker Memory</strong> > 80% usage</li>
|
|
</ol>
|
|
<h2 id="health-checks">Health Checks<a class="headerlink" href="#health-checks" title="Permanent link">¶</a></h2>
|
|
<div class="highlight"><pre><span></span><code><span class="ch">#!/bin/bash</span>
|
|
<span class="c1"># health-check.sh</span>
|
|
|
|
<span class="c1"># Check Redis</span>
|
|
redis-cli<span class="w"> </span>PING<span class="w"> </span><span class="o">||</span><span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"Redis DOWN"</span>
|
|
|
|
<span class="c1"># Check worker heartbeat</span>
|
|
<span class="nv">WORKER_ID</span><span class="o">=</span><span class="k">$(</span>cat<span class="w"> </span>/var/run/ml-worker.pid<span class="k">)</span>
|
|
<span class="nv">LAST_HB</span><span class="o">=</span><span class="k">$(</span>redis-cli<span class="w"> </span>HGET<span class="w"> </span>worker:heartbeat<span class="w"> </span><span class="s2">"</span><span class="nv">$WORKER_ID</span><span class="s2">"</span><span class="k">)</span>
|
|
<span class="nv">NOW</span><span class="o">=</span><span class="k">$(</span>date<span class="w"> </span>+%s<span class="k">)</span>
|
|
<span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="k">$((</span><span class="nv">NOW</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="nv">LAST_HB</span><span class="k">))</span><span class="w"> </span>-gt<span class="w"> </span><span class="m">300</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
|
|
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"Worker heartbeat stale"</span>
|
|
<span class="k">fi</span>
|
|
|
|
<span class="c1"># Check queue depth</span>
|
|
<span class="nv">DEPTH</span><span class="o">=</span><span class="k">$(</span>redis-cli<span class="w"> </span>ZCARD<span class="w"> </span>task:queue<span class="k">)</span>
|
|
<span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="s2">"</span><span class="nv">$DEPTH</span><span class="s2">"</span><span class="w"> </span>-gt<span class="w"> </span><span class="m">1000</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
|
|
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"Queue depth critical: </span><span class="nv">$DEPTH</span><span class="s2">"</span>
|
|
<span class="k">fi</span>
|
|
</code></pre></div>
|
|
<h2 id="runbook-checklist">Runbook Checklist<a class="headerlink" href="#runbook-checklist" title="Permanent link">¶</a></h2>
|
|
<h3 id="daily-operations">Daily Operations<a class="headerlink" href="#daily-operations" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li>Check queue depth</li>
|
|
<li>Verify worker heartbeats</li>
|
|
<li>Review DLQ for patterns</li>
|
|
<li>Check Redis memory usage</li>
|
|
</ol>
|
|
<h3 id="weekly-operations">Weekly Operations<a class="headerlink" href="#weekly-operations" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li>Review retry rates</li>
|
|
<li>Analyze failed task patterns</li>
|
|
<li>Backup Redis snapshot</li>
|
|
<li>Review worker logs</li>
|
|
</ol>
|
|
<h3 id="monthly-operations">Monthly Operations<a class="headerlink" href="#monthly-operations" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li>Performance tuning review</li>
|
|
<li>Capacity planning</li>
|
|
<li>Update documentation</li>
|
|
<li>Test disaster recovery</li>
|
|
</ol>
|
|
<hr />
|
|
<p><strong>For homelab setups:</strong>
|
|
Most of these operations can be simplified. Focus on:
|
|
- Basic monitoring (queue depth, worker status)
|
|
- Periodic Redis backups
|
|
- Graceful shutdowns for maintenance</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</article>
|
|
</div>
|
|
|
|
|
|
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
|
</div>
|
|
|
|
</main>
|
|
|
|
<footer class="md-footer">
|
|
|
|
<div class="md-footer-meta md-typeset">
|
|
<div class="md-footer-meta__inner md-grid">
|
|
<div class="md-copyright">
|
|
|
|
|
|
Made with
|
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
|
Material for MkDocs
|
|
</a>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
</div>
|
|
<div class="md-dialog" data-md-component="dialog">
|
|
<div class="md-dialog__inner md-typeset"></div>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
<script id="__config" type="application/json">{"annotate": null, "base": "..", "features": ["navigation.instant", "navigation.tracking", "navigation.tabs", "navigation.sections", "navigation.expand", "navigation.indexes", "toc.integrate", "search.highlight", "search.share"], "search": "../assets/javascripts/workers/search.7a47a382.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
|
|
|
|
|
<script src="../assets/javascripts/bundle.e71a0d61.min.js"></script>
|
|
|
|
|
|
</body>
|
|
</html> |