- Fix YAML tags in auth config struct (json -> yaml) - Update CLI configs to use pre-hashed API keys - Remove double hashing in WebSocket client - Fix port mapping (9102 -> 9103) in CLI commands - Update permission keys to use jobs:read, jobs:create, etc. - Clean up all debug logging from CLI and server - All user roles now authenticate correctly: * Admin: Can queue jobs and see all jobs * Researcher: Can queue jobs and see own jobs * Analyst: Can see status (read-only access) Multi-user authentication is now fully functional.
2293 lines
No EOL
50 KiB
HTML
2293 lines
No EOL
50 KiB
HTML
|
|
<!doctype html>
|
|
<html lang="en" class="no-js">
|
|
<head>
|
|
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
|
|
<meta name="description" content="Secure Machine Learning Platform">
|
|
|
|
|
|
|
|
|
|
<link rel="prev" href="../production-monitoring/">
|
|
|
|
|
|
<link rel="next" href="../redis-ha/">
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="icon" href="../assets/images/favicon.png">
|
|
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.0">
|
|
|
|
|
|
|
|
<title>Operations Runbook - Fetch ML Documentation</title>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/main.618322db.min.css">
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/palette.ab4e12ef.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
|
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
|
|
|
|
|
|
|
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
|
|
|
|
|
|
|
|
|
|
|
</head>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="blue" data-md-color-accent="blue">
|
|
|
|
|
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
|
<label class="md-overlay" for="__drawer"></label>
|
|
<div data-md-component="skip">
|
|
|
|
|
|
<a href="#operations-runbook" class="md-skip">
|
|
Skip to content
|
|
</a>
|
|
|
|
</div>
|
|
<div data-md-component="announce">
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<header class="md-header" data-md-component="header">
|
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
|
<a href=".." title="Fetch ML Documentation" class="md-header__button md-logo" aria-label="Fetch ML Documentation" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
<label class="md-header__button md-icon" for="__drawer">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
|
</label>
|
|
<div class="md-header__title" data-md-component="header-title">
|
|
<div class="md-header__ellipsis">
|
|
<div class="md-header__topic">
|
|
<span class="md-ellipsis">
|
|
Fetch ML Documentation
|
|
</span>
|
|
</div>
|
|
<div class="md-header__topic" data-md-component="header-topic">
|
|
<span class="md-ellipsis">
|
|
|
|
Operations Runbook
|
|
|
|
</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<form class="md-header__option" data-md-component="palette">
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="blue" data-md-color-accent="blue" aria-hidden="true" type="radio" name="__palette" id="__palette_0">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="blue" data-md-color-accent="blue" aria-hidden="true" type="radio" name="__palette" id="__palette_1">
|
|
|
|
|
|
</form>
|
|
|
|
|
|
|
|
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-header__button md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
</label>
|
|
<div class="md-search" data-md-component="search" role="dialog">
|
|
<label class="md-search__overlay" for="__search"></label>
|
|
<div class="md-search__inner" role="search">
|
|
<form class="md-search__form" name="search">
|
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
|
<label class="md-search__icon md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
|
|
</label>
|
|
<nav class="md-search__options" aria-label="Search">
|
|
|
|
<a href="javascript:void(0)" class="md-search__icon md-icon" title="Share" aria-label="Share" data-clipboard data-clipboard-text="" data-md-component="search-share" tabindex="-1">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18 16.08c-.76 0-1.44.3-1.96.77L8.91 12.7c.05-.23.09-.46.09-.7s-.04-.47-.09-.7l7.05-4.11c.54.5 1.25.81 2.04.81a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3c0 .24.04.47.09.7L8.04 9.81C7.5 9.31 6.79 9 6 9a3 3 0 0 0-3 3 3 3 0 0 0 3 3c.79 0 1.5-.31 2.04-.81l7.12 4.15c-.05.21-.08.43-.08.66 0 1.61 1.31 2.91 2.92 2.91s2.92-1.3 2.92-2.91A2.92 2.92 0 0 0 18 16.08"/></svg>
|
|
</a>
|
|
|
|
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
|
|
</button>
|
|
</nav>
|
|
|
|
</form>
|
|
<div class="md-search__output">
|
|
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
|
|
<div class="md-search-result" data-md-component="search-result">
|
|
<div class="md-search-result__meta">
|
|
Initializing search
|
|
</div>
|
|
<ol class="md-search-result__list" role="presentation"></ol>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-header__source">
|
|
<a href="https://github.com/jfraeys/fetch_ml" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
GitHub
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
</header>
|
|
|
|
<div class="md-container" data-md-component="container">
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
|
<div class="md-grid">
|
|
<ul class="md-tabs__list">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href=".." class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Home
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../quick-start/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Getting Started
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../development-setup/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Development
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item md-tabs__item--active">
|
|
<a href="../deployment/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Operations & Production
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../security/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Security
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../configuration-schema/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Reference
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../adr/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Architecture Decisions
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</div>
|
|
</nav>
|
|
|
|
|
|
|
|
<main class="md-main" data-md-component="main">
|
|
<div class="md-main__inner md-grid">
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--primary md-nav--lifted md-nav--integrated" aria-label="Navigation" data-md-level="0">
|
|
<label class="md-nav__title" for="__drawer">
|
|
<a href=".." title="Fetch ML Documentation" class="md-nav__button md-logo" aria-label="Fetch ML Documentation" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
Fetch ML Documentation
|
|
</label>
|
|
|
|
<div class="md-nav__source">
|
|
<a href="https://github.com/jfraeys/fetch_ml" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
GitHub
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href=".." class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Home
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Getting Started
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Getting Started
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../quick-start/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Quick Start
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../installation/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Simple Installation Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../first-experiment/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
First Experiment
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_3" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Development
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Development
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../development-setup/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Development Setup
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../testing/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Testing Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../architecture/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Homelab Architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../cli-reference/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
CLI Reference
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../zig-cli/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Zig CLI Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../queue/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Task Queue Architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../smart-defaults/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Smart Defaults
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../cicd/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
CI/CD Pipeline
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" checked>
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Operations & Production
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="true">
|
|
<label class="md-nav__title" for="__nav_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Operations & Production
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../deployment/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
ML Experiment Manager - Deployment Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../environment-variables/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Environment Variables
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../production-monitoring/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Production Monitoring Deployment Guide (Linux)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active">
|
|
|
|
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Operations Runbook
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<a href="./" class="md-nav__link md-nav__link--active">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Operations Runbook
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#task-queue-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Task Queue Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Task Queue Operations">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#monitoring-queue-health" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Monitoring Queue Health
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#handling-stuck-tasks" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Handling Stuck Tasks
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#dead-letter-queue-management" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Dead Letter Queue Management
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-crashes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Crashes
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Worker Operations">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#graceful-shutdown" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Graceful Shutdown
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#force-shutdown" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Force Shutdown
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-heartbeat-monitoring" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Heartbeat Monitoring
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#redis-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Redis Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Redis Operations">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#backup" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Backup
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#restore" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Restore
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#memory-management" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Memory Management
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#common-issues" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Common Issues
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Common Issues">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#issue-queue-growing-unbounded" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Issue: Queue Growing Unbounded
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#issue-high-retry-rate" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Issue: High Retry Rate
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#issue-leases-expiring-prematurely" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Issue: Leases Expiring Prematurely
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#performance-tuning" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Performance Tuning
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Performance Tuning">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#worker-concurrency" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Worker Concurrency
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#redis-configuration" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Redis Configuration
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#alerting-rules" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Alerting Rules
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Alerting Rules">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#critical-alerts" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Critical Alerts
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#warning-alerts" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Warning Alerts
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#health-checks" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Health Checks
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#runbook-checklist" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Runbook Checklist
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Runbook Checklist">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#daily-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Daily Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#weekly-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Weekly Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#monthly-operations" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Monthly Operations
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../redis-ha/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Redis High Availability (Optional)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../release-checklist/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Release Checklist
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_5" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Security
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_5">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Security
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../security/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Security Guide
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../api-key-process/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
FetchML API Key Process
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../user-permissions/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
User Permissions in Fetch ML
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_6" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Reference
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_6">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Reference
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../configuration-schema/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Configuration Schema
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../troubleshooting/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Troubleshooting
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_7" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="../adr/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Architecture Decisions
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_7" id="__nav_7_label" tabindex="0">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_7">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Architecture Decisions
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../adr/ADR-001-use-go-for-api-server/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
ADR-001: Use Go for API Server
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../adr/ADR-002-use-sqlite-for-local-development/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
ADR-002: Use SQLite for Local Development
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../adr/ADR-003-use-redis-for-job-queue/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
ADR-003: Use Redis for Job Queue
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="md-content" data-md-component="content">
|
|
|
|
<article class="md-content__inner md-typeset">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<h1 id="operations-runbook">Operations Runbook<a class="headerlink" href="#operations-runbook" title="Permanent link">¶</a></h1>
|
|
<p>Operational guide for troubleshooting and maintaining the ML experiment system.</p>
|
|
<h2 id="task-queue-operations">Task Queue Operations<a class="headerlink" href="#task-queue-operations" title="Permanent link">¶</a></h2>
|
|
<h3 id="monitoring-queue-health">Monitoring Queue Health<a class="headerlink" href="#monitoring-queue-health" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># Check queue depth
|
|
ZCARD task:queue
|
|
|
|
# List pending tasks
|
|
ZRANGE task:queue 0 -1 WITHSCORES
|
|
|
|
# Check dead letter queue
|
|
KEYS task:dlq:*
|
|
</code></pre></div>
|
|
<h3 id="handling-stuck-tasks">Handling Stuck Tasks<a class="headerlink" href="#handling-stuck-tasks" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptom:</strong> Tasks stuck in "running" status</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check for expired leases</span>
|
|
redis-cli<span class="w"> </span>GET<span class="w"> </span>task:<span class="o">{</span>task-id<span class="o">}</span>
|
|
<span class="c1"># Look for LeaseExpiry in past</span>
|
|
</code></pre></div></p>
|
|
<p>**Rem</p>
|
|
<p>ediation:**
|
|
Tasks with expired leases are automatically reclaimed every 1 minute. To force immediate reclamation:
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Restart worker to trigger reclaim cycle</span>
|
|
systemctl<span class="w"> </span>restart<span class="w"> </span>ml-worker
|
|
</code></pre></div></p>
|
|
<h3 id="dead-letter-queue-management">Dead Letter Queue Management<a class="headerlink" href="#dead-letter-queue-management" title="Permanent link">¶</a></h3>
|
|
<p><strong>View failed tasks:</strong>
|
|
<div class="highlight"><pre><span></span><code>KEYS task:dlq:*
|
|
</code></pre></div></p>
|
|
<p><strong>Inspect failed task:</strong>
|
|
<div class="highlight"><pre><span></span><code>GET task:dlq:{task-id}
|
|
</code></pre></div></p>
|
|
<p><strong>Retry from DLQ:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Manual retry (requires custom script)</span>
|
|
<span class="c1"># 1. Get task from DLQ</span>
|
|
<span class="c1"># 2. Reset retry count</span>
|
|
<span class="c1"># 3. Re-queue task</span>
|
|
</code></pre></div></p>
|
|
<h3 id="worker-crashes">Worker Crashes<a class="headerlink" href="#worker-crashes" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptom:</strong> Worker disappeared mid-task</p>
|
|
<p><strong>What Happens:</strong>
|
|
1. Lease expires after 30 minutes (default)
|
|
2. Background reclaim job detects expired lease
|
|
3. Task is retried (up to 3 attempts)
|
|
4. After max retries → Dead Letter Queue</p>
|
|
<p><strong>Prevention:</strong>
|
|
- Monitor worker heartbeats
|
|
- Set up alerts for worker down
|
|
- Use process manager (systemd, supervisor)</p>
|
|
<h2 id="worker-operations">Worker Operations<a class="headerlink" href="#worker-operations" title="Permanent link">¶</a></h2>
|
|
<h3 id="graceful-shutdown">Graceful Shutdown<a class="headerlink" href="#graceful-shutdown" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Send SIGTERM for graceful shutdown</span>
|
|
<span class="nb">kill</span><span class="w"> </span>-TERM<span class="w"> </span><span class="k">$(</span>pgrep<span class="w"> </span>ml-worker<span class="k">)</span>
|
|
|
|
<span class="c1"># Worker will:</span>
|
|
<span class="c1"># 1. Stop accepting new tasks</span>
|
|
<span class="c1"># 2. Finish active tasks (up to 5min timeout)</span>
|
|
<span class="c1"># 3. Release all leases</span>
|
|
<span class="c1"># 4. Exit cleanly</span>
|
|
</code></pre></div>
|
|
<h3 id="force-shutdown">Force Shutdown<a class="headerlink" href="#force-shutdown" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Force kill (leases will be reclaimed automatically)</span>
|
|
<span class="nb">kill</span><span class="w"> </span>-9<span class="w"> </span><span class="k">$(</span>pgrep<span class="w"> </span>ml-worker<span class="k">)</span>
|
|
</code></pre></div>
|
|
<h3 id="worker-heartbeat-monitoring">Worker Heartbeat Monitoring<a class="headerlink" href="#worker-heartbeat-monitoring" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># Check worker heartbeats
|
|
HGETALL worker:heartbeat
|
|
|
|
# Example output:
|
|
# worker-abc123 1701234567
|
|
# worker-def456 1701234580
|
|
</code></pre></div>
|
|
<p><strong>Alert if:</strong> Heartbeat timestamp > 5 minutes old</p>
|
|
<h2 id="redis-operations">Redis Operations<a class="headerlink" href="#redis-operations" title="Permanent link">¶</a></h2>
|
|
<h3 id="backup">Backup<a class="headerlink" href="#backup" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Manual backup</span>
|
|
redis-cli<span class="w"> </span>SAVE
|
|
cp<span class="w"> </span>/var/lib/redis/dump.rdb<span class="w"> </span>/backup/redis-<span class="k">$(</span>date<span class="w"> </span>+%Y%m%d<span class="k">)</span>.rdb
|
|
</code></pre></div>
|
|
<h3 id="restore">Restore<a class="headerlink" href="#restore" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Stop Redis</span>
|
|
systemctl<span class="w"> </span>stop<span class="w"> </span>redis
|
|
|
|
<span class="c1"># Restore snapshot</span>
|
|
cp<span class="w"> </span>/backup/redis-20231201.rdb<span class="w"> </span>/var/lib/redis/dump.rdb
|
|
|
|
<span class="c1"># Start Redis</span>
|
|
systemctl<span class="w"> </span>start<span class="w"> </span>redis
|
|
</code></pre></div>
|
|
<h3 id="memory-management">Memory Management<a class="headerlink" href="#memory-management" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># Check memory usage
|
|
INFO memory
|
|
|
|
# Evict old data if needed
|
|
FLUSHDB # DANGER: Clears all data!
|
|
</code></pre></div>
|
|
<h2 id="common-issues">Common Issues<a class="headerlink" href="#common-issues" title="Permanent link">¶</a></h2>
|
|
<h3 id="issue-queue-growing-unbounded">Issue: Queue Growing Unbounded<a class="headerlink" href="#issue-queue-growing-unbounded" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptoms:</strong>
|
|
- <code>ZCARD task:queue</code> keeps increasing
|
|
- No workers processing tasks</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker status</span>
|
|
systemctl<span class="w"> </span>status<span class="w"> </span>ml-worker
|
|
|
|
<span class="c1"># Check logs</span>
|
|
journalctl<span class="w"> </span>-u<span class="w"> </span>ml-worker<span class="w"> </span>-n<span class="w"> </span><span class="m">100</span>
|
|
</code></pre></div></p>
|
|
<p><strong>Resolution:</strong>
|
|
1. Verify workers are running
|
|
2. Check Redis connectivity
|
|
3. Verify lease configuration</p>
|
|
<h3 id="issue-high-retry-rate">Issue: High Retry Rate<a class="headerlink" href="#issue-high-retry-rate" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptoms:</strong>
|
|
- Many tasks in DLQ
|
|
- <code>retry_count</code> field high on tasks</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker logs for errors</span>
|
|
journalctl<span class="w"> </span>-u<span class="w"> </span>ml-worker<span class="w"> </span><span class="p">|</span><span class="w"> </span>grep<span class="w"> </span><span class="s2">"retry"</span>
|
|
|
|
<span class="c1"># Look for patterns (network issues, resource limits, etc)</span>
|
|
</code></pre></div></p>
|
|
<p><strong>Resolution:</strong>
|
|
- Fix underlying issue (network, resources, etc)
|
|
- Adjust retry limits if permanent failures
|
|
- Increase task timeout if jobs are slow</p>
|
|
<h3 id="issue-leases-expiring-prematurely">Issue: Leases Expiring Prematurely<a class="headerlink" href="#issue-leases-expiring-prematurely" title="Permanent link">¶</a></h3>
|
|
<p><strong>Symptoms:</strong>
|
|
- Tasks retried even though worker is healthy
|
|
- Logs show "lease expired" frequently</p>
|
|
<p><strong>Diagnosis:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Check worker config</span>
|
|
<span class="l l-Scalar l-Scalar-Plain">cat configs/worker-config.yaml | grep -A3 "lease"</span>
|
|
|
|
<span class="l l-Scalar l-Scalar-Plain">task_lease_duration</span><span class="p p-Indicator">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">30m</span><span class="w"> </span><span class="c1"># Too short?</span>
|
|
<span class="nt">heartbeat_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1m</span><span class="w"> </span><span class="c1"># Too infrequent?</span>
|
|
</code></pre></div></p>
|
|
<p><strong>Resolution:</strong>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Increase lease duration for long-running jobs</span>
|
|
<span class="nt">task_lease_duration</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">60m</span>
|
|
<span class="nt">heartbeat_interval</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">30s</span><span class="w"> </span><span class="c1"># More frequent heartbeats</span>
|
|
</code></pre></div></p>
|
|
<h2 id="performance-tuning">Performance Tuning<a class="headerlink" href="#performance-tuning" title="Permanent link">¶</a></h2>
|
|
<h3 id="worker-concurrency">Worker Concurrency<a class="headerlink" href="#worker-concurrency" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># worker-config.yaml</span>
|
|
<span class="nt">max_workers</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">4</span><span class="w"> </span><span class="c1"># Number of parallel tasks</span>
|
|
|
|
<span class="c1"># Adjust based on:</span>
|
|
<span class="c1"># - CPU cores available</span>
|
|
<span class="c1"># - Memory per task</span>
|
|
<span class="c1"># - GPU availability</span>
|
|
</code></pre></div>
|
|
<h3 id="redis-configuration">Redis Configuration<a class="headerlink" href="#redis-configuration" title="Permanent link">¶</a></h3>
|
|
<div class="highlight"><pre><span></span><code># /etc/redis/redis.conf
|
|
|
|
# Persistence
|
|
save 900 1
|
|
save 300 10
|
|
|
|
# Memory
|
|
maxmemory 2gb
|
|
maxmemory-policy noeviction
|
|
|
|
# Performance
|
|
tcp-keepalive 300
|
|
timeout 0
|
|
</code></pre></div>
|
|
<h2 id="alerting-rules">Alerting Rules<a class="headerlink" href="#alerting-rules" title="Permanent link">¶</a></h2>
|
|
<h3 id="critical-alerts">Critical Alerts<a class="headerlink" href="#critical-alerts" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li><strong>Worker Down</strong> (no heartbeat > 5min)</li>
|
|
<li><strong>Queue Depth</strong> > 1000 tasks</li>
|
|
<li><strong>DLQ Growth</strong> > 100 tasks/hour</li>
|
|
<li><strong>Redis Down</strong> (connection failed)</li>
|
|
</ol>
|
|
<h3 id="warning-alerts">Warning Alerts<a class="headerlink" href="#warning-alerts" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li><strong>High Retry Rate</strong> > 10% of tasks</li>
|
|
<li><strong>Slow Queue Drain</strong> (depth increasing over 1 hour)</li>
|
|
<li><strong>Worker Memory</strong> > 80% usage</li>
|
|
</ol>
|
|
<h2 id="health-checks">Health Checks<a class="headerlink" href="#health-checks" title="Permanent link">¶</a></h2>
|
|
<div class="highlight"><pre><span></span><code><span class="ch">#!/bin/bash</span>
|
|
<span class="c1"># health-check.sh</span>
|
|
|
|
<span class="c1"># Check Redis</span>
|
|
redis-cli<span class="w"> </span>PING<span class="w"> </span><span class="o">||</span><span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"Redis DOWN"</span>
|
|
|
|
<span class="c1"># Check worker heartbeat</span>
|
|
<span class="nv">WORKER_ID</span><span class="o">=</span><span class="k">$(</span>cat<span class="w"> </span>/var/run/ml-worker.pid<span class="k">)</span>
|
|
<span class="nv">LAST_HB</span><span class="o">=</span><span class="k">$(</span>redis-cli<span class="w"> </span>HGET<span class="w"> </span>worker:heartbeat<span class="w"> </span><span class="s2">"</span><span class="nv">$WORKER_ID</span><span class="s2">"</span><span class="k">)</span>
|
|
<span class="nv">NOW</span><span class="o">=</span><span class="k">$(</span>date<span class="w"> </span>+%s<span class="k">)</span>
|
|
<span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="k">$((</span><span class="nv">NOW</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="nv">LAST_HB</span><span class="k">))</span><span class="w"> </span>-gt<span class="w"> </span><span class="m">300</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
|
|
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"Worker heartbeat stale"</span>
|
|
<span class="k">fi</span>
|
|
|
|
<span class="c1"># Check queue depth</span>
|
|
<span class="nv">DEPTH</span><span class="o">=</span><span class="k">$(</span>redis-cli<span class="w"> </span>ZCARD<span class="w"> </span>task:queue<span class="k">)</span>
|
|
<span class="k">if</span><span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="s2">"</span><span class="nv">$DEPTH</span><span class="s2">"</span><span class="w"> </span>-gt<span class="w"> </span><span class="m">1000</span><span class="w"> </span><span class="o">]</span><span class="p">;</span><span class="w"> </span><span class="k">then</span>
|
|
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"Queue depth critical: </span><span class="nv">$DEPTH</span><span class="s2">"</span>
|
|
<span class="k">fi</span>
|
|
</code></pre></div>
|
|
<h2 id="runbook-checklist">Runbook Checklist<a class="headerlink" href="#runbook-checklist" title="Permanent link">¶</a></h2>
|
|
<h3 id="daily-operations">Daily Operations<a class="headerlink" href="#daily-operations" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li>Check queue depth</li>
|
|
<li>Verify worker heartbeats</li>
|
|
<li>Review DLQ for patterns</li>
|
|
<li>Check Redis memory usage</li>
|
|
</ol>
|
|
<h3 id="weekly-operations">Weekly Operations<a class="headerlink" href="#weekly-operations" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li>Review retry rates</li>
|
|
<li>Analyze failed task patterns</li>
|
|
<li>Backup Redis snapshot</li>
|
|
<li>Review worker logs</li>
|
|
</ol>
|
|
<h3 id="monthly-operations">Monthly Operations<a class="headerlink" href="#monthly-operations" title="Permanent link">¶</a></h3>
|
|
<ol>
|
|
<li>Performance tuning review</li>
|
|
<li>Capacity planning</li>
|
|
<li>Update documentation</li>
|
|
<li>Test disaster recovery</li>
|
|
</ol>
|
|
<hr />
|
|
<p><strong>For homelab setups:</strong>
|
|
Most of these operations can be simplified. Focus on:
|
|
- Basic monitoring (queue depth, worker status)
|
|
- Periodic Redis backups
|
|
- Graceful shutdowns for maintenance</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</article>
|
|
</div>
|
|
|
|
|
|
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
|
</div>
|
|
|
|
</main>
|
|
|
|
<footer class="md-footer">
|
|
|
|
<div class="md-footer-meta md-typeset">
|
|
<div class="md-footer-meta__inner md-grid">
|
|
<div class="md-copyright">
|
|
|
|
|
|
Made with
|
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
|
Material for MkDocs
|
|
</a>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
</div>
|
|
<div class="md-dialog" data-md-component="dialog">
|
|
<div class="md-dialog__inner md-typeset"></div>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
<script id="__config" type="application/json">{"annotate": null, "base": "..", "features": ["navigation.instant", "navigation.tracking", "navigation.tabs", "navigation.sections", "navigation.expand", "navigation.indexes", "toc.integrate", "search.highlight", "search.share"], "search": "../assets/javascripts/workers/search.7a47a382.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
|
|
|
|
|
<script src="../assets/javascripts/bundle.e71a0d61.min.js"></script>
|
|
|
|
|
|
</body>
|
|
</html> |