Option B: stop unused services; add restricted SSH scripts
Some checks failed
Deploy / deploy (push) Failing after 7s

This commit is contained in:
Jeremie Fraeys 2026-01-20 12:18:09 -05:00
parent 4cd7e72e2b
commit 0291800ef5
No known key found for this signature in database
14 changed files with 350 additions and 7 deletions

View file

@ -49,7 +49,8 @@ jobs:
set -euo pipefail
APP_NAME="${{ github.event.repository.name }}"
echo "Registering app $APP_NAME with infra-controller..."
test -f .infra.toml
ssh -i ~/.ssh/id_ed25519 "$SERVICE_USER@$SERVICE_HOST" \
"cat > /var/run/active-apps/$APP_NAME.toml.tmp && mv /var/run/active-apps/$APP_NAME.toml.tmp /var/run/active-apps/$APP_NAME.toml" \
< .infra.toml
if [[ -f .infra.toml ]]; then
ssh -i ~/.ssh/id_ed25519 "$SERVICE_USER@$SERVICE_HOST" infra-register-stdin "$APP_NAME" < .infra.toml
else
ssh -i ~/.ssh/id_ed25519 "$SERVICE_USER@$SERVICE_HOST" infra-deregister "$APP_NAME"
fi

View file

@ -0,0 +1,11 @@
name: Reusable
on:
workflow_call:
jobs:
test:
runs-on: docker
steps:
- name: Hello
run: echo "Hello from reusable"

15
.gitignore vendored Normal file
View file

@ -0,0 +1,15 @@
__pycache__/
*.py[cod]
*$py.class
.pytest_cache/
.ruff_cache/
*.egg-info/
.eggs/
build/
dist/
.venv/
venv/

1
.python-version Normal file
View file

@ -0,0 +1 @@
3.11

View file

@ -34,6 +34,8 @@ To avoid running a daemon or polling timer, you can trigger a one-shot run whene
- enable path trigger: `sudo systemctl enable --now infra-controller.path`
- view logs: `journalctl -u infra-controller-once.service -f`
Services that are no longer required are stopped after `grace_period_minutes` (see config) using `docker compose down`.
## Remote app registration
Run `infra-controller` on the service server. When you deploy, create/update a registration file in `/var/run/active-apps/` (this triggers the path unit).
@ -51,3 +53,15 @@ ssh infractl@service-host \
"cat > /var/run/active-apps/$APP_NAME.toml.tmp && mv /var/run/active-apps/$APP_NAME.toml.tmp /var/run/active-apps/$APP_NAME.toml" \
< .infra.toml
```
## Restricted SSH keys (recommended)
If you want to avoid giving CI a general shell on the services server, install the helper scripts to `/usr/local/sbin` (see `install.sh`) and restrict the runner key in `authorized_keys`.
Example (services server, `~infractl/.ssh/authorized_keys`):
```text
command="/usr/local/sbin/infra-register-stdin",no-pty,no-agent-forwarding,no-port-forwarding,no-X11-forwarding ssh-ed25519 AAAA... runner
```
For deregistration, use a separate key restricted to `/usr/local/sbin/infra-deregister`.

View file

@ -27,6 +27,13 @@ sudo python3 -m venv /opt/infra-controller/venv
sudo /opt/infra-controller/venv/bin/pip install --upgrade pip
sudo /opt/infra-controller/venv/bin/pip install -e .
echo "Installing helper scripts..."
sudo install -d /usr/local/sbin
sudo install -m 0755 scripts/deploy-app /usr/local/sbin/deploy-app
sudo install -m 0755 scripts/sync-infra /usr/local/sbin/sync-infra
sudo install -m 0755 scripts/infra-register-stdin /usr/local/sbin/infra-register-stdin
sudo install -m 0755 scripts/infra-deregister /usr/local/sbin/infra-deregister
if [ ! -f /etc/infra-controller/config.toml ]; then
echo "Installing default configuration..."
sudo cp config/controller.toml.example /etc/infra-controller/config.toml

View file

@ -37,6 +37,7 @@ infra-controller = "infra_controller.__main__:main"
infra-register = "infra_controller.cli:register"
infra-deregister = "infra_controller.cli:deregister"
infra-status = "infra_controller.cli:status"
infra-ensure = "infra_controller.cli:ensure_service_cli"
[tool.setuptools]
package-dir = {"" = "src"}
@ -46,3 +47,6 @@ where = ["src"]
[tool.ruff]
line-length = 100
[tool.pytest.ini_options]
asyncio_default_fixture_loop_scope = "function"

43
scripts/deploy-app Normal file
View file

@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -euo pipefail
APP_NAME="${1:-}"
GIT_REF="${2:-}"
if [[ -z "$APP_NAME" || -z "$GIT_REF" ]]; then
echo "usage: deploy-app <app_name> <git_ref>" >&2
exit 2
fi
if ! [[ "$APP_NAME" =~ ^[A-Za-z0-9._-]+$ ]]; then
echo "invalid app name: $APP_NAME" >&2
exit 2
fi
if ! [[ "$GIT_REF" =~ ^[0-9a-fA-F]{7,40}$ ]]; then
echo "invalid git ref: $GIT_REF" >&2
exit 2
fi
APP_DIR="/srv/apps/$APP_NAME"
if [[ ! -d "$APP_DIR/.git" ]]; then
echo "app repo not present at $APP_DIR; clone it first (or extend deploy-app to clone)" >&2
exit 1
fi
cd "$APP_DIR"
git fetch --all --prune
git checkout -f "$GIT_REF"
git submodule update --init --recursive
if [[ -x "./deploy.sh" ]]; then
./deploy.sh
else
echo "ERROR: deploy.sh missing or not executable" >&2
exit 1
fi
/usr/local/sbin/sync-infra "$APP_NAME" "$APP_DIR"

16
scripts/infra-deregister Normal file
View file

@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail
APP_NAME="${1:-}"
if [[ -z "$APP_NAME" ]]; then
echo "usage: infra-deregister <app_name>" >&2
exit 2
fi
if ! [[ "$APP_NAME" =~ ^[A-Za-z0-9._-]+$ ]]; then
echo "invalid app name: $APP_NAME" >&2
exit 2
fi
rm -f "/var/run/active-apps/$APP_NAME.toml" "/var/run/active-apps/$APP_NAME.yml" "/var/run/active-apps/$APP_NAME.yaml"

View file

@ -0,0 +1,24 @@
#!/usr/bin/env bash
set -euo pipefail
APP_NAME="${1:-}"
if [[ -z "$APP_NAME" ]]; then
echo "usage: infra-register-stdin <app_name>" >&2
exit 2
fi
if ! [[ "$APP_NAME" =~ ^[A-Za-z0-9._-]+$ ]]; then
echo "invalid app name: $APP_NAME" >&2
exit 2
fi
DST_DIR="/var/run/active-apps"
DST="$DST_DIR/$APP_NAME.toml"
TMP="$DST.toml.tmp"
mkdir -p "$DST_DIR"
cat > "$TMP"
mv "$TMP" "$DST"

23
scripts/sync-infra Normal file
View file

@ -0,0 +1,23 @@
#!/usr/bin/env bash
set -euo pipefail
APP_NAME="${1:-}"
APP_DIR="${2:-}"
if [[ -z "$APP_NAME" || -z "$APP_DIR" ]]; then
echo "usage: sync-infra <app_name> <app_dir>" >&2
exit 2
fi
if ! [[ "$APP_NAME" =~ ^[A-Za-z0-9._-]+$ ]]; then
echo "invalid app name: $APP_NAME" >&2
exit 2
fi
INFRA_FILE="$APP_DIR/.infra.toml"
if [[ -f "$INFRA_FILE" ]]; then
ssh infra@services-server infra-register-stdin "$APP_NAME" < "$INFRA_FILE"
else
ssh infra@services-server infra-deregister "$APP_NAME"
fi

View file

@ -1,7 +1,10 @@
from __future__ import annotations
import json
import logging
import time
from pathlib import Path
from infra_controller.config import ControllerConfig
from infra_controller.discovery import AppRegistration, DiscoveryManager
from infra_controller.service_manager import ServiceManager
@ -10,10 +13,15 @@ from infra_controller.service_manager import ServiceManager
logger = logging.getLogger(__name__)
class InfraController:
def __init__(self, cfg: ControllerConfig):
def __init__(
self,
cfg: ControllerConfig,
discovery: DiscoveryManager | None = None,
services: ServiceManager | None = None,
):
self._cfg = cfg
self._discovery = DiscoveryManager(cfg.discovery)
self._services = ServiceManager(cfg.docker)
self._discovery = discovery or DiscoveryManager(cfg.discovery)
self._services = services or ServiceManager(cfg.docker)
def run(self) -> None:
while True:
@ -23,10 +31,55 @@ class InfraController:
def run_once(self) -> None:
discovered = self._discovery.discover_all()
required = self._required_services(discovered)
state = self._load_state(self._cfg.services.state_file)
unused_since = state.get("unused_since")
if not isinstance(unused_since, dict):
unused_since = {}
known_services_val = state.get("known_services")
if isinstance(known_services_val, list):
known_services = {str(s) for s in known_services_val if isinstance(s, str) and s.strip()}
else:
known_services = set()
now = time.time()
for service in sorted(required):
logger.info("Ensuring service: %s", service)
self.ensure_service(service)
unused_since.pop(service, None)
known_services.add(service)
known_services |= set(unused_since.keys())
grace_seconds = int(self._cfg.services.grace_period_minutes) * 60
for service in sorted(known_services - set(required)):
since = unused_since.get(service)
if since is None:
unused_since[service] = now
logger.info("Service no longer required (grace period started): %s", service)
continue
try:
since_ts = float(since)
except Exception:
since_ts = now
unused_since[service] = now
continue
if (now - since_ts) < grace_seconds:
continue
logger.info("Stopping unused service: %s", service)
res = self._services.stop_service(service)
if res.returncode != 0:
raise RuntimeError(res.stderr or res.stdout)
unused_since.pop(service, None)
known_services.discard(service)
state["unused_since"] = unused_since
state["known_services"] = sorted(known_services)
self._save_state(self._cfg.services.state_file, state)
def ensure_service(self, service_name: str) -> None:
res = self._services.apply_service(service_name)
@ -50,3 +103,22 @@ class InfraController:
required.add(services.strip())
return required
def _load_state(self, path: Path) -> dict:
try:
if not path.exists():
return {}
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
return data
return {}
except Exception:
return {}
def _save_state(self, path: Path, state: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
with open(tmp, "w", encoding="utf-8") as f:
json.dump(state, f)
tmp.replace(path)

View file

@ -64,3 +64,21 @@ class ServiceManager:
proc = subprocess.run(cmd, capture_output=True, text=True, cwd=str(service_dir))
return ServiceResult(returncode=proc.returncode, stdout=proc.stdout, stderr=proc.stderr)
def stop_service(self, service_name: str) -> ServiceResult:
service_dir = self.service_dir_for_service(service_name)
if not service_dir.exists():
raise FileNotFoundError(f"Service directory not found: {service_dir}")
compose_file = self._resolve_compose_file(service_dir)
cmd = [
"docker",
"compose",
"-f",
str(compose_file),
"down",
]
proc = subprocess.run(cmd, capture_output=True, text=True, cwd=str(service_dir))
return ServiceResult(returncode=proc.returncode, stdout=proc.stdout, stderr=proc.stderr)

94
tests/test_controller.py Normal file
View file

@ -0,0 +1,94 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import pytest
from infra_controller.config import ControllerConfig
from infra_controller.controller import InfraController
from infra_controller.discovery import AppRegistration, InfraMetadata
@dataclass
class FakeServiceResult:
returncode: int = 0
stdout: str = ""
stderr: str = ""
class FakeServiceManager:
def __init__(self):
self.applied: list[str] = []
self.stopped: list[str] = []
def apply_service(self, service_name: str) -> FakeServiceResult:
self.applied.append(service_name)
return FakeServiceResult()
def stop_service(self, service_name: str) -> FakeServiceResult:
self.stopped.append(service_name)
return FakeServiceResult()
class FakeDiscoveryManager:
def __init__(self, apps: dict[str, AppRegistration]):
self._apps = apps
def set_apps(self, apps: dict[str, AppRegistration]) -> None:
self._apps = apps
def discover_all(self) -> dict[str, AppRegistration]:
return dict(self._apps)
def _app(name: str, services: list[str]) -> AppRegistration:
md = InfraMetadata(project=name, requires={"services": services})
return AppRegistration(name=name, metadata=md, last_seen=datetime.now(), discovery_method="test")
def test_controller_stops_unused_services_after_grace_period(tmp_path: Path, monkeypatch):
cfg = ControllerConfig()
cfg.services.grace_period_minutes = 0
cfg.services.state_file = tmp_path / "state.json"
discovery = FakeDiscoveryManager({"a": _app("a", ["svc1"])})
services = FakeServiceManager()
c = InfraController(cfg, discovery=discovery, services=services)
monkeypatch.setattr("infra_controller.controller.time.time", lambda: 0.0)
c.run_once()
assert services.applied == ["svc1"]
assert services.stopped == []
discovery.set_apps({})
monkeypatch.setattr("infra_controller.controller.time.time", lambda: 10.0)
c.run_once()
assert services.stopped == []
monkeypatch.setattr("infra_controller.controller.time.time", lambda: 20.0)
c.run_once()
assert services.stopped == ["svc1"]
def test_controller_does_not_stop_service_within_grace_period(tmp_path: Path, monkeypatch):
cfg = ControllerConfig()
cfg.services.grace_period_minutes = 1
cfg.services.state_file = tmp_path / "state.json"
discovery = FakeDiscoveryManager({"a": _app("a", ["svc1"])})
services = FakeServiceManager()
c = InfraController(cfg, discovery=discovery, services=services)
monkeypatch.setattr("infra_controller.controller.time.time", lambda: 0.0)
c.run_once()
discovery.set_apps({})
monkeypatch.setattr("infra_controller.controller.time.time", lambda: 10.0)
c.run_once()
monkeypatch.setattr("infra_controller.controller.time.time", lambda: 20.0)
c.run_once()
assert services.stopped == []