feat: Add auto-start for stopped containers in watchdog

- Added AUTO_START option (default: true) to automatically start stopped containers, not just restart unhealthy ones - Added handle_stopped_container() function - Updated documentation with new configuration option Set AUTO_START=false to disable automatic starting of stopped containers. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-28 17:42:45 +01:00 · 2026-01-31 14:05:51 +01:00
parent bb292b1a1d
commit aa788d7a0b
3 changed files with 52 additions and 2 deletions
--- a/scripts/watchdog/install.sh
+++ b/scripts/watchdog/install.sh
@@ -97,6 +97,7 @@ Environment=MCWEBUI_DIR=${MCWEBUI_DIR}
 Environment=CHECK_INTERVAL=30
 Environment=LOG_FILE=${LOG_FILE}
 Environment=HTTP_PORT=5051
+Environment=AUTO_START=true
 ExecStart=/usr/bin/python3 -u ${SCRIPT_DIR}/watchdog.py
 Restart=always
 RestartSec=10
--- a/scripts/watchdog/watchdog.py
+++ b/scripts/watchdog/watchdog.py
@@ -7,6 +7,7 @@ Designed to run as a systemd service on the host.

 Features:
 - Monitors container health status
+- Automatically starts stopped containers (configurable)
 - Captures logs before restart for diagnostics
 - Logs all events to file
 - HTTP endpoint for status check
@@ -16,6 +17,7 @@ Configuration via environment variables:
 - CHECK_INTERVAL: Seconds between checks (default: 30)
 - LOG_FILE: Path to log file (default: /var/log/mc-webui-watchdog.log)
 - HTTP_PORT: Port for status endpoint (default: 5051, 0 to disable)
+- AUTO_START: Start stopped containers (default: true, set to 'false' to disable)
 """

 import os
@@ -33,6 +35,7 @@ MCWEBUI_DIR = os.environ.get('MCWEBUI_DIR', os.path.expanduser('~/mc-webui'))
 CHECK_INTERVAL = int(os.environ.get('CHECK_INTERVAL', '30'))
 LOG_FILE = os.environ.get('LOG_FILE', '/var/log/mc-webui-watchdog.log')
 HTTP_PORT = int(os.environ.get('HTTP_PORT', '5051'))
+AUTO_START = os.environ.get('AUTO_START', 'true').lower() != 'false'

 # Containers to monitor
 CONTAINERS = ['meshcore-bridge', 'mc-webui']
@@ -150,6 +153,45 @@ def restart_container(container_name: str) -> bool:
        return False


+def start_container(container_name: str) -> bool:
+    """Start a stopped container using docker compose."""
+    log(f"Starting container: {container_name}", 'WARN')
+
+    success, stdout, stderr = run_compose_command([
+        'start', container_name
+    ], timeout=120)
+
+    if success:
+        log(f"Container {container_name} started successfully")
+        return True
+    else:
+        log(f"Failed to start {container_name}: {stderr}", 'ERROR')
+        return False
+
+
+def handle_stopped_container(container_name: str, status: dict):
+    """Handle a stopped container - log and start it."""
+    global restart_history
+
+    log(f"Container {container_name} is stopped! Status: {status['status']}", 'WARN')
+
+    # Start the container
+    start_success = start_container(container_name)
+
+    # Record in history
+    restart_history.append({
+        'timestamp': datetime.now().isoformat(),
+        'container': container_name,
+        'action': 'start',
+        'status_before': status,
+        'success': start_success
+    })
+
+    # Keep only last 50 entries
+    if len(restart_history) > 50:
+        restart_history = restart_history[-50:]
+
+
 def handle_unhealthy_container(container_name: str, status: dict):
    """Handle an unhealthy container - log details and restart."""
    global restart_history
@@ -206,7 +248,10 @@ def check_containers():
        if not status['exists']:
            log(f"Container {container_name} not found", 'WARN')
        elif status['status'] != 'running':
-            log(f"Container {container_name} is not running (status: {status['status']})", 'WARN')
+            if AUTO_START:
+                handle_stopped_container(container_name, status)
+            else:
+                log(f"Container {container_name} is not running (status: {status['status']}), AUTO_START disabled", 'WARN')
        elif status['health'] == 'unhealthy':
            handle_unhealthy_container(container_name, status)

@@ -273,6 +318,7 @@ def main():
    log(f"  Check interval: {CHECK_INTERVAL}s")
    log(f"  Log file: {LOG_FILE}")
    log(f"  HTTP port: {HTTP_PORT if HTTP_PORT > 0 else 'disabled'}")
+    log(f"  Auto-start stopped containers: {AUTO_START}")
    log(f"  Monitoring containers: {', '.join(CONTAINERS)}")
    log("=" * 60)