1
0
forked from iarv/mc-webui

feat: Add auto-start for stopped containers in watchdog

- Added AUTO_START option (default: true) to automatically start
  stopped containers, not just restart unhealthy ones
- Added handle_stopped_container() function
- Updated documentation with new configuration option

Set AUTO_START=false to disable automatic starting of stopped containers.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
MarekWo
2026-01-31 14:05:51 +01:00
parent bb292b1a1d
commit aa788d7a0b
3 changed files with 52 additions and 2 deletions

View File

@@ -1,11 +1,12 @@
# Container Watchdog
The Container Watchdog is a systemd service that monitors Docker containers and automatically restarts unhealthy ones. This is useful for ensuring reliability, especially on resource-constrained systems.
The Container Watchdog is a systemd service that monitors Docker containers and automatically restarts unhealthy or stopped ones. This is useful for ensuring reliability, especially on resource-constrained systems.
## Features
- **Health monitoring** - Checks container status every 30 seconds
- **Automatic restart** - Restarts containers that become unhealthy
- **Auto-start stopped containers** - Starts containers that have stopped (configurable)
- **Diagnostic logging** - Captures container logs before restart for troubleshooting
- **HTTP status endpoint** - Query container status via HTTP API
- **Restart history** - Tracks all automatic restarts with timestamps
@@ -80,6 +81,7 @@ If you need to customize the behavior, the service supports these environment va
| `CHECK_INTERVAL` | `30` | Seconds between health checks |
| `LOG_FILE` | `/var/log/mc-webui-watchdog.log` | Path to log file |
| `HTTP_PORT` | `5051` | HTTP status port (0 to disable) |
| `AUTO_START` | `true` | Start stopped containers (set to `false` to disable) |
To modify defaults, create an override file:
```bash
@@ -90,6 +92,7 @@ Then add your overrides, for example:
```ini
[Service]
Environment=CHECK_INTERVAL=60
Environment=AUTO_START=false
```
## Uninstall

View File

@@ -97,6 +97,7 @@ Environment=MCWEBUI_DIR=${MCWEBUI_DIR}
Environment=CHECK_INTERVAL=30
Environment=LOG_FILE=${LOG_FILE}
Environment=HTTP_PORT=5051
Environment=AUTO_START=true
ExecStart=/usr/bin/python3 -u ${SCRIPT_DIR}/watchdog.py
Restart=always
RestartSec=10

View File

@@ -7,6 +7,7 @@ Designed to run as a systemd service on the host.
Features:
- Monitors container health status
- Automatically starts stopped containers (configurable)
- Captures logs before restart for diagnostics
- Logs all events to file
- HTTP endpoint for status check
@@ -16,6 +17,7 @@ Configuration via environment variables:
- CHECK_INTERVAL: Seconds between checks (default: 30)
- LOG_FILE: Path to log file (default: /var/log/mc-webui-watchdog.log)
- HTTP_PORT: Port for status endpoint (default: 5051, 0 to disable)
- AUTO_START: Start stopped containers (default: true, set to 'false' to disable)
"""
import os
@@ -33,6 +35,7 @@ MCWEBUI_DIR = os.environ.get('MCWEBUI_DIR', os.path.expanduser('~/mc-webui'))
CHECK_INTERVAL = int(os.environ.get('CHECK_INTERVAL', '30'))
LOG_FILE = os.environ.get('LOG_FILE', '/var/log/mc-webui-watchdog.log')
HTTP_PORT = int(os.environ.get('HTTP_PORT', '5051'))
AUTO_START = os.environ.get('AUTO_START', 'true').lower() != 'false'
# Containers to monitor
CONTAINERS = ['meshcore-bridge', 'mc-webui']
@@ -150,6 +153,45 @@ def restart_container(container_name: str) -> bool:
return False
def start_container(container_name: str) -> bool:
"""Start a stopped container using docker compose."""
log(f"Starting container: {container_name}", 'WARN')
success, stdout, stderr = run_compose_command([
'start', container_name
], timeout=120)
if success:
log(f"Container {container_name} started successfully")
return True
else:
log(f"Failed to start {container_name}: {stderr}", 'ERROR')
return False
def handle_stopped_container(container_name: str, status: dict):
"""Handle a stopped container - log and start it."""
global restart_history
log(f"Container {container_name} is stopped! Status: {status['status']}", 'WARN')
# Start the container
start_success = start_container(container_name)
# Record in history
restart_history.append({
'timestamp': datetime.now().isoformat(),
'container': container_name,
'action': 'start',
'status_before': status,
'success': start_success
})
# Keep only last 50 entries
if len(restart_history) > 50:
restart_history = restart_history[-50:]
def handle_unhealthy_container(container_name: str, status: dict):
"""Handle an unhealthy container - log details and restart."""
global restart_history
@@ -206,7 +248,10 @@ def check_containers():
if not status['exists']:
log(f"Container {container_name} not found", 'WARN')
elif status['status'] != 'running':
log(f"Container {container_name} is not running (status: {status['status']})", 'WARN')
if AUTO_START:
handle_stopped_container(container_name, status)
else:
log(f"Container {container_name} is not running (status: {status['status']}), AUTO_START disabled", 'WARN')
elif status['health'] == 'unhealthy':
handle_unhealthy_container(container_name, status)
@@ -273,6 +318,7 @@ def main():
log(f" Check interval: {CHECK_INTERVAL}s")
log(f" Log file: {LOG_FILE}")
log(f" HTTP port: {HTTP_PORT if HTTP_PORT > 0 else 'disabled'}")
log(f" Auto-start stopped containers: {AUTO_START}")
log(f" Monitoring containers: {', '.join(CONTAINERS)}")
log("=" * 60)