forked from iarv/mc-webui
feat: Add auto-start for stopped containers in watchdog
- Added AUTO_START option (default: true) to automatically start stopped containers, not just restart unhealthy ones - Added handle_stopped_container() function - Updated documentation with new configuration option Set AUTO_START=false to disable automatic starting of stopped containers. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,12 @@
|
||||
# Container Watchdog
|
||||
|
||||
The Container Watchdog is a systemd service that monitors Docker containers and automatically restarts unhealthy ones. This is useful for ensuring reliability, especially on resource-constrained systems.
|
||||
The Container Watchdog is a systemd service that monitors Docker containers and automatically restarts unhealthy or stopped ones. This is useful for ensuring reliability, especially on resource-constrained systems.
|
||||
|
||||
## Features
|
||||
|
||||
- **Health monitoring** - Checks container status every 30 seconds
|
||||
- **Automatic restart** - Restarts containers that become unhealthy
|
||||
- **Auto-start stopped containers** - Starts containers that have stopped (configurable)
|
||||
- **Diagnostic logging** - Captures container logs before restart for troubleshooting
|
||||
- **HTTP status endpoint** - Query container status via HTTP API
|
||||
- **Restart history** - Tracks all automatic restarts with timestamps
|
||||
@@ -80,6 +81,7 @@ If you need to customize the behavior, the service supports these environment va
|
||||
| `CHECK_INTERVAL` | `30` | Seconds between health checks |
|
||||
| `LOG_FILE` | `/var/log/mc-webui-watchdog.log` | Path to log file |
|
||||
| `HTTP_PORT` | `5051` | HTTP status port (0 to disable) |
|
||||
| `AUTO_START` | `true` | Start stopped containers (set to `false` to disable) |
|
||||
|
||||
To modify defaults, create an override file:
|
||||
```bash
|
||||
@@ -90,6 +92,7 @@ Then add your overrides, for example:
|
||||
```ini
|
||||
[Service]
|
||||
Environment=CHECK_INTERVAL=60
|
||||
Environment=AUTO_START=false
|
||||
```
|
||||
|
||||
## Uninstall
|
||||
|
||||
@@ -97,6 +97,7 @@ Environment=MCWEBUI_DIR=${MCWEBUI_DIR}
|
||||
Environment=CHECK_INTERVAL=30
|
||||
Environment=LOG_FILE=${LOG_FILE}
|
||||
Environment=HTTP_PORT=5051
|
||||
Environment=AUTO_START=true
|
||||
ExecStart=/usr/bin/python3 -u ${SCRIPT_DIR}/watchdog.py
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
@@ -7,6 +7,7 @@ Designed to run as a systemd service on the host.
|
||||
|
||||
Features:
|
||||
- Monitors container health status
|
||||
- Automatically starts stopped containers (configurable)
|
||||
- Captures logs before restart for diagnostics
|
||||
- Logs all events to file
|
||||
- HTTP endpoint for status check
|
||||
@@ -16,6 +17,7 @@ Configuration via environment variables:
|
||||
- CHECK_INTERVAL: Seconds between checks (default: 30)
|
||||
- LOG_FILE: Path to log file (default: /var/log/mc-webui-watchdog.log)
|
||||
- HTTP_PORT: Port for status endpoint (default: 5051, 0 to disable)
|
||||
- AUTO_START: Start stopped containers (default: true, set to 'false' to disable)
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -33,6 +35,7 @@ MCWEBUI_DIR = os.environ.get('MCWEBUI_DIR', os.path.expanduser('~/mc-webui'))
|
||||
CHECK_INTERVAL = int(os.environ.get('CHECK_INTERVAL', '30'))
|
||||
LOG_FILE = os.environ.get('LOG_FILE', '/var/log/mc-webui-watchdog.log')
|
||||
HTTP_PORT = int(os.environ.get('HTTP_PORT', '5051'))
|
||||
AUTO_START = os.environ.get('AUTO_START', 'true').lower() != 'false'
|
||||
|
||||
# Containers to monitor
|
||||
CONTAINERS = ['meshcore-bridge', 'mc-webui']
|
||||
@@ -150,6 +153,45 @@ def restart_container(container_name: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def start_container(container_name: str) -> bool:
|
||||
"""Start a stopped container using docker compose."""
|
||||
log(f"Starting container: {container_name}", 'WARN')
|
||||
|
||||
success, stdout, stderr = run_compose_command([
|
||||
'start', container_name
|
||||
], timeout=120)
|
||||
|
||||
if success:
|
||||
log(f"Container {container_name} started successfully")
|
||||
return True
|
||||
else:
|
||||
log(f"Failed to start {container_name}: {stderr}", 'ERROR')
|
||||
return False
|
||||
|
||||
|
||||
def handle_stopped_container(container_name: str, status: dict):
|
||||
"""Handle a stopped container - log and start it."""
|
||||
global restart_history
|
||||
|
||||
log(f"Container {container_name} is stopped! Status: {status['status']}", 'WARN')
|
||||
|
||||
# Start the container
|
||||
start_success = start_container(container_name)
|
||||
|
||||
# Record in history
|
||||
restart_history.append({
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'container': container_name,
|
||||
'action': 'start',
|
||||
'status_before': status,
|
||||
'success': start_success
|
||||
})
|
||||
|
||||
# Keep only last 50 entries
|
||||
if len(restart_history) > 50:
|
||||
restart_history = restart_history[-50:]
|
||||
|
||||
|
||||
def handle_unhealthy_container(container_name: str, status: dict):
|
||||
"""Handle an unhealthy container - log details and restart."""
|
||||
global restart_history
|
||||
@@ -206,7 +248,10 @@ def check_containers():
|
||||
if not status['exists']:
|
||||
log(f"Container {container_name} not found", 'WARN')
|
||||
elif status['status'] != 'running':
|
||||
log(f"Container {container_name} is not running (status: {status['status']})", 'WARN')
|
||||
if AUTO_START:
|
||||
handle_stopped_container(container_name, status)
|
||||
else:
|
||||
log(f"Container {container_name} is not running (status: {status['status']}), AUTO_START disabled", 'WARN')
|
||||
elif status['health'] == 'unhealthy':
|
||||
handle_unhealthy_container(container_name, status)
|
||||
|
||||
@@ -273,6 +318,7 @@ def main():
|
||||
log(f" Check interval: {CHECK_INTERVAL}s")
|
||||
log(f" Log file: {LOG_FILE}")
|
||||
log(f" HTTP port: {HTTP_PORT if HTTP_PORT > 0 else 'disabled'}")
|
||||
log(f" Auto-start stopped containers: {AUTO_START}")
|
||||
log(f" Monitoring containers: {', '.join(CONTAINERS)}")
|
||||
log("=" * 60)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user