feat(watchdog): Hardware USB bus reset for stuck LoRa devices

Implement a smart auto-detection and low-level fcntl ioctl reset mechanism for LoRa USB devices. This 'last resort' recovery is triggered if the meshcore-bridge container fails to recover after 3 restarts within an 8-minute window. Includes updates to the installer, systemd service, and newly added README.

Co-Authored-By: Gemini CLI <noreply@google.com>
This commit is contained in:
MarekWo
2026-02-22 20:15:27 +00:00
parent f1e5f39a4e
commit 2f82c589c7
5 changed files with 136 additions and 0 deletions
+2
View File
@@ -7,6 +7,7 @@ The Container Watchdog is a systemd service that monitors Docker containers and
- **Health monitoring** - Checks container status every 30 seconds
- **Automatic restart** - Restarts containers that become unhealthy
- **Auto-start stopped containers** - Starts containers that have stopped (configurable)
- **Hardware USB reset** - Performs a low-level USB bus reset if the LoRa device freezes (detected after 3 failed container restarts within 8 minutes)
- **Diagnostic logging** - Captures container logs before restart for troubleshooting
- **HTTP status endpoint** - Query container status via HTTP API
- **Restart history** - Tracks all automatic restarts with timestamps
@@ -82,6 +83,7 @@ If you need to customize the behavior, the service supports these environment va
| `LOG_FILE` | `/var/log/mc-webui-watchdog.log` | Path to log file |
| `HTTP_PORT` | `5051` | HTTP status port (0 to disable) |
| `AUTO_START` | `true` | Start stopped containers (set to `false` to disable) |
| `USB_DEVICE_PATH` | *(auto-detected)* | Path to the LoRa device (e.g., `/dev/bus/usb/001/002`) for hardware USB bus reset |
To modify defaults, create an override file:
```bash
+21
View File
@@ -0,0 +1,21 @@
# mc-webui Container Watchdog
The `watchdog` service is a utility designed to run on the host machine running the Docker containers for the `mc-webui` project. Its primary purpose is to continuously monitor the health of the application's containers, specifically the `meshcore-bridge` container, which handles the physical connection to the LoRa device (like Heltec V3 or V4).
## Key Capabilities
- **Automated Restarts:** If a container becomes `unhealthy` or crashes, the watchdog automatically restarts it to restore service without human intervention.
- **Hardware USB Bus Reset:** If the `meshcore-bridge` container fails to recover after three successive restarts (e.g., due to a hardware freeze on the LoRa device itself), the watchdog will intelligently simulate a physical disconnection and reconnection of the device via a low-level USB bus reset, completely resolving hardware lockups.
## Installation / Update
You can easily install or update the watchdog by running the provided installer script with root privileges:
```bash
cd ~/mc-webui/scripts/watchdog
sudo ./install.sh
```
## Detailed Documentation
For full details on configuration, logs, troubleshooting, and more advanced features, please refer to the main [Container Watchdog Documentation](../../docs/watchdog.md) located in the `docs` folder.
+2
View File
@@ -98,6 +98,7 @@ Environment=CHECK_INTERVAL=30
Environment=LOG_FILE=${LOG_FILE}
Environment=HTTP_PORT=5051
Environment=AUTO_START=true
Environment=USB_DEVICE_PATH=${USB_DEVICE_PATH}
ExecStart=/usr/bin/python3 -u ${SCRIPT_DIR}/watchdog.py
Restart=always
RestartSec=10
@@ -144,6 +145,7 @@ echo "Features:"
echo " - Checks container health every 30 seconds"
echo " - Automatically restarts unhealthy containers"
echo " - Saves diagnostic logs before restart"
echo " - Performs hardware USB bus reset if LoRa device is stuck"
echo ""
echo "Useful commands:"
echo " systemctl status $SERVICE_NAME # Check service status"
@@ -11,6 +11,7 @@ Environment=MCWEBUI_DIR=/home/marek/mc-webui
Environment=CHECK_INTERVAL=30
Environment=LOG_FILE=/var/log/mc-webui-watchdog.log
Environment=HTTP_PORT=5051
Environment=USB_DEVICE_PATH=
ExecStart=/usr/bin/python3 -u /home/marek/mc-webui/scripts/watchdog/watchdog.py
Restart=always
RestartSec=10
+110
View File
@@ -26,6 +26,7 @@ import json
import subprocess
import threading
import time
import fcntl
from datetime import datetime
from http.server import HTTPServer, BaseHTTPRequestHandler
from pathlib import Path
@@ -59,6 +60,107 @@ def log(message: str, level: str = 'INFO'):
print(f"[{timestamp}] [ERROR] Failed to write to log file: {e}")
# USB Device Reset Constant
USBDEVFS_RESET = 21780 # 0x5514
def auto_detect_usb_device() -> str:
"""Attempt to auto-detect the physical USB device path (e.g., /dev/bus/usb/001/002) for LoRa."""
env_file = os.path.join(MCWEBUI_DIR, '.env')
serial_port = 'auto'
if os.path.exists(env_file):
try:
with open(env_file, 'r') as f:
for line in f:
if line.startswith('MC_SERIAL_PORT='):
serial_port = line.split('=', 1)[1].strip().strip('"\'')
break
except Exception as e:
log(f"Failed to read .env file for serial port: {e}", "WARN")
if serial_port.lower() == 'auto':
by_id_path = Path('/dev/serial/by-id')
if by_id_path.exists():
devices = list(by_id_path.iterdir())
if len(devices) == 1:
serial_port = str(devices[0])
elif len(devices) > 1:
log("Multiple serial devices found, cannot auto-detect USB device for reset", "WARN")
return None
else:
log("No serial devices found in /dev/serial/by-id", "WARN")
return None
else:
log("/dev/serial/by-id does not exist", "WARN")
return None
if not serial_port or not os.path.exists(serial_port):
log(f"Serial port {serial_port} not found", "WARN")
return None
try:
# Resolve symlink to get actual tty device (e.g., /dev/ttyACM0)
real_tty = os.path.realpath(serial_port)
tty_name = os.path.basename(real_tty)
# Find USB bus and dev number via sysfs
sysfs_path = f"/sys/class/tty/{tty_name}/device"
if not os.path.exists(sysfs_path):
log(f"Sysfs path {sysfs_path} not found", "WARN")
return None
usb_dev_dir = os.path.dirname(os.path.realpath(sysfs_path))
busnum_file = os.path.join(usb_dev_dir, "busnum")
devnum_file = os.path.join(usb_dev_dir, "devnum")
if os.path.exists(busnum_file) and os.path.exists(devnum_file):
with open(busnum_file) as f:
busnum = int(f.read().strip())
with open(devnum_file) as f:
devnum = int(f.read().strip())
return f"/dev/bus/usb/{busnum:03d}/{devnum:03d}"
log("Could not find busnum/devnum files in sysfs", "WARN")
return None
except Exception as e:
log(f"Error during USB device auto-detection: {e}", "ERROR")
return None
def reset_usb_device():
"""Perform a hardware USB bus reset on the LoRa device."""
device_path = os.environ.get('USB_DEVICE_PATH')
if not device_path:
device_path = auto_detect_usb_device()
if not device_path:
log("Cannot perform USB reset: device path could not be determined", "WARN")
return False
log(f"Performing hardware USB bus reset on {device_path}", "WARN")
try:
with open(device_path, 'w') as fd:
fcntl.ioctl(fd, USBDEVFS_RESET, 0)
log("USB bus reset successful", "INFO")
return True
except Exception as e:
log(f"USB reset failed: {e}", "ERROR")
return False
def count_recent_restarts(container_name: str, minutes: int = 8) -> int:
"""Count how many times a container was restarted in the last N minutes due to unhealthiness."""
cutoff_time = time.time() - (minutes * 60)
count = 0
for entry in restart_history:
if entry.get('container') == container_name and 'restart_success' in entry:
try:
dt = datetime.fromisoformat(entry['timestamp'])
if dt.timestamp() >= cutoff_time:
count += 1
except ValueError:
pass
return count
def run_docker_command(args: list, timeout: int = 30) -> tuple:
"""Run docker command and return (success, stdout, stderr)."""
try:
@@ -216,6 +318,14 @@ def handle_unhealthy_container(container_name: str, status: dict):
except Exception as e:
log(f"Failed to save diagnostic info: {e}", 'ERROR')
# Check if we should do a USB reset for meshcore-bridge
if container_name == 'meshcore-bridge':
recent_restarts = count_recent_restarts(container_name, minutes=8)
if recent_restarts >= 3:
log(f"{container_name} has been restarted {recent_restarts} times in the last 8 minutes. Attempting hardware USB reset.", "WARN")
if reset_usb_device():
time.sleep(2) # Give OS time to re-enumerate the device before Docker brings it back
# Restart the container
restart_success = restart_container(container_name)