mirror of
https://github.com/MarekWo/mc-webui.git
synced 2026-07-04 17:01:34 +02:00
feat(watchdog): Hardware USB bus reset for stuck LoRa devices
Implement a smart auto-detection and low-level fcntl ioctl reset mechanism for LoRa USB devices. This 'last resort' recovery is triggered if the meshcore-bridge container fails to recover after 3 restarts within an 8-minute window. Includes updates to the installer, systemd service, and newly added README. Co-Authored-By: Gemini CLI <noreply@google.com>
This commit is contained in:
@@ -7,6 +7,7 @@ The Container Watchdog is a systemd service that monitors Docker containers and
|
||||
- **Health monitoring** - Checks container status every 30 seconds
|
||||
- **Automatic restart** - Restarts containers that become unhealthy
|
||||
- **Auto-start stopped containers** - Starts containers that have stopped (configurable)
|
||||
- **Hardware USB reset** - Performs a low-level USB bus reset if the LoRa device freezes (detected after 3 failed container restarts within 8 minutes)
|
||||
- **Diagnostic logging** - Captures container logs before restart for troubleshooting
|
||||
- **HTTP status endpoint** - Query container status via HTTP API
|
||||
- **Restart history** - Tracks all automatic restarts with timestamps
|
||||
@@ -82,6 +83,7 @@ If you need to customize the behavior, the service supports these environment va
|
||||
| `LOG_FILE` | `/var/log/mc-webui-watchdog.log` | Path to log file |
|
||||
| `HTTP_PORT` | `5051` | HTTP status port (0 to disable) |
|
||||
| `AUTO_START` | `true` | Start stopped containers (set to `false` to disable) |
|
||||
| `USB_DEVICE_PATH` | *(auto-detected)* | Path to the LoRa device (e.g., `/dev/bus/usb/001/002`) for hardware USB bus reset |
|
||||
|
||||
To modify defaults, create an override file:
|
||||
```bash
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
# mc-webui Container Watchdog
|
||||
|
||||
The `watchdog` service is a utility designed to run on the host machine running the Docker containers for the `mc-webui` project. Its primary purpose is to continuously monitor the health of the application's containers, specifically the `meshcore-bridge` container, which handles the physical connection to the LoRa device (like Heltec V3 or V4).
|
||||
|
||||
## Key Capabilities
|
||||
|
||||
- **Automated Restarts:** If a container becomes `unhealthy` or crashes, the watchdog automatically restarts it to restore service without human intervention.
|
||||
- **Hardware USB Bus Reset:** If the `meshcore-bridge` container fails to recover after three successive restarts (e.g., due to a hardware freeze on the LoRa device itself), the watchdog will intelligently simulate a physical disconnection and reconnection of the device via a low-level USB bus reset, completely resolving hardware lockups.
|
||||
|
||||
## Installation / Update
|
||||
|
||||
You can easily install or update the watchdog by running the provided installer script with root privileges:
|
||||
|
||||
```bash
|
||||
cd ~/mc-webui/scripts/watchdog
|
||||
sudo ./install.sh
|
||||
```
|
||||
|
||||
## Detailed Documentation
|
||||
|
||||
For full details on configuration, logs, troubleshooting, and more advanced features, please refer to the main [Container Watchdog Documentation](../../docs/watchdog.md) located in the `docs` folder.
|
||||
@@ -98,6 +98,7 @@ Environment=CHECK_INTERVAL=30
|
||||
Environment=LOG_FILE=${LOG_FILE}
|
||||
Environment=HTTP_PORT=5051
|
||||
Environment=AUTO_START=true
|
||||
Environment=USB_DEVICE_PATH=${USB_DEVICE_PATH}
|
||||
ExecStart=/usr/bin/python3 -u ${SCRIPT_DIR}/watchdog.py
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
@@ -144,6 +145,7 @@ echo "Features:"
|
||||
echo " - Checks container health every 30 seconds"
|
||||
echo " - Automatically restarts unhealthy containers"
|
||||
echo " - Saves diagnostic logs before restart"
|
||||
echo " - Performs hardware USB bus reset if LoRa device is stuck"
|
||||
echo ""
|
||||
echo "Useful commands:"
|
||||
echo " systemctl status $SERVICE_NAME # Check service status"
|
||||
|
||||
@@ -11,6 +11,7 @@ Environment=MCWEBUI_DIR=/home/marek/mc-webui
|
||||
Environment=CHECK_INTERVAL=30
|
||||
Environment=LOG_FILE=/var/log/mc-webui-watchdog.log
|
||||
Environment=HTTP_PORT=5051
|
||||
Environment=USB_DEVICE_PATH=
|
||||
ExecStart=/usr/bin/python3 -u /home/marek/mc-webui/scripts/watchdog/watchdog.py
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
@@ -26,6 +26,7 @@ import json
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import fcntl
|
||||
from datetime import datetime
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
from pathlib import Path
|
||||
@@ -59,6 +60,107 @@ def log(message: str, level: str = 'INFO'):
|
||||
print(f"[{timestamp}] [ERROR] Failed to write to log file: {e}")
|
||||
|
||||
|
||||
# USB Device Reset Constant
|
||||
USBDEVFS_RESET = 21780 # 0x5514
|
||||
|
||||
def auto_detect_usb_device() -> str:
|
||||
"""Attempt to auto-detect the physical USB device path (e.g., /dev/bus/usb/001/002) for LoRa."""
|
||||
env_file = os.path.join(MCWEBUI_DIR, '.env')
|
||||
serial_port = 'auto'
|
||||
|
||||
if os.path.exists(env_file):
|
||||
try:
|
||||
with open(env_file, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('MC_SERIAL_PORT='):
|
||||
serial_port = line.split('=', 1)[1].strip().strip('"\'')
|
||||
break
|
||||
except Exception as e:
|
||||
log(f"Failed to read .env file for serial port: {e}", "WARN")
|
||||
|
||||
if serial_port.lower() == 'auto':
|
||||
by_id_path = Path('/dev/serial/by-id')
|
||||
if by_id_path.exists():
|
||||
devices = list(by_id_path.iterdir())
|
||||
if len(devices) == 1:
|
||||
serial_port = str(devices[0])
|
||||
elif len(devices) > 1:
|
||||
log("Multiple serial devices found, cannot auto-detect USB device for reset", "WARN")
|
||||
return None
|
||||
else:
|
||||
log("No serial devices found in /dev/serial/by-id", "WARN")
|
||||
return None
|
||||
else:
|
||||
log("/dev/serial/by-id does not exist", "WARN")
|
||||
return None
|
||||
|
||||
if not serial_port or not os.path.exists(serial_port):
|
||||
log(f"Serial port {serial_port} not found", "WARN")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Resolve symlink to get actual tty device (e.g., /dev/ttyACM0)
|
||||
real_tty = os.path.realpath(serial_port)
|
||||
tty_name = os.path.basename(real_tty)
|
||||
|
||||
# Find USB bus and dev number via sysfs
|
||||
sysfs_path = f"/sys/class/tty/{tty_name}/device"
|
||||
if not os.path.exists(sysfs_path):
|
||||
log(f"Sysfs path {sysfs_path} not found", "WARN")
|
||||
return None
|
||||
|
||||
usb_dev_dir = os.path.dirname(os.path.realpath(sysfs_path))
|
||||
busnum_file = os.path.join(usb_dev_dir, "busnum")
|
||||
devnum_file = os.path.join(usb_dev_dir, "devnum")
|
||||
|
||||
if os.path.exists(busnum_file) and os.path.exists(devnum_file):
|
||||
with open(busnum_file) as f:
|
||||
busnum = int(f.read().strip())
|
||||
with open(devnum_file) as f:
|
||||
devnum = int(f.read().strip())
|
||||
return f"/dev/bus/usb/{busnum:03d}/{devnum:03d}"
|
||||
|
||||
log("Could not find busnum/devnum files in sysfs", "WARN")
|
||||
return None
|
||||
except Exception as e:
|
||||
log(f"Error during USB device auto-detection: {e}", "ERROR")
|
||||
return None
|
||||
|
||||
def reset_usb_device():
|
||||
"""Perform a hardware USB bus reset on the LoRa device."""
|
||||
device_path = os.environ.get('USB_DEVICE_PATH')
|
||||
if not device_path:
|
||||
device_path = auto_detect_usb_device()
|
||||
|
||||
if not device_path:
|
||||
log("Cannot perform USB reset: device path could not be determined", "WARN")
|
||||
return False
|
||||
|
||||
log(f"Performing hardware USB bus reset on {device_path}", "WARN")
|
||||
try:
|
||||
with open(device_path, 'w') as fd:
|
||||
fcntl.ioctl(fd, USBDEVFS_RESET, 0)
|
||||
log("USB bus reset successful", "INFO")
|
||||
return True
|
||||
except Exception as e:
|
||||
log(f"USB reset failed: {e}", "ERROR")
|
||||
return False
|
||||
|
||||
def count_recent_restarts(container_name: str, minutes: int = 8) -> int:
|
||||
"""Count how many times a container was restarted in the last N minutes due to unhealthiness."""
|
||||
cutoff_time = time.time() - (minutes * 60)
|
||||
count = 0
|
||||
for entry in restart_history:
|
||||
if entry.get('container') == container_name and 'restart_success' in entry:
|
||||
try:
|
||||
dt = datetime.fromisoformat(entry['timestamp'])
|
||||
if dt.timestamp() >= cutoff_time:
|
||||
count += 1
|
||||
except ValueError:
|
||||
pass
|
||||
return count
|
||||
|
||||
|
||||
def run_docker_command(args: list, timeout: int = 30) -> tuple:
|
||||
"""Run docker command and return (success, stdout, stderr)."""
|
||||
try:
|
||||
@@ -216,6 +318,14 @@ def handle_unhealthy_container(container_name: str, status: dict):
|
||||
except Exception as e:
|
||||
log(f"Failed to save diagnostic info: {e}", 'ERROR')
|
||||
|
||||
# Check if we should do a USB reset for meshcore-bridge
|
||||
if container_name == 'meshcore-bridge':
|
||||
recent_restarts = count_recent_restarts(container_name, minutes=8)
|
||||
if recent_restarts >= 3:
|
||||
log(f"{container_name} has been restarted {recent_restarts} times in the last 8 minutes. Attempting hardware USB reset.", "WARN")
|
||||
if reset_usb_device():
|
||||
time.sleep(2) # Give OS time to re-enumerate the device before Docker brings it back
|
||||
|
||||
# Restart the container
|
||||
restart_success = restart_container(container_name)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user