diff --git a/docs/watchdog.md b/docs/watchdog.md index da45e16..2bd5e42 100644 --- a/docs/watchdog.md +++ b/docs/watchdog.md @@ -7,6 +7,7 @@ The Container Watchdog is a systemd service that monitors Docker containers and - **Health monitoring** - Checks container status every 30 seconds - **Automatic restart** - Restarts containers that become unhealthy - **Auto-start stopped containers** - Starts containers that have stopped (configurable) +- **Hardware USB reset** - Performs a low-level USB bus reset if the LoRa device freezes (detected after 3 failed container restarts within 8 minutes) - **Diagnostic logging** - Captures container logs before restart for troubleshooting - **HTTP status endpoint** - Query container status via HTTP API - **Restart history** - Tracks all automatic restarts with timestamps @@ -82,6 +83,7 @@ If you need to customize the behavior, the service supports these environment va | `LOG_FILE` | `/var/log/mc-webui-watchdog.log` | Path to log file | | `HTTP_PORT` | `5051` | HTTP status port (0 to disable) | | `AUTO_START` | `true` | Start stopped containers (set to `false` to disable) | +| `USB_DEVICE_PATH` | *(auto-detected)* | Path to the LoRa device (e.g., `/dev/bus/usb/001/002`) for hardware USB bus reset | To modify defaults, create an override file: ```bash diff --git a/scripts/watchdog/README.md b/scripts/watchdog/README.md new file mode 100644 index 0000000..7a40ea0 --- /dev/null +++ b/scripts/watchdog/README.md @@ -0,0 +1,21 @@ +# mc-webui Container Watchdog + +The `watchdog` service is a utility designed to run on the host machine running the Docker containers for the `mc-webui` project. Its primary purpose is to continuously monitor the health of the application's containers, specifically the `meshcore-bridge` container, which handles the physical connection to the LoRa device (like Heltec V3 or V4). + +## Key Capabilities + +- **Automated Restarts:** If a container becomes `unhealthy` or crashes, the watchdog automatically restarts it to restore service without human intervention. +- **Hardware USB Bus Reset:** If the `meshcore-bridge` container fails to recover after three successive restarts (e.g., due to a hardware freeze on the LoRa device itself), the watchdog will intelligently simulate a physical disconnection and reconnection of the device via a low-level USB bus reset, completely resolving hardware lockups. + +## Installation / Update + +You can easily install or update the watchdog by running the provided installer script with root privileges: + +```bash +cd ~/mc-webui/scripts/watchdog +sudo ./install.sh +``` + +## Detailed Documentation + +For full details on configuration, logs, troubleshooting, and more advanced features, please refer to the main [Container Watchdog Documentation](../../docs/watchdog.md) located in the `docs` folder. diff --git a/scripts/watchdog/install.sh b/scripts/watchdog/install.sh index 8c1a564..baa38e9 100755 --- a/scripts/watchdog/install.sh +++ b/scripts/watchdog/install.sh @@ -98,6 +98,7 @@ Environment=CHECK_INTERVAL=30 Environment=LOG_FILE=${LOG_FILE} Environment=HTTP_PORT=5051 Environment=AUTO_START=true +Environment=USB_DEVICE_PATH=${USB_DEVICE_PATH} ExecStart=/usr/bin/python3 -u ${SCRIPT_DIR}/watchdog.py Restart=always RestartSec=10 @@ -144,6 +145,7 @@ echo "Features:" echo " - Checks container health every 30 seconds" echo " - Automatically restarts unhealthy containers" echo " - Saves diagnostic logs before restart" +echo " - Performs hardware USB bus reset if LoRa device is stuck" echo "" echo "Useful commands:" echo " systemctl status $SERVICE_NAME # Check service status" diff --git a/scripts/watchdog/mc-webui-watchdog.service b/scripts/watchdog/mc-webui-watchdog.service index 5265aee..86657b3 100644 --- a/scripts/watchdog/mc-webui-watchdog.service +++ b/scripts/watchdog/mc-webui-watchdog.service @@ -11,6 +11,7 @@ Environment=MCWEBUI_DIR=/home/marek/mc-webui Environment=CHECK_INTERVAL=30 Environment=LOG_FILE=/var/log/mc-webui-watchdog.log Environment=HTTP_PORT=5051 +Environment=USB_DEVICE_PATH= ExecStart=/usr/bin/python3 -u /home/marek/mc-webui/scripts/watchdog/watchdog.py Restart=always RestartSec=10 diff --git a/scripts/watchdog/watchdog.py b/scripts/watchdog/watchdog.py index 9651717..6b61290 100755 --- a/scripts/watchdog/watchdog.py +++ b/scripts/watchdog/watchdog.py @@ -26,6 +26,7 @@ import json import subprocess import threading import time +import fcntl from datetime import datetime from http.server import HTTPServer, BaseHTTPRequestHandler from pathlib import Path @@ -59,6 +60,107 @@ def log(message: str, level: str = 'INFO'): print(f"[{timestamp}] [ERROR] Failed to write to log file: {e}") +# USB Device Reset Constant +USBDEVFS_RESET = 21780 # 0x5514 + +def auto_detect_usb_device() -> str: + """Attempt to auto-detect the physical USB device path (e.g., /dev/bus/usb/001/002) for LoRa.""" + env_file = os.path.join(MCWEBUI_DIR, '.env') + serial_port = 'auto' + + if os.path.exists(env_file): + try: + with open(env_file, 'r') as f: + for line in f: + if line.startswith('MC_SERIAL_PORT='): + serial_port = line.split('=', 1)[1].strip().strip('"\'') + break + except Exception as e: + log(f"Failed to read .env file for serial port: {e}", "WARN") + + if serial_port.lower() == 'auto': + by_id_path = Path('/dev/serial/by-id') + if by_id_path.exists(): + devices = list(by_id_path.iterdir()) + if len(devices) == 1: + serial_port = str(devices[0]) + elif len(devices) > 1: + log("Multiple serial devices found, cannot auto-detect USB device for reset", "WARN") + return None + else: + log("No serial devices found in /dev/serial/by-id", "WARN") + return None + else: + log("/dev/serial/by-id does not exist", "WARN") + return None + + if not serial_port or not os.path.exists(serial_port): + log(f"Serial port {serial_port} not found", "WARN") + return None + + try: + # Resolve symlink to get actual tty device (e.g., /dev/ttyACM0) + real_tty = os.path.realpath(serial_port) + tty_name = os.path.basename(real_tty) + + # Find USB bus and dev number via sysfs + sysfs_path = f"/sys/class/tty/{tty_name}/device" + if not os.path.exists(sysfs_path): + log(f"Sysfs path {sysfs_path} not found", "WARN") + return None + + usb_dev_dir = os.path.dirname(os.path.realpath(sysfs_path)) + busnum_file = os.path.join(usb_dev_dir, "busnum") + devnum_file = os.path.join(usb_dev_dir, "devnum") + + if os.path.exists(busnum_file) and os.path.exists(devnum_file): + with open(busnum_file) as f: + busnum = int(f.read().strip()) + with open(devnum_file) as f: + devnum = int(f.read().strip()) + return f"/dev/bus/usb/{busnum:03d}/{devnum:03d}" + + log("Could not find busnum/devnum files in sysfs", "WARN") + return None + except Exception as e: + log(f"Error during USB device auto-detection: {e}", "ERROR") + return None + +def reset_usb_device(): + """Perform a hardware USB bus reset on the LoRa device.""" + device_path = os.environ.get('USB_DEVICE_PATH') + if not device_path: + device_path = auto_detect_usb_device() + + if not device_path: + log("Cannot perform USB reset: device path could not be determined", "WARN") + return False + + log(f"Performing hardware USB bus reset on {device_path}", "WARN") + try: + with open(device_path, 'w') as fd: + fcntl.ioctl(fd, USBDEVFS_RESET, 0) + log("USB bus reset successful", "INFO") + return True + except Exception as e: + log(f"USB reset failed: {e}", "ERROR") + return False + +def count_recent_restarts(container_name: str, minutes: int = 8) -> int: + """Count how many times a container was restarted in the last N minutes due to unhealthiness.""" + cutoff_time = time.time() - (minutes * 60) + count = 0 + for entry in restart_history: + if entry.get('container') == container_name and 'restart_success' in entry: + try: + dt = datetime.fromisoformat(entry['timestamp']) + if dt.timestamp() >= cutoff_time: + count += 1 + except ValueError: + pass + return count + + def run_docker_command(args: list, timeout: int = 30) -> tuple: """Run docker command and return (success, stdout, stderr).""" try: @@ -216,6 +318,14 @@ def handle_unhealthy_container(container_name: str, status: dict): except Exception as e: log(f"Failed to save diagnostic info: {e}", 'ERROR') + # Check if we should do a USB reset for meshcore-bridge + if container_name == 'meshcore-bridge': + recent_restarts = count_recent_restarts(container_name, minutes=8) + if recent_restarts >= 3: + log(f"{container_name} has been restarted {recent_restarts} times in the last 8 minutes. Attempting hardware USB reset.", "WARN") + if reset_usb_device(): + time.sleep(2) # Give OS time to re-enumerate the device before Docker brings it back + # Restart the container restart_success = restart_container(container_name)