From f352ccd968dadfdf9af28f459ae109177bce197b Mon Sep 17 00:00:00 2001 From: MarekWo Date: Sun, 5 Apr 2026 13:37:33 +0200 Subject: [PATCH] fix(ble): add keepalive and robust reconnection for BLE zombie connections BLE connections can enter a "zombie" state where notifications (reads) still arrive but writes silently fail. This went undetected until the user tried to send a message, at which point the connection was already dead. Additionally, after an abnormal BLE disconnect, BlueZ retains stale GATT notification handles, causing reconnection to fail with "[org.bluez.Error.NotPermitted] Notify acquired". Changes: - Add BLE keepalive loop (60s interval) that sends get_bat() to detect zombie connections proactively and trigger reconnection automatically - Add adapter power-cycle (hci0 off/on via D-Bus) during BLE reconnection to clear stale GATT notification state - Dedicated _ble_reconnect() with 5 attempts + adapter reset between each - Health endpoint returns 503 when BLE permanently fails, triggering Docker container restart via healthcheck - Guard against concurrent reconnection attempts Co-Authored-By: Claude Opus 4.6 --- app/device_manager.py | 138 +++++++++++++++++++++++++++++++++++++++++- app/routes/views.py | 10 ++- 2 files changed, 145 insertions(+), 3 deletions(-) diff --git a/app/device_manager.py b/app/device_manager.py index 6583faf..6eee7e5 100644 --- a/app/device_manager.py +++ b/app/device_manager.py @@ -118,6 +118,9 @@ class DeviceManager: self._pending_acks = {} # {ack_code_hex: dm_id} — maps retry acks to DM self._retry_tasks = {} # {dm_id: asyncio.Task} — active retry coroutines self._retry_context = {} # {dm_id: {attempt, max_attempts, path}} — for _on_ack + self._ble_keepalive_task = None # asyncio.Task for BLE keepalive + self._ble_permanently_failed = False # True when all reconnect attempts exhausted + self._ble_reconnecting = False # Guard against concurrent reconnect attempts @property def is_connected(self) -> bool: @@ -227,6 +230,88 @@ class DeviceManager: except Exception as e: logger.debug(f"BLE force-disconnect check skipped: {e}") + @staticmethod + async def _ble_power_cycle_adapter(): + """Power-cycle the Bluetooth adapter via D-Bus to clear all stale state. + + This clears stale GATT notification handles ('Notify acquired' error) + that persist after an abnormal bleak disconnect. A simple + Device1.Disconnect is not enough — the notification subscriptions are + per-adapter, not per-device. + """ + import subprocess + adapter_path = '/org/bluez/hci0' + try: + logger.info("Power-cycling Bluetooth adapter to clear stale GATT state...") + # Power OFF + subprocess.run( + ['dbus-send', '--system', '--print-reply', '--dest=org.bluez', + adapter_path, 'org.freedesktop.DBus.Properties.Set', + 'string:org.bluez.Adapter1', 'string:Powered', + 'variant:boolean:false'], + capture_output=True, text=True, timeout=5 + ) + await asyncio.sleep(2) + # Power ON + subprocess.run( + ['dbus-send', '--system', '--print-reply', '--dest=org.bluez', + adapter_path, 'org.freedesktop.DBus.Properties.Set', + 'string:org.bluez.Adapter1', 'string:Powered', + 'variant:boolean:true'], + capture_output=True, text=True, timeout=5 + ) + await asyncio.sleep(3) # BlueZ needs time to re-init the adapter + logger.info("Bluetooth adapter power-cycled successfully") + except Exception as e: + logger.warning(f"Bluetooth adapter power-cycle failed: {e}") + + async def _ble_reconnect(self): + """Reconnect BLE with adapter power-cycle to clear stale GATT state. + + Uses aggressive cleanup (adapter power-cycle) between attempts to + avoid the 'Notify acquired' error that blocks reconnection after + an abnormal disconnect. + """ + if self._ble_reconnecting: + logger.debug("BLE reconnect already in progress, skipping") + return + self._ble_reconnecting = True + + MAX_ATTEMPTS = 5 + for attempt in range(1, MAX_ATTEMPTS + 1): + delay = 5 * attempt + logger.info(f"BLE reconnecting in {delay}s (attempt {attempt}/{MAX_ATTEMPTS})...") + await asyncio.sleep(delay) + + try: + # Clean up old mc instance + if self.mc: + try: + await self.mc.disconnect() + except Exception: + pass + self.mc = None + + # Power-cycle adapter to clear stale notification handles + await self._ble_power_cycle_adapter() + + await self._connect() + if self._connected: + logger.info("BLE reconnected successfully") + self._ble_reconnecting = False + if self.socketio: + self.socketio.emit('device_status', { + 'connected': True, + }, namespace='/chat') + return + except Exception as e: + logger.error(f"BLE reconnect attempt {attempt} failed: {e}") + + self._ble_reconnecting = False + logger.error(f"BLE reconnection failed after {MAX_ATTEMPTS} attempts — " + "marking permanently failed (healthcheck will trigger restart)") + self._ble_permanently_failed = True + async def _connect(self): """Connect to device via BLE, TCP, or serial and subscribe to events.""" from meshcore import MeshCore @@ -325,6 +410,13 @@ class DeviceManager: # Start auto message fetching (events fire on new messages) await self.mc.start_auto_message_fetching() + # Start BLE keepalive to detect zombie connections + if self.config.use_ble: + if self._ble_keepalive_task and not self._ble_keepalive_task.done(): + self._ble_keepalive_task.cancel() + self._ble_keepalive_task = asyncio.ensure_future(self._ble_keepalive_loop()) + self._ble_permanently_failed = False + except Exception as e: logger.error(f"Device connection failed: {e}") self._connected = False @@ -389,6 +481,35 @@ class DeviceManager: self._subscriptions.append(sub) logger.debug(f"Subscribed to {event_type.value}") + async def _ble_keepalive_loop(self): + """Periodically send a lightweight command to detect BLE zombie connections. + + BLE connections can enter a state where notifications (reads) still + arrive but writes silently fail. A periodic write detects this early + and triggers reconnection before the user encounters the problem. + """ + BLE_KEEPALIVE_INTERVAL = 60 # seconds + while True: + await asyncio.sleep(BLE_KEEPALIVE_INTERVAL) + if not self._connected or not self.mc: + return # stop if disconnected by other means + try: + await asyncio.wait_for( + self.mc.commands.get_bat(), + timeout=10, + ) + logger.debug("BLE keepalive OK") + except Exception as e: + logger.warning(f"BLE keepalive failed: {e} — triggering reconnect") + # Synthesize a disconnect event to reuse existing reconnection logic + self._connected = False + if self.socketio: + self.socketio.emit('device_status', { + 'connected': False, + }, namespace='/chat') + await self._ble_reconnect() + return # reconnect loop takes over + def _sync_contacts_to_db(self): """Sync device contacts to database (bidirectional). @@ -438,6 +559,11 @@ class DeviceManager: """Disconnect from device and stop the background thread.""" logger.info("Stopping DeviceManager...") + # Cancel BLE keepalive + if self._ble_keepalive_task and not self._ble_keepalive_task.done(): + self._ble_keepalive_task.cancel() + self._ble_keepalive_task = None + if self.mc and self._loop and self._loop.is_running(): try: future = asyncio.run_coroutine_threadsafe( @@ -1183,12 +1309,22 @@ class DeviceManager: logger.warning("Device disconnected") self._connected = False + # Cancel BLE keepalive task + if self._ble_keepalive_task and not self._ble_keepalive_task.done(): + self._ble_keepalive_task.cancel() + self._ble_keepalive_task = None + if self.socketio: self.socketio.emit('device_status', { 'connected': False, }, namespace='/chat') - # Auto-reconnect with backoff + # BLE needs adapter power-cycle to clear stale GATT state + if self.config.use_ble: + await self._ble_reconnect() + return + + # Serial/TCP: simple reconnect with backoff for attempt in range(1, 4): delay = 5 * attempt logger.info(f"Reconnecting in {delay}s (attempt {attempt}/3)...") diff --git a/app/routes/views.py b/app/routes/views.py index 0e0daf2..18db095 100644 --- a/app/routes/views.py +++ b/app/routes/views.py @@ -104,7 +104,13 @@ def logs(): @views_bp.route('/health') def health(): + """Health check endpoint for monitoring. + + Returns 503 when BLE reconnection has permanently failed so Docker's + healthcheck triggers a container restart (which clears all BLE state). """ - Health check endpoint for monitoring. - """ + from flask import current_app + dm = getattr(current_app, 'device_manager', None) + if dm and getattr(dm, '_ble_permanently_failed', False): + return 'BLE connection permanently failed', 503 return 'OK', 200