fix(ble): add keepalive and robust reconnection for BLE zombie connections

BLE connections can enter a "zombie" state where notifications (reads) still
arrive but writes silently fail.  This went undetected until the user tried
to send a message, at which point the connection was already dead.

Additionally, after an abnormal BLE disconnect, BlueZ retains stale GATT
notification handles, causing reconnection to fail with
"[org.bluez.Error.NotPermitted] Notify acquired".

Changes:
- Add BLE keepalive loop (60s interval) that sends get_bat() to detect
  zombie connections proactively and trigger reconnection automatically
- Add adapter power-cycle (hci0 off/on via D-Bus) during BLE reconnection
  to clear stale GATT notification state
- Dedicated _ble_reconnect() with 5 attempts + adapter reset between each
- Health endpoint returns 503 when BLE permanently fails, triggering
  Docker container restart via healthcheck
- Guard against concurrent reconnection attempts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
MarekWo
2026-04-05 13:37:33 +02:00
parent 61d60ea4b6
commit f352ccd968
2 changed files with 145 additions and 3 deletions

View File

@@ -118,6 +118,9 @@ class DeviceManager:
self._pending_acks = {} # {ack_code_hex: dm_id} — maps retry acks to DM
self._retry_tasks = {} # {dm_id: asyncio.Task} — active retry coroutines
self._retry_context = {} # {dm_id: {attempt, max_attempts, path}} — for _on_ack
self._ble_keepalive_task = None # asyncio.Task for BLE keepalive
self._ble_permanently_failed = False # True when all reconnect attempts exhausted
self._ble_reconnecting = False # Guard against concurrent reconnect attempts
@property
def is_connected(self) -> bool:
@@ -227,6 +230,88 @@ class DeviceManager:
except Exception as e:
logger.debug(f"BLE force-disconnect check skipped: {e}")
@staticmethod
async def _ble_power_cycle_adapter():
"""Power-cycle the Bluetooth adapter via D-Bus to clear all stale state.
This clears stale GATT notification handles ('Notify acquired' error)
that persist after an abnormal bleak disconnect. A simple
Device1.Disconnect is not enough — the notification subscriptions are
per-adapter, not per-device.
"""
import subprocess
adapter_path = '/org/bluez/hci0'
try:
logger.info("Power-cycling Bluetooth adapter to clear stale GATT state...")
# Power OFF
subprocess.run(
['dbus-send', '--system', '--print-reply', '--dest=org.bluez',
adapter_path, 'org.freedesktop.DBus.Properties.Set',
'string:org.bluez.Adapter1', 'string:Powered',
'variant:boolean:false'],
capture_output=True, text=True, timeout=5
)
await asyncio.sleep(2)
# Power ON
subprocess.run(
['dbus-send', '--system', '--print-reply', '--dest=org.bluez',
adapter_path, 'org.freedesktop.DBus.Properties.Set',
'string:org.bluez.Adapter1', 'string:Powered',
'variant:boolean:true'],
capture_output=True, text=True, timeout=5
)
await asyncio.sleep(3) # BlueZ needs time to re-init the adapter
logger.info("Bluetooth adapter power-cycled successfully")
except Exception as e:
logger.warning(f"Bluetooth adapter power-cycle failed: {e}")
async def _ble_reconnect(self):
"""Reconnect BLE with adapter power-cycle to clear stale GATT state.
Uses aggressive cleanup (adapter power-cycle) between attempts to
avoid the 'Notify acquired' error that blocks reconnection after
an abnormal disconnect.
"""
if self._ble_reconnecting:
logger.debug("BLE reconnect already in progress, skipping")
return
self._ble_reconnecting = True
MAX_ATTEMPTS = 5
for attempt in range(1, MAX_ATTEMPTS + 1):
delay = 5 * attempt
logger.info(f"BLE reconnecting in {delay}s (attempt {attempt}/{MAX_ATTEMPTS})...")
await asyncio.sleep(delay)
try:
# Clean up old mc instance
if self.mc:
try:
await self.mc.disconnect()
except Exception:
pass
self.mc = None
# Power-cycle adapter to clear stale notification handles
await self._ble_power_cycle_adapter()
await self._connect()
if self._connected:
logger.info("BLE reconnected successfully")
self._ble_reconnecting = False
if self.socketio:
self.socketio.emit('device_status', {
'connected': True,
}, namespace='/chat')
return
except Exception as e:
logger.error(f"BLE reconnect attempt {attempt} failed: {e}")
self._ble_reconnecting = False
logger.error(f"BLE reconnection failed after {MAX_ATTEMPTS} attempts — "
"marking permanently failed (healthcheck will trigger restart)")
self._ble_permanently_failed = True
async def _connect(self):
"""Connect to device via BLE, TCP, or serial and subscribe to events."""
from meshcore import MeshCore
@@ -325,6 +410,13 @@ class DeviceManager:
# Start auto message fetching (events fire on new messages)
await self.mc.start_auto_message_fetching()
# Start BLE keepalive to detect zombie connections
if self.config.use_ble:
if self._ble_keepalive_task and not self._ble_keepalive_task.done():
self._ble_keepalive_task.cancel()
self._ble_keepalive_task = asyncio.ensure_future(self._ble_keepalive_loop())
self._ble_permanently_failed = False
except Exception as e:
logger.error(f"Device connection failed: {e}")
self._connected = False
@@ -389,6 +481,35 @@ class DeviceManager:
self._subscriptions.append(sub)
logger.debug(f"Subscribed to {event_type.value}")
async def _ble_keepalive_loop(self):
"""Periodically send a lightweight command to detect BLE zombie connections.
BLE connections can enter a state where notifications (reads) still
arrive but writes silently fail. A periodic write detects this early
and triggers reconnection before the user encounters the problem.
"""
BLE_KEEPALIVE_INTERVAL = 60 # seconds
while True:
await asyncio.sleep(BLE_KEEPALIVE_INTERVAL)
if not self._connected or not self.mc:
return # stop if disconnected by other means
try:
await asyncio.wait_for(
self.mc.commands.get_bat(),
timeout=10,
)
logger.debug("BLE keepalive OK")
except Exception as e:
logger.warning(f"BLE keepalive failed: {e} — triggering reconnect")
# Synthesize a disconnect event to reuse existing reconnection logic
self._connected = False
if self.socketio:
self.socketio.emit('device_status', {
'connected': False,
}, namespace='/chat')
await self._ble_reconnect()
return # reconnect loop takes over
def _sync_contacts_to_db(self):
"""Sync device contacts to database (bidirectional).
@@ -438,6 +559,11 @@ class DeviceManager:
"""Disconnect from device and stop the background thread."""
logger.info("Stopping DeviceManager...")
# Cancel BLE keepalive
if self._ble_keepalive_task and not self._ble_keepalive_task.done():
self._ble_keepalive_task.cancel()
self._ble_keepalive_task = None
if self.mc and self._loop and self._loop.is_running():
try:
future = asyncio.run_coroutine_threadsafe(
@@ -1183,12 +1309,22 @@ class DeviceManager:
logger.warning("Device disconnected")
self._connected = False
# Cancel BLE keepalive task
if self._ble_keepalive_task and not self._ble_keepalive_task.done():
self._ble_keepalive_task.cancel()
self._ble_keepalive_task = None
if self.socketio:
self.socketio.emit('device_status', {
'connected': False,
}, namespace='/chat')
# Auto-reconnect with backoff
# BLE needs adapter power-cycle to clear stale GATT state
if self.config.use_ble:
await self._ble_reconnect()
return
# Serial/TCP: simple reconnect with backoff
for attempt in range(1, 4):
delay = 5 * attempt
logger.info(f"Reconnecting in {delay}s (attempt {attempt}/3)...")

View File

@@ -104,7 +104,13 @@ def logs():
@views_bp.route('/health')
def health():
"""Health check endpoint for monitoring.
Returns 503 when BLE reconnection has permanently failed so Docker's
healthcheck triggers a container restart (which clears all BLE state).
"""
Health check endpoint for monitoring.
"""
from flask import current_app
dm = getattr(current_app, 'device_manager', None)
if dm and getattr(dm, '_ble_permanently_failed', False):
return 'BLE connection permanently failed', 503
return 'OK', 200