Fix stuck post-connect failure state

This commit is contained in:
Jack Kingsman
2026-02-23 23:12:53 -08:00
parent 559935e3d5
commit 4f3d8a7838
3 changed files with 100 additions and 3 deletions

View File

@@ -133,6 +133,7 @@ class RadioManager:
self._operation_lock: asyncio.Lock | None = None
self._setup_lock: asyncio.Lock | None = None
self._setup_in_progress: bool = False
self._setup_complete: bool = False
async def _acquire_operation_lock(
self,
@@ -247,6 +248,7 @@ class RadioManager:
if not self._meshcore:
return
self._setup_in_progress = True
self._setup_complete = False
mc = self._meshcore
try:
register_event_handlers(mc)
@@ -284,6 +286,8 @@ class RadioManager:
# Start periodic message polling as fallback (idempotent)
start_message_polling()
self._setup_complete = True
finally:
self._setup_in_progress = False
@@ -309,6 +313,10 @@ class RadioManager:
def is_setup_in_progress(self) -> bool:
return self._setup_in_progress
@property
def is_setup_complete(self) -> bool:
return self._setup_complete
async def connect(self) -> None:
"""Connect to the radio using the configured transport."""
if self._meshcore is not None:
@@ -346,6 +354,7 @@ class RadioManager:
)
self._connection_info = f"Serial: {port}"
self._last_connected = True
self._setup_complete = False
logger.debug("Serial connection established")
async def _connect_tcp(self) -> None:
@@ -362,6 +371,7 @@ class RadioManager:
)
self._connection_info = f"TCP: {host}:{port}"
self._last_connected = True
self._setup_complete = False
logger.debug("TCP connection established")
async def _connect_ble(self) -> None:
@@ -378,6 +388,7 @@ class RadioManager:
)
self._connection_info = f"BLE: {address}"
self._last_connected = True
self._setup_complete = False
logger.debug("BLE connection established")
async def disconnect(self) -> None:
@@ -386,6 +397,7 @@ class RadioManager:
logger.debug("Disconnecting from radio")
await self._meshcore.disconnect()
self._meshcore = None
self._setup_complete = False
logger.debug("Radio disconnected")
async def reconnect(self, *, broadcast_on_success: bool = True) -> bool:
@@ -475,6 +487,12 @@ class RadioManager:
broadcast_health(True, self._connection_info)
self._last_connected = True
elif current_connected and not self._setup_complete:
# Transport connected but setup incomplete — retry
logger.info("Retrying post-connect setup...")
await self.post_connect_setup()
broadcast_health(True, self._connection_info)
except asyncio.CancelledError:
# Task is being cancelled, exit cleanly
break

View File

@@ -206,7 +206,14 @@ async def reboot_radio() -> dict:
success = await radio_manager.reconnect()
if success:
await radio_manager.post_connect_setup()
try:
await radio_manager.post_connect_setup()
except Exception as e:
logger.exception("Post-connect setup failed after reconnect")
raise HTTPException(
status_code=503,
detail=f"Radio connected but setup failed: {e}",
) from e
return {"status": "ok", "message": "Reconnected successfully", "connected": True}
else:
@@ -224,7 +231,20 @@ async def reconnect_radio() -> dict:
or power-cycled.
"""
if radio_manager.is_connected:
return {"status": "ok", "message": "Already connected", "connected": True}
if radio_manager.is_setup_complete:
return {"status": "ok", "message": "Already connected", "connected": True}
# Connected but setup incomplete — retry setup
logger.info("Radio connected but setup incomplete, retrying setup")
try:
await radio_manager.post_connect_setup()
return {"status": "ok", "message": "Setup completed", "connected": True}
except Exception as e:
logger.exception("Post-connect setup failed")
raise HTTPException(
status_code=503,
detail=f"Radio connected but setup failed: {e}",
) from e
if radio_manager.is_reconnecting:
return {
@@ -237,7 +257,14 @@ async def reconnect_radio() -> dict:
success = await radio_manager.reconnect()
if success:
await radio_manager.post_connect_setup()
try:
await radio_manager.post_connect_setup()
except Exception as e:
logger.exception("Post-connect setup failed after reconnect")
raise HTTPException(
status_code=503,
detail=f"Radio connected but setup failed: {e}",
) from e
return {"status": "ok", "message": "Reconnected successfully", "connected": True}
else:

View File

@@ -221,6 +221,58 @@ class TestConnectionMonitor:
assert healthy_calls == []
assert rm._last_connected is False
@pytest.mark.asyncio
async def test_monitor_retries_setup_when_connected_but_incomplete(self):
"""Monitor retries setup when transport is connected but setup previously failed."""
from app.radio import RadioManager
rm = RadioManager()
rm._connection_info = "TCP: test:4000"
# Simulate: transport connected, _last_connected=True (set by _connect_*),
# but setup failed so _setup_complete=False.
mock_mc = MagicMock()
mock_mc.is_connected = True
rm._meshcore = mock_mc
rm._last_connected = True
rm._setup_complete = False
setup_calls = 0
async def _mock_setup():
nonlocal setup_calls
setup_calls += 1
if setup_calls == 1:
raise RuntimeError("setup failed")
# Second call succeeds
rm._setup_complete = True
rm.post_connect_setup = AsyncMock(side_effect=_mock_setup)
sleep_count = 0
async def _sleep(_seconds: float):
nonlocal sleep_count
sleep_count += 1
if sleep_count >= 4:
raise asyncio.CancelledError()
with (
patch("app.radio.asyncio.sleep", side_effect=_sleep),
patch("app.websocket.broadcast_health") as mock_broadcast,
):
await rm.start_connection_monitor()
try:
await rm._reconnect_task
finally:
await rm.stop_connection_monitor()
# Setup should have been retried and eventually succeeded
assert setup_calls >= 2
# Should broadcast healthy after setup succeeds
mock_broadcast.assert_any_call(True, "TCP: test:4000")
assert rm._setup_complete is True
class TestReconnectLock:
"""Tests for reconnect() lock serialization — no duplicate reconnections."""