From 4f3d8a7838c08d98795b45f2b5e4c65f9f78b425 Mon Sep 17 00:00:00 2001 From: Jack Kingsman Date: Mon, 23 Feb 2026 23:12:53 -0800 Subject: [PATCH] Fix stuck post-connect failure state --- app/radio.py | 18 +++++++++++++++ app/routers/radio.py | 33 +++++++++++++++++++++++++--- tests/test_radio.py | 52 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 3 deletions(-) diff --git a/app/radio.py b/app/radio.py index 7c1ac1a..14eba3a 100644 --- a/app/radio.py +++ b/app/radio.py @@ -133,6 +133,7 @@ class RadioManager: self._operation_lock: asyncio.Lock | None = None self._setup_lock: asyncio.Lock | None = None self._setup_in_progress: bool = False + self._setup_complete: bool = False async def _acquire_operation_lock( self, @@ -247,6 +248,7 @@ class RadioManager: if not self._meshcore: return self._setup_in_progress = True + self._setup_complete = False mc = self._meshcore try: register_event_handlers(mc) @@ -284,6 +286,8 @@ class RadioManager: # Start periodic message polling as fallback (idempotent) start_message_polling() + + self._setup_complete = True finally: self._setup_in_progress = False @@ -309,6 +313,10 @@ class RadioManager: def is_setup_in_progress(self) -> bool: return self._setup_in_progress + @property + def is_setup_complete(self) -> bool: + return self._setup_complete + async def connect(self) -> None: """Connect to the radio using the configured transport.""" if self._meshcore is not None: @@ -346,6 +354,7 @@ class RadioManager: ) self._connection_info = f"Serial: {port}" self._last_connected = True + self._setup_complete = False logger.debug("Serial connection established") async def _connect_tcp(self) -> None: @@ -362,6 +371,7 @@ class RadioManager: ) self._connection_info = f"TCP: {host}:{port}" self._last_connected = True + self._setup_complete = False logger.debug("TCP connection established") async def _connect_ble(self) -> None: @@ -378,6 +388,7 @@ class RadioManager: ) self._connection_info = f"BLE: {address}" self._last_connected = True + self._setup_complete = False logger.debug("BLE connection established") async def disconnect(self) -> None: @@ -386,6 +397,7 @@ class RadioManager: logger.debug("Disconnecting from radio") await self._meshcore.disconnect() self._meshcore = None + self._setup_complete = False logger.debug("Radio disconnected") async def reconnect(self, *, broadcast_on_success: bool = True) -> bool: @@ -475,6 +487,12 @@ class RadioManager: broadcast_health(True, self._connection_info) self._last_connected = True + elif current_connected and not self._setup_complete: + # Transport connected but setup incomplete — retry + logger.info("Retrying post-connect setup...") + await self.post_connect_setup() + broadcast_health(True, self._connection_info) + except asyncio.CancelledError: # Task is being cancelled, exit cleanly break diff --git a/app/routers/radio.py b/app/routers/radio.py index ffc57e4..cc5174b 100644 --- a/app/routers/radio.py +++ b/app/routers/radio.py @@ -206,7 +206,14 @@ async def reboot_radio() -> dict: success = await radio_manager.reconnect() if success: - await radio_manager.post_connect_setup() + try: + await radio_manager.post_connect_setup() + except Exception as e: + logger.exception("Post-connect setup failed after reconnect") + raise HTTPException( + status_code=503, + detail=f"Radio connected but setup failed: {e}", + ) from e return {"status": "ok", "message": "Reconnected successfully", "connected": True} else: @@ -224,7 +231,20 @@ async def reconnect_radio() -> dict: or power-cycled. """ if radio_manager.is_connected: - return {"status": "ok", "message": "Already connected", "connected": True} + if radio_manager.is_setup_complete: + return {"status": "ok", "message": "Already connected", "connected": True} + + # Connected but setup incomplete — retry setup + logger.info("Radio connected but setup incomplete, retrying setup") + try: + await radio_manager.post_connect_setup() + return {"status": "ok", "message": "Setup completed", "connected": True} + except Exception as e: + logger.exception("Post-connect setup failed") + raise HTTPException( + status_code=503, + detail=f"Radio connected but setup failed: {e}", + ) from e if radio_manager.is_reconnecting: return { @@ -237,7 +257,14 @@ async def reconnect_radio() -> dict: success = await radio_manager.reconnect() if success: - await radio_manager.post_connect_setup() + try: + await radio_manager.post_connect_setup() + except Exception as e: + logger.exception("Post-connect setup failed after reconnect") + raise HTTPException( + status_code=503, + detail=f"Radio connected but setup failed: {e}", + ) from e return {"status": "ok", "message": "Reconnected successfully", "connected": True} else: diff --git a/tests/test_radio.py b/tests/test_radio.py index 25af620..b363508 100644 --- a/tests/test_radio.py +++ b/tests/test_radio.py @@ -221,6 +221,58 @@ class TestConnectionMonitor: assert healthy_calls == [] assert rm._last_connected is False + @pytest.mark.asyncio + async def test_monitor_retries_setup_when_connected_but_incomplete(self): + """Monitor retries setup when transport is connected but setup previously failed.""" + from app.radio import RadioManager + + rm = RadioManager() + rm._connection_info = "TCP: test:4000" + + # Simulate: transport connected, _last_connected=True (set by _connect_*), + # but setup failed so _setup_complete=False. + mock_mc = MagicMock() + mock_mc.is_connected = True + rm._meshcore = mock_mc + rm._last_connected = True + rm._setup_complete = False + + setup_calls = 0 + + async def _mock_setup(): + nonlocal setup_calls + setup_calls += 1 + if setup_calls == 1: + raise RuntimeError("setup failed") + # Second call succeeds + rm._setup_complete = True + + rm.post_connect_setup = AsyncMock(side_effect=_mock_setup) + + sleep_count = 0 + + async def _sleep(_seconds: float): + nonlocal sleep_count + sleep_count += 1 + if sleep_count >= 4: + raise asyncio.CancelledError() + + with ( + patch("app.radio.asyncio.sleep", side_effect=_sleep), + patch("app.websocket.broadcast_health") as mock_broadcast, + ): + await rm.start_connection_monitor() + try: + await rm._reconnect_task + finally: + await rm.stop_connection_monitor() + + # Setup should have been retried and eventually succeeded + assert setup_calls >= 2 + # Should broadcast healthy after setup succeeds + mock_broadcast.assert_any_call(True, "TCP: test:4000") + assert rm._setup_complete is True + class TestReconnectLock: """Tests for reconnect() lock serialization — no duplicate reconnections."""