From 7e1f941760ab35118c62a3015992cf8e9117a2d2 Mon Sep 17 00:00:00 2001 From: Jack Kingsman Date: Thu, 12 Mar 2026 16:57:22 -0700 Subject: [PATCH] Add documentation and force-lock-acquisition mode for channel management --- AGENTS.md | 5 +-- README.md | 17 ++++++++-- app/AGENTS.md | 3 +- app/config.py | 1 + app/radio.py | 6 ++++ app/radio_sync.py | 58 ++++++++++++++++++++++++++++++++ tests/test_radio_sync.py | 67 +++++++++++++++++++++++++++++++++++++ tests/test_send_messages.py | 25 ++++++++++++++ 8 files changed, 177 insertions(+), 5 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 9c0b0b7..bb5ce46 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -437,11 +437,12 @@ mc.subscribe(EventType.ACK, handler) | `MESHCORE_LOG_LEVEL` | `INFO` | Logging level (`DEBUG`/`INFO`/`WARNING`/`ERROR`) | | `MESHCORE_DATABASE_PATH` | `data/meshcore.db` | SQLite database location | | `MESHCORE_DISABLE_BOTS` | `false` | Disable bot system entirely (blocks execution and config) | -| `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK` | `false` | Switch the always-on message audit task from hourly checks to aggressive 10-second `get_msg()` fallback polling | | `MESHCORE_BASIC_AUTH_USERNAME` | *(none)* | Optional app-wide HTTP Basic auth username; must be set together with `MESHCORE_BASIC_AUTH_PASSWORD` | | `MESHCORE_BASIC_AUTH_PASSWORD` | *(none)* | Optional app-wide HTTP Basic auth password; must be set together with `MESHCORE_BASIC_AUTH_USERNAME` | +| `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK` | `false` | Switch the always-on radio audit task from hourly checks to aggressive 10-second polling; the audit checks both missed message drift and channel-slot cache drift | +| `MESHCORE_FORCE_CHANNEL_SLOT_RECONFIGURE` | `false` | Disable channel-slot reuse and force `set_channel(...)` before every channel send, even on serial/BLE | -**Note:** Runtime app settings are stored in the database (`app_settings` table), not environment variables. These include `max_radio_contacts`, `auto_decrypt_dm_on_advert`, `sidebar_sort_order`, `advert_interval`, `last_advert_time`, `favorites`, `last_message_times`, `flood_scope`, `blocked_keys`, and `blocked_names`. `max_radio_contacts` is the configured radio contact capacity baseline used by background maintenance: favorites reload first, non-favorite fill targets about 80% of that value, and full offload/reload triggers around 95% occupancy. They are configured via `GET/PATCH /api/settings`. MQTT, bot, webhook, Apprise, and SQS configs are stored in the `fanout_configs` table, managed via `/api/fanout`. +**Note:** Runtime app settings are stored in the database (`app_settings` table), not environment variables. These include `max_radio_contacts`, `auto_decrypt_dm_on_advert`, `sidebar_sort_order`, `advert_interval`, `last_advert_time`, `favorites`, `last_message_times`, `flood_scope`, `blocked_keys`, and `blocked_names`. `max_radio_contacts` is the configured radio contact capacity baseline used by background maintenance: favorites reload first, non-favorite fill targets about 80% of that value, and full offload/reload triggers around 95% occupancy. They are configured via `GET/PATCH /api/settings`. MQTT, bot, webhook, Apprise, and SQS configs are stored in the `fanout_configs` table, managed via `/api/fanout`. If the radio's channel slots appear unstable or another client is mutating them underneath this app, operators can force the old always-reconfigure send path with `MESHCORE_FORCE_CHANNEL_SLOT_RECONFIGURE=true`. Byte-perfect channel retries are user-triggered via `POST /api/messages/channel/{message_id}/resend` and are allowed for 30 seconds after the original send. diff --git a/README.md b/README.md index a141f1c..9e89013 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,6 @@ npm run build # build the frontend | `MESHCORE_LOG_LEVEL` | INFO | DEBUG, INFO, WARNING, ERROR | | `MESHCORE_DATABASE_PATH` | data/meshcore.db | SQLite database path | | `MESHCORE_DISABLE_BOTS` | false | Disable bot system entirely (blocks execution and config) | -| `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK` | false | Run aggressive 10-second `get_msg()` fallback polling instead of the default hourly audit task | | `MESHCORE_BASIC_AUTH_USERNAME` | | Optional app-wide HTTP Basic auth username; must be set together with `MESHCORE_BASIC_AUTH_PASSWORD` | | `MESHCORE_BASIC_AUTH_PASSWORD` | | Optional app-wide HTTP Basic auth password; must be set together with `MESHCORE_BASIC_AUTH_USERNAME` | @@ -232,7 +231,21 @@ Only one transport may be active at a time. If multiple are set, the server will If you enable Basic Auth, protect the app with HTTPS. HTTP Basic credentials are not safe on plain HTTP. -By default the app relies on radio events plus MeshCore auto-fetch for incoming messages, and also runs a low-frequency hourly audit poll. If that audit ever finds radio data that was not surfaced through event subscription, the backend logs an error and the UI shows a toast telling the operator to check the logs. If you see that warning, or if messages on the radio never show up in the app, try `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK=true` to switch that task into a more aggressive 10-second `get_msg()` safety net. +### Remediation Environment Variables + +These are intended for diagnosing or working around radios that behave oddly. + +| Variable | Default | Description | +|----------|---------|-------------| +| `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK` | false | Run aggressive 10-second `get_msg()` fallback polling instead of the default hourly sanity check | +| `MESHCORE_FORCE_CHANNEL_SLOT_RECONFIGURE` | false | Disable channel-slot reuse and force `set_channel(...)` before every channel send | + +By default the app relies on radio events plus MeshCore auto-fetch for incoming messages, and also runs a low-frequency hourly audit poll. That audit checks both: + +- whether messages were left on the radio without reaching the app through event subscription +- whether the app's channel-slot expectations still match the radio's actual channel listing + +If the audit finds a mismatch, you'll see an error in the application UI and your logs. If you see that warning, or if messages on the radio never show up in the app, try `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK=true` to switch that task into a more aggressive 10-second safety net. If room sends appear to be using the wrong channel slot or another client is changing slots underneath this app, try `MESHCORE_FORCE_CHANNEL_SLOT_RECONFIGURE=true` to force the radio to validate the channel slot is valid before sending (will delay sending by ~500ms). ## Additional Setup diff --git a/app/AGENTS.md b/app/AGENTS.md index 80165ee..183fa07 100644 --- a/app/AGENTS.md +++ b/app/AGENTS.md @@ -89,7 +89,7 @@ app/ - `RadioManager.post_connect_setup()` delegates to `services/radio_lifecycle.py`. - Routers, startup/lifespan code, fanout helpers, and `radio_sync.py` should reach radio state through `services/radio_runtime.py`, not by importing `app.radio.radio_manager` directly. - Shared reconnect/setup helpers in `services/radio_lifecycle.py` are used by startup, the monitor, and manual reconnect/reboot flows before broadcasting healthy state. -- Setup still includes handler registration, key export, time sync, contact/channel sync, and advertisement tasks. The message-poll task always starts: by default it runs as a low-frequency hourly audit, and `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK=true` switches it to aggressive 10-second polling. +- Setup still includes handler registration, key export, time sync, contact/channel sync, and advertisement tasks. The message-poll task always starts: by default it runs as a low-frequency hourly audit, and `MESHCORE_ENABLE_MESSAGE_POLL_FALLBACK=true` switches it to aggressive 10-second polling. That audit checks both missed-radio-message drift and channel-slot cache drift; cache mismatches are logged, toasted, and the send-slot cache is reset. - Post-connect setup is timeout-bounded. If initial radio offload/setup hangs too long, the backend logs the failure and broadcasts an `error` toast telling the operator to reboot the radio and restart the server. ## Important Behaviors @@ -101,6 +101,7 @@ app/ - Channel slot count comes from firmware-reported `DEVICE_INFO.max_channels`; do not hardcode `40` when scanning/offloading channel slots. - Channel sends use a session-local LRU slot cache after startup channel offload clears the radio. Repeated sends to the same room reuse the loaded slot; new rooms fill free slots up to the discovered channel capacity, then evict the least recently used cached room. - TCP radios do not reuse cached slot contents. For TCP, channel sends still force `set_channel(...)` before every send because this backend does not have exclusive device access. +- `MESHCORE_FORCE_CHANNEL_SLOT_RECONFIGURE=true` disables slot reuse on all transports and forces the old always-`set_channel(...)` behavior before every channel send. - Contacts persist `out_path_hash_mode` in the database so contact sync and outbound DM routing reuse the exact stored mode instead of inferring from path bytes. - Contacts may also persist `route_override_path`, `route_override_len`, and `route_override_hash_mode`. `Contact.to_radio_dict()` gives these override fields precedence over learned `last_path*`, while advert processing still updates the learned route for telemetry/fallback. - `contact_advert_paths` identity is `(public_key, path_hex, path_len)` because the same hex bytes can represent different routes at different hop widths. diff --git a/app/config.py b/app/config.py index db6406c..6525389 100644 --- a/app/config.py +++ b/app/config.py @@ -19,6 +19,7 @@ class Settings(BaseSettings): database_path: str = "data/meshcore.db" disable_bots: bool = False enable_message_poll_fallback: bool = False + force_channel_slot_reconfigure: bool = False basic_auth_username: str = "" basic_auth_password: str = "" diff --git a/app/radio.py b/app/radio.py index 768b762..89590c2 100644 --- a/app/radio.py +++ b/app/radio.py @@ -234,6 +234,8 @@ class RadioManager: def channel_slot_reuse_enabled(self) -> bool: """Return whether this transport can safely reuse cached channel slots.""" + if settings.force_channel_slot_reconfigure: + return False if self._connection_info: return not self._connection_info.startswith("TCP:") return settings.connection_type != "tcp" @@ -304,6 +306,10 @@ class RadioManager: self._channel_slot_by_key.move_to_end(normalized_key) self._channel_key_by_slot[slot] = normalized_key + def get_channel_send_cache_snapshot(self) -> list[tuple[str, int]]: + """Return the current channel send cache contents in LRU order.""" + return list(self._channel_slot_by_key.items()) + def _find_first_free_channel_slot(self, capacity: int, preferred_slot: int) -> int: """Pick the first unclaimed app-managed slot, preferring the requested slot.""" if preferred_slot < capacity and preferred_slot not in self._channel_key_by_slot: diff --git a/app/radio_sync.py b/app/radio_sync.py index 20e6299..b2cee5f 100644 --- a/app/radio_sync.py +++ b/app/radio_sync.py @@ -466,6 +466,63 @@ async def poll_for_messages(mc: MeshCore) -> int: return count +def _normalize_channel_secret(payload: dict) -> bytes: + """Return a normalized bytes representation of a radio channel secret.""" + secret = payload.get("channel_secret", b"") + if isinstance(secret, bytes): + return secret + return bytes(secret) + + +async def audit_channel_send_cache(mc: MeshCore) -> bool: + """Verify cached send-slot expectations still match radio channel contents. + + If a mismatch is detected, the app's send-slot cache is reset so future sends + fall back to reloading channels before reuse resumes. + """ + if not radio_manager.channel_slot_reuse_enabled(): + return True + + cached_slots = radio_manager.get_channel_send_cache_snapshot() + if not cached_slots: + return True + + mismatches: list[str] = [] + for channel_key, slot in cached_slots: + result = await mc.commands.get_channel(slot) + if result.type != EventType.CHANNEL_INFO: + mismatches.append( + f"slot {slot}: expected {channel_key[:8]} but radio returned {result.type}" + ) + continue + + observed_name = result.payload.get("channel_name") or "" + observed_key = _normalize_channel_secret(result.payload).hex().upper() + expected_channel = await ChannelRepository.get_by_key(channel_key) + expected_name = expected_channel.name if expected_channel is not None else None + + if observed_key != channel_key or expected_name is None or observed_name != expected_name: + mismatches.append( + f"slot {slot}: expected {expected_name or '(missing db row)'} " + f"{channel_key[:8]}, got {observed_name or '(empty)'} {observed_key[:8]}" + ) + + if not mismatches: + return True + + logger.error( + "A periodic radio audit discovered that the channel send-slot cache fell out of sync with radio state. This indicates that some other system, internal or external to the radio, has updated the channel slots on the radio (which the app assumes it has exclusive rights to, except on TCP-linked devices). The cache is resetting now, but you should review the README.md and consider using the environment variable MESHCORE_FORCE_CHANNEL_SLOT_RECONFIGURE=true to make the radio use non-optimistic channel management and force-write the channel to radio before each send. This is a minor performance hit, but guarantees consistency. Mismatches found: %s", + "; ".join(mismatches), + ) + radio_manager.reset_channel_send_cache() + broadcast_error( + "A periodic poll task has discovered radio inconsistencies.", + "Please check the logs for recommendations (search " + "'MESHCORE_FORCE_CHANNEL_SLOT_RECONFIGURE').", + ) + return False + + async def _message_poll_loop(): """Background task that periodically polls for messages.""" while True: @@ -483,6 +540,7 @@ async def _message_poll_loop(): suspend_auto_fetch=True, ) as mc: count = await poll_for_messages(mc) + await audit_channel_send_cache(mc) if count > 0: if aggressive_fallback: logger.warning( diff --git a/tests/test_radio_sync.py b/tests/test_radio_sync.py index 075a335..c499ab3 100644 --- a/tests/test_radio_sync.py +++ b/tests/test_radio_sync.py @@ -16,6 +16,7 @@ from app.radio_sync import ( _message_poll_loop, _periodic_advert_loop, _periodic_sync_loop, + audit_channel_send_cache, ensure_contact_on_radio, is_polling_paused, pause_polling, @@ -38,6 +39,7 @@ def reset_sync_state(): prev_mc = radio_manager._meshcore prev_lock = radio_manager._operation_lock prev_max_channels = radio_manager.max_channels + prev_connection_info = radio_manager._connection_info prev_slot_by_key = radio_manager._channel_slot_by_key.copy() prev_key_by_slot = radio_manager._channel_key_by_slot.copy() @@ -49,6 +51,7 @@ def reset_sync_state(): radio_manager._meshcore = prev_mc radio_manager._operation_lock = prev_lock radio_manager.max_channels = prev_max_channels + radio_manager._connection_info = prev_connection_info radio_manager._channel_slot_by_key = prev_slot_by_key radio_manager._channel_key_by_slot = prev_key_by_slot @@ -1341,6 +1344,70 @@ class TestMessagePollLoopRaces: mock_broadcast_error.assert_not_called() +class TestChannelSendCacheAudit: + """Verify session-local channel-slot reuse state is audited against the radio.""" + + @pytest.mark.asyncio + async def test_audit_channel_send_cache_accepts_matching_radio_state(self, test_db): + chan_key = "ab" * 16 + await ChannelRepository.upsert(key=chan_key, name="#flightless") + radio_manager.note_channel_slot_loaded(chan_key, 0) + + ok_result = MagicMock() + ok_result.type = EventType.CHANNEL_INFO + ok_result.payload = { + "channel_name": "#flightless", + "channel_secret": bytes.fromhex(chan_key), + } + + mock_mc = MagicMock() + mock_mc.commands.get_channel = AsyncMock(return_value=ok_result) + + with patch("app.radio_sync.broadcast_error") as mock_broadcast_error: + assert await audit_channel_send_cache(mock_mc) is True + + mock_mc.commands.get_channel.assert_awaited_once_with(0) + mock_broadcast_error.assert_not_called() + assert radio_manager.get_cached_channel_slot(chan_key) == 0 + + @pytest.mark.asyncio + async def test_audit_channel_send_cache_resets_and_toasts_on_mismatch(self, test_db): + chan_key = "cd" * 16 + await ChannelRepository.upsert(key=chan_key, name="#flightless") + radio_manager.note_channel_slot_loaded(chan_key, 0) + + mismatch_result = MagicMock() + mismatch_result.type = EventType.CHANNEL_INFO + mismatch_result.payload = { + "channel_name": "#elsewhere", + "channel_secret": bytes.fromhex("ef" * 16), + } + + mock_mc = MagicMock() + mock_mc.commands.get_channel = AsyncMock(return_value=mismatch_result) + + with ( + patch("app.radio_sync.logger") as mock_logger, + patch("app.radio_sync.broadcast_error") as mock_broadcast_error, + ): + assert await audit_channel_send_cache(mock_mc) is False + + mock_logger.error.assert_called_once() + mock_broadcast_error.assert_called_once() + assert radio_manager.get_cached_channel_slot(chan_key) is None + + @pytest.mark.asyncio + async def test_audit_channel_send_cache_skips_when_reuse_forced_off(self, test_db): + chan_key = "ef" * 16 + radio_manager.note_channel_slot_loaded(chan_key, 0) + mock_mc = MagicMock() + + with patch("app.radio.settings.force_channel_slot_reconfigure", True): + assert await audit_channel_send_cache(mock_mc) is True + + mock_mc.commands.get_channel.assert_not_called() + + class TestPeriodicAdvertLoopRaces: """Regression tests for disconnect/reconnect race paths in _periodic_advert_loop.""" diff --git a/tests/test_send_messages.py b/tests/test_send_messages.py index 9ce51af..2aacd1e 100644 --- a/tests/test_send_messages.py +++ b/tests/test_send_messages.py @@ -450,6 +450,31 @@ class TestOutgoingChannelBroadcast: assert mc.commands.send_chan_msg.await_count == 2 assert radio_manager.get_cached_channel_slot(chan_key) is None + @pytest.mark.asyncio + async def test_send_channel_msg_force_reconfigure_env_disables_reuse(self, test_db): + mc = _make_mc(name="MyNode") + chan_key = "e1" * 16 + await ChannelRepository.upsert(key=chan_key, name="#forced") + radio_manager.max_channels = 4 + radio_manager._connection_info = "Serial: /dev/ttyUSB0" + + with ( + patch("app.routers.messages.require_connected", return_value=mc), + patch.object(radio_manager, "_meshcore", mc), + patch("app.decoder.calculate_channel_hash", return_value="abcd"), + patch("app.routers.messages.broadcast_event"), + patch("app.radio.settings.force_channel_slot_reconfigure", True), + ): + await send_channel_message( + SendChannelMessageRequest(channel_key=chan_key, text="first send") + ) + await send_channel_message( + SendChannelMessageRequest(channel_key=chan_key, text="second send") + ) + + assert mc.commands.set_channel.await_count == 2 + assert radio_manager.get_cached_channel_slot(chan_key) is None + class TestResendChannelMessage: """Test the user-triggered resend endpoint."""