diff --git a/CLAUDE.md b/CLAUDE.md index 8ef90d9..978d926 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,6 +16,8 @@ Always edit the source templates, then regenerate with `python scripts/render_si ## Running Commands +**IMPORTANT: Always activate the virtual environment before running any Python commands.** + ```bash cd /path/to/meshcore-stats source .venv/bin/activate @@ -354,11 +356,17 @@ All configuration via `meshcore.conf` or environment variables. The config file ### Timeouts & Retry - `REMOTE_TIMEOUT_S`: Minimum timeout for LoRa requests (default: 10) -- `REMOTE_RETRY_ATTEMPTS`: Number of retry attempts (default: 5) +- `REMOTE_RETRY_ATTEMPTS`: Number of retry attempts (default: 2) - `REMOTE_RETRY_BACKOFF_S`: Seconds between retries (default: 4) - `REMOTE_CB_FAILS`: Failures before circuit breaker opens (default: 6) - `REMOTE_CB_COOLDOWN_S`: Circuit breaker cooldown (default: 3600) +### Telemetry Collection +- `TELEMETRY_ENABLED`: Enable environmental telemetry collection from repeater (0/1, default: 0) +- `TELEMETRY_TIMEOUT_S`: Timeout for telemetry requests (default: 10) +- `TELEMETRY_RETRY_ATTEMPTS`: Retry attempts for telemetry (default: 2) +- `TELEMETRY_RETRY_BACKOFF_S`: Backoff between telemetry retries (default: 4) + ### Intervals - `COMPANION_STEP`: Collection interval for companion (default: 60s) - `REPEATER_STEP`: Collection interval for repeater (default: 900s / 15min) @@ -410,6 +418,12 @@ Metrics are classified as either **gauge** or **counter** in `src/meshmon/metric Counter metrics are converted to rates during chart rendering by calculating deltas between consecutive readings. +- **TELEMETRY**: Environmental sensor data (when `TELEMETRY_ENABLED=1`): + - Stored with `telemetry.` prefix: `telemetry.temperature.0`, `telemetry.humidity.0`, `telemetry.barometer.0` + - Channel number distinguishes multiple sensors of the same type + - Compound values (e.g., GPS) stored as: `telemetry.gps.0.latitude`, `telemetry.gps.0.longitude` + - Telemetry collection does NOT affect circuit breaker state + ## Database Schema Metrics are stored in a SQLite database at `data/state/metrics.db` with WAL mode enabled for concurrent access. diff --git a/docs/firmware-responses.md b/docs/firmware-responses.md index ec1c4c7..3e3a1a9 100644 --- a/docs/firmware-responses.md +++ b/docs/firmware-responses.md @@ -102,6 +102,84 @@ Returns a single dict with all status fields. --- +## Telemetry Data + +Environmental telemetry is requested via `req_telemetry_sync(contact)` and returns +Cayenne LPP formatted sensor data. This requires `TELEMETRY_ENABLED=1` and a sensor +board attached to the repeater. + +### Payload Format + +Both `req_telemetry_sync()` and `get_self_telemetry()` return a dict containing the +LPP data list and a public key prefix: + +```python +{ + 'pubkey_pre': 'a5c14f5244d6', + 'lpp': [ + {'channel': 0, 'type': 'temperature', 'value': 23.5}, + {'channel': 0, 'type': 'humidity', 'value': 45.2}, + ] +} +``` + +The `extract_lpp_from_payload()` helper in `src/meshmon/telemetry.py` handles +extracting the `lpp` list from this wrapper format. + +### `req_telemetry_sync(contact)` + +Returns sensor readings from a remote node in Cayenne LPP format: + +```python +[ + {'channel': 0, 'type': 'temperature', 'value': 23.5}, + {'channel': 0, 'type': 'humidity', 'value': 45.2}, + {'channel': 0, 'type': 'barometer', 'value': 1013.25}, + {'channel': 1, 'type': 'gps', 'value': {'latitude': 51.5, 'longitude': -0.1, 'altitude': 10}}, +] +``` + +**Common sensor types:** + +| Type | Unit | Description | +|------|------|-------------| +| `temperature` | Celsius | Temperature reading | +| `humidity` | % | Relative humidity | +| `barometer` | hPa/mbar | Barometric pressure | +| `voltage` | V | Voltage reading | +| `gps` | compound | GPS with `latitude`, `longitude`, `altitude` | + +**Stored as:** +- `telemetry.temperature.0` - Temperature on channel 0 +- `telemetry.humidity.0` - Humidity on channel 0 +- `telemetry.gps.1.latitude` - GPS latitude on channel 1 + +**Notes:** +- Requires environmental sensor board (BME280, BME680, etc.) on repeater +- Channel number distinguishes multiple sensors of the same type +- Not all repeaters have environmental sensors attached +- Telemetry collection does not affect circuit breaker state +- Telemetry failures are logged as warnings and do not block status collection + +### `get_self_telemetry()` + +Returns self telemetry from the companion node's attached sensors. +Same Cayenne LPP format as `req_telemetry_sync()`. + +```python +[ + {'channel': 0, 'type': 'temperature', 'value': 23.5}, + {'channel': 0, 'type': 'humidity', 'value': 45.2}, +] +``` + +**Notes:** +- Requires environmental sensor board attached to companion +- Returns empty list if no sensors attached +- Uses same format as repeater telemetry + +--- + ## Derived Metrics These are computed at query time, not stored: diff --git a/meshcore.conf.example b/meshcore.conf.example index f13f3c9..ed0c540 100644 --- a/meshcore.conf.example +++ b/meshcore.conf.example @@ -113,6 +113,23 @@ RADIO_CODING_RATE=CR8 # REMOTE_CB_FAILS=6 # REMOTE_CB_COOLDOWN_S=3600 +# ============================================================================= +# Telemetry Collection (Environmental Sensors) +# ============================================================================= +# Enable telemetry collection from repeater's environmental sensors +# (temperature, humidity, barometric pressure, etc.) +# Requires sensor board attached to repeater (e.g., BME280, BME680) +# Default: 0 (disabled) +# TELEMETRY_ENABLED=1 + +# Telemetry-specific timeout and retry settings +# Defaults match status settings. Separate config allows tuning if telemetry +# proves problematic (e.g., firmware doesn't support it, sensor board missing). +# You can reduce these if telemetry collection is causing issues. +# TELEMETRY_TIMEOUT_S=10 +# TELEMETRY_RETRY_ATTEMPTS=2 +# TELEMETRY_RETRY_BACKOFF_S=4 + # ============================================================================= # Paths (Native installation only) # ============================================================================= diff --git a/scripts/collect_companion.py b/scripts/collect_companion.py index 2b649c3..8118f18 100755 --- a/scripts/collect_companion.py +++ b/scripts/collect_companion.py @@ -27,6 +27,7 @@ from meshmon.env import get_config from meshmon import log from meshmon.meshcore_client import connect_with_lock, run_command from meshmon.db import init_db, insert_metrics +from meshmon.telemetry import extract_lpp_from_payload, extract_telemetry_metrics async def collect_companion() -> int: @@ -93,15 +94,26 @@ async def collect_companion() -> int: else: log.error(f"get_time failed: {err}") - # get_self_telemetry + # get_self_telemetry - collect environmental sensor data + # Note: The call happens regardless of telemetry_enabled for device query completeness, + # but we only extract and store metrics if the feature is enabled. ok, evt_type, payload, err = await run_command( mc, cmd.get_self_telemetry(), "get_self_telemetry" ) if ok: commands_succeeded += 1 log.debug(f"get_self_telemetry: {payload}") + # Extract and store telemetry if enabled + if cfg.telemetry_enabled: + lpp_data = extract_lpp_from_payload(payload) + if lpp_data is not None: + telemetry_metrics = extract_telemetry_metrics(lpp_data) + if telemetry_metrics: + metrics.update(telemetry_metrics) + log.debug(f"Extracted {len(telemetry_metrics)} telemetry metrics") else: - log.error(f"get_self_telemetry failed: {err}") + # Debug level because not all devices have sensors attached - this is expected + log.debug(f"get_self_telemetry failed: {err}") # get_custom_vars ok, evt_type, payload, err = await run_command( @@ -176,6 +188,10 @@ async def collect_companion() -> int: summary_parts.append(f"rx={int(metrics['recv'])}") if "sent" in metrics: summary_parts.append(f"tx={int(metrics['sent'])}") + # Add telemetry count to summary if present + telemetry_count = sum(1 for k in metrics if k.startswith("telemetry.")) + if telemetry_count > 0: + summary_parts.append(f"telem={telemetry_count}") log.info(f"Companion: {', '.join(summary_parts)}") diff --git a/scripts/collect_repeater.py b/scripts/collect_repeater.py index d69106b..a658c51 100755 --- a/scripts/collect_repeater.py +++ b/scripts/collect_repeater.py @@ -32,10 +32,10 @@ from meshmon.meshcore_client import ( get_contact_by_name, get_contact_by_key_prefix, extract_contact_info, - list_contacts_summary, ) from meshmon.db import init_db, insert_metrics from meshmon.retry import get_repeater_circuit_breaker, with_retries +from meshmon.telemetry import extract_lpp_from_payload, extract_telemetry_metrics async def find_repeater_contact(mc: Any) -> Optional[Any]: @@ -143,8 +143,10 @@ async def query_repeater_with_retry( async def collect_repeater() -> int: - """ - Collect data from remote repeater node. + """Collect data from remote repeater node. + + Collects status metrics (battery, uptime, packet counters, etc.) and + optionally telemetry data (temperature, humidity, pressure) if enabled. Returns: Exit code (0 = success, 1 = error) @@ -162,7 +164,8 @@ async def collect_repeater() -> int: return 0 # Metrics to insert (firmware field names from req_status_sync) - metrics: dict[str, float] = {} + status_metrics: dict[str, float] = {} + telemetry_metrics: dict[str, float] = {} node_name = "unknown" status_ok = False @@ -213,7 +216,7 @@ async def collect_repeater() -> int: except Exception as e: log.debug(f"Login not supported: {e}") - # Query status (using _sync version which returns payload directly) + # Phase 1: Status collection (affects circuit breaker) # Use timeout=0 to let the device suggest timeout, with min_timeout as floor log.debug("Querying repeater status...") success, payload, err = await query_repeater_with_retry( @@ -227,12 +230,12 @@ async def collect_repeater() -> int: # Insert all numeric fields from status response for key, value in payload.items(): if isinstance(value, (int, float)): - metrics[key] = float(value) + status_metrics[key] = float(value) log.debug(f"req_status_sync: {payload}") else: log.warn(f"req_status_sync failed: {err}") - # Update circuit breaker + # Update circuit breaker based on status result if status_ok: cb.record_success() log.debug("Circuit breaker: recorded success") @@ -240,6 +243,51 @@ async def collect_repeater() -> int: cb.record_failure(cfg.remote_cb_fails, cfg.remote_cb_cooldown_s) log.debug(f"Circuit breaker: recorded failure ({cb.consecutive_failures}/{cfg.remote_cb_fails})") + # CRITICAL: Store status metrics immediately before attempting telemetry + # This ensures critical data is saved even if telemetry fails + if status_ok and status_metrics: + try: + inserted = insert_metrics(ts=ts, role="repeater", metrics=status_metrics) + log.debug(f"Stored {inserted} status metrics (ts={ts})") + except Exception as e: + log.error(f"Failed to store status metrics: {e}") + return 1 + + # Phase 2: Telemetry collection (does NOT affect circuit breaker) + if cfg.telemetry_enabled and status_ok: + log.debug("Querying repeater telemetry...") + try: + # Note: Telemetry uses its own retry settings and does NOT + # affect circuit breaker. Status success proves the link is up; + # telemetry failures are likely firmware/capability issues. + telem_success, telem_payload, telem_err = await with_retries( + lambda: cmd.req_telemetry_sync( + contact, timeout=0, min_timeout=cfg.telemetry_timeout_s + ), + attempts=cfg.telemetry_retry_attempts, + backoff_s=cfg.telemetry_retry_backoff_s, + name="req_telemetry_sync", + ) + + if telem_success and telem_payload: + log.debug(f"req_telemetry_sync: {telem_payload}") + lpp_data = extract_lpp_from_payload(telem_payload) + if lpp_data is not None: + telemetry_metrics = extract_telemetry_metrics(lpp_data) + log.debug(f"Extracted {len(telemetry_metrics)} telemetry metrics") + + # Store telemetry metrics + if telemetry_metrics: + try: + inserted = insert_metrics(ts=ts, role="repeater", metrics=telemetry_metrics) + log.debug(f"Stored {inserted} telemetry metrics") + except Exception as e: + log.warn(f"Failed to store telemetry metrics: {e}") + else: + log.warn(f"req_telemetry_sync failed: {telem_err}") + except Exception as e: + log.warn(f"Telemetry collection error (continuing): {e}") + except Exception as e: log.error(f"Error during collection: {e}") cb.record_failure(cfg.remote_cb_fails, cfg.remote_cb_cooldown_s) @@ -248,28 +296,21 @@ async def collect_repeater() -> int: # Print summary summary_parts = [f"ts={ts}"] - if "bat" in metrics: - bat_v = metrics["bat"] / 1000.0 + if "bat" in status_metrics: + bat_v = status_metrics["bat"] / 1000.0 summary_parts.append(f"bat={bat_v:.2f}V") - if "uptime" in metrics: - uptime_days = metrics["uptime"] // 86400 + if "uptime" in status_metrics: + uptime_days = status_metrics["uptime"] // 86400 summary_parts.append(f"uptime={int(uptime_days)}d") - if "nb_recv" in metrics: - summary_parts.append(f"rx={int(metrics['nb_recv'])}") - if "nb_sent" in metrics: - summary_parts.append(f"tx={int(metrics['nb_sent'])}") + if "nb_recv" in status_metrics: + summary_parts.append(f"rx={int(status_metrics['nb_recv'])}") + if "nb_sent" in status_metrics: + summary_parts.append(f"tx={int(status_metrics['nb_sent'])}") + if telemetry_metrics: + summary_parts.append(f"telem={len(telemetry_metrics)}") log.info(f"Repeater ({node_name}): {', '.join(summary_parts)}") - # Write metrics to database - if status_ok and metrics: - try: - inserted = insert_metrics(ts=ts, role="repeater", metrics=metrics) - log.debug(f"Inserted {inserted} metrics to database (ts={ts})") - except Exception as e: - log.error(f"Failed to write metrics to database: {e}") - return 1 - return 0 if status_ok else 1 diff --git a/src/meshmon/env.py b/src/meshmon/env.py index 1561cda..0adb357 100644 --- a/src/meshmon/env.py +++ b/src/meshmon/env.py @@ -155,6 +155,14 @@ class Config: self.remote_cb_fails = get_int("REMOTE_CB_FAILS", 6) self.remote_cb_cooldown_s = get_int("REMOTE_CB_COOLDOWN_S", 3600) + # Telemetry collection (requires sensor board on repeater) + self.telemetry_enabled = get_bool("TELEMETRY_ENABLED", False) + # Separate settings allow tuning if telemetry proves problematic + # Defaults match status settings - tune down if needed + self.telemetry_timeout_s = get_int("TELEMETRY_TIMEOUT_S", 10) + self.telemetry_retry_attempts = get_int("TELEMETRY_RETRY_ATTEMPTS", 2) + self.telemetry_retry_backoff_s = get_int("TELEMETRY_RETRY_BACKOFF_S", 4) + # Paths (defaults are Docker container paths; native installs override via config) self.state_dir = get_path("STATE_DIR", "/data/state") self.out_dir = get_path("OUT_DIR", "/out") diff --git a/src/meshmon/telemetry.py b/src/meshmon/telemetry.py new file mode 100644 index 0000000..b867448 --- /dev/null +++ b/src/meshmon/telemetry.py @@ -0,0 +1,102 @@ +"""Telemetry data extraction from Cayenne LPP format.""" + +from typing import Any +from . import log + +__all__ = ["extract_lpp_from_payload", "extract_telemetry_metrics"] + + +def extract_lpp_from_payload(payload: Any) -> list | None: + """Extract LPP data list from telemetry payload. + + Handles both formats returned by the MeshCore API: + - Dict format: {'pubkey_pre': '...', 'lpp': [...]} + - Direct list format: [...] + + Args: + payload: Raw telemetry payload from get_self_telemetry() or req_telemetry_sync() + + Returns: + The LPP data list, or None if not extractable. + """ + if payload is None: + return None + + if isinstance(payload, dict): + lpp = payload.get("lpp") + if lpp is None: + log.debug("No 'lpp' key in telemetry payload dict") + return None + if not isinstance(lpp, list): + log.debug(f"Unexpected LPP data type in payload: {type(lpp).__name__}") + return None + return lpp + + if isinstance(payload, list): + return payload + + log.debug(f"Unexpected telemetry payload type: {type(payload).__name__}") + return None + + +def extract_telemetry_metrics(lpp_data: Any) -> dict[str, float]: + """Extract numeric telemetry values from Cayenne LPP response. + + Expected format: + [ + {"type": "temperature", "channel": 0, "value": 23.5}, + {"type": "gps", "channel": 1, "value": {"latitude": 51.5, "longitude": -0.1, "altitude": 10}} + ] + + Keys are formatted as: + - telemetry.{type}.{channel} for scalar values + - telemetry.{type}.{channel}.{subkey} for compound values (e.g., GPS) + + Returns: + Dict mapping metric keys to float values. Invalid readings are skipped. + """ + if not isinstance(lpp_data, list): + log.warn(f"Expected list for LPP data, got {type(lpp_data).__name__}") + return {} + + metrics: dict[str, float] = {} + + for i, reading in enumerate(lpp_data): + if not isinstance(reading, dict): + log.debug(f"Skipping non-dict LPP reading at index {i}") + continue + + sensor_type = reading.get("type") + if not isinstance(sensor_type, str) or not sensor_type.strip(): + log.debug(f"Skipping reading with invalid type at index {i}") + continue + + # Normalize sensor type for use as metric key component + sensor_type = sensor_type.strip().lower().replace(" ", "_") + + channel = reading.get("channel", 0) + if not isinstance(channel, int): + channel = 0 + + value = reading.get("value") + base_key = f"telemetry.{sensor_type}.{channel}" + + # Note: Check bool before int because bool is a subclass of int in Python. + # Some sensors may report digital on/off values as booleans. + if isinstance(value, bool): + metrics[base_key] = float(value) + elif isinstance(value, (int, float)): + metrics[base_key] = float(value) + elif isinstance(value, dict): + for subkey, subval in value.items(): + if not isinstance(subkey, str): + continue + subkey_clean = subkey.strip().lower().replace(" ", "_") + if not subkey_clean: + continue + if isinstance(subval, bool): + metrics[f"{base_key}.{subkey_clean}"] = float(subval) + elif isinstance(subval, (int, float)): + metrics[f"{base_key}.{subkey_clean}"] = float(subval) + + return metrics