diff --git a/etc/prometheus/alerts.yml b/etc/prometheus/alerts.yml index 5c478cb..ad968b3 100644 --- a/etc/prometheus/alerts.yml +++ b/etc/prometheus/alerts.yml @@ -7,10 +7,10 @@ groups: - name: meshcore rules: - alert: NodeNotSeen - expr: time() - meshcore_node_last_seen_timestamp_seconds > 48 * 3600 + expr: time() - meshcore_node_last_seen_timestamp_seconds{role="infra"} > 48 * 3600 for: 5m labels: severity: warning annotations: - summary: "Node {{ $labels.node_name }} not seen for 48+ hours" - description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}) last seen {{ $value | humanizeDuration }} ago." + summary: "Node {{ $labels.node_name }} ({{ $labels.role }}) not seen for 48+ hours" + description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}, role={{ $labels.role }}) last seen {{ $value | humanizeDuration }} ago." diff --git a/src/meshcore_hub/api/metrics.py b/src/meshcore_hub/api/metrics.py index 17e5da2..0d19f92 100644 --- a/src/meshcore_hub/api/metrics.py +++ b/src/meshcore_hub/api/metrics.py @@ -16,6 +16,7 @@ from meshcore_hub.common.models import ( Member, Message, Node, + NodeTag, Telemetry, TracePath, ) @@ -144,19 +145,31 @@ def collect_metrics(session: Any) -> bytes: node_last_seen = Gauge( "meshcore_node_last_seen_timestamp_seconds", "Unix timestamp of when the node was last seen", - ["public_key", "node_name", "adv_type"], + ["public_key", "node_name", "adv_type", "role"], registry=registry, ) + role_subq = ( + select(NodeTag.node_id, NodeTag.value.label("role")) + .where(NodeTag.key == "role") + .subquery() + ) nodes_with_last_seen = session.execute( - select(Node.public_key, Node.name, Node.adv_type, Node.last_seen).where( - Node.last_seen.isnot(None) + select( + Node.public_key, + Node.name, + Node.adv_type, + Node.last_seen, + role_subq.c.role, ) + .outerjoin(role_subq, Node.id == role_subq.c.node_id) + .where(Node.last_seen.isnot(None)) ).all() - for public_key, name, adv_type, last_seen in nodes_with_last_seen: + for public_key, name, adv_type, last_seen, role in nodes_with_last_seen: node_last_seen.labels( public_key=public_key, node_name=name or "", adv_type=adv_type or "unknown", + role=role or "", ).set(last_seen.timestamp()) # -- Messages total by type -- diff --git a/tests/test_api/test_metrics.py b/tests/test_api/test_metrics.py index 61d9b52..2590a3b 100644 --- a/tests/test_api/test_metrics.py +++ b/tests/test_api/test_metrics.py @@ -13,7 +13,7 @@ from meshcore_hub.api.dependencies import ( get_db_session, get_mqtt_client, ) -from meshcore_hub.common.models import Node +from meshcore_hub.common.models import Node, NodeTag def _make_basic_auth(username: str, password: str) -> str: @@ -198,7 +198,36 @@ class TestMetricsData: "meshcore_node_last_seen_timestamp_seconds" '{adv_type="REPEATER",' 'node_name="Seen Node",' - 'public_key="lastseen1234lastseen1234lastseen"}' + 'public_key="lastseen1234lastseen1234lastseen",' + 'role=""}' + ) in response.text + + def test_node_last_seen_timestamp_with_role(self, api_db_session, client_no_auth): + """Test that node_last_seen_timestamp includes role label from node tags.""" + seen_at = datetime(2025, 6, 15, 12, 0, 0, tzinfo=timezone.utc) + node = Node( + public_key="rolenode1234rolenode1234rolenode", + name="Infra Node", + adv_type="REPEATER", + first_seen=seen_at, + last_seen=seen_at, + ) + api_db_session.add(node) + api_db_session.flush() + + tag = NodeTag(node_id=node.id, key="role", value="infra") + api_db_session.add(tag) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert ( + "meshcore_node_last_seen_timestamp_seconds" + '{adv_type="REPEATER",' + 'node_name="Infra Node",' + 'public_key="rolenode1234rolenode1234rolenode",' + 'role="infra"}' ) in response.text def test_node_last_seen_timestamp_skips_null(self, api_db_session, client_no_auth):