Add role label to node last seen metric and filter alerts by role

Joins NodeTag (key='role') to the node last seen Prometheus metric so
alert rules can target infrastructure nodes only (role="infra").

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Louis King
2026-02-19 00:01:20 +00:00
parent de85e0cd7a
commit 2a153a5239
3 changed files with 51 additions and 9 deletions

View File

@@ -7,10 +7,10 @@ groups:
- name: meshcore
rules:
- alert: NodeNotSeen
expr: time() - meshcore_node_last_seen_timestamp_seconds > 48 * 3600
expr: time() - meshcore_node_last_seen_timestamp_seconds{role="infra"} > 48 * 3600
for: 5m
labels:
severity: warning
annotations:
summary: "Node {{ $labels.node_name }} not seen for 48+ hours"
description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}) last seen {{ $value | humanizeDuration }} ago."
summary: "Node {{ $labels.node_name }} ({{ $labels.role }}) not seen for 48+ hours"
description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}, role={{ $labels.role }}) last seen {{ $value | humanizeDuration }} ago."

View File

@@ -16,6 +16,7 @@ from meshcore_hub.common.models import (
Member,
Message,
Node,
NodeTag,
Telemetry,
TracePath,
)
@@ -144,19 +145,31 @@ def collect_metrics(session: Any) -> bytes:
node_last_seen = Gauge(
"meshcore_node_last_seen_timestamp_seconds",
"Unix timestamp of when the node was last seen",
["public_key", "node_name", "adv_type"],
["public_key", "node_name", "adv_type", "role"],
registry=registry,
)
nodes_with_last_seen = session.execute(
select(Node.public_key, Node.name, Node.adv_type, Node.last_seen).where(
Node.last_seen.isnot(None)
role_subq = (
select(NodeTag.node_id, NodeTag.value.label("role"))
.where(NodeTag.key == "role")
.subquery()
)
nodes_with_last_seen = session.execute(
select(
Node.public_key,
Node.name,
Node.adv_type,
Node.last_seen,
role_subq.c.role,
)
.outerjoin(role_subq, Node.id == role_subq.c.node_id)
.where(Node.last_seen.isnot(None))
).all()
for public_key, name, adv_type, last_seen in nodes_with_last_seen:
for public_key, name, adv_type, last_seen, role in nodes_with_last_seen:
node_last_seen.labels(
public_key=public_key,
node_name=name or "",
adv_type=adv_type or "unknown",
role=role or "",
).set(last_seen.timestamp())
# -- Messages total by type --

View File

@@ -13,7 +13,7 @@ from meshcore_hub.api.dependencies import (
get_db_session,
get_mqtt_client,
)
from meshcore_hub.common.models import Node
from meshcore_hub.common.models import Node, NodeTag
def _make_basic_auth(username: str, password: str) -> str:
@@ -198,7 +198,36 @@ class TestMetricsData:
"meshcore_node_last_seen_timestamp_seconds"
'{adv_type="REPEATER",'
'node_name="Seen Node",'
'public_key="lastseen1234lastseen1234lastseen"}'
'public_key="lastseen1234lastseen1234lastseen",'
'role=""}'
) in response.text
def test_node_last_seen_timestamp_with_role(self, api_db_session, client_no_auth):
"""Test that node_last_seen_timestamp includes role label from node tags."""
seen_at = datetime(2025, 6, 15, 12, 0, 0, tzinfo=timezone.utc)
node = Node(
public_key="rolenode1234rolenode1234rolenode",
name="Infra Node",
adv_type="REPEATER",
first_seen=seen_at,
last_seen=seen_at,
)
api_db_session.add(node)
api_db_session.flush()
tag = NodeTag(node_id=node.id, key="role", value="infra")
api_db_session.add(tag)
api_db_session.commit()
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert (
"meshcore_node_last_seen_timestamp_seconds"
'{adv_type="REPEATER",'
'node_name="Infra Node",'
'public_key="rolenode1234rolenode1234rolenode",'
'role="infra"}'
) in response.text
def test_node_last_seen_timestamp_skips_null(self, api_db_session, client_no_auth):