Merge pull request #126 from ipnet-mesh/feat/prometheus

Add Prometheus metrics endpoint, Alertmanager, and 1h stats window
This commit is contained in:
JingleManSweep
2026-02-18 23:09:22 +00:00
committed by GitHub
13 changed files with 843 additions and 2 deletions

View File

@@ -190,6 +190,25 @@ API_PORT=8000
API_READ_KEY=
API_ADMIN_KEY=
# -------------------
# Prometheus Metrics
# -------------------
# Prometheus metrics endpoint exposed at /metrics on the API service
# Enable Prometheus metrics endpoint
# Default: true
METRICS_ENABLED=true
# Seconds to cache metrics output (reduces database load)
# Default: 60
METRICS_CACHE_TTL=60
# External Prometheus port (when using --profile metrics)
PROMETHEUS_PORT=9090
# External Alertmanager port (when using --profile metrics)
ALERTMANAGER_PORT=9093
# =============================================================================
# WEB DASHBOARD SETTINGS
# =============================================================================

View File

@@ -281,6 +281,7 @@ meshcore-hub/
│ │ ├── app.py # FastAPI app
│ │ ├── auth.py # Authentication
│ │ ├── dependencies.py
│ │ ├── metrics.py # Prometheus metrics endpoint
│ │ └── routes/ # API routes
│ │ ├── members.py # Member CRUD endpoints
│ │ └── ...
@@ -311,7 +312,12 @@ meshcore-hub/
│ ├── env.py
│ └── versions/
├── etc/
── mosquitto.conf # MQTT broker configuration
── mosquitto.conf # MQTT broker configuration
│ ├── prometheus/ # Prometheus configuration
│ │ ├── prometheus.yml # Scrape and alerting config
│ │ └── alerts.yml # Alert rules
│ └── alertmanager/ # Alertmanager configuration
│ └── alertmanager.yml # Routing and receiver config
├── example/
│ ├── seed/ # Example seed data files
│ │ ├── node_tags.yaml # Example node tags
@@ -609,6 +615,8 @@ Key variables:
- `WEB_AUTO_REFRESH_SECONDS` - Auto-refresh interval in seconds for list pages (default: `30`, `0` to disable)
- `TZ` - Timezone for web dashboard date/time display (default: `UTC`, e.g., `America/New_York`, `Europe/London`)
- `FEATURE_DASHBOARD`, `FEATURE_NODES`, `FEATURE_ADVERTISEMENTS`, `FEATURE_MESSAGES`, `FEATURE_MAP`, `FEATURE_MEMBERS`, `FEATURE_PAGES` - Feature flags to enable/disable specific web dashboard pages (default: all `true`). Dependencies: Dashboard auto-disables when all of Nodes/Advertisements/Messages are disabled. Map auto-disables when Nodes is disabled.
- `METRICS_ENABLED` - Enable Prometheus metrics endpoint at /metrics (default: `true`)
- `METRICS_CACHE_TTL` - Seconds to cache metrics output (default: `60`)
- `LOG_LEVEL` - Logging verbosity
The database defaults to `sqlite:///{DATA_HOME}/collector/meshcore.db` and does not typically need to be configured.

View File

@@ -185,6 +185,7 @@ Docker Compose uses **profiles** to select which services to run:
| `mock` | interface-mock-receiver | Testing without hardware |
| `migrate` | db-migrate | One-time database migration |
| `seed` | seed | One-time seed data import |
| `metrics` | prometheus, alertmanager | Prometheus metrics and alerting |
**Note:** Most deployments connect to an external MQTT broker. Add `--profile mqtt` only if you need a local broker.
@@ -337,6 +338,8 @@ The collector automatically cleans up old event data and inactive nodes:
| `API_PORT` | `8000` | API port |
| `API_READ_KEY` | *(none)* | Read-only API key |
| `API_ADMIN_KEY` | *(none)* | Admin API key (required for commands) |
| `METRICS_ENABLED` | `true` | Enable Prometheus metrics endpoint at `/metrics` |
| `METRICS_CACHE_TTL` | `60` | Seconds to cache metrics output (reduces database load) |
### Web Dashboard Settings
@@ -541,6 +544,7 @@ Health check endpoints are also available:
- **Health**: http://localhost:8000/health
- **Ready**: http://localhost:8000/health/ready (includes database check)
- **Metrics**: http://localhost:8000/metrics (Prometheus format)
### Authentication
@@ -648,7 +652,7 @@ meshcore-hub/
│ └── locales/ # Translation files (en.json, languages.md)
├── tests/ # Test suite
├── alembic/ # Database migrations
├── etc/ # Configuration files (mosquitto.conf)
├── etc/ # Configuration files (MQTT, Prometheus, Alertmanager)
├── example/ # Example files for reference
│ ├── seed/ # Example seed data files
│ │ ├── node_tags.yaml # Example node tags

View File

@@ -215,6 +215,8 @@ services:
- API_PORT=8000
- API_READ_KEY=${API_READ_KEY:-}
- API_ADMIN_KEY=${API_ADMIN_KEY:-}
- METRICS_ENABLED=${METRICS_ENABLED:-true}
- METRICS_CACHE_TTL=${METRICS_CACHE_TTL:-60}
command: ["api"]
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
@@ -326,6 +328,48 @@ services:
# Imports both node_tags.yaml and members.yaml if they exist
command: ["collector", "seed"]
# ==========================================================================
# Prometheus - Metrics collection and monitoring (optional, use --profile metrics)
# ==========================================================================
prometheus:
image: prom/prometheus:latest
container_name: meshcore-prometheus
profiles:
- all
- metrics
restart: unless-stopped
depends_on:
api:
condition: service_healthy
ports:
- "${PROMETHEUS_PORT:-9090}:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time=30d'
volumes:
- ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./etc/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
# ==========================================================================
# Alertmanager - Alert routing and notifications (optional, use --profile metrics)
# ==========================================================================
alertmanager:
image: prom/alertmanager:latest
container_name: meshcore-alertmanager
profiles:
- all
- metrics
restart: unless-stopped
ports:
- "${ALERTMANAGER_PORT:-9093}:9093"
volumes:
- ./etc/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
# ==========================================================================
# Volumes
# ==========================================================================
@@ -336,3 +380,7 @@ volumes:
name: meshcore_mosquitto_data
mosquitto_log:
name: meshcore_mosquitto_log
prometheus_data:
name: meshcore_prometheus_data
alertmanager_data:
name: meshcore_alertmanager_data

View File

@@ -0,0 +1,35 @@
# Alertmanager configuration for MeshCore Hub
#
# Default configuration routes all alerts to a "blackhole" receiver
# (logs only, no external notifications).
#
# To receive notifications, configure a receiver below.
# See: https://prometheus.io/docs/alerting/latest/configuration/
#
# Examples:
#
# Email:
# receivers:
# - name: 'email'
# email_configs:
# - to: 'admin@example.com'
# from: 'alertmanager@example.com'
# smarthost: 'smtp.example.com:587'
# auth_username: 'alertmanager@example.com'
# auth_password: 'password'
#
# Webhook (e.g. Slack incoming webhook, ntfy, Gotify):
# receivers:
# - name: 'webhook'
# webhook_configs:
# - url: 'https://example.com/webhook'
route:
receiver: 'default'
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receivers:
- name: 'default'

16
etc/prometheus/alerts.yml Normal file
View File

@@ -0,0 +1,16 @@
# Prometheus alert rules for MeshCore Hub
#
# These rules are evaluated by Prometheus and fired alerts are sent
# to Alertmanager for routing and notification.
groups:
- name: meshcore
rules:
- alert: NodeNotSeen
expr: time() - meshcore_node_last_seen_timestamp_seconds > 48 * 3600
for: 5m
labels:
severity: warning
annotations:
summary: "Node {{ $labels.node_name }} not seen for 48+ hours"
description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}) last seen {{ $value | humanizeDuration }} ago."

View File

@@ -0,0 +1,29 @@
# Prometheus scrape configuration for MeshCore Hub
#
# This file is used when running Prometheus via Docker Compose:
# docker compose --profile core --profile metrics up -d
#
# The scrape interval matches the default metrics cache TTL (60s)
# to avoid unnecessary database queries.
global:
scrape_interval: 60s
evaluation_interval: 60s
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- 'alerts.yml'
scrape_configs:
- job_name: 'meshcore-hub'
metrics_path: '/metrics'
# Uncomment basic_auth if API_READ_KEY is configured
# basic_auth:
# username: 'metrics'
# password: '<API_READ_KEY>'
static_configs:
- targets: ['api:8000']

View File

@@ -41,6 +41,7 @@ dependencies = [
"pyyaml>=6.0.0",
"python-frontmatter>=1.0.0",
"markdown>=3.5.0",
"prometheus-client>=0.20.0",
]
[project.optional-dependencies]
@@ -116,6 +117,7 @@ module = [
"meshcore.*",
"frontmatter.*",
"markdown.*",
"prometheus_client.*",
]
ignore_missing_imports = true

View File

@@ -54,6 +54,8 @@ def create_app(
mqtt_prefix: str = "meshcore",
mqtt_tls: bool = False,
cors_origins: list[str] | None = None,
metrics_enabled: bool = True,
metrics_cache_ttl: int = 60,
) -> FastAPI:
"""Create and configure the FastAPI application.
@@ -66,6 +68,8 @@ def create_app(
mqtt_prefix: MQTT topic prefix
mqtt_tls: Enable TLS/SSL for MQTT connection
cors_origins: Allowed CORS origins
metrics_enabled: Enable Prometheus metrics endpoint at /metrics
metrics_cache_ttl: Seconds to cache metrics output
Returns:
Configured FastAPI application
@@ -88,6 +92,7 @@ def create_app(
app.state.mqtt_port = mqtt_port
app.state.mqtt_prefix = mqtt_prefix
app.state.mqtt_tls = mqtt_tls
app.state.metrics_cache_ttl = metrics_cache_ttl
# Configure CORS
if cors_origins is None:
@@ -106,6 +111,12 @@ def create_app(
app.include_router(api_router, prefix="/api/v1")
# Include Prometheus metrics endpoint
if metrics_enabled:
from meshcore_hub.api.metrics import router as metrics_router
app.include_router(metrics_router)
# Health check endpoints
@app.get("/health", tags=["Health"])
async def health() -> dict:

View File

@@ -81,6 +81,19 @@ import click
envvar="CORS_ORIGINS",
help="Comma-separated list of allowed CORS origins",
)
@click.option(
"--metrics-enabled/--no-metrics",
default=True,
envvar="METRICS_ENABLED",
help="Enable Prometheus metrics endpoint at /metrics",
)
@click.option(
"--metrics-cache-ttl",
type=int,
default=60,
envvar="METRICS_CACHE_TTL",
help="Seconds to cache metrics output (reduces database load)",
)
@click.option(
"--reload",
is_flag=True,
@@ -101,6 +114,8 @@ def api(
mqtt_prefix: str,
mqtt_tls: bool,
cors_origins: str | None,
metrics_enabled: bool,
metrics_cache_ttl: int,
reload: bool,
) -> None:
"""Run the REST API server.
@@ -149,6 +164,8 @@ def api(
click.echo(f"Read key configured: {read_key is not None}")
click.echo(f"Admin key configured: {admin_key is not None}")
click.echo(f"CORS origins: {cors_origins or 'none'}")
click.echo(f"Metrics enabled: {metrics_enabled}")
click.echo(f"Metrics cache TTL: {metrics_cache_ttl}s")
click.echo(f"Reload mode: {reload}")
click.echo("=" * 50)
@@ -181,6 +198,8 @@ def api(
mqtt_prefix=mqtt_prefix,
mqtt_tls=mqtt_tls,
cors_origins=origins_list,
metrics_enabled=metrics_enabled,
metrics_cache_ttl=metrics_cache_ttl,
)
click.echo("\nStarting API server...")

View File

@@ -0,0 +1,318 @@
"""Prometheus metrics endpoint for MeshCore Hub API."""
import base64
import logging
import time
from typing import Any
from fastapi import APIRouter, Request, Response
from fastapi.responses import PlainTextResponse
from prometheus_client import CollectorRegistry, Gauge, generate_latest
from sqlalchemy import func, select
from meshcore_hub.common.models import (
Advertisement,
EventLog,
Member,
Message,
Node,
Telemetry,
TracePath,
)
logger = logging.getLogger(__name__)
router = APIRouter()
# Module-level cache
_cache: dict[str, Any] = {"output": b"", "expires_at": 0.0}
def verify_basic_auth(request: Request) -> bool:
"""Verify HTTP Basic Auth credentials for metrics endpoint.
Uses username 'metrics' and the API read key as password.
Returns True if no read key is configured (public access).
Args:
request: FastAPI request
Returns:
True if authentication passes
"""
read_key = getattr(request.app.state, "read_key", None)
# No read key configured = public access
if not read_key:
return True
auth_header = request.headers.get("Authorization", "")
if not auth_header.startswith("Basic "):
return False
try:
decoded = base64.b64decode(auth_header[6:]).decode("utf-8")
username, password = decoded.split(":", 1)
return username == "metrics" and password == read_key
except Exception:
return False
def collect_metrics(session: Any) -> bytes:
"""Collect all metrics from the database and generate Prometheus output.
Creates a fresh CollectorRegistry per call to avoid global state issues.
Args:
session: SQLAlchemy database session
Returns:
Prometheus text exposition format as bytes
"""
from meshcore_hub import __version__
registry = CollectorRegistry()
# -- Info gauge --
info_gauge = Gauge(
"meshcore_info",
"MeshCore Hub application info",
["version"],
registry=registry,
)
info_gauge.labels(version=__version__).set(1)
# -- Nodes total --
nodes_total = Gauge(
"meshcore_nodes_total",
"Total number of nodes",
registry=registry,
)
count = session.execute(select(func.count(Node.id))).scalar() or 0
nodes_total.set(count)
# -- Nodes active by time window --
nodes_active = Gauge(
"meshcore_nodes_active",
"Number of active nodes in time window",
["window"],
registry=registry,
)
for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]:
cutoff = time.time() - (hours * 3600)
from datetime import datetime, timezone
cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc)
count = (
session.execute(
select(func.count(Node.id)).where(Node.last_seen >= cutoff_dt)
).scalar()
or 0
)
nodes_active.labels(window=window).set(count)
# -- Nodes by type --
nodes_by_type = Gauge(
"meshcore_nodes_by_type",
"Number of nodes by advertisement type",
["adv_type"],
registry=registry,
)
type_counts = session.execute(
select(Node.adv_type, func.count(Node.id)).group_by(Node.adv_type)
).all()
for adv_type, count in type_counts:
nodes_by_type.labels(adv_type=adv_type or "unknown").set(count)
# -- Nodes with location --
nodes_with_location = Gauge(
"meshcore_nodes_with_location",
"Number of nodes with GPS coordinates",
registry=registry,
)
count = (
session.execute(
select(func.count(Node.id)).where(
Node.lat.isnot(None), Node.lon.isnot(None)
)
).scalar()
or 0
)
nodes_with_location.set(count)
# -- Node last seen timestamp --
node_last_seen = Gauge(
"meshcore_node_last_seen_timestamp_seconds",
"Unix timestamp of when the node was last seen",
["public_key", "node_name", "adv_type"],
registry=registry,
)
nodes_with_last_seen = session.execute(
select(Node.public_key, Node.name, Node.adv_type, Node.last_seen).where(
Node.last_seen.isnot(None)
)
).all()
for public_key, name, adv_type, last_seen in nodes_with_last_seen:
node_last_seen.labels(
public_key=public_key,
node_name=name or "",
adv_type=adv_type or "unknown",
).set(last_seen.timestamp())
# -- Messages total by type --
messages_total = Gauge(
"meshcore_messages_total",
"Total number of messages by type",
["type"],
registry=registry,
)
msg_type_counts = session.execute(
select(Message.message_type, func.count(Message.id)).group_by(
Message.message_type
)
).all()
for msg_type, count in msg_type_counts:
messages_total.labels(type=msg_type).set(count)
# -- Messages received by type and window --
messages_received = Gauge(
"meshcore_messages_received",
"Messages received in time window by type",
["type", "window"],
registry=registry,
)
for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]:
cutoff = time.time() - (hours * 3600)
cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc)
window_counts = session.execute(
select(Message.message_type, func.count(Message.id))
.where(Message.received_at >= cutoff_dt)
.group_by(Message.message_type)
).all()
for msg_type, count in window_counts:
messages_received.labels(type=msg_type, window=window).set(count)
# -- Advertisements total --
advertisements_total = Gauge(
"meshcore_advertisements_total",
"Total number of advertisements",
registry=registry,
)
count = session.execute(select(func.count(Advertisement.id))).scalar() or 0
advertisements_total.set(count)
# -- Advertisements received by window --
advertisements_received = Gauge(
"meshcore_advertisements_received",
"Advertisements received in time window",
["window"],
registry=registry,
)
for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]:
cutoff = time.time() - (hours * 3600)
cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc)
count = (
session.execute(
select(func.count(Advertisement.id)).where(
Advertisement.received_at >= cutoff_dt
)
).scalar()
or 0
)
advertisements_received.labels(window=window).set(count)
# -- Telemetry total --
telemetry_total = Gauge(
"meshcore_telemetry_total",
"Total number of telemetry records",
registry=registry,
)
count = session.execute(select(func.count(Telemetry.id))).scalar() or 0
telemetry_total.set(count)
# -- Trace paths total --
trace_paths_total = Gauge(
"meshcore_trace_paths_total",
"Total number of trace path records",
registry=registry,
)
count = session.execute(select(func.count(TracePath.id))).scalar() or 0
trace_paths_total.set(count)
# -- Events by type --
events_total = Gauge(
"meshcore_events_total",
"Total events by type from event log",
["event_type"],
registry=registry,
)
event_counts = session.execute(
select(EventLog.event_type, func.count(EventLog.id)).group_by(
EventLog.event_type
)
).all()
for event_type, count in event_counts:
events_total.labels(event_type=event_type).set(count)
# -- Members total --
members_total = Gauge(
"meshcore_members_total",
"Total number of network members",
registry=registry,
)
count = session.execute(select(func.count(Member.id))).scalar() or 0
members_total.set(count)
output: bytes = generate_latest(registry)
return output
@router.get("/metrics")
async def metrics(request: Request) -> Response:
"""Prometheus metrics endpoint.
Returns metrics in Prometheus text exposition format.
Supports HTTP Basic Auth with username 'metrics' and API read key as password.
Results are cached with a configurable TTL to reduce database load.
"""
# Check authentication
if not verify_basic_auth(request):
return PlainTextResponse(
"Unauthorized",
status_code=401,
headers={"WWW-Authenticate": 'Basic realm="metrics"'},
)
# Check cache
cache_ttl = getattr(request.app.state, "metrics_cache_ttl", 60)
now = time.time()
if _cache["output"] and now < _cache["expires_at"]:
return Response(
content=_cache["output"],
media_type="text/plain; version=0.0.4; charset=utf-8",
)
# Collect fresh metrics
try:
from meshcore_hub.api.app import get_db_manager
db_manager = get_db_manager()
with db_manager.session_scope() as session:
output = collect_metrics(session)
# Update cache
_cache["output"] = output
_cache["expires_at"] = now + cache_ttl
return Response(
content=output,
media_type="text/plain; version=0.0.4; charset=utf-8",
)
except Exception as e:
logger.exception("Failed to collect metrics: %s", e)
return PlainTextResponse(
f"# Error collecting metrics: {e}\n",
status_code=500,
media_type="text/plain; version=0.0.4; charset=utf-8",
)

View File

@@ -2,6 +2,7 @@
import os
import tempfile
from contextlib import contextmanager
from datetime import datetime, timezone
from unittest.mock import MagicMock, patch
@@ -81,6 +82,20 @@ def mock_db_manager(api_db_engine):
manager = MagicMock(spec=DatabaseManager)
Session = sessionmaker(bind=api_db_engine)
manager.get_session = lambda: Session()
@contextmanager
def _session_scope():
session = Session()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()
manager.session_scope = _session_scope
return manager

View File

@@ -0,0 +1,317 @@
"""Tests for Prometheus metrics endpoint."""
import base64
from datetime import datetime, timezone
from unittest.mock import patch
from fastapi.testclient import TestClient
from sqlalchemy.orm import sessionmaker
from meshcore_hub.api.app import create_app
from meshcore_hub.api.dependencies import (
get_db_manager,
get_db_session,
get_mqtt_client,
)
from meshcore_hub.common.models import Node
def _make_basic_auth(username: str, password: str) -> str:
"""Create a Basic auth header value."""
credentials = base64.b64encode(f"{username}:{password}".encode()).decode()
return f"Basic {credentials}"
def _clear_metrics_cache() -> None:
"""Clear the metrics module cache."""
from meshcore_hub.api.metrics import _cache
_cache["output"] = b""
_cache["expires_at"] = 0.0
class TestMetricsEndpoint:
"""Tests for basic metrics endpoint availability."""
def test_metrics_endpoint_available(self, client_no_auth):
"""Test that /metrics endpoint returns 200."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
def test_metrics_content_type(self, client_no_auth):
"""Test that metrics returns correct content type."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert "text/plain" in response.headers["content-type"]
def test_metrics_contains_expected_names(self, client_no_auth):
"""Test that metrics output contains expected metric names."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
content = response.text
assert "meshcore_info" in content
assert "meshcore_nodes_total" in content
assert "meshcore_nodes_active" in content
assert "meshcore_advertisements_total" in content
assert "meshcore_telemetry_total" in content
assert "meshcore_trace_paths_total" in content
assert "meshcore_members_total" in content
def test_metrics_info_has_version(self, client_no_auth):
"""Test that meshcore_info includes version label."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert 'meshcore_info{version="' in response.text
class TestMetricsAuth:
"""Tests for metrics endpoint authentication."""
def test_no_auth_when_no_read_key(self, client_no_auth):
"""Test that no auth is required when no read key is configured."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
def test_401_when_read_key_set_no_auth(self, client_with_auth):
"""Test 401 when read key is set but no auth provided."""
_clear_metrics_cache()
response = client_with_auth.get("/metrics")
assert response.status_code == 401
assert "WWW-Authenticate" in response.headers
def test_success_with_correct_basic_auth(self, client_with_auth):
"""Test successful auth with correct Basic credentials."""
_clear_metrics_cache()
response = client_with_auth.get(
"/metrics",
headers={"Authorization": _make_basic_auth("metrics", "test-read-key")},
)
assert response.status_code == 200
def test_fail_with_wrong_password(self, client_with_auth):
"""Test 401 with incorrect password."""
_clear_metrics_cache()
response = client_with_auth.get(
"/metrics",
headers={"Authorization": _make_basic_auth("metrics", "wrong-key")},
)
assert response.status_code == 401
def test_fail_with_wrong_username(self, client_with_auth):
"""Test 401 with incorrect username."""
_clear_metrics_cache()
response = client_with_auth.get(
"/metrics",
headers={
"Authorization": _make_basic_auth("admin", "test-read-key"),
},
)
assert response.status_code == 401
def test_fail_with_bearer_auth(self, client_with_auth):
"""Test that Bearer auth does not work for metrics."""
_clear_metrics_cache()
response = client_with_auth.get(
"/metrics",
headers={"Authorization": "Bearer test-read-key"},
)
assert response.status_code == 401
class TestMetricsData:
"""Tests for metrics data accuracy."""
def test_nodes_total_reflects_database(self, client_no_auth, sample_node):
"""Test that nodes_total matches actual node count."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
# Should have at least 1 node
assert "meshcore_nodes_total 1.0" in response.text
def test_messages_total_reflects_database(self, client_no_auth, sample_message):
"""Test that messages_total reflects database state."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert "meshcore_messages_total" in response.text
def test_advertisements_total_reflects_database(
self, client_no_auth, sample_advertisement
):
"""Test that advertisements_total reflects database state."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert "meshcore_advertisements_total 1.0" in response.text
def test_members_total_reflects_database(self, client_no_auth, sample_member):
"""Test that members_total reflects database state."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert "meshcore_members_total 1.0" in response.text
def test_nodes_by_type_has_labels(self, client_no_auth, sample_node):
"""Test that nodes_by_type includes adv_type labels."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert 'meshcore_nodes_by_type{adv_type="REPEATER"}' in response.text
def test_telemetry_total_reflects_database(self, client_no_auth, sample_telemetry):
"""Test that telemetry_total reflects database state."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert "meshcore_telemetry_total 1.0" in response.text
def test_trace_paths_total_reflects_database(
self, client_no_auth, sample_trace_path
):
"""Test that trace_paths_total reflects database state."""
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert "meshcore_trace_paths_total 1.0" in response.text
def test_node_last_seen_timestamp_present(self, api_db_session, client_no_auth):
"""Test that node_last_seen_timestamp is present for nodes with last_seen."""
seen_at = datetime(2025, 6, 15, 12, 0, 0, tzinfo=timezone.utc)
node = Node(
public_key="lastseen1234lastseen1234lastseen",
name="Seen Node",
adv_type="REPEATER",
first_seen=seen_at,
last_seen=seen_at,
)
api_db_session.add(node)
api_db_session.commit()
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
# Labels are sorted alphabetically by prometheus_client
assert (
"meshcore_node_last_seen_timestamp_seconds"
'{adv_type="REPEATER",'
'node_name="Seen Node",'
'public_key="lastseen1234lastseen1234lastseen"}'
) in response.text
def test_node_last_seen_timestamp_skips_null(self, api_db_session, client_no_auth):
"""Test that nodes with last_seen=None are excluded from the metric."""
node = Node(
public_key="neverseen1234neverseen1234neversx",
name="Never Seen",
adv_type="CLIENT",
first_seen=datetime.now(timezone.utc),
last_seen=None,
)
api_db_session.add(node)
api_db_session.commit()
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert "neverseen1234neverseen1234neversx" not in response.text
def test_node_last_seen_timestamp_multiple_nodes(
self, api_db_session, client_no_auth
):
"""Test that multiple nodes each get their own labeled time series."""
seen1 = datetime(2025, 6, 15, 10, 0, 0, tzinfo=timezone.utc)
seen2 = datetime(2025, 6, 15, 11, 0, 0, tzinfo=timezone.utc)
node1 = Node(
public_key="multinode1multinode1multinode1mu",
name="Node One",
adv_type="REPEATER",
first_seen=seen1,
last_seen=seen1,
)
node2 = Node(
public_key="multinode2multinode2multinode2mu",
name="Node Two",
adv_type="CHAT",
first_seen=seen2,
last_seen=seen2,
)
api_db_session.add_all([node1, node2])
api_db_session.commit()
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert ('public_key="multinode1multinode1multinode1mu"') in response.text
assert ('public_key="multinode2multinode2multinode2mu"') in response.text
def test_nodes_with_location(self, api_db_session, client_no_auth):
"""Test that nodes_with_location counts correctly."""
node = Node(
public_key="locationtest1234locationtest1234",
name="GPS Node",
adv_type="CHAT",
lat=37.7749,
lon=-122.4194,
first_seen=datetime.now(timezone.utc),
last_seen=datetime.now(timezone.utc),
)
api_db_session.add(node)
api_db_session.commit()
_clear_metrics_cache()
response = client_no_auth.get("/metrics")
assert response.status_code == 200
assert "meshcore_nodes_with_location 1.0" in response.text
class TestMetricsDisabled:
"""Tests for when metrics are disabled."""
def test_metrics_404_when_disabled(
self, test_db_path, api_db_engine, mock_mqtt, mock_db_manager
):
"""Test that /metrics returns 404 when disabled."""
db_url = f"sqlite:///{test_db_path}"
with patch("meshcore_hub.api.app._db_manager", mock_db_manager):
app = create_app(
database_url=db_url,
metrics_enabled=False,
)
Session = sessionmaker(bind=api_db_engine)
def override_get_db_manager(request=None):
return mock_db_manager
def override_get_db_session():
session = Session()
try:
yield session
finally:
session.close()
def override_get_mqtt_client(request=None):
return mock_mqtt
app.dependency_overrides[get_db_manager] = override_get_db_manager
app.dependency_overrides[get_db_session] = override_get_db_session
app.dependency_overrides[get_mqtt_client] = override_get_mqtt_client
client = TestClient(app, raise_server_exceptions=True)
response = client.get("/metrics")
assert response.status_code == 404
class TestMetricsCache:
"""Tests for metrics caching behavior."""
def test_cache_returns_same_output(self, client_no_auth):
"""Test that cached responses return the same content."""
_clear_metrics_cache()
response1 = client_no_auth.get("/metrics")
response2 = client_no_auth.get("/metrics")
assert response1.text == response2.text