From 5a20da3afa6287c1d4a25b6997503f38880ed534 Mon Sep 17 00:00:00 2001 From: Louis King Date: Wed, 18 Feb 2026 23:06:07 +0000 Subject: [PATCH] Add Prometheus metrics endpoint, Alertmanager, and 1h stats window Add /metrics endpoint with Prometheus gauges for nodes, messages, advertisements, telemetry, trace paths, events, and members. Include per-node last_seen timestamps for alerting. Add Alertmanager service to Docker Compose metrics profile with default blackhole receiver. Add NodeNotSeen alert rule (48h threshold). Add 1h time window to all windowed metrics alongside existing 24h/7d/30d windows. Co-Authored-By: Claude Opus 4.6 --- .env.example | 19 ++ AGENTS.md | 10 +- README.md | 6 +- docker-compose.yml | 48 +++++ etc/alertmanager/alertmanager.yml | 35 ++++ etc/prometheus/alerts.yml | 16 ++ etc/prometheus/prometheus.yml | 29 +++ pyproject.toml | 2 + src/meshcore_hub/api/app.py | 11 ++ src/meshcore_hub/api/cli.py | 19 ++ src/meshcore_hub/api/metrics.py | 318 ++++++++++++++++++++++++++++++ tests/test_api/conftest.py | 15 ++ tests/test_api/test_metrics.py | 317 +++++++++++++++++++++++++++++ 13 files changed, 843 insertions(+), 2 deletions(-) create mode 100644 etc/alertmanager/alertmanager.yml create mode 100644 etc/prometheus/alerts.yml create mode 100644 etc/prometheus/prometheus.yml create mode 100644 src/meshcore_hub/api/metrics.py create mode 100644 tests/test_api/test_metrics.py diff --git a/.env.example b/.env.example index 03e5a09..9942c97 100644 --- a/.env.example +++ b/.env.example @@ -190,6 +190,25 @@ API_PORT=8000 API_READ_KEY= API_ADMIN_KEY= +# ------------------- +# Prometheus Metrics +# ------------------- +# Prometheus metrics endpoint exposed at /metrics on the API service + +# Enable Prometheus metrics endpoint +# Default: true +METRICS_ENABLED=true + +# Seconds to cache metrics output (reduces database load) +# Default: 60 +METRICS_CACHE_TTL=60 + +# External Prometheus port (when using --profile metrics) +PROMETHEUS_PORT=9090 + +# External Alertmanager port (when using --profile metrics) +ALERTMANAGER_PORT=9093 + # ============================================================================= # WEB DASHBOARD SETTINGS # ============================================================================= diff --git a/AGENTS.md b/AGENTS.md index cd94a80..0f84468 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -281,6 +281,7 @@ meshcore-hub/ │ │ ├── app.py # FastAPI app │ │ ├── auth.py # Authentication │ │ ├── dependencies.py +│ │ ├── metrics.py # Prometheus metrics endpoint │ │ └── routes/ # API routes │ │ ├── members.py # Member CRUD endpoints │ │ └── ... @@ -311,7 +312,12 @@ meshcore-hub/ │ ├── env.py │ └── versions/ ├── etc/ -│ └── mosquitto.conf # MQTT broker configuration +│ ├── mosquitto.conf # MQTT broker configuration +│ ├── prometheus/ # Prometheus configuration +│ │ ├── prometheus.yml # Scrape and alerting config +│ │ └── alerts.yml # Alert rules +│ └── alertmanager/ # Alertmanager configuration +│ └── alertmanager.yml # Routing and receiver config ├── example/ │ ├── seed/ # Example seed data files │ │ ├── node_tags.yaml # Example node tags @@ -609,6 +615,8 @@ Key variables: - `WEB_AUTO_REFRESH_SECONDS` - Auto-refresh interval in seconds for list pages (default: `30`, `0` to disable) - `TZ` - Timezone for web dashboard date/time display (default: `UTC`, e.g., `America/New_York`, `Europe/London`) - `FEATURE_DASHBOARD`, `FEATURE_NODES`, `FEATURE_ADVERTISEMENTS`, `FEATURE_MESSAGES`, `FEATURE_MAP`, `FEATURE_MEMBERS`, `FEATURE_PAGES` - Feature flags to enable/disable specific web dashboard pages (default: all `true`). Dependencies: Dashboard auto-disables when all of Nodes/Advertisements/Messages are disabled. Map auto-disables when Nodes is disabled. +- `METRICS_ENABLED` - Enable Prometheus metrics endpoint at /metrics (default: `true`) +- `METRICS_CACHE_TTL` - Seconds to cache metrics output (default: `60`) - `LOG_LEVEL` - Logging verbosity The database defaults to `sqlite:///{DATA_HOME}/collector/meshcore.db` and does not typically need to be configured. diff --git a/README.md b/README.md index c49b7c5..f868b4a 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,7 @@ Docker Compose uses **profiles** to select which services to run: | `mock` | interface-mock-receiver | Testing without hardware | | `migrate` | db-migrate | One-time database migration | | `seed` | seed | One-time seed data import | +| `metrics` | prometheus, alertmanager | Prometheus metrics and alerting | **Note:** Most deployments connect to an external MQTT broker. Add `--profile mqtt` only if you need a local broker. @@ -337,6 +338,8 @@ The collector automatically cleans up old event data and inactive nodes: | `API_PORT` | `8000` | API port | | `API_READ_KEY` | *(none)* | Read-only API key | | `API_ADMIN_KEY` | *(none)* | Admin API key (required for commands) | +| `METRICS_ENABLED` | `true` | Enable Prometheus metrics endpoint at `/metrics` | +| `METRICS_CACHE_TTL` | `60` | Seconds to cache metrics output (reduces database load) | ### Web Dashboard Settings @@ -541,6 +544,7 @@ Health check endpoints are also available: - **Health**: http://localhost:8000/health - **Ready**: http://localhost:8000/health/ready (includes database check) +- **Metrics**: http://localhost:8000/metrics (Prometheus format) ### Authentication @@ -648,7 +652,7 @@ meshcore-hub/ │ └── locales/ # Translation files (en.json, languages.md) ├── tests/ # Test suite ├── alembic/ # Database migrations -├── etc/ # Configuration files (mosquitto.conf) +├── etc/ # Configuration files (MQTT, Prometheus, Alertmanager) ├── example/ # Example files for reference │ ├── seed/ # Example seed data files │ │ ├── node_tags.yaml # Example node tags diff --git a/docker-compose.yml b/docker-compose.yml index 29e41cf..0a6602b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -215,6 +215,8 @@ services: - API_PORT=8000 - API_READ_KEY=${API_READ_KEY:-} - API_ADMIN_KEY=${API_ADMIN_KEY:-} + - METRICS_ENABLED=${METRICS_ENABLED:-true} + - METRICS_CACHE_TTL=${METRICS_CACHE_TTL:-60} command: ["api"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] @@ -326,6 +328,48 @@ services: # Imports both node_tags.yaml and members.yaml if they exist command: ["collector", "seed"] + # ========================================================================== + # Prometheus - Metrics collection and monitoring (optional, use --profile metrics) + # ========================================================================== + prometheus: + image: prom/prometheus:latest + container_name: meshcore-prometheus + profiles: + - all + - metrics + restart: unless-stopped + depends_on: + api: + condition: service_healthy + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=30d' + volumes: + - ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./etc/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + - prometheus_data:/prometheus + + # ========================================================================== + # Alertmanager - Alert routing and notifications (optional, use --profile metrics) + # ========================================================================== + alertmanager: + image: prom/alertmanager:latest + container_name: meshcore-alertmanager + profiles: + - all + - metrics + restart: unless-stopped + ports: + - "${ALERTMANAGER_PORT:-9093}:9093" + volumes: + - ./etc/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager_data:/alertmanager + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + # ========================================================================== # Volumes # ========================================================================== @@ -336,3 +380,7 @@ volumes: name: meshcore_mosquitto_data mosquitto_log: name: meshcore_mosquitto_log + prometheus_data: + name: meshcore_prometheus_data + alertmanager_data: + name: meshcore_alertmanager_data diff --git a/etc/alertmanager/alertmanager.yml b/etc/alertmanager/alertmanager.yml new file mode 100644 index 0000000..9bb4b2d --- /dev/null +++ b/etc/alertmanager/alertmanager.yml @@ -0,0 +1,35 @@ +# Alertmanager configuration for MeshCore Hub +# +# Default configuration routes all alerts to a "blackhole" receiver +# (logs only, no external notifications). +# +# To receive notifications, configure a receiver below. +# See: https://prometheus.io/docs/alerting/latest/configuration/ +# +# Examples: +# +# Email: +# receivers: +# - name: 'email' +# email_configs: +# - to: 'admin@example.com' +# from: 'alertmanager@example.com' +# smarthost: 'smtp.example.com:587' +# auth_username: 'alertmanager@example.com' +# auth_password: 'password' +# +# Webhook (e.g. Slack incoming webhook, ntfy, Gotify): +# receivers: +# - name: 'webhook' +# webhook_configs: +# - url: 'https://example.com/webhook' + +route: + receiver: 'default' + group_by: ['alertname'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + +receivers: + - name: 'default' diff --git a/etc/prometheus/alerts.yml b/etc/prometheus/alerts.yml new file mode 100644 index 0000000..5c478cb --- /dev/null +++ b/etc/prometheus/alerts.yml @@ -0,0 +1,16 @@ +# Prometheus alert rules for MeshCore Hub +# +# These rules are evaluated by Prometheus and fired alerts are sent +# to Alertmanager for routing and notification. + +groups: + - name: meshcore + rules: + - alert: NodeNotSeen + expr: time() - meshcore_node_last_seen_timestamp_seconds > 48 * 3600 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ $labels.node_name }} not seen for 48+ hours" + description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}) last seen {{ $value | humanizeDuration }} ago." diff --git a/etc/prometheus/prometheus.yml b/etc/prometheus/prometheus.yml new file mode 100644 index 0000000..11a4e2e --- /dev/null +++ b/etc/prometheus/prometheus.yml @@ -0,0 +1,29 @@ +# Prometheus scrape configuration for MeshCore Hub +# +# This file is used when running Prometheus via Docker Compose: +# docker compose --profile core --profile metrics up -d +# +# The scrape interval matches the default metrics cache TTL (60s) +# to avoid unnecessary database queries. + +global: + scrape_interval: 60s + evaluation_interval: 60s + +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +rule_files: + - 'alerts.yml' + +scrape_configs: + - job_name: 'meshcore-hub' + metrics_path: '/metrics' + # Uncomment basic_auth if API_READ_KEY is configured + # basic_auth: + # username: 'metrics' + # password: '' + static_configs: + - targets: ['api:8000'] diff --git a/pyproject.toml b/pyproject.toml index 75e896a..e0f54d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "pyyaml>=6.0.0", "python-frontmatter>=1.0.0", "markdown>=3.5.0", + "prometheus-client>=0.20.0", ] [project.optional-dependencies] @@ -116,6 +117,7 @@ module = [ "meshcore.*", "frontmatter.*", "markdown.*", + "prometheus_client.*", ] ignore_missing_imports = true diff --git a/src/meshcore_hub/api/app.py b/src/meshcore_hub/api/app.py index a1bba91..87c220a 100644 --- a/src/meshcore_hub/api/app.py +++ b/src/meshcore_hub/api/app.py @@ -54,6 +54,8 @@ def create_app( mqtt_prefix: str = "meshcore", mqtt_tls: bool = False, cors_origins: list[str] | None = None, + metrics_enabled: bool = True, + metrics_cache_ttl: int = 60, ) -> FastAPI: """Create and configure the FastAPI application. @@ -66,6 +68,8 @@ def create_app( mqtt_prefix: MQTT topic prefix mqtt_tls: Enable TLS/SSL for MQTT connection cors_origins: Allowed CORS origins + metrics_enabled: Enable Prometheus metrics endpoint at /metrics + metrics_cache_ttl: Seconds to cache metrics output Returns: Configured FastAPI application @@ -88,6 +92,7 @@ def create_app( app.state.mqtt_port = mqtt_port app.state.mqtt_prefix = mqtt_prefix app.state.mqtt_tls = mqtt_tls + app.state.metrics_cache_ttl = metrics_cache_ttl # Configure CORS if cors_origins is None: @@ -106,6 +111,12 @@ def create_app( app.include_router(api_router, prefix="/api/v1") + # Include Prometheus metrics endpoint + if metrics_enabled: + from meshcore_hub.api.metrics import router as metrics_router + + app.include_router(metrics_router) + # Health check endpoints @app.get("/health", tags=["Health"]) async def health() -> dict: diff --git a/src/meshcore_hub/api/cli.py b/src/meshcore_hub/api/cli.py index cd30211..4e42d5a 100644 --- a/src/meshcore_hub/api/cli.py +++ b/src/meshcore_hub/api/cli.py @@ -81,6 +81,19 @@ import click envvar="CORS_ORIGINS", help="Comma-separated list of allowed CORS origins", ) +@click.option( + "--metrics-enabled/--no-metrics", + default=True, + envvar="METRICS_ENABLED", + help="Enable Prometheus metrics endpoint at /metrics", +) +@click.option( + "--metrics-cache-ttl", + type=int, + default=60, + envvar="METRICS_CACHE_TTL", + help="Seconds to cache metrics output (reduces database load)", +) @click.option( "--reload", is_flag=True, @@ -101,6 +114,8 @@ def api( mqtt_prefix: str, mqtt_tls: bool, cors_origins: str | None, + metrics_enabled: bool, + metrics_cache_ttl: int, reload: bool, ) -> None: """Run the REST API server. @@ -149,6 +164,8 @@ def api( click.echo(f"Read key configured: {read_key is not None}") click.echo(f"Admin key configured: {admin_key is not None}") click.echo(f"CORS origins: {cors_origins or 'none'}") + click.echo(f"Metrics enabled: {metrics_enabled}") + click.echo(f"Metrics cache TTL: {metrics_cache_ttl}s") click.echo(f"Reload mode: {reload}") click.echo("=" * 50) @@ -181,6 +198,8 @@ def api( mqtt_prefix=mqtt_prefix, mqtt_tls=mqtt_tls, cors_origins=origins_list, + metrics_enabled=metrics_enabled, + metrics_cache_ttl=metrics_cache_ttl, ) click.echo("\nStarting API server...") diff --git a/src/meshcore_hub/api/metrics.py b/src/meshcore_hub/api/metrics.py new file mode 100644 index 0000000..17e5da2 --- /dev/null +++ b/src/meshcore_hub/api/metrics.py @@ -0,0 +1,318 @@ +"""Prometheus metrics endpoint for MeshCore Hub API.""" + +import base64 +import logging +import time +from typing import Any + +from fastapi import APIRouter, Request, Response +from fastapi.responses import PlainTextResponse +from prometheus_client import CollectorRegistry, Gauge, generate_latest +from sqlalchemy import func, select + +from meshcore_hub.common.models import ( + Advertisement, + EventLog, + Member, + Message, + Node, + Telemetry, + TracePath, +) + +logger = logging.getLogger(__name__) + +router = APIRouter() + +# Module-level cache +_cache: dict[str, Any] = {"output": b"", "expires_at": 0.0} + + +def verify_basic_auth(request: Request) -> bool: + """Verify HTTP Basic Auth credentials for metrics endpoint. + + Uses username 'metrics' and the API read key as password. + Returns True if no read key is configured (public access). + + Args: + request: FastAPI request + + Returns: + True if authentication passes + """ + read_key = getattr(request.app.state, "read_key", None) + + # No read key configured = public access + if not read_key: + return True + + auth_header = request.headers.get("Authorization", "") + if not auth_header.startswith("Basic "): + return False + + try: + decoded = base64.b64decode(auth_header[6:]).decode("utf-8") + username, password = decoded.split(":", 1) + return username == "metrics" and password == read_key + except Exception: + return False + + +def collect_metrics(session: Any) -> bytes: + """Collect all metrics from the database and generate Prometheus output. + + Creates a fresh CollectorRegistry per call to avoid global state issues. + + Args: + session: SQLAlchemy database session + + Returns: + Prometheus text exposition format as bytes + """ + from meshcore_hub import __version__ + + registry = CollectorRegistry() + + # -- Info gauge -- + info_gauge = Gauge( + "meshcore_info", + "MeshCore Hub application info", + ["version"], + registry=registry, + ) + info_gauge.labels(version=__version__).set(1) + + # -- Nodes total -- + nodes_total = Gauge( + "meshcore_nodes_total", + "Total number of nodes", + registry=registry, + ) + count = session.execute(select(func.count(Node.id))).scalar() or 0 + nodes_total.set(count) + + # -- Nodes active by time window -- + nodes_active = Gauge( + "meshcore_nodes_active", + "Number of active nodes in time window", + ["window"], + registry=registry, + ) + for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]: + cutoff = time.time() - (hours * 3600) + from datetime import datetime, timezone + + cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc) + count = ( + session.execute( + select(func.count(Node.id)).where(Node.last_seen >= cutoff_dt) + ).scalar() + or 0 + ) + nodes_active.labels(window=window).set(count) + + # -- Nodes by type -- + nodes_by_type = Gauge( + "meshcore_nodes_by_type", + "Number of nodes by advertisement type", + ["adv_type"], + registry=registry, + ) + type_counts = session.execute( + select(Node.adv_type, func.count(Node.id)).group_by(Node.adv_type) + ).all() + for adv_type, count in type_counts: + nodes_by_type.labels(adv_type=adv_type or "unknown").set(count) + + # -- Nodes with location -- + nodes_with_location = Gauge( + "meshcore_nodes_with_location", + "Number of nodes with GPS coordinates", + registry=registry, + ) + count = ( + session.execute( + select(func.count(Node.id)).where( + Node.lat.isnot(None), Node.lon.isnot(None) + ) + ).scalar() + or 0 + ) + nodes_with_location.set(count) + + # -- Node last seen timestamp -- + node_last_seen = Gauge( + "meshcore_node_last_seen_timestamp_seconds", + "Unix timestamp of when the node was last seen", + ["public_key", "node_name", "adv_type"], + registry=registry, + ) + nodes_with_last_seen = session.execute( + select(Node.public_key, Node.name, Node.adv_type, Node.last_seen).where( + Node.last_seen.isnot(None) + ) + ).all() + for public_key, name, adv_type, last_seen in nodes_with_last_seen: + node_last_seen.labels( + public_key=public_key, + node_name=name or "", + adv_type=adv_type or "unknown", + ).set(last_seen.timestamp()) + + # -- Messages total by type -- + messages_total = Gauge( + "meshcore_messages_total", + "Total number of messages by type", + ["type"], + registry=registry, + ) + msg_type_counts = session.execute( + select(Message.message_type, func.count(Message.id)).group_by( + Message.message_type + ) + ).all() + for msg_type, count in msg_type_counts: + messages_total.labels(type=msg_type).set(count) + + # -- Messages received by type and window -- + messages_received = Gauge( + "meshcore_messages_received", + "Messages received in time window by type", + ["type", "window"], + registry=registry, + ) + for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]: + cutoff = time.time() - (hours * 3600) + cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc) + window_counts = session.execute( + select(Message.message_type, func.count(Message.id)) + .where(Message.received_at >= cutoff_dt) + .group_by(Message.message_type) + ).all() + for msg_type, count in window_counts: + messages_received.labels(type=msg_type, window=window).set(count) + + # -- Advertisements total -- + advertisements_total = Gauge( + "meshcore_advertisements_total", + "Total number of advertisements", + registry=registry, + ) + count = session.execute(select(func.count(Advertisement.id))).scalar() or 0 + advertisements_total.set(count) + + # -- Advertisements received by window -- + advertisements_received = Gauge( + "meshcore_advertisements_received", + "Advertisements received in time window", + ["window"], + registry=registry, + ) + for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]: + cutoff = time.time() - (hours * 3600) + cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc) + count = ( + session.execute( + select(func.count(Advertisement.id)).where( + Advertisement.received_at >= cutoff_dt + ) + ).scalar() + or 0 + ) + advertisements_received.labels(window=window).set(count) + + # -- Telemetry total -- + telemetry_total = Gauge( + "meshcore_telemetry_total", + "Total number of telemetry records", + registry=registry, + ) + count = session.execute(select(func.count(Telemetry.id))).scalar() or 0 + telemetry_total.set(count) + + # -- Trace paths total -- + trace_paths_total = Gauge( + "meshcore_trace_paths_total", + "Total number of trace path records", + registry=registry, + ) + count = session.execute(select(func.count(TracePath.id))).scalar() or 0 + trace_paths_total.set(count) + + # -- Events by type -- + events_total = Gauge( + "meshcore_events_total", + "Total events by type from event log", + ["event_type"], + registry=registry, + ) + event_counts = session.execute( + select(EventLog.event_type, func.count(EventLog.id)).group_by( + EventLog.event_type + ) + ).all() + for event_type, count in event_counts: + events_total.labels(event_type=event_type).set(count) + + # -- Members total -- + members_total = Gauge( + "meshcore_members_total", + "Total number of network members", + registry=registry, + ) + count = session.execute(select(func.count(Member.id))).scalar() or 0 + members_total.set(count) + + output: bytes = generate_latest(registry) + return output + + +@router.get("/metrics") +async def metrics(request: Request) -> Response: + """Prometheus metrics endpoint. + + Returns metrics in Prometheus text exposition format. + Supports HTTP Basic Auth with username 'metrics' and API read key as password. + Results are cached with a configurable TTL to reduce database load. + """ + # Check authentication + if not verify_basic_auth(request): + return PlainTextResponse( + "Unauthorized", + status_code=401, + headers={"WWW-Authenticate": 'Basic realm="metrics"'}, + ) + + # Check cache + cache_ttl = getattr(request.app.state, "metrics_cache_ttl", 60) + now = time.time() + + if _cache["output"] and now < _cache["expires_at"]: + return Response( + content=_cache["output"], + media_type="text/plain; version=0.0.4; charset=utf-8", + ) + + # Collect fresh metrics + try: + from meshcore_hub.api.app import get_db_manager + + db_manager = get_db_manager() + with db_manager.session_scope() as session: + output = collect_metrics(session) + + # Update cache + _cache["output"] = output + _cache["expires_at"] = now + cache_ttl + + return Response( + content=output, + media_type="text/plain; version=0.0.4; charset=utf-8", + ) + except Exception as e: + logger.exception("Failed to collect metrics: %s", e) + return PlainTextResponse( + f"# Error collecting metrics: {e}\n", + status_code=500, + media_type="text/plain; version=0.0.4; charset=utf-8", + ) diff --git a/tests/test_api/conftest.py b/tests/test_api/conftest.py index 0529695..4cbbb41 100644 --- a/tests/test_api/conftest.py +++ b/tests/test_api/conftest.py @@ -2,6 +2,7 @@ import os import tempfile +from contextlib import contextmanager from datetime import datetime, timezone from unittest.mock import MagicMock, patch @@ -81,6 +82,20 @@ def mock_db_manager(api_db_engine): manager = MagicMock(spec=DatabaseManager) Session = sessionmaker(bind=api_db_engine) manager.get_session = lambda: Session() + + @contextmanager + def _session_scope(): + session = Session() + try: + yield session + session.commit() + except Exception: + session.rollback() + raise + finally: + session.close() + + manager.session_scope = _session_scope return manager diff --git a/tests/test_api/test_metrics.py b/tests/test_api/test_metrics.py new file mode 100644 index 0000000..61d9b52 --- /dev/null +++ b/tests/test_api/test_metrics.py @@ -0,0 +1,317 @@ +"""Tests for Prometheus metrics endpoint.""" + +import base64 +from datetime import datetime, timezone +from unittest.mock import patch + +from fastapi.testclient import TestClient +from sqlalchemy.orm import sessionmaker + +from meshcore_hub.api.app import create_app +from meshcore_hub.api.dependencies import ( + get_db_manager, + get_db_session, + get_mqtt_client, +) +from meshcore_hub.common.models import Node + + +def _make_basic_auth(username: str, password: str) -> str: + """Create a Basic auth header value.""" + credentials = base64.b64encode(f"{username}:{password}".encode()).decode() + return f"Basic {credentials}" + + +def _clear_metrics_cache() -> None: + """Clear the metrics module cache.""" + from meshcore_hub.api.metrics import _cache + + _cache["output"] = b"" + _cache["expires_at"] = 0.0 + + +class TestMetricsEndpoint: + """Tests for basic metrics endpoint availability.""" + + def test_metrics_endpoint_available(self, client_no_auth): + """Test that /metrics endpoint returns 200.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + + def test_metrics_content_type(self, client_no_auth): + """Test that metrics returns correct content type.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert "text/plain" in response.headers["content-type"] + + def test_metrics_contains_expected_names(self, client_no_auth): + """Test that metrics output contains expected metric names.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + content = response.text + assert "meshcore_info" in content + assert "meshcore_nodes_total" in content + assert "meshcore_nodes_active" in content + assert "meshcore_advertisements_total" in content + assert "meshcore_telemetry_total" in content + assert "meshcore_trace_paths_total" in content + assert "meshcore_members_total" in content + + def test_metrics_info_has_version(self, client_no_auth): + """Test that meshcore_info includes version label.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert 'meshcore_info{version="' in response.text + + +class TestMetricsAuth: + """Tests for metrics endpoint authentication.""" + + def test_no_auth_when_no_read_key(self, client_no_auth): + """Test that no auth is required when no read key is configured.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + + def test_401_when_read_key_set_no_auth(self, client_with_auth): + """Test 401 when read key is set but no auth provided.""" + _clear_metrics_cache() + response = client_with_auth.get("/metrics") + assert response.status_code == 401 + assert "WWW-Authenticate" in response.headers + + def test_success_with_correct_basic_auth(self, client_with_auth): + """Test successful auth with correct Basic credentials.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={"Authorization": _make_basic_auth("metrics", "test-read-key")}, + ) + assert response.status_code == 200 + + def test_fail_with_wrong_password(self, client_with_auth): + """Test 401 with incorrect password.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={"Authorization": _make_basic_auth("metrics", "wrong-key")}, + ) + assert response.status_code == 401 + + def test_fail_with_wrong_username(self, client_with_auth): + """Test 401 with incorrect username.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={ + "Authorization": _make_basic_auth("admin", "test-read-key"), + }, + ) + assert response.status_code == 401 + + def test_fail_with_bearer_auth(self, client_with_auth): + """Test that Bearer auth does not work for metrics.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={"Authorization": "Bearer test-read-key"}, + ) + assert response.status_code == 401 + + +class TestMetricsData: + """Tests for metrics data accuracy.""" + + def test_nodes_total_reflects_database(self, client_no_auth, sample_node): + """Test that nodes_total matches actual node count.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + # Should have at least 1 node + assert "meshcore_nodes_total 1.0" in response.text + + def test_messages_total_reflects_database(self, client_no_auth, sample_message): + """Test that messages_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_messages_total" in response.text + + def test_advertisements_total_reflects_database( + self, client_no_auth, sample_advertisement + ): + """Test that advertisements_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_advertisements_total 1.0" in response.text + + def test_members_total_reflects_database(self, client_no_auth, sample_member): + """Test that members_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_members_total 1.0" in response.text + + def test_nodes_by_type_has_labels(self, client_no_auth, sample_node): + """Test that nodes_by_type includes adv_type labels.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert 'meshcore_nodes_by_type{adv_type="REPEATER"}' in response.text + + def test_telemetry_total_reflects_database(self, client_no_auth, sample_telemetry): + """Test that telemetry_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_telemetry_total 1.0" in response.text + + def test_trace_paths_total_reflects_database( + self, client_no_auth, sample_trace_path + ): + """Test that trace_paths_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_trace_paths_total 1.0" in response.text + + def test_node_last_seen_timestamp_present(self, api_db_session, client_no_auth): + """Test that node_last_seen_timestamp is present for nodes with last_seen.""" + seen_at = datetime(2025, 6, 15, 12, 0, 0, tzinfo=timezone.utc) + node = Node( + public_key="lastseen1234lastseen1234lastseen", + name="Seen Node", + adv_type="REPEATER", + first_seen=seen_at, + last_seen=seen_at, + ) + api_db_session.add(node) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + # Labels are sorted alphabetically by prometheus_client + assert ( + "meshcore_node_last_seen_timestamp_seconds" + '{adv_type="REPEATER",' + 'node_name="Seen Node",' + 'public_key="lastseen1234lastseen1234lastseen"}' + ) in response.text + + def test_node_last_seen_timestamp_skips_null(self, api_db_session, client_no_auth): + """Test that nodes with last_seen=None are excluded from the metric.""" + node = Node( + public_key="neverseen1234neverseen1234neversx", + name="Never Seen", + adv_type="CLIENT", + first_seen=datetime.now(timezone.utc), + last_seen=None, + ) + api_db_session.add(node) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "neverseen1234neverseen1234neversx" not in response.text + + def test_node_last_seen_timestamp_multiple_nodes( + self, api_db_session, client_no_auth + ): + """Test that multiple nodes each get their own labeled time series.""" + seen1 = datetime(2025, 6, 15, 10, 0, 0, tzinfo=timezone.utc) + seen2 = datetime(2025, 6, 15, 11, 0, 0, tzinfo=timezone.utc) + node1 = Node( + public_key="multinode1multinode1multinode1mu", + name="Node One", + adv_type="REPEATER", + first_seen=seen1, + last_seen=seen1, + ) + node2 = Node( + public_key="multinode2multinode2multinode2mu", + name="Node Two", + adv_type="CHAT", + first_seen=seen2, + last_seen=seen2, + ) + api_db_session.add_all([node1, node2]) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert ('public_key="multinode1multinode1multinode1mu"') in response.text + assert ('public_key="multinode2multinode2multinode2mu"') in response.text + + def test_nodes_with_location(self, api_db_session, client_no_auth): + """Test that nodes_with_location counts correctly.""" + node = Node( + public_key="locationtest1234locationtest1234", + name="GPS Node", + adv_type="CHAT", + lat=37.7749, + lon=-122.4194, + first_seen=datetime.now(timezone.utc), + last_seen=datetime.now(timezone.utc), + ) + api_db_session.add(node) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_nodes_with_location 1.0" in response.text + + +class TestMetricsDisabled: + """Tests for when metrics are disabled.""" + + def test_metrics_404_when_disabled( + self, test_db_path, api_db_engine, mock_mqtt, mock_db_manager + ): + """Test that /metrics returns 404 when disabled.""" + db_url = f"sqlite:///{test_db_path}" + + with patch("meshcore_hub.api.app._db_manager", mock_db_manager): + app = create_app( + database_url=db_url, + metrics_enabled=False, + ) + + Session = sessionmaker(bind=api_db_engine) + + def override_get_db_manager(request=None): + return mock_db_manager + + def override_get_db_session(): + session = Session() + try: + yield session + finally: + session.close() + + def override_get_mqtt_client(request=None): + return mock_mqtt + + app.dependency_overrides[get_db_manager] = override_get_db_manager + app.dependency_overrides[get_db_session] = override_get_db_session + app.dependency_overrides[get_mqtt_client] = override_get_mqtt_client + + client = TestClient(app, raise_server_exceptions=True) + response = client.get("/metrics") + assert response.status_code == 404 + + +class TestMetricsCache: + """Tests for metrics caching behavior.""" + + def test_cache_returns_same_output(self, client_no_auth): + """Test that cached responses return the same content.""" + _clear_metrics_cache() + response1 = client_no_auth.get("/metrics") + response2 = client_no_auth.get("/metrics") + assert response1.text == response2.text