diff --git a/.env.example b/.env.example index 03e5a09..9942c97 100644 --- a/.env.example +++ b/.env.example @@ -190,6 +190,25 @@ API_PORT=8000 API_READ_KEY= API_ADMIN_KEY= +# ------------------- +# Prometheus Metrics +# ------------------- +# Prometheus metrics endpoint exposed at /metrics on the API service + +# Enable Prometheus metrics endpoint +# Default: true +METRICS_ENABLED=true + +# Seconds to cache metrics output (reduces database load) +# Default: 60 +METRICS_CACHE_TTL=60 + +# External Prometheus port (when using --profile metrics) +PROMETHEUS_PORT=9090 + +# External Alertmanager port (when using --profile metrics) +ALERTMANAGER_PORT=9093 + # ============================================================================= # WEB DASHBOARD SETTINGS # ============================================================================= diff --git a/AGENTS.md b/AGENTS.md index cd94a80..0f84468 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -281,6 +281,7 @@ meshcore-hub/ │ │ ├── app.py # FastAPI app │ │ ├── auth.py # Authentication │ │ ├── dependencies.py +│ │ ├── metrics.py # Prometheus metrics endpoint │ │ └── routes/ # API routes │ │ ├── members.py # Member CRUD endpoints │ │ └── ... @@ -311,7 +312,12 @@ meshcore-hub/ │ ├── env.py │ └── versions/ ├── etc/ -│ └── mosquitto.conf # MQTT broker configuration +│ ├── mosquitto.conf # MQTT broker configuration +│ ├── prometheus/ # Prometheus configuration +│ │ ├── prometheus.yml # Scrape and alerting config +│ │ └── alerts.yml # Alert rules +│ └── alertmanager/ # Alertmanager configuration +│ └── alertmanager.yml # Routing and receiver config ├── example/ │ ├── seed/ # Example seed data files │ │ ├── node_tags.yaml # Example node tags @@ -609,6 +615,8 @@ Key variables: - `WEB_AUTO_REFRESH_SECONDS` - Auto-refresh interval in seconds for list pages (default: `30`, `0` to disable) - `TZ` - Timezone for web dashboard date/time display (default: `UTC`, e.g., `America/New_York`, `Europe/London`) - `FEATURE_DASHBOARD`, `FEATURE_NODES`, `FEATURE_ADVERTISEMENTS`, `FEATURE_MESSAGES`, `FEATURE_MAP`, `FEATURE_MEMBERS`, `FEATURE_PAGES` - Feature flags to enable/disable specific web dashboard pages (default: all `true`). Dependencies: Dashboard auto-disables when all of Nodes/Advertisements/Messages are disabled. Map auto-disables when Nodes is disabled. +- `METRICS_ENABLED` - Enable Prometheus metrics endpoint at /metrics (default: `true`) +- `METRICS_CACHE_TTL` - Seconds to cache metrics output (default: `60`) - `LOG_LEVEL` - Logging verbosity The database defaults to `sqlite:///{DATA_HOME}/collector/meshcore.db` and does not typically need to be configured. diff --git a/README.md b/README.md index c49b7c5..f868b4a 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,7 @@ Docker Compose uses **profiles** to select which services to run: | `mock` | interface-mock-receiver | Testing without hardware | | `migrate` | db-migrate | One-time database migration | | `seed` | seed | One-time seed data import | +| `metrics` | prometheus, alertmanager | Prometheus metrics and alerting | **Note:** Most deployments connect to an external MQTT broker. Add `--profile mqtt` only if you need a local broker. @@ -337,6 +338,8 @@ The collector automatically cleans up old event data and inactive nodes: | `API_PORT` | `8000` | API port | | `API_READ_KEY` | *(none)* | Read-only API key | | `API_ADMIN_KEY` | *(none)* | Admin API key (required for commands) | +| `METRICS_ENABLED` | `true` | Enable Prometheus metrics endpoint at `/metrics` | +| `METRICS_CACHE_TTL` | `60` | Seconds to cache metrics output (reduces database load) | ### Web Dashboard Settings @@ -541,6 +544,7 @@ Health check endpoints are also available: - **Health**: http://localhost:8000/health - **Ready**: http://localhost:8000/health/ready (includes database check) +- **Metrics**: http://localhost:8000/metrics (Prometheus format) ### Authentication @@ -648,7 +652,7 @@ meshcore-hub/ │ └── locales/ # Translation files (en.json, languages.md) ├── tests/ # Test suite ├── alembic/ # Database migrations -├── etc/ # Configuration files (mosquitto.conf) +├── etc/ # Configuration files (MQTT, Prometheus, Alertmanager) ├── example/ # Example files for reference │ ├── seed/ # Example seed data files │ │ ├── node_tags.yaml # Example node tags diff --git a/docker-compose.yml b/docker-compose.yml index 29e41cf..0a6602b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -215,6 +215,8 @@ services: - API_PORT=8000 - API_READ_KEY=${API_READ_KEY:-} - API_ADMIN_KEY=${API_ADMIN_KEY:-} + - METRICS_ENABLED=${METRICS_ENABLED:-true} + - METRICS_CACHE_TTL=${METRICS_CACHE_TTL:-60} command: ["api"] healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] @@ -326,6 +328,48 @@ services: # Imports both node_tags.yaml and members.yaml if they exist command: ["collector", "seed"] + # ========================================================================== + # Prometheus - Metrics collection and monitoring (optional, use --profile metrics) + # ========================================================================== + prometheus: + image: prom/prometheus:latest + container_name: meshcore-prometheus + profiles: + - all + - metrics + restart: unless-stopped + depends_on: + api: + condition: service_healthy + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=30d' + volumes: + - ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./etc/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + - prometheus_data:/prometheus + + # ========================================================================== + # Alertmanager - Alert routing and notifications (optional, use --profile metrics) + # ========================================================================== + alertmanager: + image: prom/alertmanager:latest + container_name: meshcore-alertmanager + profiles: + - all + - metrics + restart: unless-stopped + ports: + - "${ALERTMANAGER_PORT:-9093}:9093" + volumes: + - ./etc/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager_data:/alertmanager + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + # ========================================================================== # Volumes # ========================================================================== @@ -336,3 +380,7 @@ volumes: name: meshcore_mosquitto_data mosquitto_log: name: meshcore_mosquitto_log + prometheus_data: + name: meshcore_prometheus_data + alertmanager_data: + name: meshcore_alertmanager_data diff --git a/etc/alertmanager/alertmanager.yml b/etc/alertmanager/alertmanager.yml new file mode 100644 index 0000000..9bb4b2d --- /dev/null +++ b/etc/alertmanager/alertmanager.yml @@ -0,0 +1,35 @@ +# Alertmanager configuration for MeshCore Hub +# +# Default configuration routes all alerts to a "blackhole" receiver +# (logs only, no external notifications). +# +# To receive notifications, configure a receiver below. +# See: https://prometheus.io/docs/alerting/latest/configuration/ +# +# Examples: +# +# Email: +# receivers: +# - name: 'email' +# email_configs: +# - to: 'admin@example.com' +# from: 'alertmanager@example.com' +# smarthost: 'smtp.example.com:587' +# auth_username: 'alertmanager@example.com' +# auth_password: 'password' +# +# Webhook (e.g. Slack incoming webhook, ntfy, Gotify): +# receivers: +# - name: 'webhook' +# webhook_configs: +# - url: 'https://example.com/webhook' + +route: + receiver: 'default' + group_by: ['alertname'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + +receivers: + - name: 'default' diff --git a/etc/prometheus/alerts.yml b/etc/prometheus/alerts.yml new file mode 100644 index 0000000..5c478cb --- /dev/null +++ b/etc/prometheus/alerts.yml @@ -0,0 +1,16 @@ +# Prometheus alert rules for MeshCore Hub +# +# These rules are evaluated by Prometheus and fired alerts are sent +# to Alertmanager for routing and notification. + +groups: + - name: meshcore + rules: + - alert: NodeNotSeen + expr: time() - meshcore_node_last_seen_timestamp_seconds > 48 * 3600 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ $labels.node_name }} not seen for 48+ hours" + description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}) last seen {{ $value | humanizeDuration }} ago." diff --git a/etc/prometheus/prometheus.yml b/etc/prometheus/prometheus.yml new file mode 100644 index 0000000..11a4e2e --- /dev/null +++ b/etc/prometheus/prometheus.yml @@ -0,0 +1,29 @@ +# Prometheus scrape configuration for MeshCore Hub +# +# This file is used when running Prometheus via Docker Compose: +# docker compose --profile core --profile metrics up -d +# +# The scrape interval matches the default metrics cache TTL (60s) +# to avoid unnecessary database queries. + +global: + scrape_interval: 60s + evaluation_interval: 60s + +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +rule_files: + - 'alerts.yml' + +scrape_configs: + - job_name: 'meshcore-hub' + metrics_path: '/metrics' + # Uncomment basic_auth if API_READ_KEY is configured + # basic_auth: + # username: 'metrics' + # password: '' + static_configs: + - targets: ['api:8000'] diff --git a/pyproject.toml b/pyproject.toml index 75e896a..e0f54d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "pyyaml>=6.0.0", "python-frontmatter>=1.0.0", "markdown>=3.5.0", + "prometheus-client>=0.20.0", ] [project.optional-dependencies] @@ -116,6 +117,7 @@ module = [ "meshcore.*", "frontmatter.*", "markdown.*", + "prometheus_client.*", ] ignore_missing_imports = true diff --git a/src/meshcore_hub/api/app.py b/src/meshcore_hub/api/app.py index a1bba91..87c220a 100644 --- a/src/meshcore_hub/api/app.py +++ b/src/meshcore_hub/api/app.py @@ -54,6 +54,8 @@ def create_app( mqtt_prefix: str = "meshcore", mqtt_tls: bool = False, cors_origins: list[str] | None = None, + metrics_enabled: bool = True, + metrics_cache_ttl: int = 60, ) -> FastAPI: """Create and configure the FastAPI application. @@ -66,6 +68,8 @@ def create_app( mqtt_prefix: MQTT topic prefix mqtt_tls: Enable TLS/SSL for MQTT connection cors_origins: Allowed CORS origins + metrics_enabled: Enable Prometheus metrics endpoint at /metrics + metrics_cache_ttl: Seconds to cache metrics output Returns: Configured FastAPI application @@ -88,6 +92,7 @@ def create_app( app.state.mqtt_port = mqtt_port app.state.mqtt_prefix = mqtt_prefix app.state.mqtt_tls = mqtt_tls + app.state.metrics_cache_ttl = metrics_cache_ttl # Configure CORS if cors_origins is None: @@ -106,6 +111,12 @@ def create_app( app.include_router(api_router, prefix="/api/v1") + # Include Prometheus metrics endpoint + if metrics_enabled: + from meshcore_hub.api.metrics import router as metrics_router + + app.include_router(metrics_router) + # Health check endpoints @app.get("/health", tags=["Health"]) async def health() -> dict: diff --git a/src/meshcore_hub/api/cli.py b/src/meshcore_hub/api/cli.py index cd30211..4e42d5a 100644 --- a/src/meshcore_hub/api/cli.py +++ b/src/meshcore_hub/api/cli.py @@ -81,6 +81,19 @@ import click envvar="CORS_ORIGINS", help="Comma-separated list of allowed CORS origins", ) +@click.option( + "--metrics-enabled/--no-metrics", + default=True, + envvar="METRICS_ENABLED", + help="Enable Prometheus metrics endpoint at /metrics", +) +@click.option( + "--metrics-cache-ttl", + type=int, + default=60, + envvar="METRICS_CACHE_TTL", + help="Seconds to cache metrics output (reduces database load)", +) @click.option( "--reload", is_flag=True, @@ -101,6 +114,8 @@ def api( mqtt_prefix: str, mqtt_tls: bool, cors_origins: str | None, + metrics_enabled: bool, + metrics_cache_ttl: int, reload: bool, ) -> None: """Run the REST API server. @@ -149,6 +164,8 @@ def api( click.echo(f"Read key configured: {read_key is not None}") click.echo(f"Admin key configured: {admin_key is not None}") click.echo(f"CORS origins: {cors_origins or 'none'}") + click.echo(f"Metrics enabled: {metrics_enabled}") + click.echo(f"Metrics cache TTL: {metrics_cache_ttl}s") click.echo(f"Reload mode: {reload}") click.echo("=" * 50) @@ -181,6 +198,8 @@ def api( mqtt_prefix=mqtt_prefix, mqtt_tls=mqtt_tls, cors_origins=origins_list, + metrics_enabled=metrics_enabled, + metrics_cache_ttl=metrics_cache_ttl, ) click.echo("\nStarting API server...") diff --git a/src/meshcore_hub/api/metrics.py b/src/meshcore_hub/api/metrics.py new file mode 100644 index 0000000..17e5da2 --- /dev/null +++ b/src/meshcore_hub/api/metrics.py @@ -0,0 +1,318 @@ +"""Prometheus metrics endpoint for MeshCore Hub API.""" + +import base64 +import logging +import time +from typing import Any + +from fastapi import APIRouter, Request, Response +from fastapi.responses import PlainTextResponse +from prometheus_client import CollectorRegistry, Gauge, generate_latest +from sqlalchemy import func, select + +from meshcore_hub.common.models import ( + Advertisement, + EventLog, + Member, + Message, + Node, + Telemetry, + TracePath, +) + +logger = logging.getLogger(__name__) + +router = APIRouter() + +# Module-level cache +_cache: dict[str, Any] = {"output": b"", "expires_at": 0.0} + + +def verify_basic_auth(request: Request) -> bool: + """Verify HTTP Basic Auth credentials for metrics endpoint. + + Uses username 'metrics' and the API read key as password. + Returns True if no read key is configured (public access). + + Args: + request: FastAPI request + + Returns: + True if authentication passes + """ + read_key = getattr(request.app.state, "read_key", None) + + # No read key configured = public access + if not read_key: + return True + + auth_header = request.headers.get("Authorization", "") + if not auth_header.startswith("Basic "): + return False + + try: + decoded = base64.b64decode(auth_header[6:]).decode("utf-8") + username, password = decoded.split(":", 1) + return username == "metrics" and password == read_key + except Exception: + return False + + +def collect_metrics(session: Any) -> bytes: + """Collect all metrics from the database and generate Prometheus output. + + Creates a fresh CollectorRegistry per call to avoid global state issues. + + Args: + session: SQLAlchemy database session + + Returns: + Prometheus text exposition format as bytes + """ + from meshcore_hub import __version__ + + registry = CollectorRegistry() + + # -- Info gauge -- + info_gauge = Gauge( + "meshcore_info", + "MeshCore Hub application info", + ["version"], + registry=registry, + ) + info_gauge.labels(version=__version__).set(1) + + # -- Nodes total -- + nodes_total = Gauge( + "meshcore_nodes_total", + "Total number of nodes", + registry=registry, + ) + count = session.execute(select(func.count(Node.id))).scalar() or 0 + nodes_total.set(count) + + # -- Nodes active by time window -- + nodes_active = Gauge( + "meshcore_nodes_active", + "Number of active nodes in time window", + ["window"], + registry=registry, + ) + for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]: + cutoff = time.time() - (hours * 3600) + from datetime import datetime, timezone + + cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc) + count = ( + session.execute( + select(func.count(Node.id)).where(Node.last_seen >= cutoff_dt) + ).scalar() + or 0 + ) + nodes_active.labels(window=window).set(count) + + # -- Nodes by type -- + nodes_by_type = Gauge( + "meshcore_nodes_by_type", + "Number of nodes by advertisement type", + ["adv_type"], + registry=registry, + ) + type_counts = session.execute( + select(Node.adv_type, func.count(Node.id)).group_by(Node.adv_type) + ).all() + for adv_type, count in type_counts: + nodes_by_type.labels(adv_type=adv_type or "unknown").set(count) + + # -- Nodes with location -- + nodes_with_location = Gauge( + "meshcore_nodes_with_location", + "Number of nodes with GPS coordinates", + registry=registry, + ) + count = ( + session.execute( + select(func.count(Node.id)).where( + Node.lat.isnot(None), Node.lon.isnot(None) + ) + ).scalar() + or 0 + ) + nodes_with_location.set(count) + + # -- Node last seen timestamp -- + node_last_seen = Gauge( + "meshcore_node_last_seen_timestamp_seconds", + "Unix timestamp of when the node was last seen", + ["public_key", "node_name", "adv_type"], + registry=registry, + ) + nodes_with_last_seen = session.execute( + select(Node.public_key, Node.name, Node.adv_type, Node.last_seen).where( + Node.last_seen.isnot(None) + ) + ).all() + for public_key, name, adv_type, last_seen in nodes_with_last_seen: + node_last_seen.labels( + public_key=public_key, + node_name=name or "", + adv_type=adv_type or "unknown", + ).set(last_seen.timestamp()) + + # -- Messages total by type -- + messages_total = Gauge( + "meshcore_messages_total", + "Total number of messages by type", + ["type"], + registry=registry, + ) + msg_type_counts = session.execute( + select(Message.message_type, func.count(Message.id)).group_by( + Message.message_type + ) + ).all() + for msg_type, count in msg_type_counts: + messages_total.labels(type=msg_type).set(count) + + # -- Messages received by type and window -- + messages_received = Gauge( + "meshcore_messages_received", + "Messages received in time window by type", + ["type", "window"], + registry=registry, + ) + for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]: + cutoff = time.time() - (hours * 3600) + cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc) + window_counts = session.execute( + select(Message.message_type, func.count(Message.id)) + .where(Message.received_at >= cutoff_dt) + .group_by(Message.message_type) + ).all() + for msg_type, count in window_counts: + messages_received.labels(type=msg_type, window=window).set(count) + + # -- Advertisements total -- + advertisements_total = Gauge( + "meshcore_advertisements_total", + "Total number of advertisements", + registry=registry, + ) + count = session.execute(select(func.count(Advertisement.id))).scalar() or 0 + advertisements_total.set(count) + + # -- Advertisements received by window -- + advertisements_received = Gauge( + "meshcore_advertisements_received", + "Advertisements received in time window", + ["window"], + registry=registry, + ) + for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]: + cutoff = time.time() - (hours * 3600) + cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc) + count = ( + session.execute( + select(func.count(Advertisement.id)).where( + Advertisement.received_at >= cutoff_dt + ) + ).scalar() + or 0 + ) + advertisements_received.labels(window=window).set(count) + + # -- Telemetry total -- + telemetry_total = Gauge( + "meshcore_telemetry_total", + "Total number of telemetry records", + registry=registry, + ) + count = session.execute(select(func.count(Telemetry.id))).scalar() or 0 + telemetry_total.set(count) + + # -- Trace paths total -- + trace_paths_total = Gauge( + "meshcore_trace_paths_total", + "Total number of trace path records", + registry=registry, + ) + count = session.execute(select(func.count(TracePath.id))).scalar() or 0 + trace_paths_total.set(count) + + # -- Events by type -- + events_total = Gauge( + "meshcore_events_total", + "Total events by type from event log", + ["event_type"], + registry=registry, + ) + event_counts = session.execute( + select(EventLog.event_type, func.count(EventLog.id)).group_by( + EventLog.event_type + ) + ).all() + for event_type, count in event_counts: + events_total.labels(event_type=event_type).set(count) + + # -- Members total -- + members_total = Gauge( + "meshcore_members_total", + "Total number of network members", + registry=registry, + ) + count = session.execute(select(func.count(Member.id))).scalar() or 0 + members_total.set(count) + + output: bytes = generate_latest(registry) + return output + + +@router.get("/metrics") +async def metrics(request: Request) -> Response: + """Prometheus metrics endpoint. + + Returns metrics in Prometheus text exposition format. + Supports HTTP Basic Auth with username 'metrics' and API read key as password. + Results are cached with a configurable TTL to reduce database load. + """ + # Check authentication + if not verify_basic_auth(request): + return PlainTextResponse( + "Unauthorized", + status_code=401, + headers={"WWW-Authenticate": 'Basic realm="metrics"'}, + ) + + # Check cache + cache_ttl = getattr(request.app.state, "metrics_cache_ttl", 60) + now = time.time() + + if _cache["output"] and now < _cache["expires_at"]: + return Response( + content=_cache["output"], + media_type="text/plain; version=0.0.4; charset=utf-8", + ) + + # Collect fresh metrics + try: + from meshcore_hub.api.app import get_db_manager + + db_manager = get_db_manager() + with db_manager.session_scope() as session: + output = collect_metrics(session) + + # Update cache + _cache["output"] = output + _cache["expires_at"] = now + cache_ttl + + return Response( + content=output, + media_type="text/plain; version=0.0.4; charset=utf-8", + ) + except Exception as e: + logger.exception("Failed to collect metrics: %s", e) + return PlainTextResponse( + f"# Error collecting metrics: {e}\n", + status_code=500, + media_type="text/plain; version=0.0.4; charset=utf-8", + ) diff --git a/tests/test_api/conftest.py b/tests/test_api/conftest.py index 0529695..4cbbb41 100644 --- a/tests/test_api/conftest.py +++ b/tests/test_api/conftest.py @@ -2,6 +2,7 @@ import os import tempfile +from contextlib import contextmanager from datetime import datetime, timezone from unittest.mock import MagicMock, patch @@ -81,6 +82,20 @@ def mock_db_manager(api_db_engine): manager = MagicMock(spec=DatabaseManager) Session = sessionmaker(bind=api_db_engine) manager.get_session = lambda: Session() + + @contextmanager + def _session_scope(): + session = Session() + try: + yield session + session.commit() + except Exception: + session.rollback() + raise + finally: + session.close() + + manager.session_scope = _session_scope return manager diff --git a/tests/test_api/test_metrics.py b/tests/test_api/test_metrics.py new file mode 100644 index 0000000..61d9b52 --- /dev/null +++ b/tests/test_api/test_metrics.py @@ -0,0 +1,317 @@ +"""Tests for Prometheus metrics endpoint.""" + +import base64 +from datetime import datetime, timezone +from unittest.mock import patch + +from fastapi.testclient import TestClient +from sqlalchemy.orm import sessionmaker + +from meshcore_hub.api.app import create_app +from meshcore_hub.api.dependencies import ( + get_db_manager, + get_db_session, + get_mqtt_client, +) +from meshcore_hub.common.models import Node + + +def _make_basic_auth(username: str, password: str) -> str: + """Create a Basic auth header value.""" + credentials = base64.b64encode(f"{username}:{password}".encode()).decode() + return f"Basic {credentials}" + + +def _clear_metrics_cache() -> None: + """Clear the metrics module cache.""" + from meshcore_hub.api.metrics import _cache + + _cache["output"] = b"" + _cache["expires_at"] = 0.0 + + +class TestMetricsEndpoint: + """Tests for basic metrics endpoint availability.""" + + def test_metrics_endpoint_available(self, client_no_auth): + """Test that /metrics endpoint returns 200.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + + def test_metrics_content_type(self, client_no_auth): + """Test that metrics returns correct content type.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert "text/plain" in response.headers["content-type"] + + def test_metrics_contains_expected_names(self, client_no_auth): + """Test that metrics output contains expected metric names.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + content = response.text + assert "meshcore_info" in content + assert "meshcore_nodes_total" in content + assert "meshcore_nodes_active" in content + assert "meshcore_advertisements_total" in content + assert "meshcore_telemetry_total" in content + assert "meshcore_trace_paths_total" in content + assert "meshcore_members_total" in content + + def test_metrics_info_has_version(self, client_no_auth): + """Test that meshcore_info includes version label.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert 'meshcore_info{version="' in response.text + + +class TestMetricsAuth: + """Tests for metrics endpoint authentication.""" + + def test_no_auth_when_no_read_key(self, client_no_auth): + """Test that no auth is required when no read key is configured.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + + def test_401_when_read_key_set_no_auth(self, client_with_auth): + """Test 401 when read key is set but no auth provided.""" + _clear_metrics_cache() + response = client_with_auth.get("/metrics") + assert response.status_code == 401 + assert "WWW-Authenticate" in response.headers + + def test_success_with_correct_basic_auth(self, client_with_auth): + """Test successful auth with correct Basic credentials.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={"Authorization": _make_basic_auth("metrics", "test-read-key")}, + ) + assert response.status_code == 200 + + def test_fail_with_wrong_password(self, client_with_auth): + """Test 401 with incorrect password.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={"Authorization": _make_basic_auth("metrics", "wrong-key")}, + ) + assert response.status_code == 401 + + def test_fail_with_wrong_username(self, client_with_auth): + """Test 401 with incorrect username.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={ + "Authorization": _make_basic_auth("admin", "test-read-key"), + }, + ) + assert response.status_code == 401 + + def test_fail_with_bearer_auth(self, client_with_auth): + """Test that Bearer auth does not work for metrics.""" + _clear_metrics_cache() + response = client_with_auth.get( + "/metrics", + headers={"Authorization": "Bearer test-read-key"}, + ) + assert response.status_code == 401 + + +class TestMetricsData: + """Tests for metrics data accuracy.""" + + def test_nodes_total_reflects_database(self, client_no_auth, sample_node): + """Test that nodes_total matches actual node count.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + # Should have at least 1 node + assert "meshcore_nodes_total 1.0" in response.text + + def test_messages_total_reflects_database(self, client_no_auth, sample_message): + """Test that messages_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_messages_total" in response.text + + def test_advertisements_total_reflects_database( + self, client_no_auth, sample_advertisement + ): + """Test that advertisements_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_advertisements_total 1.0" in response.text + + def test_members_total_reflects_database(self, client_no_auth, sample_member): + """Test that members_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_members_total 1.0" in response.text + + def test_nodes_by_type_has_labels(self, client_no_auth, sample_node): + """Test that nodes_by_type includes adv_type labels.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert 'meshcore_nodes_by_type{adv_type="REPEATER"}' in response.text + + def test_telemetry_total_reflects_database(self, client_no_auth, sample_telemetry): + """Test that telemetry_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_telemetry_total 1.0" in response.text + + def test_trace_paths_total_reflects_database( + self, client_no_auth, sample_trace_path + ): + """Test that trace_paths_total reflects database state.""" + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_trace_paths_total 1.0" in response.text + + def test_node_last_seen_timestamp_present(self, api_db_session, client_no_auth): + """Test that node_last_seen_timestamp is present for nodes with last_seen.""" + seen_at = datetime(2025, 6, 15, 12, 0, 0, tzinfo=timezone.utc) + node = Node( + public_key="lastseen1234lastseen1234lastseen", + name="Seen Node", + adv_type="REPEATER", + first_seen=seen_at, + last_seen=seen_at, + ) + api_db_session.add(node) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + # Labels are sorted alphabetically by prometheus_client + assert ( + "meshcore_node_last_seen_timestamp_seconds" + '{adv_type="REPEATER",' + 'node_name="Seen Node",' + 'public_key="lastseen1234lastseen1234lastseen"}' + ) in response.text + + def test_node_last_seen_timestamp_skips_null(self, api_db_session, client_no_auth): + """Test that nodes with last_seen=None are excluded from the metric.""" + node = Node( + public_key="neverseen1234neverseen1234neversx", + name="Never Seen", + adv_type="CLIENT", + first_seen=datetime.now(timezone.utc), + last_seen=None, + ) + api_db_session.add(node) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "neverseen1234neverseen1234neversx" not in response.text + + def test_node_last_seen_timestamp_multiple_nodes( + self, api_db_session, client_no_auth + ): + """Test that multiple nodes each get their own labeled time series.""" + seen1 = datetime(2025, 6, 15, 10, 0, 0, tzinfo=timezone.utc) + seen2 = datetime(2025, 6, 15, 11, 0, 0, tzinfo=timezone.utc) + node1 = Node( + public_key="multinode1multinode1multinode1mu", + name="Node One", + adv_type="REPEATER", + first_seen=seen1, + last_seen=seen1, + ) + node2 = Node( + public_key="multinode2multinode2multinode2mu", + name="Node Two", + adv_type="CHAT", + first_seen=seen2, + last_seen=seen2, + ) + api_db_session.add_all([node1, node2]) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert ('public_key="multinode1multinode1multinode1mu"') in response.text + assert ('public_key="multinode2multinode2multinode2mu"') in response.text + + def test_nodes_with_location(self, api_db_session, client_no_auth): + """Test that nodes_with_location counts correctly.""" + node = Node( + public_key="locationtest1234locationtest1234", + name="GPS Node", + adv_type="CHAT", + lat=37.7749, + lon=-122.4194, + first_seen=datetime.now(timezone.utc), + last_seen=datetime.now(timezone.utc), + ) + api_db_session.add(node) + api_db_session.commit() + + _clear_metrics_cache() + response = client_no_auth.get("/metrics") + assert response.status_code == 200 + assert "meshcore_nodes_with_location 1.0" in response.text + + +class TestMetricsDisabled: + """Tests for when metrics are disabled.""" + + def test_metrics_404_when_disabled( + self, test_db_path, api_db_engine, mock_mqtt, mock_db_manager + ): + """Test that /metrics returns 404 when disabled.""" + db_url = f"sqlite:///{test_db_path}" + + with patch("meshcore_hub.api.app._db_manager", mock_db_manager): + app = create_app( + database_url=db_url, + metrics_enabled=False, + ) + + Session = sessionmaker(bind=api_db_engine) + + def override_get_db_manager(request=None): + return mock_db_manager + + def override_get_db_session(): + session = Session() + try: + yield session + finally: + session.close() + + def override_get_mqtt_client(request=None): + return mock_mqtt + + app.dependency_overrides[get_db_manager] = override_get_db_manager + app.dependency_overrides[get_db_session] = override_get_db_session + app.dependency_overrides[get_mqtt_client] = override_get_mqtt_client + + client = TestClient(app, raise_server_exceptions=True) + response = client.get("/metrics") + assert response.status_code == 404 + + +class TestMetricsCache: + """Tests for metrics caching behavior.""" + + def test_cache_returns_same_output(self, client_no_auth): + """Test that cached responses return the same content.""" + _clear_metrics_cache() + response1 = client_no_auth.get("/metrics") + response2 = client_no_auth.get("/metrics") + assert response1.text == response2.text