mirror of
https://github.com/ipnet-mesh/meshcore-hub.git
synced 2026-03-28 17:42:56 +01:00
Merge pull request #126 from ipnet-mesh/feat/prometheus
Add Prometheus metrics endpoint, Alertmanager, and 1h stats window
This commit is contained in:
19
.env.example
19
.env.example
@@ -190,6 +190,25 @@ API_PORT=8000
|
||||
API_READ_KEY=
|
||||
API_ADMIN_KEY=
|
||||
|
||||
# -------------------
|
||||
# Prometheus Metrics
|
||||
# -------------------
|
||||
# Prometheus metrics endpoint exposed at /metrics on the API service
|
||||
|
||||
# Enable Prometheus metrics endpoint
|
||||
# Default: true
|
||||
METRICS_ENABLED=true
|
||||
|
||||
# Seconds to cache metrics output (reduces database load)
|
||||
# Default: 60
|
||||
METRICS_CACHE_TTL=60
|
||||
|
||||
# External Prometheus port (when using --profile metrics)
|
||||
PROMETHEUS_PORT=9090
|
||||
|
||||
# External Alertmanager port (when using --profile metrics)
|
||||
ALERTMANAGER_PORT=9093
|
||||
|
||||
# =============================================================================
|
||||
# WEB DASHBOARD SETTINGS
|
||||
# =============================================================================
|
||||
|
||||
10
AGENTS.md
10
AGENTS.md
@@ -281,6 +281,7 @@ meshcore-hub/
|
||||
│ │ ├── app.py # FastAPI app
|
||||
│ │ ├── auth.py # Authentication
|
||||
│ │ ├── dependencies.py
|
||||
│ │ ├── metrics.py # Prometheus metrics endpoint
|
||||
│ │ └── routes/ # API routes
|
||||
│ │ ├── members.py # Member CRUD endpoints
|
||||
│ │ └── ...
|
||||
@@ -311,7 +312,12 @@ meshcore-hub/
|
||||
│ ├── env.py
|
||||
│ └── versions/
|
||||
├── etc/
|
||||
│ └── mosquitto.conf # MQTT broker configuration
|
||||
│ ├── mosquitto.conf # MQTT broker configuration
|
||||
│ ├── prometheus/ # Prometheus configuration
|
||||
│ │ ├── prometheus.yml # Scrape and alerting config
|
||||
│ │ └── alerts.yml # Alert rules
|
||||
│ └── alertmanager/ # Alertmanager configuration
|
||||
│ └── alertmanager.yml # Routing and receiver config
|
||||
├── example/
|
||||
│ ├── seed/ # Example seed data files
|
||||
│ │ ├── node_tags.yaml # Example node tags
|
||||
@@ -609,6 +615,8 @@ Key variables:
|
||||
- `WEB_AUTO_REFRESH_SECONDS` - Auto-refresh interval in seconds for list pages (default: `30`, `0` to disable)
|
||||
- `TZ` - Timezone for web dashboard date/time display (default: `UTC`, e.g., `America/New_York`, `Europe/London`)
|
||||
- `FEATURE_DASHBOARD`, `FEATURE_NODES`, `FEATURE_ADVERTISEMENTS`, `FEATURE_MESSAGES`, `FEATURE_MAP`, `FEATURE_MEMBERS`, `FEATURE_PAGES` - Feature flags to enable/disable specific web dashboard pages (default: all `true`). Dependencies: Dashboard auto-disables when all of Nodes/Advertisements/Messages are disabled. Map auto-disables when Nodes is disabled.
|
||||
- `METRICS_ENABLED` - Enable Prometheus metrics endpoint at /metrics (default: `true`)
|
||||
- `METRICS_CACHE_TTL` - Seconds to cache metrics output (default: `60`)
|
||||
- `LOG_LEVEL` - Logging verbosity
|
||||
|
||||
The database defaults to `sqlite:///{DATA_HOME}/collector/meshcore.db` and does not typically need to be configured.
|
||||
|
||||
@@ -185,6 +185,7 @@ Docker Compose uses **profiles** to select which services to run:
|
||||
| `mock` | interface-mock-receiver | Testing without hardware |
|
||||
| `migrate` | db-migrate | One-time database migration |
|
||||
| `seed` | seed | One-time seed data import |
|
||||
| `metrics` | prometheus, alertmanager | Prometheus metrics and alerting |
|
||||
|
||||
**Note:** Most deployments connect to an external MQTT broker. Add `--profile mqtt` only if you need a local broker.
|
||||
|
||||
@@ -337,6 +338,8 @@ The collector automatically cleans up old event data and inactive nodes:
|
||||
| `API_PORT` | `8000` | API port |
|
||||
| `API_READ_KEY` | *(none)* | Read-only API key |
|
||||
| `API_ADMIN_KEY` | *(none)* | Admin API key (required for commands) |
|
||||
| `METRICS_ENABLED` | `true` | Enable Prometheus metrics endpoint at `/metrics` |
|
||||
| `METRICS_CACHE_TTL` | `60` | Seconds to cache metrics output (reduces database load) |
|
||||
|
||||
### Web Dashboard Settings
|
||||
|
||||
@@ -541,6 +544,7 @@ Health check endpoints are also available:
|
||||
|
||||
- **Health**: http://localhost:8000/health
|
||||
- **Ready**: http://localhost:8000/health/ready (includes database check)
|
||||
- **Metrics**: http://localhost:8000/metrics (Prometheus format)
|
||||
|
||||
### Authentication
|
||||
|
||||
@@ -648,7 +652,7 @@ meshcore-hub/
|
||||
│ └── locales/ # Translation files (en.json, languages.md)
|
||||
├── tests/ # Test suite
|
||||
├── alembic/ # Database migrations
|
||||
├── etc/ # Configuration files (mosquitto.conf)
|
||||
├── etc/ # Configuration files (MQTT, Prometheus, Alertmanager)
|
||||
├── example/ # Example files for reference
|
||||
│ ├── seed/ # Example seed data files
|
||||
│ │ ├── node_tags.yaml # Example node tags
|
||||
|
||||
@@ -215,6 +215,8 @@ services:
|
||||
- API_PORT=8000
|
||||
- API_READ_KEY=${API_READ_KEY:-}
|
||||
- API_ADMIN_KEY=${API_ADMIN_KEY:-}
|
||||
- METRICS_ENABLED=${METRICS_ENABLED:-true}
|
||||
- METRICS_CACHE_TTL=${METRICS_CACHE_TTL:-60}
|
||||
command: ["api"]
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||
@@ -326,6 +328,48 @@ services:
|
||||
# Imports both node_tags.yaml and members.yaml if they exist
|
||||
command: ["collector", "seed"]
|
||||
|
||||
# ==========================================================================
|
||||
# Prometheus - Metrics collection and monitoring (optional, use --profile metrics)
|
||||
# ==========================================================================
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: meshcore-prometheus
|
||||
profiles:
|
||||
- all
|
||||
- metrics
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
api:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- "${PROMETHEUS_PORT:-9090}:9090"
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
volumes:
|
||||
- ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./etc/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
|
||||
# ==========================================================================
|
||||
# Alertmanager - Alert routing and notifications (optional, use --profile metrics)
|
||||
# ==========================================================================
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: meshcore-alertmanager
|
||||
profiles:
|
||||
- all
|
||||
- metrics
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${ALERTMANAGER_PORT:-9093}:9093"
|
||||
volumes:
|
||||
- ./etc/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
- alertmanager_data:/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
|
||||
# ==========================================================================
|
||||
# Volumes
|
||||
# ==========================================================================
|
||||
@@ -336,3 +380,7 @@ volumes:
|
||||
name: meshcore_mosquitto_data
|
||||
mosquitto_log:
|
||||
name: meshcore_mosquitto_log
|
||||
prometheus_data:
|
||||
name: meshcore_prometheus_data
|
||||
alertmanager_data:
|
||||
name: meshcore_alertmanager_data
|
||||
|
||||
35
etc/alertmanager/alertmanager.yml
Normal file
35
etc/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,35 @@
|
||||
# Alertmanager configuration for MeshCore Hub
|
||||
#
|
||||
# Default configuration routes all alerts to a "blackhole" receiver
|
||||
# (logs only, no external notifications).
|
||||
#
|
||||
# To receive notifications, configure a receiver below.
|
||||
# See: https://prometheus.io/docs/alerting/latest/configuration/
|
||||
#
|
||||
# Examples:
|
||||
#
|
||||
# Email:
|
||||
# receivers:
|
||||
# - name: 'email'
|
||||
# email_configs:
|
||||
# - to: 'admin@example.com'
|
||||
# from: 'alertmanager@example.com'
|
||||
# smarthost: 'smtp.example.com:587'
|
||||
# auth_username: 'alertmanager@example.com'
|
||||
# auth_password: 'password'
|
||||
#
|
||||
# Webhook (e.g. Slack incoming webhook, ntfy, Gotify):
|
||||
# receivers:
|
||||
# - name: 'webhook'
|
||||
# webhook_configs:
|
||||
# - url: 'https://example.com/webhook'
|
||||
|
||||
route:
|
||||
receiver: 'default'
|
||||
group_by: ['alertname']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
receivers:
|
||||
- name: 'default'
|
||||
16
etc/prometheus/alerts.yml
Normal file
16
etc/prometheus/alerts.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
# Prometheus alert rules for MeshCore Hub
|
||||
#
|
||||
# These rules are evaluated by Prometheus and fired alerts are sent
|
||||
# to Alertmanager for routing and notification.
|
||||
|
||||
groups:
|
||||
- name: meshcore
|
||||
rules:
|
||||
- alert: NodeNotSeen
|
||||
expr: time() - meshcore_node_last_seen_timestamp_seconds > 48 * 3600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Node {{ $labels.node_name }} not seen for 48+ hours"
|
||||
description: "Node {{ $labels.public_key }} ({{ $labels.adv_type }}) last seen {{ $value | humanizeDuration }} ago."
|
||||
29
etc/prometheus/prometheus.yml
Normal file
29
etc/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,29 @@
|
||||
# Prometheus scrape configuration for MeshCore Hub
|
||||
#
|
||||
# This file is used when running Prometheus via Docker Compose:
|
||||
# docker compose --profile core --profile metrics up -d
|
||||
#
|
||||
# The scrape interval matches the default metrics cache TTL (60s)
|
||||
# to avoid unnecessary database queries.
|
||||
|
||||
global:
|
||||
scrape_interval: 60s
|
||||
evaluation_interval: 60s
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
rule_files:
|
||||
- 'alerts.yml'
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'meshcore-hub'
|
||||
metrics_path: '/metrics'
|
||||
# Uncomment basic_auth if API_READ_KEY is configured
|
||||
# basic_auth:
|
||||
# username: 'metrics'
|
||||
# password: '<API_READ_KEY>'
|
||||
static_configs:
|
||||
- targets: ['api:8000']
|
||||
@@ -41,6 +41,7 @@ dependencies = [
|
||||
"pyyaml>=6.0.0",
|
||||
"python-frontmatter>=1.0.0",
|
||||
"markdown>=3.5.0",
|
||||
"prometheus-client>=0.20.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -116,6 +117,7 @@ module = [
|
||||
"meshcore.*",
|
||||
"frontmatter.*",
|
||||
"markdown.*",
|
||||
"prometheus_client.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
@@ -54,6 +54,8 @@ def create_app(
|
||||
mqtt_prefix: str = "meshcore",
|
||||
mqtt_tls: bool = False,
|
||||
cors_origins: list[str] | None = None,
|
||||
metrics_enabled: bool = True,
|
||||
metrics_cache_ttl: int = 60,
|
||||
) -> FastAPI:
|
||||
"""Create and configure the FastAPI application.
|
||||
|
||||
@@ -66,6 +68,8 @@ def create_app(
|
||||
mqtt_prefix: MQTT topic prefix
|
||||
mqtt_tls: Enable TLS/SSL for MQTT connection
|
||||
cors_origins: Allowed CORS origins
|
||||
metrics_enabled: Enable Prometheus metrics endpoint at /metrics
|
||||
metrics_cache_ttl: Seconds to cache metrics output
|
||||
|
||||
Returns:
|
||||
Configured FastAPI application
|
||||
@@ -88,6 +92,7 @@ def create_app(
|
||||
app.state.mqtt_port = mqtt_port
|
||||
app.state.mqtt_prefix = mqtt_prefix
|
||||
app.state.mqtt_tls = mqtt_tls
|
||||
app.state.metrics_cache_ttl = metrics_cache_ttl
|
||||
|
||||
# Configure CORS
|
||||
if cors_origins is None:
|
||||
@@ -106,6 +111,12 @@ def create_app(
|
||||
|
||||
app.include_router(api_router, prefix="/api/v1")
|
||||
|
||||
# Include Prometheus metrics endpoint
|
||||
if metrics_enabled:
|
||||
from meshcore_hub.api.metrics import router as metrics_router
|
||||
|
||||
app.include_router(metrics_router)
|
||||
|
||||
# Health check endpoints
|
||||
@app.get("/health", tags=["Health"])
|
||||
async def health() -> dict:
|
||||
|
||||
@@ -81,6 +81,19 @@ import click
|
||||
envvar="CORS_ORIGINS",
|
||||
help="Comma-separated list of allowed CORS origins",
|
||||
)
|
||||
@click.option(
|
||||
"--metrics-enabled/--no-metrics",
|
||||
default=True,
|
||||
envvar="METRICS_ENABLED",
|
||||
help="Enable Prometheus metrics endpoint at /metrics",
|
||||
)
|
||||
@click.option(
|
||||
"--metrics-cache-ttl",
|
||||
type=int,
|
||||
default=60,
|
||||
envvar="METRICS_CACHE_TTL",
|
||||
help="Seconds to cache metrics output (reduces database load)",
|
||||
)
|
||||
@click.option(
|
||||
"--reload",
|
||||
is_flag=True,
|
||||
@@ -101,6 +114,8 @@ def api(
|
||||
mqtt_prefix: str,
|
||||
mqtt_tls: bool,
|
||||
cors_origins: str | None,
|
||||
metrics_enabled: bool,
|
||||
metrics_cache_ttl: int,
|
||||
reload: bool,
|
||||
) -> None:
|
||||
"""Run the REST API server.
|
||||
@@ -149,6 +164,8 @@ def api(
|
||||
click.echo(f"Read key configured: {read_key is not None}")
|
||||
click.echo(f"Admin key configured: {admin_key is not None}")
|
||||
click.echo(f"CORS origins: {cors_origins or 'none'}")
|
||||
click.echo(f"Metrics enabled: {metrics_enabled}")
|
||||
click.echo(f"Metrics cache TTL: {metrics_cache_ttl}s")
|
||||
click.echo(f"Reload mode: {reload}")
|
||||
click.echo("=" * 50)
|
||||
|
||||
@@ -181,6 +198,8 @@ def api(
|
||||
mqtt_prefix=mqtt_prefix,
|
||||
mqtt_tls=mqtt_tls,
|
||||
cors_origins=origins_list,
|
||||
metrics_enabled=metrics_enabled,
|
||||
metrics_cache_ttl=metrics_cache_ttl,
|
||||
)
|
||||
|
||||
click.echo("\nStarting API server...")
|
||||
|
||||
318
src/meshcore_hub/api/metrics.py
Normal file
318
src/meshcore_hub/api/metrics.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""Prometheus metrics endpoint for MeshCore Hub API."""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Request, Response
|
||||
from fastapi.responses import PlainTextResponse
|
||||
from prometheus_client import CollectorRegistry, Gauge, generate_latest
|
||||
from sqlalchemy import func, select
|
||||
|
||||
from meshcore_hub.common.models import (
|
||||
Advertisement,
|
||||
EventLog,
|
||||
Member,
|
||||
Message,
|
||||
Node,
|
||||
Telemetry,
|
||||
TracePath,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Module-level cache
|
||||
_cache: dict[str, Any] = {"output": b"", "expires_at": 0.0}
|
||||
|
||||
|
||||
def verify_basic_auth(request: Request) -> bool:
|
||||
"""Verify HTTP Basic Auth credentials for metrics endpoint.
|
||||
|
||||
Uses username 'metrics' and the API read key as password.
|
||||
Returns True if no read key is configured (public access).
|
||||
|
||||
Args:
|
||||
request: FastAPI request
|
||||
|
||||
Returns:
|
||||
True if authentication passes
|
||||
"""
|
||||
read_key = getattr(request.app.state, "read_key", None)
|
||||
|
||||
# No read key configured = public access
|
||||
if not read_key:
|
||||
return True
|
||||
|
||||
auth_header = request.headers.get("Authorization", "")
|
||||
if not auth_header.startswith("Basic "):
|
||||
return False
|
||||
|
||||
try:
|
||||
decoded = base64.b64decode(auth_header[6:]).decode("utf-8")
|
||||
username, password = decoded.split(":", 1)
|
||||
return username == "metrics" and password == read_key
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def collect_metrics(session: Any) -> bytes:
|
||||
"""Collect all metrics from the database and generate Prometheus output.
|
||||
|
||||
Creates a fresh CollectorRegistry per call to avoid global state issues.
|
||||
|
||||
Args:
|
||||
session: SQLAlchemy database session
|
||||
|
||||
Returns:
|
||||
Prometheus text exposition format as bytes
|
||||
"""
|
||||
from meshcore_hub import __version__
|
||||
|
||||
registry = CollectorRegistry()
|
||||
|
||||
# -- Info gauge --
|
||||
info_gauge = Gauge(
|
||||
"meshcore_info",
|
||||
"MeshCore Hub application info",
|
||||
["version"],
|
||||
registry=registry,
|
||||
)
|
||||
info_gauge.labels(version=__version__).set(1)
|
||||
|
||||
# -- Nodes total --
|
||||
nodes_total = Gauge(
|
||||
"meshcore_nodes_total",
|
||||
"Total number of nodes",
|
||||
registry=registry,
|
||||
)
|
||||
count = session.execute(select(func.count(Node.id))).scalar() or 0
|
||||
nodes_total.set(count)
|
||||
|
||||
# -- Nodes active by time window --
|
||||
nodes_active = Gauge(
|
||||
"meshcore_nodes_active",
|
||||
"Number of active nodes in time window",
|
||||
["window"],
|
||||
registry=registry,
|
||||
)
|
||||
for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]:
|
||||
cutoff = time.time() - (hours * 3600)
|
||||
from datetime import datetime, timezone
|
||||
|
||||
cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc)
|
||||
count = (
|
||||
session.execute(
|
||||
select(func.count(Node.id)).where(Node.last_seen >= cutoff_dt)
|
||||
).scalar()
|
||||
or 0
|
||||
)
|
||||
nodes_active.labels(window=window).set(count)
|
||||
|
||||
# -- Nodes by type --
|
||||
nodes_by_type = Gauge(
|
||||
"meshcore_nodes_by_type",
|
||||
"Number of nodes by advertisement type",
|
||||
["adv_type"],
|
||||
registry=registry,
|
||||
)
|
||||
type_counts = session.execute(
|
||||
select(Node.adv_type, func.count(Node.id)).group_by(Node.adv_type)
|
||||
).all()
|
||||
for adv_type, count in type_counts:
|
||||
nodes_by_type.labels(adv_type=adv_type or "unknown").set(count)
|
||||
|
||||
# -- Nodes with location --
|
||||
nodes_with_location = Gauge(
|
||||
"meshcore_nodes_with_location",
|
||||
"Number of nodes with GPS coordinates",
|
||||
registry=registry,
|
||||
)
|
||||
count = (
|
||||
session.execute(
|
||||
select(func.count(Node.id)).where(
|
||||
Node.lat.isnot(None), Node.lon.isnot(None)
|
||||
)
|
||||
).scalar()
|
||||
or 0
|
||||
)
|
||||
nodes_with_location.set(count)
|
||||
|
||||
# -- Node last seen timestamp --
|
||||
node_last_seen = Gauge(
|
||||
"meshcore_node_last_seen_timestamp_seconds",
|
||||
"Unix timestamp of when the node was last seen",
|
||||
["public_key", "node_name", "adv_type"],
|
||||
registry=registry,
|
||||
)
|
||||
nodes_with_last_seen = session.execute(
|
||||
select(Node.public_key, Node.name, Node.adv_type, Node.last_seen).where(
|
||||
Node.last_seen.isnot(None)
|
||||
)
|
||||
).all()
|
||||
for public_key, name, adv_type, last_seen in nodes_with_last_seen:
|
||||
node_last_seen.labels(
|
||||
public_key=public_key,
|
||||
node_name=name or "",
|
||||
adv_type=adv_type or "unknown",
|
||||
).set(last_seen.timestamp())
|
||||
|
||||
# -- Messages total by type --
|
||||
messages_total = Gauge(
|
||||
"meshcore_messages_total",
|
||||
"Total number of messages by type",
|
||||
["type"],
|
||||
registry=registry,
|
||||
)
|
||||
msg_type_counts = session.execute(
|
||||
select(Message.message_type, func.count(Message.id)).group_by(
|
||||
Message.message_type
|
||||
)
|
||||
).all()
|
||||
for msg_type, count in msg_type_counts:
|
||||
messages_total.labels(type=msg_type).set(count)
|
||||
|
||||
# -- Messages received by type and window --
|
||||
messages_received = Gauge(
|
||||
"meshcore_messages_received",
|
||||
"Messages received in time window by type",
|
||||
["type", "window"],
|
||||
registry=registry,
|
||||
)
|
||||
for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]:
|
||||
cutoff = time.time() - (hours * 3600)
|
||||
cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc)
|
||||
window_counts = session.execute(
|
||||
select(Message.message_type, func.count(Message.id))
|
||||
.where(Message.received_at >= cutoff_dt)
|
||||
.group_by(Message.message_type)
|
||||
).all()
|
||||
for msg_type, count in window_counts:
|
||||
messages_received.labels(type=msg_type, window=window).set(count)
|
||||
|
||||
# -- Advertisements total --
|
||||
advertisements_total = Gauge(
|
||||
"meshcore_advertisements_total",
|
||||
"Total number of advertisements",
|
||||
registry=registry,
|
||||
)
|
||||
count = session.execute(select(func.count(Advertisement.id))).scalar() or 0
|
||||
advertisements_total.set(count)
|
||||
|
||||
# -- Advertisements received by window --
|
||||
advertisements_received = Gauge(
|
||||
"meshcore_advertisements_received",
|
||||
"Advertisements received in time window",
|
||||
["window"],
|
||||
registry=registry,
|
||||
)
|
||||
for window, hours in [("1h", 1), ("24h", 24), ("7d", 168), ("30d", 720)]:
|
||||
cutoff = time.time() - (hours * 3600)
|
||||
cutoff_dt = datetime.fromtimestamp(cutoff, tz=timezone.utc)
|
||||
count = (
|
||||
session.execute(
|
||||
select(func.count(Advertisement.id)).where(
|
||||
Advertisement.received_at >= cutoff_dt
|
||||
)
|
||||
).scalar()
|
||||
or 0
|
||||
)
|
||||
advertisements_received.labels(window=window).set(count)
|
||||
|
||||
# -- Telemetry total --
|
||||
telemetry_total = Gauge(
|
||||
"meshcore_telemetry_total",
|
||||
"Total number of telemetry records",
|
||||
registry=registry,
|
||||
)
|
||||
count = session.execute(select(func.count(Telemetry.id))).scalar() or 0
|
||||
telemetry_total.set(count)
|
||||
|
||||
# -- Trace paths total --
|
||||
trace_paths_total = Gauge(
|
||||
"meshcore_trace_paths_total",
|
||||
"Total number of trace path records",
|
||||
registry=registry,
|
||||
)
|
||||
count = session.execute(select(func.count(TracePath.id))).scalar() or 0
|
||||
trace_paths_total.set(count)
|
||||
|
||||
# -- Events by type --
|
||||
events_total = Gauge(
|
||||
"meshcore_events_total",
|
||||
"Total events by type from event log",
|
||||
["event_type"],
|
||||
registry=registry,
|
||||
)
|
||||
event_counts = session.execute(
|
||||
select(EventLog.event_type, func.count(EventLog.id)).group_by(
|
||||
EventLog.event_type
|
||||
)
|
||||
).all()
|
||||
for event_type, count in event_counts:
|
||||
events_total.labels(event_type=event_type).set(count)
|
||||
|
||||
# -- Members total --
|
||||
members_total = Gauge(
|
||||
"meshcore_members_total",
|
||||
"Total number of network members",
|
||||
registry=registry,
|
||||
)
|
||||
count = session.execute(select(func.count(Member.id))).scalar() or 0
|
||||
members_total.set(count)
|
||||
|
||||
output: bytes = generate_latest(registry)
|
||||
return output
|
||||
|
||||
|
||||
@router.get("/metrics")
|
||||
async def metrics(request: Request) -> Response:
|
||||
"""Prometheus metrics endpoint.
|
||||
|
||||
Returns metrics in Prometheus text exposition format.
|
||||
Supports HTTP Basic Auth with username 'metrics' and API read key as password.
|
||||
Results are cached with a configurable TTL to reduce database load.
|
||||
"""
|
||||
# Check authentication
|
||||
if not verify_basic_auth(request):
|
||||
return PlainTextResponse(
|
||||
"Unauthorized",
|
||||
status_code=401,
|
||||
headers={"WWW-Authenticate": 'Basic realm="metrics"'},
|
||||
)
|
||||
|
||||
# Check cache
|
||||
cache_ttl = getattr(request.app.state, "metrics_cache_ttl", 60)
|
||||
now = time.time()
|
||||
|
||||
if _cache["output"] and now < _cache["expires_at"]:
|
||||
return Response(
|
||||
content=_cache["output"],
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8",
|
||||
)
|
||||
|
||||
# Collect fresh metrics
|
||||
try:
|
||||
from meshcore_hub.api.app import get_db_manager
|
||||
|
||||
db_manager = get_db_manager()
|
||||
with db_manager.session_scope() as session:
|
||||
output = collect_metrics(session)
|
||||
|
||||
# Update cache
|
||||
_cache["output"] = output
|
||||
_cache["expires_at"] = now + cache_ttl
|
||||
|
||||
return Response(
|
||||
content=output,
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("Failed to collect metrics: %s", e)
|
||||
return PlainTextResponse(
|
||||
f"# Error collecting metrics: {e}\n",
|
||||
status_code=500,
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8",
|
||||
)
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timezone
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
@@ -81,6 +82,20 @@ def mock_db_manager(api_db_engine):
|
||||
manager = MagicMock(spec=DatabaseManager)
|
||||
Session = sessionmaker(bind=api_db_engine)
|
||||
manager.get_session = lambda: Session()
|
||||
|
||||
@contextmanager
|
||||
def _session_scope():
|
||||
session = Session()
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
manager.session_scope = _session_scope
|
||||
return manager
|
||||
|
||||
|
||||
|
||||
317
tests/test_api/test_metrics.py
Normal file
317
tests/test_api/test_metrics.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""Tests for Prometheus metrics endpoint."""
|
||||
|
||||
import base64
|
||||
from datetime import datetime, timezone
|
||||
from unittest.mock import patch
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from meshcore_hub.api.app import create_app
|
||||
from meshcore_hub.api.dependencies import (
|
||||
get_db_manager,
|
||||
get_db_session,
|
||||
get_mqtt_client,
|
||||
)
|
||||
from meshcore_hub.common.models import Node
|
||||
|
||||
|
||||
def _make_basic_auth(username: str, password: str) -> str:
|
||||
"""Create a Basic auth header value."""
|
||||
credentials = base64.b64encode(f"{username}:{password}".encode()).decode()
|
||||
return f"Basic {credentials}"
|
||||
|
||||
|
||||
def _clear_metrics_cache() -> None:
|
||||
"""Clear the metrics module cache."""
|
||||
from meshcore_hub.api.metrics import _cache
|
||||
|
||||
_cache["output"] = b""
|
||||
_cache["expires_at"] = 0.0
|
||||
|
||||
|
||||
class TestMetricsEndpoint:
|
||||
"""Tests for basic metrics endpoint availability."""
|
||||
|
||||
def test_metrics_endpoint_available(self, client_no_auth):
|
||||
"""Test that /metrics endpoint returns 200."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_metrics_content_type(self, client_no_auth):
|
||||
"""Test that metrics returns correct content type."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert "text/plain" in response.headers["content-type"]
|
||||
|
||||
def test_metrics_contains_expected_names(self, client_no_auth):
|
||||
"""Test that metrics output contains expected metric names."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
content = response.text
|
||||
assert "meshcore_info" in content
|
||||
assert "meshcore_nodes_total" in content
|
||||
assert "meshcore_nodes_active" in content
|
||||
assert "meshcore_advertisements_total" in content
|
||||
assert "meshcore_telemetry_total" in content
|
||||
assert "meshcore_trace_paths_total" in content
|
||||
assert "meshcore_members_total" in content
|
||||
|
||||
def test_metrics_info_has_version(self, client_no_auth):
|
||||
"""Test that meshcore_info includes version label."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert 'meshcore_info{version="' in response.text
|
||||
|
||||
|
||||
class TestMetricsAuth:
|
||||
"""Tests for metrics endpoint authentication."""
|
||||
|
||||
def test_no_auth_when_no_read_key(self, client_no_auth):
|
||||
"""Test that no auth is required when no read key is configured."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_401_when_read_key_set_no_auth(self, client_with_auth):
|
||||
"""Test 401 when read key is set but no auth provided."""
|
||||
_clear_metrics_cache()
|
||||
response = client_with_auth.get("/metrics")
|
||||
assert response.status_code == 401
|
||||
assert "WWW-Authenticate" in response.headers
|
||||
|
||||
def test_success_with_correct_basic_auth(self, client_with_auth):
|
||||
"""Test successful auth with correct Basic credentials."""
|
||||
_clear_metrics_cache()
|
||||
response = client_with_auth.get(
|
||||
"/metrics",
|
||||
headers={"Authorization": _make_basic_auth("metrics", "test-read-key")},
|
||||
)
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_fail_with_wrong_password(self, client_with_auth):
|
||||
"""Test 401 with incorrect password."""
|
||||
_clear_metrics_cache()
|
||||
response = client_with_auth.get(
|
||||
"/metrics",
|
||||
headers={"Authorization": _make_basic_auth("metrics", "wrong-key")},
|
||||
)
|
||||
assert response.status_code == 401
|
||||
|
||||
def test_fail_with_wrong_username(self, client_with_auth):
|
||||
"""Test 401 with incorrect username."""
|
||||
_clear_metrics_cache()
|
||||
response = client_with_auth.get(
|
||||
"/metrics",
|
||||
headers={
|
||||
"Authorization": _make_basic_auth("admin", "test-read-key"),
|
||||
},
|
||||
)
|
||||
assert response.status_code == 401
|
||||
|
||||
def test_fail_with_bearer_auth(self, client_with_auth):
|
||||
"""Test that Bearer auth does not work for metrics."""
|
||||
_clear_metrics_cache()
|
||||
response = client_with_auth.get(
|
||||
"/metrics",
|
||||
headers={"Authorization": "Bearer test-read-key"},
|
||||
)
|
||||
assert response.status_code == 401
|
||||
|
||||
|
||||
class TestMetricsData:
|
||||
"""Tests for metrics data accuracy."""
|
||||
|
||||
def test_nodes_total_reflects_database(self, client_no_auth, sample_node):
|
||||
"""Test that nodes_total matches actual node count."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
# Should have at least 1 node
|
||||
assert "meshcore_nodes_total 1.0" in response.text
|
||||
|
||||
def test_messages_total_reflects_database(self, client_no_auth, sample_message):
|
||||
"""Test that messages_total reflects database state."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert "meshcore_messages_total" in response.text
|
||||
|
||||
def test_advertisements_total_reflects_database(
|
||||
self, client_no_auth, sample_advertisement
|
||||
):
|
||||
"""Test that advertisements_total reflects database state."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert "meshcore_advertisements_total 1.0" in response.text
|
||||
|
||||
def test_members_total_reflects_database(self, client_no_auth, sample_member):
|
||||
"""Test that members_total reflects database state."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert "meshcore_members_total 1.0" in response.text
|
||||
|
||||
def test_nodes_by_type_has_labels(self, client_no_auth, sample_node):
|
||||
"""Test that nodes_by_type includes adv_type labels."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert 'meshcore_nodes_by_type{adv_type="REPEATER"}' in response.text
|
||||
|
||||
def test_telemetry_total_reflects_database(self, client_no_auth, sample_telemetry):
|
||||
"""Test that telemetry_total reflects database state."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert "meshcore_telemetry_total 1.0" in response.text
|
||||
|
||||
def test_trace_paths_total_reflects_database(
|
||||
self, client_no_auth, sample_trace_path
|
||||
):
|
||||
"""Test that trace_paths_total reflects database state."""
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert "meshcore_trace_paths_total 1.0" in response.text
|
||||
|
||||
def test_node_last_seen_timestamp_present(self, api_db_session, client_no_auth):
|
||||
"""Test that node_last_seen_timestamp is present for nodes with last_seen."""
|
||||
seen_at = datetime(2025, 6, 15, 12, 0, 0, tzinfo=timezone.utc)
|
||||
node = Node(
|
||||
public_key="lastseen1234lastseen1234lastseen",
|
||||
name="Seen Node",
|
||||
adv_type="REPEATER",
|
||||
first_seen=seen_at,
|
||||
last_seen=seen_at,
|
||||
)
|
||||
api_db_session.add(node)
|
||||
api_db_session.commit()
|
||||
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
# Labels are sorted alphabetically by prometheus_client
|
||||
assert (
|
||||
"meshcore_node_last_seen_timestamp_seconds"
|
||||
'{adv_type="REPEATER",'
|
||||
'node_name="Seen Node",'
|
||||
'public_key="lastseen1234lastseen1234lastseen"}'
|
||||
) in response.text
|
||||
|
||||
def test_node_last_seen_timestamp_skips_null(self, api_db_session, client_no_auth):
|
||||
"""Test that nodes with last_seen=None are excluded from the metric."""
|
||||
node = Node(
|
||||
public_key="neverseen1234neverseen1234neversx",
|
||||
name="Never Seen",
|
||||
adv_type="CLIENT",
|
||||
first_seen=datetime.now(timezone.utc),
|
||||
last_seen=None,
|
||||
)
|
||||
api_db_session.add(node)
|
||||
api_db_session.commit()
|
||||
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert "neverseen1234neverseen1234neversx" not in response.text
|
||||
|
||||
def test_node_last_seen_timestamp_multiple_nodes(
|
||||
self, api_db_session, client_no_auth
|
||||
):
|
||||
"""Test that multiple nodes each get their own labeled time series."""
|
||||
seen1 = datetime(2025, 6, 15, 10, 0, 0, tzinfo=timezone.utc)
|
||||
seen2 = datetime(2025, 6, 15, 11, 0, 0, tzinfo=timezone.utc)
|
||||
node1 = Node(
|
||||
public_key="multinode1multinode1multinode1mu",
|
||||
name="Node One",
|
||||
adv_type="REPEATER",
|
||||
first_seen=seen1,
|
||||
last_seen=seen1,
|
||||
)
|
||||
node2 = Node(
|
||||
public_key="multinode2multinode2multinode2mu",
|
||||
name="Node Two",
|
||||
adv_type="CHAT",
|
||||
first_seen=seen2,
|
||||
last_seen=seen2,
|
||||
)
|
||||
api_db_session.add_all([node1, node2])
|
||||
api_db_session.commit()
|
||||
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert ('public_key="multinode1multinode1multinode1mu"') in response.text
|
||||
assert ('public_key="multinode2multinode2multinode2mu"') in response.text
|
||||
|
||||
def test_nodes_with_location(self, api_db_session, client_no_auth):
|
||||
"""Test that nodes_with_location counts correctly."""
|
||||
node = Node(
|
||||
public_key="locationtest1234locationtest1234",
|
||||
name="GPS Node",
|
||||
adv_type="CHAT",
|
||||
lat=37.7749,
|
||||
lon=-122.4194,
|
||||
first_seen=datetime.now(timezone.utc),
|
||||
last_seen=datetime.now(timezone.utc),
|
||||
)
|
||||
api_db_session.add(node)
|
||||
api_db_session.commit()
|
||||
|
||||
_clear_metrics_cache()
|
||||
response = client_no_auth.get("/metrics")
|
||||
assert response.status_code == 200
|
||||
assert "meshcore_nodes_with_location 1.0" in response.text
|
||||
|
||||
|
||||
class TestMetricsDisabled:
|
||||
"""Tests for when metrics are disabled."""
|
||||
|
||||
def test_metrics_404_when_disabled(
|
||||
self, test_db_path, api_db_engine, mock_mqtt, mock_db_manager
|
||||
):
|
||||
"""Test that /metrics returns 404 when disabled."""
|
||||
db_url = f"sqlite:///{test_db_path}"
|
||||
|
||||
with patch("meshcore_hub.api.app._db_manager", mock_db_manager):
|
||||
app = create_app(
|
||||
database_url=db_url,
|
||||
metrics_enabled=False,
|
||||
)
|
||||
|
||||
Session = sessionmaker(bind=api_db_engine)
|
||||
|
||||
def override_get_db_manager(request=None):
|
||||
return mock_db_manager
|
||||
|
||||
def override_get_db_session():
|
||||
session = Session()
|
||||
try:
|
||||
yield session
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def override_get_mqtt_client(request=None):
|
||||
return mock_mqtt
|
||||
|
||||
app.dependency_overrides[get_db_manager] = override_get_db_manager
|
||||
app.dependency_overrides[get_db_session] = override_get_db_session
|
||||
app.dependency_overrides[get_mqtt_client] = override_get_mqtt_client
|
||||
|
||||
client = TestClient(app, raise_server_exceptions=True)
|
||||
response = client.get("/metrics")
|
||||
assert response.status_code == 404
|
||||
|
||||
|
||||
class TestMetricsCache:
|
||||
"""Tests for metrics caching behavior."""
|
||||
|
||||
def test_cache_returns_same_output(self, client_no_auth):
|
||||
"""Test that cached responses return the same content."""
|
||||
_clear_metrics_cache()
|
||||
response1 = client_no_auth.get("/metrics")
|
||||
response2 = client_no_auth.get("/metrics")
|
||||
assert response1.text == response2.text
|
||||
Reference in New Issue
Block a user