mirror of
https://github.com/ajvpot/meshexplorer.git
synced 2026-06-22 10:54:44 +02:00
7cea182c6d
* ingest: batch ClickHouse inserts to stop MQTT flapping & packet loss The meshcore handler did a synchronous per-message ClickHouse insert on paho's single inbound goroutine. At ~86ms/insert (single-row inserts + async_insert wait + materialized views) the goroutine couldn't keep up with the high-volume letsmesh feed, so it stalled past PingTimeout and paho declared "pingresp not received" and reconnected — ~847 cycles in 19.5h, ~45% downtime, ~50% of letsmesh packets lost. The low-volume davekeogh broker never saturated the goroutine and was unaffected. Decouple receipt from insertion: the handler now enqueues decoded rows onto a buffered channel and a single background writer flushes them to meshcore_packets in batched native inserts (every MESHCORE_BATCH_FLUSH_ SECONDS or MESHCORE_BATCH_MAX_ROWS rows). The inbound goroutine never blocks, so PINGRESP is always processed in time. - New batch writer with env-configurable flush interval / max rows / buffer size (MESHCORE_BATCH_* ), wired in docker-compose. - Drop server-side async_insert (redundant once we batch app-side). - Bump PingTimeout 10s -> 20s (env MQTT_PING_TIMEOUT_SECONDS) for margin against Cloudflare WebSocket buffering jitter. - Enqueue is non-blocking; rows are dropped+counted only if the buffer fills (ClickHouse unavailable). A failed batch is dropped and retried by the next flush (native blocks commit atomically). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * ingest: make MQTT KeepAlive configurable (MQTT_KEEPALIVE_SECONDS) As a near-silent subscriber, paho emits a PINGREQ roughly every KeepAlive seconds; lowering it sends client->server frames more often to keep the Cloudflare-proxied WebSocket path warm in both directions, a lever for the residual mid-stream "pingresp not received" stalls on the letsmesh broker. Default unchanged (30s); wired through docker-compose. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * ingest: add configurable MQTT write timeout (MQTT_WRITE_TIMEOUT_SECONDS) Bounds PINGREQ/SUBSCRIBE writes so a stalled write through the Cloudflare WebSocket proxy can't hang the client. Default 0 (paho's existing no-timeout behavior); wired through docker-compose. Recommended ~20s when behind a buffering reverse proxy. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Alex Vanderpot <alex@Alexs-MacBook-Pro-2.local> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
165 lines
5.0 KiB
YAML
165 lines
5.0 KiB
YAML
services:
|
|
# ClickHouse database (custom image bundles the meshcore decrypt UDF)
|
|
clickhouse:
|
|
build: ./ingest/clickhouse
|
|
environment:
|
|
- CLICKHOUSE_DB=${CLICKHOUSE_DB:-default}
|
|
- CLICKHOUSE_USER=${CLICKHOUSE_USER:-default}
|
|
- CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-}
|
|
- CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1
|
|
ports:
|
|
# Published to localhost only, for debugging. Not required by the stack.
|
|
- "127.0.0.1:8123:8123"
|
|
- "127.0.0.1:9000:9000"
|
|
volumes:
|
|
- clickhouse-data:/var/lib/clickhouse
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "clickhouse-client --user $$CLICKHOUSE_USER --password $$CLICKHOUSE_PASSWORD --query 'SELECT 1' || exit 1"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 30
|
|
start_period: 10s
|
|
restart: unless-stopped
|
|
networks:
|
|
- meshnet
|
|
|
|
# One-shot migration runner (goose). Applies the meshcore schema then exits.
|
|
migrate:
|
|
build: ./ingest
|
|
command:
|
|
- "./migrate"
|
|
- "-host"
|
|
- "clickhouse"
|
|
- "-port"
|
|
- "9000"
|
|
- "-database"
|
|
- "${CLICKHOUSE_DB:-default}"
|
|
- "-username"
|
|
- "${CLICKHOUSE_USER:-default}"
|
|
- "-password"
|
|
- "${CLICKHOUSE_PASSWORD:-}"
|
|
- "-path"
|
|
- "migrations"
|
|
- "-action"
|
|
- "up"
|
|
depends_on:
|
|
clickhouse:
|
|
condition: service_healthy
|
|
restart: "no"
|
|
networks:
|
|
- meshnet
|
|
|
|
# MeshCore MQTT -> ClickHouse ingest daemon
|
|
meshcoreingest:
|
|
build: ./ingest
|
|
command: ["./meshcoreingest"]
|
|
environment:
|
|
- CLICKHOUSE_HOST=clickhouse
|
|
- CLICKHOUSE_PORT=9000
|
|
- CLICKHOUSE_DB=${CLICKHOUSE_DB:-default}
|
|
- CLICKHOUSE_USER=${CLICKHOUSE_USER:-default}
|
|
- CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD:-}
|
|
- MQTT_BROKERS=${MQTT_BROKERS}
|
|
- MQTT_CLIENT_ID=${MQTT_CLIENT_ID:-meshcore-ingest}
|
|
- MQTT_STALE_AFTER_SECONDS=${MQTT_STALE_AFTER_SECONDS:-300}
|
|
- MQTT_KEEPALIVE_SECONDS=${MQTT_KEEPALIVE_SECONDS:-30}
|
|
- MQTT_PING_TIMEOUT_SECONDS=${MQTT_PING_TIMEOUT_SECONDS:-20}
|
|
- MQTT_WRITE_TIMEOUT_SECONDS=${MQTT_WRITE_TIMEOUT_SECONDS:-0}
|
|
- MESHCORE_BATCH_FLUSH_SECONDS=${MESHCORE_BATCH_FLUSH_SECONDS:-10}
|
|
- MESHCORE_BATCH_MAX_ROWS=${MESHCORE_BATCH_MAX_ROWS:-5000}
|
|
- MESHCORE_BATCH_BUFFER=${MESHCORE_BATCH_BUFFER:-50000}
|
|
depends_on:
|
|
clickhouse:
|
|
condition: service_healthy
|
|
migrate:
|
|
condition: service_completed_successfully
|
|
restart: unless-stopped
|
|
networks:
|
|
- meshnet
|
|
|
|
# MeshExplorer web app (Next.js). Reads ClickHouse over HTTP (8123) as the
|
|
# readonly user.
|
|
meshexplorer:
|
|
build:
|
|
context: ./meshexplorer
|
|
dockerfile: Dockerfile
|
|
environment:
|
|
- NODE_ENV=production
|
|
- PORT=3000
|
|
- HOSTNAME=0.0.0.0
|
|
- CLICKHOUSE_HOST=clickhouse
|
|
- CLICKHOUSE_PORT=8123
|
|
- CLICKHOUSE_USER=${CLICKHOUSE_READONLY_USER:-readonly}
|
|
- CLICKHOUSE_PASSWORD=${CLICKHOUSE_READONLY_PASSWORD:-readonly}
|
|
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-}
|
|
ports:
|
|
- "3001:3000"
|
|
depends_on:
|
|
clickhouse:
|
|
condition: service_healthy
|
|
migrate:
|
|
condition: service_completed_successfully
|
|
restart: unless-stopped
|
|
init: true
|
|
networks:
|
|
- meshnet
|
|
|
|
# MeshCore -> Discord relay bot. Optional: enabled with the "bot" profile.
|
|
# docker compose --profile bot up
|
|
discord-bot:
|
|
build:
|
|
context: ./meshexplorer
|
|
dockerfile: Dockerfile.bot
|
|
profiles: ["bot"]
|
|
environment:
|
|
- NODE_ENV=production
|
|
- CLICKHOUSE_HOST=clickhouse
|
|
- CLICKHOUSE_PORT=8123
|
|
- CLICKHOUSE_USER=${CLICKHOUSE_READONLY_USER:-readonly}
|
|
- CLICKHOUSE_PASSWORD=${CLICKHOUSE_READONLY_PASSWORD:-readonly}
|
|
- DISCORD_WEBHOOK_URL=${DISCORD_WEBHOOK_URL}
|
|
- DISCORD_THREAD_ID=${DISCORD_THREAD_ID:-}
|
|
- MESH_REGION=${MESH_REGION:-SEA}
|
|
- POLL_INTERVAL=${POLL_INTERVAL:-300}
|
|
- MAX_ROWS_PER_POLL=${MAX_ROWS_PER_POLL:-50}
|
|
- PRIVATE_KEYS=${PRIVATE_KEYS:-}
|
|
depends_on:
|
|
clickhouse:
|
|
condition: service_healthy
|
|
migrate:
|
|
condition: service_completed_successfully
|
|
restart: unless-stopped
|
|
init: true
|
|
networks:
|
|
- meshnet
|
|
|
|
# Grafana dashboards, wired to ClickHouse via the read-only user.
|
|
grafana:
|
|
image: grafana/grafana:12.1.1
|
|
environment:
|
|
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
|
- GF_INSTALL_PLUGINS=grafana-clickhouse-datasource
|
|
- GF_USERS_DEFAULT_THEME=dark
|
|
# Consumed by the provisioned ClickHouse datasource (grafana/provisioning).
|
|
- CLICKHOUSE_READONLY_USER=${CLICKHOUSE_READONLY_USER:-readonly}
|
|
- CLICKHOUSE_READONLY_PASSWORD=${CLICKHOUSE_READONLY_PASSWORD:-readonly}
|
|
ports:
|
|
- "127.0.0.1:3000:3000"
|
|
volumes:
|
|
- grafana-data:/var/lib/grafana
|
|
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
|
depends_on:
|
|
clickhouse:
|
|
condition: service_healthy
|
|
restart: unless-stopped
|
|
networks:
|
|
- meshnet
|
|
|
|
volumes:
|
|
clickhouse-data:
|
|
grafana-data:
|
|
|
|
networks:
|
|
meshnet:
|
|
driver: bridge
|