mirror of
https://github.com/pelgraine/Meck.git
synced 2026-05-16 06:15:54 +02:00
888 lines
32 KiB
C++
888 lines
32 KiB
C++
#pragma once
|
|
// =============================================================================
|
|
// EpubProcessor.h - Convert EPUB files to plain text for TextReaderScreen
|
|
//
|
|
// Pipeline: EPUB (ZIP) → container.xml → OPF spine → extract chapters →
|
|
// strip XHTML tags → concatenated plain text → cached .txt on SD
|
|
//
|
|
// The resulting .txt file is placed in /books/ and picked up automatically
|
|
// by TextReaderScreen's existing pagination, indexing, and bookmarking.
|
|
//
|
|
// Dependencies: EpubZipReader.h (for ZIP extraction)
|
|
// =============================================================================
|
|
|
|
#include <SD.h>
|
|
#include <FS.h>
|
|
#include "EpubZipReader.h"
|
|
#include "Utf8CP437.h"
|
|
|
|
// Maximum chapters in spine (most novels have 20-80)
|
|
#define EPUB_MAX_CHAPTERS 200
|
|
|
|
// Maximum manifest items we track
|
|
#define EPUB_MAX_MANIFEST 256
|
|
|
|
// Buffer size for reading OPF/container XML
|
|
// (These are small files, typically 1-20KB)
|
|
#define EPUB_XML_BUF_SIZE 64
|
|
|
|
class EpubProcessor {
|
|
public:
|
|
|
|
// ----------------------------------------------------------
|
|
// Process an EPUB file: extract text and write to SD cache.
|
|
//
|
|
// epubPath: source, e.g. "/books/The Iliad.epub"
|
|
// txtPath: output, e.g. "/books/The Iliad by Homer.txt"
|
|
//
|
|
// Returns true if the .txt file was written successfully.
|
|
// If txtPath already exists, returns true immediately (cached).
|
|
// ----------------------------------------------------------
|
|
static bool processToText(const char* epubPath, const char* txtPath) {
|
|
// Check if already cached
|
|
if (SD.exists(txtPath)) {
|
|
Serial.printf("EpubProc: '%s' already cached\n", txtPath);
|
|
return true;
|
|
}
|
|
|
|
Serial.printf("EpubProc: Processing '%s'\n", epubPath);
|
|
unsigned long t0 = millis();
|
|
|
|
// Open the EPUB (ZIP archive)
|
|
File epubFile = SD.open(epubPath, FILE_READ);
|
|
if (!epubFile) {
|
|
Serial.println("EpubProc: Cannot open EPUB file");
|
|
return false;
|
|
}
|
|
|
|
// Heap-allocate zip reader (entries table is ~19KB)
|
|
EpubZipReader* zip = new EpubZipReader();
|
|
if (!zip) {
|
|
epubFile.close();
|
|
Serial.println("EpubProc: Cannot allocate ZipReader");
|
|
return false;
|
|
}
|
|
|
|
if (!zip->open(epubFile)) {
|
|
delete zip;
|
|
epubFile.close();
|
|
Serial.println("EpubProc: Cannot parse ZIP structure");
|
|
return false;
|
|
}
|
|
|
|
// Step 1: Find OPF path from container.xml
|
|
char opfPath[EPUB_XML_BUF_SIZE];
|
|
opfPath[0] = '\0';
|
|
if (!_findOpfPath(zip, opfPath, sizeof(opfPath))) {
|
|
delete zip;
|
|
epubFile.close();
|
|
Serial.println("EpubProc: Cannot find OPF path");
|
|
return false;
|
|
}
|
|
Serial.printf("EpubProc: OPF at '%s'\n", opfPath);
|
|
|
|
// Determine the content base directory (e.g., "OEBPS/")
|
|
char baseDir[EPUB_XML_BUF_SIZE];
|
|
_getDirectory(opfPath, baseDir, sizeof(baseDir));
|
|
|
|
// Step 2: Parse OPF to get title and spine chapter order
|
|
char title[128];
|
|
title[0] = '\0';
|
|
|
|
// Chapter paths in spine order
|
|
char** chapterPaths = nullptr;
|
|
int chapterCount = 0;
|
|
|
|
if (!_parseOpf(zip, opfPath, baseDir, title, sizeof(title),
|
|
&chapterPaths, &chapterCount)) {
|
|
delete zip;
|
|
epubFile.close();
|
|
Serial.println("EpubProc: Cannot parse OPF");
|
|
return false;
|
|
}
|
|
|
|
Serial.printf("EpubProc: Title='%s', %d chapters\n", title, chapterCount);
|
|
|
|
// Step 3: Extract each chapter, strip XHTML, write to output .txt
|
|
File outFile = SD.open(txtPath, FILE_WRITE);
|
|
if (!outFile) {
|
|
_freeChapterPaths(chapterPaths, chapterCount);
|
|
delete zip;
|
|
epubFile.close();
|
|
Serial.printf("EpubProc: Cannot create '%s'\n", txtPath);
|
|
return false;
|
|
}
|
|
|
|
// Write title as first line
|
|
if (title[0]) {
|
|
outFile.println(title);
|
|
outFile.println();
|
|
}
|
|
|
|
int chaptersWritten = 0;
|
|
uint32_t totalBytes = 0;
|
|
|
|
for (int i = 0; i < chapterCount; i++) {
|
|
int entryIdx = zip->findEntry(chapterPaths[i]);
|
|
if (entryIdx < 0) {
|
|
Serial.printf("EpubProc: Chapter not found: '%s'\n", chapterPaths[i]);
|
|
continue;
|
|
}
|
|
|
|
uint32_t rawSize = 0;
|
|
uint8_t* rawData = zip->extractEntry(entryIdx, &rawSize);
|
|
if (!rawData || rawSize == 0) {
|
|
Serial.printf("EpubProc: Failed to extract chapter %d\n", i);
|
|
if (rawData) free(rawData);
|
|
continue;
|
|
}
|
|
|
|
// Strip XHTML tags and write plain text
|
|
uint32_t textLen = 0;
|
|
uint8_t* plainText = _stripXhtml(rawData, rawSize, &textLen);
|
|
free(rawData);
|
|
|
|
if (plainText && textLen > 0) {
|
|
outFile.write(plainText, textLen);
|
|
// Add chapter separator
|
|
outFile.print("\n\n");
|
|
totalBytes += textLen + 2;
|
|
chaptersWritten++;
|
|
}
|
|
if (plainText) free(plainText);
|
|
}
|
|
|
|
outFile.flush();
|
|
outFile.close();
|
|
|
|
// Release SD CS for other SPI users
|
|
digitalWrite(SDCARD_CS, HIGH);
|
|
|
|
_freeChapterPaths(chapterPaths, chapterCount);
|
|
delete zip;
|
|
epubFile.close();
|
|
|
|
unsigned long elapsed = millis() - t0;
|
|
Serial.printf("EpubProc: Done! %d chapters, %u bytes in %lu ms -> '%s'\n",
|
|
chaptersWritten, totalBytes, elapsed, txtPath);
|
|
|
|
return chaptersWritten > 0;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Extract just the title from an EPUB (for display in file list).
|
|
// Returns false if it can't be determined.
|
|
// ----------------------------------------------------------
|
|
static bool getTitle(const char* epubPath, char* titleBuf, int titleBufSize) {
|
|
File epubFile = SD.open(epubPath, FILE_READ);
|
|
if (!epubFile) return false;
|
|
|
|
EpubZipReader* zip = new EpubZipReader();
|
|
if (!zip) { epubFile.close(); return false; }
|
|
|
|
if (!zip->open(epubFile)) {
|
|
delete zip; epubFile.close(); return false;
|
|
}
|
|
|
|
char opfPath[EPUB_XML_BUF_SIZE];
|
|
if (!_findOpfPath(zip, opfPath, sizeof(opfPath))) {
|
|
delete zip; epubFile.close(); return false;
|
|
}
|
|
|
|
// Extract OPF and find <dc:title>
|
|
int opfIdx = zip->findEntry(opfPath);
|
|
if (opfIdx < 0) { delete zip; epubFile.close(); return false; }
|
|
|
|
uint32_t opfSize = 0;
|
|
uint8_t* opfData = zip->extractEntry(opfIdx, &opfSize);
|
|
delete zip;
|
|
epubFile.close();
|
|
|
|
if (!opfData) return false;
|
|
|
|
bool found = _extractTagContent((const char*)opfData, opfSize,
|
|
"dc:title", titleBuf, titleBufSize);
|
|
free(opfData);
|
|
return found;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Build a cache .txt path from an .epub path.
|
|
// e.g., "/books/mybook.epub" -> "/books/.epub_cache/mybook.txt"
|
|
// ----------------------------------------------------------
|
|
static void buildCachePath(const char* epubPath, char* cachePath, int cachePathSize) {
|
|
// Extract filename without extension
|
|
const char* lastSlash = strrchr(epubPath, '/');
|
|
const char* filename = lastSlash ? lastSlash + 1 : epubPath;
|
|
|
|
// Find the directory part
|
|
char dir[128];
|
|
if (lastSlash) {
|
|
int dirLen = lastSlash - epubPath;
|
|
if (dirLen >= (int)sizeof(dir)) dirLen = sizeof(dir) - 1;
|
|
strncpy(dir, epubPath, dirLen);
|
|
dir[dirLen] = '\0';
|
|
} else {
|
|
strcpy(dir, "/books");
|
|
}
|
|
|
|
// Create cache directory if needed
|
|
char cacheDir[160];
|
|
snprintf(cacheDir, sizeof(cacheDir), "%s/.epub_cache", dir);
|
|
if (!SD.exists(cacheDir)) {
|
|
SD.mkdir(cacheDir);
|
|
}
|
|
|
|
// Strip .epub extension
|
|
char baseName[128];
|
|
strncpy(baseName, filename, sizeof(baseName) - 1);
|
|
baseName[sizeof(baseName) - 1] = '\0';
|
|
char* dot = strrchr(baseName, '.');
|
|
if (dot) *dot = '\0';
|
|
|
|
snprintf(cachePath, cachePathSize, "%s/%s.txt", cacheDir, baseName);
|
|
}
|
|
|
|
private:
|
|
|
|
// ----------------------------------------------------------
|
|
// Parse container.xml to find the OPF file path.
|
|
// Returns true if found.
|
|
// ----------------------------------------------------------
|
|
static bool _findOpfPath(EpubZipReader* zip, char* opfPath, int opfPathSize) {
|
|
int idx = zip->findEntry("META-INF/container.xml");
|
|
if (idx < 0) {
|
|
// Fallback: find any .opf file directly
|
|
idx = zip->findEntryBySuffix(".opf");
|
|
if (idx >= 0) {
|
|
const ZipEntry* e = zip->getEntry(idx);
|
|
strncpy(opfPath, e->filename, opfPathSize - 1);
|
|
opfPath[opfPathSize - 1] = '\0';
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
uint32_t size = 0;
|
|
uint8_t* data = zip->extractEntry(idx, &size);
|
|
if (!data) return false;
|
|
|
|
// Find: full-path="OEBPS/content.opf"
|
|
bool found = _extractAttribute((const char*)data, size,
|
|
"full-path", opfPath, opfPathSize);
|
|
free(data);
|
|
return found;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Parse OPF to extract title, build manifest, and resolve spine.
|
|
//
|
|
// Populates chapterPaths (heap-allocated array of strings) with
|
|
// full ZIP paths for each chapter in spine order.
|
|
// Caller must free with _freeChapterPaths().
|
|
// ----------------------------------------------------------
|
|
static bool _parseOpf(EpubZipReader* zip, const char* opfPath,
|
|
const char* baseDir, char* title, int titleSize,
|
|
char*** outChapterPaths, int* outChapterCount) {
|
|
int opfIdx = zip->findEntry(opfPath);
|
|
if (opfIdx < 0) return false;
|
|
|
|
uint32_t opfSize = 0;
|
|
uint8_t* opfData = zip->extractEntry(opfIdx, &opfSize);
|
|
if (!opfData) return false;
|
|
|
|
const char* xml = (const char*)opfData;
|
|
|
|
// Extract title
|
|
_extractTagContent(xml, opfSize, "dc:title", title, titleSize);
|
|
|
|
// Build manifest: map id -> href
|
|
// We use two parallel arrays to avoid complex data structures
|
|
struct ManifestItem {
|
|
char id[64];
|
|
char href[128];
|
|
bool isContent; // has media-type containing "html" or "xml"
|
|
};
|
|
|
|
// Heap-allocate manifest (could be large)
|
|
ManifestItem* manifest = (ManifestItem*)ps_malloc(
|
|
EPUB_MAX_MANIFEST * sizeof(ManifestItem));
|
|
if (!manifest) {
|
|
manifest = (ManifestItem*)malloc(EPUB_MAX_MANIFEST * sizeof(ManifestItem));
|
|
}
|
|
if (!manifest) {
|
|
free(opfData);
|
|
return false;
|
|
}
|
|
int manifestCount = 0;
|
|
|
|
// Parse <item> elements from <manifest>
|
|
const char* manifestStart = _findTag(xml, opfSize, "<manifest");
|
|
const char* manifestEnd = manifestStart ?
|
|
_findTag(manifestStart, opfSize - (manifestStart - xml), "</manifest") : nullptr;
|
|
if (!manifestEnd) manifestEnd = xml + opfSize;
|
|
|
|
if (manifestStart) {
|
|
const char* pos = manifestStart;
|
|
while (pos < manifestEnd && manifestCount < EPUB_MAX_MANIFEST) {
|
|
pos = _findTag(pos, manifestEnd - pos, "<item");
|
|
if (!pos || pos >= manifestEnd) break;
|
|
|
|
// Find the closing > of this <item ... />
|
|
const char* tagEnd = (const char*)memchr(pos, '>', manifestEnd - pos);
|
|
if (!tagEnd) break;
|
|
tagEnd++;
|
|
|
|
ManifestItem& item = manifest[manifestCount];
|
|
item.id[0] = '\0';
|
|
item.href[0] = '\0';
|
|
item.isContent = false;
|
|
|
|
_extractAttributeFromTag(pos, tagEnd - pos, "id",
|
|
item.id, sizeof(item.id));
|
|
_extractAttributeFromTag(pos, tagEnd - pos, "href",
|
|
item.href, sizeof(item.href));
|
|
|
|
// Check media-type for content files
|
|
char mediaType[64];
|
|
mediaType[0] = '\0';
|
|
_extractAttributeFromTag(pos, tagEnd - pos, "media-type",
|
|
mediaType, sizeof(mediaType));
|
|
item.isContent = (strstr(mediaType, "html") != nullptr ||
|
|
strstr(mediaType, "xml") != nullptr);
|
|
|
|
if (item.id[0] && item.href[0]) {
|
|
manifestCount++;
|
|
}
|
|
|
|
pos = tagEnd;
|
|
}
|
|
}
|
|
|
|
Serial.printf("EpubProc: Manifest has %d items\n", manifestCount);
|
|
|
|
// Parse <spine> to get reading order
|
|
// Spine contains <itemref idref="..."/> elements
|
|
const char* spineStart = _findTag(xml, opfSize, "<spine");
|
|
const char* spineEnd = spineStart ?
|
|
_findTag(spineStart, opfSize - (spineStart - xml), "</spine") : nullptr;
|
|
if (!spineEnd) spineEnd = xml + opfSize;
|
|
|
|
// Collect spine idrefs
|
|
char** chapterPaths = (char**)ps_malloc(EPUB_MAX_CHAPTERS * sizeof(char*));
|
|
if (!chapterPaths) chapterPaths = (char**)malloc(EPUB_MAX_CHAPTERS * sizeof(char*));
|
|
if (!chapterPaths) {
|
|
free(manifest);
|
|
free(opfData);
|
|
return false;
|
|
}
|
|
int chapterCount = 0;
|
|
|
|
if (spineStart) {
|
|
const char* pos = spineStart;
|
|
while (pos < spineEnd && chapterCount < EPUB_MAX_CHAPTERS) {
|
|
pos = _findTag(pos, spineEnd - pos, "<itemref");
|
|
if (!pos || pos >= spineEnd) break;
|
|
|
|
const char* tagEnd = (const char*)memchr(pos, '>', spineEnd - pos);
|
|
if (!tagEnd) break;
|
|
tagEnd++;
|
|
|
|
char idref[64];
|
|
idref[0] = '\0';
|
|
_extractAttributeFromTag(pos, tagEnd - pos, "idref",
|
|
idref, sizeof(idref));
|
|
|
|
if (idref[0]) {
|
|
// Look up in manifest
|
|
for (int m = 0; m < manifestCount; m++) {
|
|
if (strcmp(manifest[m].id, idref) == 0 && manifest[m].isContent) {
|
|
// Build full path: baseDir + href
|
|
int pathLen = strlen(baseDir) + strlen(manifest[m].href) + 1;
|
|
char* fullPath = (char*)malloc(pathLen);
|
|
if (fullPath) {
|
|
snprintf(fullPath, pathLen, "%s%s", baseDir, manifest[m].href);
|
|
chapterPaths[chapterCount++] = fullPath;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
pos = tagEnd;
|
|
}
|
|
}
|
|
|
|
free(manifest);
|
|
free(opfData);
|
|
|
|
*outChapterPaths = chapterPaths;
|
|
*outChapterCount = chapterCount;
|
|
|
|
return chapterCount > 0;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Strip XHTML/HTML tags from raw content, producing plain text.
|
|
//
|
|
// Handles:
|
|
// - Tag removal (everything between < and >)
|
|
// - <p>, <br>, <div>, <h1>-<h6> → newlines
|
|
// - HTML entity decoding (& < > " ' &#NNN; &#xHH;)
|
|
// - Collapse multiple whitespace/newlines
|
|
// - Skip <head>, <style>, <script> content entirely
|
|
//
|
|
// Returns heap-allocated buffer (caller must free).
|
|
// ----------------------------------------------------------
|
|
static uint8_t* _stripXhtml(const uint8_t* input, uint32_t inputLen,
|
|
uint32_t* outLen) {
|
|
// Output can't be larger than input
|
|
uint8_t* output = (uint8_t*)ps_malloc(inputLen + 1);
|
|
if (!output) output = (uint8_t*)malloc(inputLen + 1);
|
|
if (!output) { *outLen = 0; return nullptr; }
|
|
|
|
uint32_t outPos = 0;
|
|
bool inTag = false;
|
|
bool skipContent = false; // Inside <head>, <style>, <script>
|
|
char tagName[32];
|
|
int tagNamePos = 0;
|
|
bool tagNameDone = false;
|
|
bool isClosingTag = false;
|
|
bool lastWasNewline = false;
|
|
bool lastWasSpace = false;
|
|
|
|
// Skip to <body> if present (ignore everything before it)
|
|
const uint8_t* start = input;
|
|
const uint8_t* inputEnd = input + inputLen;
|
|
const char* bodyStart = _findTagCI((const char*)input, inputLen, "<body");
|
|
if (bodyStart) {
|
|
const char* bodyTagEnd = (const char*)memchr(bodyStart, '>',
|
|
inputEnd - (const uint8_t*)bodyStart);
|
|
if (bodyTagEnd) {
|
|
start = (const uint8_t*)(bodyTagEnd + 1);
|
|
}
|
|
}
|
|
const uint8_t* end = inputEnd;
|
|
|
|
for (const uint8_t* p = start; p < end; p++) {
|
|
char c = (char)*p;
|
|
|
|
if (inTag) {
|
|
// Collecting tag name
|
|
if (!tagNameDone) {
|
|
if (tagNamePos == 0 && c == '/') {
|
|
isClosingTag = true;
|
|
continue;
|
|
}
|
|
if (c == '>' || c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '/') {
|
|
tagName[tagNamePos] = '\0';
|
|
tagNameDone = true;
|
|
} else if (tagNamePos < (int)sizeof(tagName) - 1) {
|
|
tagName[tagNamePos++] = (c >= 'A' && c <= 'Z') ? (c + 32) : c;
|
|
}
|
|
}
|
|
|
|
if (c == '>') {
|
|
inTag = false;
|
|
|
|
// Handle skip regions
|
|
if (!isClosingTag) {
|
|
if (strcmp(tagName, "head") == 0 ||
|
|
strcmp(tagName, "style") == 0 ||
|
|
strcmp(tagName, "script") == 0) {
|
|
skipContent = true;
|
|
}
|
|
} else {
|
|
if (strcmp(tagName, "head") == 0 ||
|
|
strcmp(tagName, "style") == 0 ||
|
|
strcmp(tagName, "script") == 0) {
|
|
skipContent = false;
|
|
}
|
|
}
|
|
|
|
if (!skipContent) {
|
|
// Block-level elements produce newlines
|
|
if (strcmp(tagName, "p") == 0 ||
|
|
strcmp(tagName, "div") == 0 ||
|
|
strcmp(tagName, "br") == 0 ||
|
|
strcmp(tagName, "h1") == 0 ||
|
|
strcmp(tagName, "h2") == 0 ||
|
|
strcmp(tagName, "h3") == 0 ||
|
|
strcmp(tagName, "h4") == 0 ||
|
|
strcmp(tagName, "h5") == 0 ||
|
|
strcmp(tagName, "h6") == 0 ||
|
|
strcmp(tagName, "li") == 0 ||
|
|
strcmp(tagName, "tr") == 0 ||
|
|
strcmp(tagName, "blockquote") == 0 ||
|
|
strcmp(tagName, "hr") == 0) {
|
|
if (outPos > 0 && !lastWasNewline) {
|
|
output[outPos++] = '\n';
|
|
lastWasNewline = true;
|
|
lastWasSpace = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
continue;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Not in a tag
|
|
if (c == '<') {
|
|
inTag = true;
|
|
tagNamePos = 0;
|
|
tagNameDone = false;
|
|
isClosingTag = false;
|
|
continue;
|
|
}
|
|
|
|
if (skipContent) continue;
|
|
|
|
// Handle HTML entities
|
|
if (c == '&') {
|
|
char decoded = _decodeEntity(p, end, &p);
|
|
if (decoded) {
|
|
c = decoded;
|
|
// p now points to the ';' or last char of entity; loop will increment
|
|
}
|
|
}
|
|
|
|
// Handle UTF-8 multi-byte sequences (smart quotes, em dashes, accented chars, etc.)
|
|
// These appear as raw bytes in XHTML. Typographic chars are mapped to ASCII;
|
|
// accented Latin chars are preserved as UTF-8 for CP437 rendering on e-ink.
|
|
if ((uint8_t)c >= 0xC0) {
|
|
uint32_t codepoint = 0;
|
|
int extraBytes = 0;
|
|
|
|
if (((uint8_t)c & 0xE0) == 0xC0) {
|
|
// 2-byte sequence: 110xxxxx 10xxxxxx
|
|
codepoint = (uint8_t)c & 0x1F;
|
|
extraBytes = 1;
|
|
} else if (((uint8_t)c & 0xF0) == 0xE0) {
|
|
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
|
|
codepoint = (uint8_t)c & 0x0F;
|
|
extraBytes = 2;
|
|
} else if (((uint8_t)c & 0xF8) == 0xF0) {
|
|
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
codepoint = (uint8_t)c & 0x07;
|
|
extraBytes = 3;
|
|
}
|
|
|
|
// Read continuation bytes
|
|
bool valid = true;
|
|
for (int b = 0; b < extraBytes && p + 1 + b < end; b++) {
|
|
uint8_t cb = *(p + 1 + b);
|
|
if ((cb & 0xC0) != 0x80) { valid = false; break; }
|
|
codepoint = (codepoint << 6) | (cb & 0x3F);
|
|
}
|
|
|
|
if (valid && extraBytes > 0) {
|
|
p += extraBytes; // Skip continuation bytes (loop increments past lead byte)
|
|
|
|
// Map Unicode codepoints to displayable equivalents
|
|
// Typographic chars → ASCII, accented chars → preserved as UTF-8
|
|
char mapped = 0;
|
|
switch (codepoint) {
|
|
case 0x2018: case 0x2019: mapped = '\''; break; // Smart single quotes
|
|
case 0x201C: case 0x201D: mapped = '"'; break; // Smart double quotes
|
|
case 0x2013: case 0x2014: mapped = '-'; break; // En/em dash
|
|
case 0x2026: mapped = '.'; break; // Ellipsis
|
|
case 0x2022: mapped = '*'; break; // Bullet
|
|
case 0x00A0: mapped = ' '; break; // Non-breaking space
|
|
case 0x00AB: case 0x00BB: mapped = '"'; break; // Guillemets
|
|
case 0x2032: mapped = '\''; break; // Prime
|
|
case 0x2033: mapped = '"'; break; // Double prime
|
|
case 0x2010: case 0x2011: mapped = '-'; break; // Hyphens
|
|
case 0x2012: mapped = '-'; break; // Figure dash
|
|
case 0x2015: mapped = '-'; break; // Horizontal bar
|
|
case 0x2039: case 0x203A: mapped = '\''; break; // Single guillemets
|
|
default:
|
|
if (codepoint >= 0x20 && codepoint < 0x7F) {
|
|
mapped = (char)codepoint; // Basic ASCII range
|
|
} else if (unicodeToCP437(codepoint)) {
|
|
// Accented character that the e-ink font can render via CP437.
|
|
// Preserve as UTF-8 in the output; the text reader will decode
|
|
// and map to CP437 at render time.
|
|
if (codepoint <= 0x7FF) {
|
|
output[outPos++] = 0xC0 | (codepoint >> 6);
|
|
output[outPos++] = 0x80 | (codepoint & 0x3F);
|
|
} else if (codepoint <= 0xFFFF) {
|
|
output[outPos++] = 0xE0 | (codepoint >> 12);
|
|
output[outPos++] = 0x80 | ((codepoint >> 6) & 0x3F);
|
|
output[outPos++] = 0x80 | (codepoint & 0x3F);
|
|
}
|
|
lastWasNewline = false;
|
|
lastWasSpace = false;
|
|
continue; // Already wrote to output
|
|
} else {
|
|
continue; // Skip unmappable characters
|
|
}
|
|
break;
|
|
}
|
|
c = mapped;
|
|
} else {
|
|
continue; // Skip malformed UTF-8
|
|
}
|
|
} else if ((uint8_t)c >= 0x80) {
|
|
// Stray continuation byte (0x80-0xBF) — skip
|
|
continue;
|
|
}
|
|
|
|
// Whitespace collapsing
|
|
if (c == '\n' || c == '\r') {
|
|
if (!lastWasNewline && outPos > 0) {
|
|
output[outPos++] = '\n';
|
|
lastWasNewline = true;
|
|
lastWasSpace = false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (c == ' ' || c == '\t') {
|
|
if (!lastWasSpace && !lastWasNewline && outPos > 0) {
|
|
output[outPos++] = ' ';
|
|
lastWasSpace = true;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Regular character
|
|
output[outPos++] = c;
|
|
lastWasNewline = false;
|
|
lastWasSpace = false;
|
|
}
|
|
|
|
// Trim trailing whitespace
|
|
while (outPos > 0 && (output[outPos-1] == '\n' || output[outPos-1] == ' ')) {
|
|
outPos--;
|
|
}
|
|
|
|
output[outPos] = '\0';
|
|
*outLen = outPos;
|
|
return output;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Decode an HTML entity starting at '&'.
|
|
// Advances *pos to the last character consumed.
|
|
// Returns the decoded character, or '&' if not recognized.
|
|
// ----------------------------------------------------------
|
|
static char _decodeEntity(const uint8_t* p, const uint8_t* end,
|
|
const uint8_t** outPos) {
|
|
// Look for ';' within a reasonable range
|
|
const uint8_t* semi = p + 1;
|
|
int maxLen = 10;
|
|
while (semi < end && semi < p + maxLen && *semi != ';') semi++;
|
|
|
|
if (*semi != ';' || semi >= end) {
|
|
*outPos = p; // Not an entity, return '&' literal
|
|
return '&';
|
|
}
|
|
|
|
int entityLen = semi - p - 1; // Length between & and ;
|
|
const char* entity = (const char*)(p + 1);
|
|
|
|
*outPos = semi; // Skip past ';'
|
|
|
|
// Named entities
|
|
if (entityLen == 3 && strncmp(entity, "amp", 3) == 0) return '&';
|
|
if (entityLen == 2 && strncmp(entity, "lt", 2) == 0) return '<';
|
|
if (entityLen == 2 && strncmp(entity, "gt", 2) == 0) return '>';
|
|
if (entityLen == 4 && strncmp(entity, "quot", 4) == 0) return '"';
|
|
if (entityLen == 4 && strncmp(entity, "apos", 4) == 0) return '\'';
|
|
if (entityLen == 4 && strncmp(entity, "nbsp", 4) == 0) return ' ';
|
|
if (entityLen == 5 && strncmp(entity, "mdash", 5) == 0) return '-';
|
|
if (entityLen == 5 && strncmp(entity, "ndash", 5) == 0) return '-';
|
|
if (entityLen == 6 && strncmp(entity, "hellip", 6) == 0) return '.';
|
|
if (entityLen == 5 && strncmp(entity, "lsquo", 5) == 0) return '\'';
|
|
if (entityLen == 5 && strncmp(entity, "rsquo", 5) == 0) return '\'';
|
|
if (entityLen == 5 && strncmp(entity, "ldquo", 5) == 0) return '"';
|
|
if (entityLen == 5 && strncmp(entity, "rdquo", 5) == 0) return '"';
|
|
|
|
// Common accented character entities → CP437 bytes for built-in font
|
|
if (entityLen == 6 && strncmp(entity, "eacute", 6) == 0) return (char)0x82; // é
|
|
if (entityLen == 6 && strncmp(entity, "egrave", 6) == 0) return (char)0x8A; // è
|
|
if (entityLen == 5 && strncmp(entity, "ecirc", 5) == 0) return (char)0x88; // ê
|
|
if (entityLen == 4 && strncmp(entity, "euml", 4) == 0) return (char)0x89; // ë
|
|
if (entityLen == 6 && strncmp(entity, "agrave", 6) == 0) return (char)0x85; // à
|
|
if (entityLen == 6 && strncmp(entity, "aacute", 6) == 0) return (char)0xA0; // á
|
|
if (entityLen == 5 && strncmp(entity, "acirc", 5) == 0) return (char)0x83; // â
|
|
if (entityLen == 4 && strncmp(entity, "auml", 4) == 0) return (char)0x84; // ä
|
|
if (entityLen == 6 && strncmp(entity, "ccedil", 6) == 0) return (char)0x87; // ç
|
|
if (entityLen == 6 && strncmp(entity, "iacute", 6) == 0) return (char)0xA1; // í
|
|
if (entityLen == 5 && strncmp(entity, "icirc", 5) == 0) return (char)0x8C; // î
|
|
if (entityLen == 4 && strncmp(entity, "iuml", 4) == 0) return (char)0x8B; // ï
|
|
if (entityLen == 6 && strncmp(entity, "igrave", 6) == 0) return (char)0x8D; // ì
|
|
if (entityLen == 6 && strncmp(entity, "oacute", 6) == 0) return (char)0xA2; // ó
|
|
if (entityLen == 5 && strncmp(entity, "ocirc", 5) == 0) return (char)0x93; // ô
|
|
if (entityLen == 4 && strncmp(entity, "ouml", 4) == 0) return (char)0x94; // ö
|
|
if (entityLen == 6 && strncmp(entity, "ograve", 6) == 0) return (char)0x95; // ò
|
|
if (entityLen == 6 && strncmp(entity, "uacute", 6) == 0) return (char)0xA3; // ú
|
|
if (entityLen == 5 && strncmp(entity, "ucirc", 5) == 0) return (char)0x96; // û
|
|
if (entityLen == 4 && strncmp(entity, "uuml", 4) == 0) return (char)0x81; // ü
|
|
if (entityLen == 6 && strncmp(entity, "ugrave", 6) == 0) return (char)0x97; // ù
|
|
if (entityLen == 6 && strncmp(entity, "ntilde", 6) == 0) return (char)0xA4; // ñ
|
|
if (entityLen == 6 && strncmp(entity, "Eacute", 6) == 0) return (char)0x90; // É
|
|
if (entityLen == 6 && strncmp(entity, "Ccedil", 6) == 0) return (char)0x80; // Ç
|
|
if (entityLen == 6 && strncmp(entity, "Ntilde", 6) == 0) return (char)0xA5; // Ñ
|
|
if (entityLen == 4 && strncmp(entity, "Auml", 4) == 0) return (char)0x8E; // Ä
|
|
if (entityLen == 4 && strncmp(entity, "Ouml", 4) == 0) return (char)0x99; // Ö
|
|
if (entityLen == 4 && strncmp(entity, "Uuml", 4) == 0) return (char)0x9A; // Ü
|
|
if (entityLen == 5 && strncmp(entity, "szlig", 5) == 0) return (char)0xE1; // ß
|
|
|
|
// Numeric entities: &#NNN; or &#xHH;
|
|
if (entityLen >= 2 && entity[0] == '#') {
|
|
int codepoint = 0;
|
|
if (entity[1] == 'x' || entity[1] == 'X') {
|
|
// Hex
|
|
for (int i = 2; i < entityLen; i++) {
|
|
char ch = entity[i];
|
|
if (ch >= '0' && ch <= '9') codepoint = codepoint * 16 + (ch - '0');
|
|
else if (ch >= 'a' && ch <= 'f') codepoint = codepoint * 16 + (ch - 'a' + 10);
|
|
else if (ch >= 'A' && ch <= 'F') codepoint = codepoint * 16 + (ch - 'A' + 10);
|
|
}
|
|
} else {
|
|
// Decimal
|
|
for (int i = 1; i < entityLen; i++) {
|
|
char ch = entity[i];
|
|
if (ch >= '0' && ch <= '9') codepoint = codepoint * 10 + (ch - '0');
|
|
}
|
|
}
|
|
// Map to displayable character (best effort)
|
|
if (codepoint >= 32 && codepoint < 127) return (char)codepoint;
|
|
if (codepoint == 160) return ' '; // non-breaking space
|
|
// Try CP437 mapping for accented characters.
|
|
// The byte value will be passed through to the built-in font.
|
|
uint8_t cp437 = unicodeToCP437(codepoint);
|
|
if (cp437) return (char)cp437;
|
|
// Unknown codepoint > 127: skip it
|
|
return ' ';
|
|
}
|
|
|
|
// Unknown entity - output as space
|
|
return ' ';
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Find a tag in XML data (case-sensitive, e.g., "<manifest").
|
|
// Returns pointer to '<' of found tag, or nullptr.
|
|
// ----------------------------------------------------------
|
|
static const char* _findTag(const char* data, int dataLen, const char* tag) {
|
|
int tagLen = strlen(tag);
|
|
const char* end = data + dataLen - tagLen;
|
|
for (const char* p = data; p <= end; p++) {
|
|
if (memcmp(p, tag, tagLen) == 0) return p;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Find a tag case-insensitively (for <body>, <BODY>, etc.).
|
|
// ----------------------------------------------------------
|
|
static const char* _findTagCI(const char* data, int dataLen, const char* tag) {
|
|
int tagLen = strlen(tag);
|
|
const char* end = data + dataLen - tagLen;
|
|
for (const char* p = data; p <= end; p++) {
|
|
if (strncasecmp(p, tag, tagLen) == 0) return p;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Extract an attribute value from a region of XML.
|
|
// Scans for attr="value" and copies value to outBuf.
|
|
// ----------------------------------------------------------
|
|
static bool _extractAttribute(const char* data, int dataLen,
|
|
const char* attrName, char* outBuf, int outBufSize) {
|
|
int nameLen = strlen(attrName);
|
|
const char* end = data + dataLen;
|
|
for (const char* p = data; p < end - nameLen - 2; p++) {
|
|
if (strncmp(p, attrName, nameLen) == 0 && p[nameLen] == '=') {
|
|
p += nameLen + 1;
|
|
char quote = *p;
|
|
if (quote != '"' && quote != '\'') continue;
|
|
p++;
|
|
const char* valEnd = (const char*)memchr(p, quote, end - p);
|
|
if (!valEnd) continue;
|
|
int valLen = valEnd - p;
|
|
if (valLen >= outBufSize) valLen = outBufSize - 1;
|
|
memcpy(outBuf, p, valLen);
|
|
outBuf[valLen] = '\0';
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Extract an attribute value from within a single tag string.
|
|
// (More targeted version for parsing <item id="x" href="y"/>)
|
|
// ----------------------------------------------------------
|
|
static bool _extractAttributeFromTag(const char* tag, int tagLen,
|
|
const char* attrName,
|
|
char* outBuf, int outBufSize) {
|
|
return _extractAttribute(tag, tagLen, attrName, outBuf, outBufSize);
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Extract text content between <tagName>...</tagName>.
|
|
// Works for simple cases like <dc:title>The Iliad</dc:title>.
|
|
// ----------------------------------------------------------
|
|
static bool _extractTagContent(const char* data, int dataLen,
|
|
const char* tagName, char* outBuf, int outBufSize) {
|
|
// Build open tag pattern: "<dc:title" (without >)
|
|
char openTag[64];
|
|
snprintf(openTag, sizeof(openTag), "<%s", tagName);
|
|
|
|
const char* start = _findTag(data, dataLen, openTag);
|
|
if (!start) return false;
|
|
|
|
// Find the > that closes the opening tag
|
|
const char* end = data + dataLen;
|
|
const char* contentStart = (const char*)memchr(start, '>', end - start);
|
|
if (!contentStart) return false;
|
|
contentStart++; // Skip past '>'
|
|
|
|
// Find closing tag
|
|
char closeTag[64];
|
|
snprintf(closeTag, sizeof(closeTag), "</%s>", tagName);
|
|
const char* contentEnd = _findTag(contentStart, end - contentStart, closeTag);
|
|
if (!contentEnd) return false;
|
|
|
|
int len = contentEnd - contentStart;
|
|
if (len >= outBufSize) len = outBufSize - 1;
|
|
memcpy(outBuf, contentStart, len);
|
|
outBuf[len] = '\0';
|
|
return true;
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Get directory portion of a path.
|
|
// "OEBPS/content.opf" -> "OEBPS/"
|
|
// "content.opf" -> ""
|
|
// ----------------------------------------------------------
|
|
static void _getDirectory(const char* path, char* dirBuf, int dirBufSize) {
|
|
const char* lastSlash = strrchr(path, '/');
|
|
if (lastSlash) {
|
|
int len = lastSlash - path + 1; // Include trailing /
|
|
if (len >= dirBufSize) len = dirBufSize - 1;
|
|
memcpy(dirBuf, path, len);
|
|
dirBuf[len] = '\0';
|
|
} else {
|
|
dirBuf[0] = '\0';
|
|
}
|
|
}
|
|
|
|
// ----------------------------------------------------------
|
|
// Free the chapter paths array allocated by _parseOpf().
|
|
// ----------------------------------------------------------
|
|
static void _freeChapterPaths(char** paths, int count) {
|
|
if (paths) {
|
|
for (int i = 0; i < count; i++) {
|
|
if (paths[i]) free(paths[i]);
|
|
}
|
|
free(paths);
|
|
}
|
|
}
|
|
}; |