Files

888 lines
32 KiB
C++

#pragma once
// =============================================================================
// EpubProcessor.h - Convert EPUB files to plain text for TextReaderScreen
//
// Pipeline: EPUB (ZIP) → container.xml → OPF spine → extract chapters →
// strip XHTML tags → concatenated plain text → cached .txt on SD
//
// The resulting .txt file is placed in /books/ and picked up automatically
// by TextReaderScreen's existing pagination, indexing, and bookmarking.
//
// Dependencies: EpubZipReader.h (for ZIP extraction)
// =============================================================================
#include <SD.h>
#include <FS.h>
#include "EpubZipReader.h"
#include "Utf8CP437.h"
// Maximum chapters in spine (most novels have 20-80)
#define EPUB_MAX_CHAPTERS 200
// Maximum manifest items we track
#define EPUB_MAX_MANIFEST 256
// Buffer size for reading OPF/container XML
// (These are small files, typically 1-20KB)
#define EPUB_XML_BUF_SIZE 64
class EpubProcessor {
public:
// ----------------------------------------------------------
// Process an EPUB file: extract text and write to SD cache.
//
// epubPath: source, e.g. "/books/The Iliad.epub"
// txtPath: output, e.g. "/books/The Iliad by Homer.txt"
//
// Returns true if the .txt file was written successfully.
// If txtPath already exists, returns true immediately (cached).
// ----------------------------------------------------------
static bool processToText(const char* epubPath, const char* txtPath) {
// Check if already cached
if (SD.exists(txtPath)) {
Serial.printf("EpubProc: '%s' already cached\n", txtPath);
return true;
}
Serial.printf("EpubProc: Processing '%s'\n", epubPath);
unsigned long t0 = millis();
// Open the EPUB (ZIP archive)
File epubFile = SD.open(epubPath, FILE_READ);
if (!epubFile) {
Serial.println("EpubProc: Cannot open EPUB file");
return false;
}
// Heap-allocate zip reader (entries table is ~19KB)
EpubZipReader* zip = new EpubZipReader();
if (!zip) {
epubFile.close();
Serial.println("EpubProc: Cannot allocate ZipReader");
return false;
}
if (!zip->open(epubFile)) {
delete zip;
epubFile.close();
Serial.println("EpubProc: Cannot parse ZIP structure");
return false;
}
// Step 1: Find OPF path from container.xml
char opfPath[EPUB_XML_BUF_SIZE];
opfPath[0] = '\0';
if (!_findOpfPath(zip, opfPath, sizeof(opfPath))) {
delete zip;
epubFile.close();
Serial.println("EpubProc: Cannot find OPF path");
return false;
}
Serial.printf("EpubProc: OPF at '%s'\n", opfPath);
// Determine the content base directory (e.g., "OEBPS/")
char baseDir[EPUB_XML_BUF_SIZE];
_getDirectory(opfPath, baseDir, sizeof(baseDir));
// Step 2: Parse OPF to get title and spine chapter order
char title[128];
title[0] = '\0';
// Chapter paths in spine order
char** chapterPaths = nullptr;
int chapterCount = 0;
if (!_parseOpf(zip, opfPath, baseDir, title, sizeof(title),
&chapterPaths, &chapterCount)) {
delete zip;
epubFile.close();
Serial.println("EpubProc: Cannot parse OPF");
return false;
}
Serial.printf("EpubProc: Title='%s', %d chapters\n", title, chapterCount);
// Step 3: Extract each chapter, strip XHTML, write to output .txt
File outFile = SD.open(txtPath, FILE_WRITE);
if (!outFile) {
_freeChapterPaths(chapterPaths, chapterCount);
delete zip;
epubFile.close();
Serial.printf("EpubProc: Cannot create '%s'\n", txtPath);
return false;
}
// Write title as first line
if (title[0]) {
outFile.println(title);
outFile.println();
}
int chaptersWritten = 0;
uint32_t totalBytes = 0;
for (int i = 0; i < chapterCount; i++) {
int entryIdx = zip->findEntry(chapterPaths[i]);
if (entryIdx < 0) {
Serial.printf("EpubProc: Chapter not found: '%s'\n", chapterPaths[i]);
continue;
}
uint32_t rawSize = 0;
uint8_t* rawData = zip->extractEntry(entryIdx, &rawSize);
if (!rawData || rawSize == 0) {
Serial.printf("EpubProc: Failed to extract chapter %d\n", i);
if (rawData) free(rawData);
continue;
}
// Strip XHTML tags and write plain text
uint32_t textLen = 0;
uint8_t* plainText = _stripXhtml(rawData, rawSize, &textLen);
free(rawData);
if (plainText && textLen > 0) {
outFile.write(plainText, textLen);
// Add chapter separator
outFile.print("\n\n");
totalBytes += textLen + 2;
chaptersWritten++;
}
if (plainText) free(plainText);
}
outFile.flush();
outFile.close();
// Release SD CS for other SPI users
digitalWrite(SDCARD_CS, HIGH);
_freeChapterPaths(chapterPaths, chapterCount);
delete zip;
epubFile.close();
unsigned long elapsed = millis() - t0;
Serial.printf("EpubProc: Done! %d chapters, %u bytes in %lu ms -> '%s'\n",
chaptersWritten, totalBytes, elapsed, txtPath);
return chaptersWritten > 0;
}
// ----------------------------------------------------------
// Extract just the title from an EPUB (for display in file list).
// Returns false if it can't be determined.
// ----------------------------------------------------------
static bool getTitle(const char* epubPath, char* titleBuf, int titleBufSize) {
File epubFile = SD.open(epubPath, FILE_READ);
if (!epubFile) return false;
EpubZipReader* zip = new EpubZipReader();
if (!zip) { epubFile.close(); return false; }
if (!zip->open(epubFile)) {
delete zip; epubFile.close(); return false;
}
char opfPath[EPUB_XML_BUF_SIZE];
if (!_findOpfPath(zip, opfPath, sizeof(opfPath))) {
delete zip; epubFile.close(); return false;
}
// Extract OPF and find <dc:title>
int opfIdx = zip->findEntry(opfPath);
if (opfIdx < 0) { delete zip; epubFile.close(); return false; }
uint32_t opfSize = 0;
uint8_t* opfData = zip->extractEntry(opfIdx, &opfSize);
delete zip;
epubFile.close();
if (!opfData) return false;
bool found = _extractTagContent((const char*)opfData, opfSize,
"dc:title", titleBuf, titleBufSize);
free(opfData);
return found;
}
// ----------------------------------------------------------
// Build a cache .txt path from an .epub path.
// e.g., "/books/mybook.epub" -> "/books/.epub_cache/mybook.txt"
// ----------------------------------------------------------
static void buildCachePath(const char* epubPath, char* cachePath, int cachePathSize) {
// Extract filename without extension
const char* lastSlash = strrchr(epubPath, '/');
const char* filename = lastSlash ? lastSlash + 1 : epubPath;
// Find the directory part
char dir[128];
if (lastSlash) {
int dirLen = lastSlash - epubPath;
if (dirLen >= (int)sizeof(dir)) dirLen = sizeof(dir) - 1;
strncpy(dir, epubPath, dirLen);
dir[dirLen] = '\0';
} else {
strcpy(dir, "/books");
}
// Create cache directory if needed
char cacheDir[160];
snprintf(cacheDir, sizeof(cacheDir), "%s/.epub_cache", dir);
if (!SD.exists(cacheDir)) {
SD.mkdir(cacheDir);
}
// Strip .epub extension
char baseName[128];
strncpy(baseName, filename, sizeof(baseName) - 1);
baseName[sizeof(baseName) - 1] = '\0';
char* dot = strrchr(baseName, '.');
if (dot) *dot = '\0';
snprintf(cachePath, cachePathSize, "%s/%s.txt", cacheDir, baseName);
}
private:
// ----------------------------------------------------------
// Parse container.xml to find the OPF file path.
// Returns true if found.
// ----------------------------------------------------------
static bool _findOpfPath(EpubZipReader* zip, char* opfPath, int opfPathSize) {
int idx = zip->findEntry("META-INF/container.xml");
if (idx < 0) {
// Fallback: find any .opf file directly
idx = zip->findEntryBySuffix(".opf");
if (idx >= 0) {
const ZipEntry* e = zip->getEntry(idx);
strncpy(opfPath, e->filename, opfPathSize - 1);
opfPath[opfPathSize - 1] = '\0';
return true;
}
return false;
}
uint32_t size = 0;
uint8_t* data = zip->extractEntry(idx, &size);
if (!data) return false;
// Find: full-path="OEBPS/content.opf"
bool found = _extractAttribute((const char*)data, size,
"full-path", opfPath, opfPathSize);
free(data);
return found;
}
// ----------------------------------------------------------
// Parse OPF to extract title, build manifest, and resolve spine.
//
// Populates chapterPaths (heap-allocated array of strings) with
// full ZIP paths for each chapter in spine order.
// Caller must free with _freeChapterPaths().
// ----------------------------------------------------------
static bool _parseOpf(EpubZipReader* zip, const char* opfPath,
const char* baseDir, char* title, int titleSize,
char*** outChapterPaths, int* outChapterCount) {
int opfIdx = zip->findEntry(opfPath);
if (opfIdx < 0) return false;
uint32_t opfSize = 0;
uint8_t* opfData = zip->extractEntry(opfIdx, &opfSize);
if (!opfData) return false;
const char* xml = (const char*)opfData;
// Extract title
_extractTagContent(xml, opfSize, "dc:title", title, titleSize);
// Build manifest: map id -> href
// We use two parallel arrays to avoid complex data structures
struct ManifestItem {
char id[64];
char href[128];
bool isContent; // has media-type containing "html" or "xml"
};
// Heap-allocate manifest (could be large)
ManifestItem* manifest = (ManifestItem*)ps_malloc(
EPUB_MAX_MANIFEST * sizeof(ManifestItem));
if (!manifest) {
manifest = (ManifestItem*)malloc(EPUB_MAX_MANIFEST * sizeof(ManifestItem));
}
if (!manifest) {
free(opfData);
return false;
}
int manifestCount = 0;
// Parse <item> elements from <manifest>
const char* manifestStart = _findTag(xml, opfSize, "<manifest");
const char* manifestEnd = manifestStart ?
_findTag(manifestStart, opfSize - (manifestStart - xml), "</manifest") : nullptr;
if (!manifestEnd) manifestEnd = xml + opfSize;
if (manifestStart) {
const char* pos = manifestStart;
while (pos < manifestEnd && manifestCount < EPUB_MAX_MANIFEST) {
pos = _findTag(pos, manifestEnd - pos, "<item");
if (!pos || pos >= manifestEnd) break;
// Find the closing > of this <item ... />
const char* tagEnd = (const char*)memchr(pos, '>', manifestEnd - pos);
if (!tagEnd) break;
tagEnd++;
ManifestItem& item = manifest[manifestCount];
item.id[0] = '\0';
item.href[0] = '\0';
item.isContent = false;
_extractAttributeFromTag(pos, tagEnd - pos, "id",
item.id, sizeof(item.id));
_extractAttributeFromTag(pos, tagEnd - pos, "href",
item.href, sizeof(item.href));
// Check media-type for content files
char mediaType[64];
mediaType[0] = '\0';
_extractAttributeFromTag(pos, tagEnd - pos, "media-type",
mediaType, sizeof(mediaType));
item.isContent = (strstr(mediaType, "html") != nullptr ||
strstr(mediaType, "xml") != nullptr);
if (item.id[0] && item.href[0]) {
manifestCount++;
}
pos = tagEnd;
}
}
Serial.printf("EpubProc: Manifest has %d items\n", manifestCount);
// Parse <spine> to get reading order
// Spine contains <itemref idref="..."/> elements
const char* spineStart = _findTag(xml, opfSize, "<spine");
const char* spineEnd = spineStart ?
_findTag(spineStart, opfSize - (spineStart - xml), "</spine") : nullptr;
if (!spineEnd) spineEnd = xml + opfSize;
// Collect spine idrefs
char** chapterPaths = (char**)ps_malloc(EPUB_MAX_CHAPTERS * sizeof(char*));
if (!chapterPaths) chapterPaths = (char**)malloc(EPUB_MAX_CHAPTERS * sizeof(char*));
if (!chapterPaths) {
free(manifest);
free(opfData);
return false;
}
int chapterCount = 0;
if (spineStart) {
const char* pos = spineStart;
while (pos < spineEnd && chapterCount < EPUB_MAX_CHAPTERS) {
pos = _findTag(pos, spineEnd - pos, "<itemref");
if (!pos || pos >= spineEnd) break;
const char* tagEnd = (const char*)memchr(pos, '>', spineEnd - pos);
if (!tagEnd) break;
tagEnd++;
char idref[64];
idref[0] = '\0';
_extractAttributeFromTag(pos, tagEnd - pos, "idref",
idref, sizeof(idref));
if (idref[0]) {
// Look up in manifest
for (int m = 0; m < manifestCount; m++) {
if (strcmp(manifest[m].id, idref) == 0 && manifest[m].isContent) {
// Build full path: baseDir + href
int pathLen = strlen(baseDir) + strlen(manifest[m].href) + 1;
char* fullPath = (char*)malloc(pathLen);
if (fullPath) {
snprintf(fullPath, pathLen, "%s%s", baseDir, manifest[m].href);
chapterPaths[chapterCount++] = fullPath;
}
break;
}
}
}
pos = tagEnd;
}
}
free(manifest);
free(opfData);
*outChapterPaths = chapterPaths;
*outChapterCount = chapterCount;
return chapterCount > 0;
}
// ----------------------------------------------------------
// Strip XHTML/HTML tags from raw content, producing plain text.
//
// Handles:
// - Tag removal (everything between < and >)
// - <p>, <br>, <div>, <h1>-<h6> → newlines
// - HTML entity decoding (&amp; &lt; &gt; &quot; &apos; &#NNN; &#xHH;)
// - Collapse multiple whitespace/newlines
// - Skip <head>, <style>, <script> content entirely
//
// Returns heap-allocated buffer (caller must free).
// ----------------------------------------------------------
static uint8_t* _stripXhtml(const uint8_t* input, uint32_t inputLen,
uint32_t* outLen) {
// Output can't be larger than input
uint8_t* output = (uint8_t*)ps_malloc(inputLen + 1);
if (!output) output = (uint8_t*)malloc(inputLen + 1);
if (!output) { *outLen = 0; return nullptr; }
uint32_t outPos = 0;
bool inTag = false;
bool skipContent = false; // Inside <head>, <style>, <script>
char tagName[32];
int tagNamePos = 0;
bool tagNameDone = false;
bool isClosingTag = false;
bool lastWasNewline = false;
bool lastWasSpace = false;
// Skip to <body> if present (ignore everything before it)
const uint8_t* start = input;
const uint8_t* inputEnd = input + inputLen;
const char* bodyStart = _findTagCI((const char*)input, inputLen, "<body");
if (bodyStart) {
const char* bodyTagEnd = (const char*)memchr(bodyStart, '>',
inputEnd - (const uint8_t*)bodyStart);
if (bodyTagEnd) {
start = (const uint8_t*)(bodyTagEnd + 1);
}
}
const uint8_t* end = inputEnd;
for (const uint8_t* p = start; p < end; p++) {
char c = (char)*p;
if (inTag) {
// Collecting tag name
if (!tagNameDone) {
if (tagNamePos == 0 && c == '/') {
isClosingTag = true;
continue;
}
if (c == '>' || c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '/') {
tagName[tagNamePos] = '\0';
tagNameDone = true;
} else if (tagNamePos < (int)sizeof(tagName) - 1) {
tagName[tagNamePos++] = (c >= 'A' && c <= 'Z') ? (c + 32) : c;
}
}
if (c == '>') {
inTag = false;
// Handle skip regions
if (!isClosingTag) {
if (strcmp(tagName, "head") == 0 ||
strcmp(tagName, "style") == 0 ||
strcmp(tagName, "script") == 0) {
skipContent = true;
}
} else {
if (strcmp(tagName, "head") == 0 ||
strcmp(tagName, "style") == 0 ||
strcmp(tagName, "script") == 0) {
skipContent = false;
}
}
if (!skipContent) {
// Block-level elements produce newlines
if (strcmp(tagName, "p") == 0 ||
strcmp(tagName, "div") == 0 ||
strcmp(tagName, "br") == 0 ||
strcmp(tagName, "h1") == 0 ||
strcmp(tagName, "h2") == 0 ||
strcmp(tagName, "h3") == 0 ||
strcmp(tagName, "h4") == 0 ||
strcmp(tagName, "h5") == 0 ||
strcmp(tagName, "h6") == 0 ||
strcmp(tagName, "li") == 0 ||
strcmp(tagName, "tr") == 0 ||
strcmp(tagName, "blockquote") == 0 ||
strcmp(tagName, "hr") == 0) {
if (outPos > 0 && !lastWasNewline) {
output[outPos++] = '\n';
lastWasNewline = true;
lastWasSpace = false;
}
}
}
continue;
}
continue;
}
// Not in a tag
if (c == '<') {
inTag = true;
tagNamePos = 0;
tagNameDone = false;
isClosingTag = false;
continue;
}
if (skipContent) continue;
// Handle HTML entities
if (c == '&') {
char decoded = _decodeEntity(p, end, &p);
if (decoded) {
c = decoded;
// p now points to the ';' or last char of entity; loop will increment
}
}
// Handle UTF-8 multi-byte sequences (smart quotes, em dashes, accented chars, etc.)
// These appear as raw bytes in XHTML. Typographic chars are mapped to ASCII;
// accented Latin chars are preserved as UTF-8 for CP437 rendering on e-ink.
if ((uint8_t)c >= 0xC0) {
uint32_t codepoint = 0;
int extraBytes = 0;
if (((uint8_t)c & 0xE0) == 0xC0) {
// 2-byte sequence: 110xxxxx 10xxxxxx
codepoint = (uint8_t)c & 0x1F;
extraBytes = 1;
} else if (((uint8_t)c & 0xF0) == 0xE0) {
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
codepoint = (uint8_t)c & 0x0F;
extraBytes = 2;
} else if (((uint8_t)c & 0xF8) == 0xF0) {
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
codepoint = (uint8_t)c & 0x07;
extraBytes = 3;
}
// Read continuation bytes
bool valid = true;
for (int b = 0; b < extraBytes && p + 1 + b < end; b++) {
uint8_t cb = *(p + 1 + b);
if ((cb & 0xC0) != 0x80) { valid = false; break; }
codepoint = (codepoint << 6) | (cb & 0x3F);
}
if (valid && extraBytes > 0) {
p += extraBytes; // Skip continuation bytes (loop increments past lead byte)
// Map Unicode codepoints to displayable equivalents
// Typographic chars → ASCII, accented chars → preserved as UTF-8
char mapped = 0;
switch (codepoint) {
case 0x2018: case 0x2019: mapped = '\''; break; // Smart single quotes
case 0x201C: case 0x201D: mapped = '"'; break; // Smart double quotes
case 0x2013: case 0x2014: mapped = '-'; break; // En/em dash
case 0x2026: mapped = '.'; break; // Ellipsis
case 0x2022: mapped = '*'; break; // Bullet
case 0x00A0: mapped = ' '; break; // Non-breaking space
case 0x00AB: case 0x00BB: mapped = '"'; break; // Guillemets
case 0x2032: mapped = '\''; break; // Prime
case 0x2033: mapped = '"'; break; // Double prime
case 0x2010: case 0x2011: mapped = '-'; break; // Hyphens
case 0x2012: mapped = '-'; break; // Figure dash
case 0x2015: mapped = '-'; break; // Horizontal bar
case 0x2039: case 0x203A: mapped = '\''; break; // Single guillemets
default:
if (codepoint >= 0x20 && codepoint < 0x7F) {
mapped = (char)codepoint; // Basic ASCII range
} else if (unicodeToCP437(codepoint)) {
// Accented character that the e-ink font can render via CP437.
// Preserve as UTF-8 in the output; the text reader will decode
// and map to CP437 at render time.
if (codepoint <= 0x7FF) {
output[outPos++] = 0xC0 | (codepoint >> 6);
output[outPos++] = 0x80 | (codepoint & 0x3F);
} else if (codepoint <= 0xFFFF) {
output[outPos++] = 0xE0 | (codepoint >> 12);
output[outPos++] = 0x80 | ((codepoint >> 6) & 0x3F);
output[outPos++] = 0x80 | (codepoint & 0x3F);
}
lastWasNewline = false;
lastWasSpace = false;
continue; // Already wrote to output
} else {
continue; // Skip unmappable characters
}
break;
}
c = mapped;
} else {
continue; // Skip malformed UTF-8
}
} else if ((uint8_t)c >= 0x80) {
// Stray continuation byte (0x80-0xBF) — skip
continue;
}
// Whitespace collapsing
if (c == '\n' || c == '\r') {
if (!lastWasNewline && outPos > 0) {
output[outPos++] = '\n';
lastWasNewline = true;
lastWasSpace = false;
}
continue;
}
if (c == ' ' || c == '\t') {
if (!lastWasSpace && !lastWasNewline && outPos > 0) {
output[outPos++] = ' ';
lastWasSpace = true;
}
continue;
}
// Regular character
output[outPos++] = c;
lastWasNewline = false;
lastWasSpace = false;
}
// Trim trailing whitespace
while (outPos > 0 && (output[outPos-1] == '\n' || output[outPos-1] == ' ')) {
outPos--;
}
output[outPos] = '\0';
*outLen = outPos;
return output;
}
// ----------------------------------------------------------
// Decode an HTML entity starting at '&'.
// Advances *pos to the last character consumed.
// Returns the decoded character, or '&' if not recognized.
// ----------------------------------------------------------
static char _decodeEntity(const uint8_t* p, const uint8_t* end,
const uint8_t** outPos) {
// Look for ';' within a reasonable range
const uint8_t* semi = p + 1;
int maxLen = 10;
while (semi < end && semi < p + maxLen && *semi != ';') semi++;
if (*semi != ';' || semi >= end) {
*outPos = p; // Not an entity, return '&' literal
return '&';
}
int entityLen = semi - p - 1; // Length between & and ;
const char* entity = (const char*)(p + 1);
*outPos = semi; // Skip past ';'
// Named entities
if (entityLen == 3 && strncmp(entity, "amp", 3) == 0) return '&';
if (entityLen == 2 && strncmp(entity, "lt", 2) == 0) return '<';
if (entityLen == 2 && strncmp(entity, "gt", 2) == 0) return '>';
if (entityLen == 4 && strncmp(entity, "quot", 4) == 0) return '"';
if (entityLen == 4 && strncmp(entity, "apos", 4) == 0) return '\'';
if (entityLen == 4 && strncmp(entity, "nbsp", 4) == 0) return ' ';
if (entityLen == 5 && strncmp(entity, "mdash", 5) == 0) return '-';
if (entityLen == 5 && strncmp(entity, "ndash", 5) == 0) return '-';
if (entityLen == 6 && strncmp(entity, "hellip", 6) == 0) return '.';
if (entityLen == 5 && strncmp(entity, "lsquo", 5) == 0) return '\'';
if (entityLen == 5 && strncmp(entity, "rsquo", 5) == 0) return '\'';
if (entityLen == 5 && strncmp(entity, "ldquo", 5) == 0) return '"';
if (entityLen == 5 && strncmp(entity, "rdquo", 5) == 0) return '"';
// Common accented character entities → CP437 bytes for built-in font
if (entityLen == 6 && strncmp(entity, "eacute", 6) == 0) return (char)0x82; // é
if (entityLen == 6 && strncmp(entity, "egrave", 6) == 0) return (char)0x8A; // è
if (entityLen == 5 && strncmp(entity, "ecirc", 5) == 0) return (char)0x88; // ê
if (entityLen == 4 && strncmp(entity, "euml", 4) == 0) return (char)0x89; // ë
if (entityLen == 6 && strncmp(entity, "agrave", 6) == 0) return (char)0x85; // à
if (entityLen == 6 && strncmp(entity, "aacute", 6) == 0) return (char)0xA0; // á
if (entityLen == 5 && strncmp(entity, "acirc", 5) == 0) return (char)0x83; // â
if (entityLen == 4 && strncmp(entity, "auml", 4) == 0) return (char)0x84; // ä
if (entityLen == 6 && strncmp(entity, "ccedil", 6) == 0) return (char)0x87; // ç
if (entityLen == 6 && strncmp(entity, "iacute", 6) == 0) return (char)0xA1; // í
if (entityLen == 5 && strncmp(entity, "icirc", 5) == 0) return (char)0x8C; // î
if (entityLen == 4 && strncmp(entity, "iuml", 4) == 0) return (char)0x8B; // ï
if (entityLen == 6 && strncmp(entity, "igrave", 6) == 0) return (char)0x8D; // ì
if (entityLen == 6 && strncmp(entity, "oacute", 6) == 0) return (char)0xA2; // ó
if (entityLen == 5 && strncmp(entity, "ocirc", 5) == 0) return (char)0x93; // ô
if (entityLen == 4 && strncmp(entity, "ouml", 4) == 0) return (char)0x94; // ö
if (entityLen == 6 && strncmp(entity, "ograve", 6) == 0) return (char)0x95; // ò
if (entityLen == 6 && strncmp(entity, "uacute", 6) == 0) return (char)0xA3; // ú
if (entityLen == 5 && strncmp(entity, "ucirc", 5) == 0) return (char)0x96; // û
if (entityLen == 4 && strncmp(entity, "uuml", 4) == 0) return (char)0x81; // ü
if (entityLen == 6 && strncmp(entity, "ugrave", 6) == 0) return (char)0x97; // ù
if (entityLen == 6 && strncmp(entity, "ntilde", 6) == 0) return (char)0xA4; // ñ
if (entityLen == 6 && strncmp(entity, "Eacute", 6) == 0) return (char)0x90; // É
if (entityLen == 6 && strncmp(entity, "Ccedil", 6) == 0) return (char)0x80; // Ç
if (entityLen == 6 && strncmp(entity, "Ntilde", 6) == 0) return (char)0xA5; // Ñ
if (entityLen == 4 && strncmp(entity, "Auml", 4) == 0) return (char)0x8E; // Ä
if (entityLen == 4 && strncmp(entity, "Ouml", 4) == 0) return (char)0x99; // Ö
if (entityLen == 4 && strncmp(entity, "Uuml", 4) == 0) return (char)0x9A; // Ü
if (entityLen == 5 && strncmp(entity, "szlig", 5) == 0) return (char)0xE1; // ß
// Numeric entities: &#NNN; or &#xHH;
if (entityLen >= 2 && entity[0] == '#') {
int codepoint = 0;
if (entity[1] == 'x' || entity[1] == 'X') {
// Hex
for (int i = 2; i < entityLen; i++) {
char ch = entity[i];
if (ch >= '0' && ch <= '9') codepoint = codepoint * 16 + (ch - '0');
else if (ch >= 'a' && ch <= 'f') codepoint = codepoint * 16 + (ch - 'a' + 10);
else if (ch >= 'A' && ch <= 'F') codepoint = codepoint * 16 + (ch - 'A' + 10);
}
} else {
// Decimal
for (int i = 1; i < entityLen; i++) {
char ch = entity[i];
if (ch >= '0' && ch <= '9') codepoint = codepoint * 10 + (ch - '0');
}
}
// Map to displayable character (best effort)
if (codepoint >= 32 && codepoint < 127) return (char)codepoint;
if (codepoint == 160) return ' '; // non-breaking space
// Try CP437 mapping for accented characters.
// The byte value will be passed through to the built-in font.
uint8_t cp437 = unicodeToCP437(codepoint);
if (cp437) return (char)cp437;
// Unknown codepoint > 127: skip it
return ' ';
}
// Unknown entity - output as space
return ' ';
}
// ----------------------------------------------------------
// Find a tag in XML data (case-sensitive, e.g., "<manifest").
// Returns pointer to '<' of found tag, or nullptr.
// ----------------------------------------------------------
static const char* _findTag(const char* data, int dataLen, const char* tag) {
int tagLen = strlen(tag);
const char* end = data + dataLen - tagLen;
for (const char* p = data; p <= end; p++) {
if (memcmp(p, tag, tagLen) == 0) return p;
}
return nullptr;
}
// ----------------------------------------------------------
// Find a tag case-insensitively (for <body>, <BODY>, etc.).
// ----------------------------------------------------------
static const char* _findTagCI(const char* data, int dataLen, const char* tag) {
int tagLen = strlen(tag);
const char* end = data + dataLen - tagLen;
for (const char* p = data; p <= end; p++) {
if (strncasecmp(p, tag, tagLen) == 0) return p;
}
return nullptr;
}
// ----------------------------------------------------------
// Extract an attribute value from a region of XML.
// Scans for attr="value" and copies value to outBuf.
// ----------------------------------------------------------
static bool _extractAttribute(const char* data, int dataLen,
const char* attrName, char* outBuf, int outBufSize) {
int nameLen = strlen(attrName);
const char* end = data + dataLen;
for (const char* p = data; p < end - nameLen - 2; p++) {
if (strncmp(p, attrName, nameLen) == 0 && p[nameLen] == '=') {
p += nameLen + 1;
char quote = *p;
if (quote != '"' && quote != '\'') continue;
p++;
const char* valEnd = (const char*)memchr(p, quote, end - p);
if (!valEnd) continue;
int valLen = valEnd - p;
if (valLen >= outBufSize) valLen = outBufSize - 1;
memcpy(outBuf, p, valLen);
outBuf[valLen] = '\0';
return true;
}
}
return false;
}
// ----------------------------------------------------------
// Extract an attribute value from within a single tag string.
// (More targeted version for parsing <item id="x" href="y"/>)
// ----------------------------------------------------------
static bool _extractAttributeFromTag(const char* tag, int tagLen,
const char* attrName,
char* outBuf, int outBufSize) {
return _extractAttribute(tag, tagLen, attrName, outBuf, outBufSize);
}
// ----------------------------------------------------------
// Extract text content between <tagName>...</tagName>.
// Works for simple cases like <dc:title>The Iliad</dc:title>.
// ----------------------------------------------------------
static bool _extractTagContent(const char* data, int dataLen,
const char* tagName, char* outBuf, int outBufSize) {
// Build open tag pattern: "<dc:title" (without >)
char openTag[64];
snprintf(openTag, sizeof(openTag), "<%s", tagName);
const char* start = _findTag(data, dataLen, openTag);
if (!start) return false;
// Find the > that closes the opening tag
const char* end = data + dataLen;
const char* contentStart = (const char*)memchr(start, '>', end - start);
if (!contentStart) return false;
contentStart++; // Skip past '>'
// Find closing tag
char closeTag[64];
snprintf(closeTag, sizeof(closeTag), "</%s>", tagName);
const char* contentEnd = _findTag(contentStart, end - contentStart, closeTag);
if (!contentEnd) return false;
int len = contentEnd - contentStart;
if (len >= outBufSize) len = outBufSize - 1;
memcpy(outBuf, contentStart, len);
outBuf[len] = '\0';
return true;
}
// ----------------------------------------------------------
// Get directory portion of a path.
// "OEBPS/content.opf" -> "OEBPS/"
// "content.opf" -> ""
// ----------------------------------------------------------
static void _getDirectory(const char* path, char* dirBuf, int dirBufSize) {
const char* lastSlash = strrchr(path, '/');
if (lastSlash) {
int len = lastSlash - path + 1; // Include trailing /
if (len >= dirBufSize) len = dirBufSize - 1;
memcpy(dirBuf, path, len);
dirBuf[len] = '\0';
} else {
dirBuf[0] = '\0';
}
}
// ----------------------------------------------------------
// Free the chapter paths array allocated by _parseOpf().
// ----------------------------------------------------------
static void _freeChapterPaths(char** paths, int count) {
if (paths) {
for (int i = 0; i < count; i++) {
if (paths[i]) free(paths[i]);
}
free(paths);
}
}
};