Epub and Epub3 converter now working

This commit is contained in:
pelgraine
2026-02-09 09:08:48 +11:00
parent 4c4a218b32
commit 69e73440db
4 changed files with 1522 additions and 33 deletions

View File

@@ -351,9 +351,7 @@ void setup() {
reader->bootIndex(*disp);
}
}
} else {
MESH_DEBUG_PRINTLN("setup() - SD card initialization failed!");
}
}
}
#endif

View File

@@ -4,6 +4,7 @@
#include <helpers/ui/DisplayDriver.h>
#include <SD.h>
#include <vector>
#include "EpubProcessor.h"
// Forward declarations
class UITask;
@@ -196,6 +197,38 @@ private:
// Draw directly to display outside the normal render cycle.
// Matches the style of the standalone text reader firmware splash.
// Generic splash screen: title (large green), subtitle (normal), detail (normal)
void drawSplash(const char* title, const char* subtitle, const char* detail) {
if (!_display) return;
_display->startFrame();
// Title in large text
_display->setTextSize(2);
_display->setColor(DisplayDriver::GREEN);
_display->setCursor(10, 11);
_display->print(title);
_display->setTextSize(1);
_display->setColor(DisplayDriver::LIGHT);
int y = 35;
// Subtitle
if (subtitle && subtitle[0]) {
_display->setCursor(10, y);
_display->print(subtitle);
y += 8;
}
// Detail line
if (detail && detail[0]) {
_display->setCursor(10, y);
_display->print(detail);
}
_display->endFrame();
}
// Word-wrapping splash for opening a large book.
// Shows: "Indexing / Pages..." (large), word-wrapped filename, "Please wait. / Loading shortly..."
void drawIndexingSplash(const String& filename) {
@@ -353,9 +386,14 @@ private:
idxFile.read(&fullyFlag, 1);
idxFile.read((uint8_t*)&lastRead, 4);
// Verify file hasn't changed
// Verify file hasn't changed - try BOOKS_FOLDER first, then epub cache
String fullPath = String(BOOKS_FOLDER) + "/" + filename;
File txtFile = SD.open(fullPath.c_str(), FILE_READ);
if (!txtFile) {
// Fallback: check epub cache directory
String cachePath = String("/books/.epub_cache/") + filename;
txtFile = SD.open(cachePath.c_str(), FILE_READ);
}
if (!txtFile) { idxFile.close(); return false; }
unsigned long curSize = txtFile.size();
txtFile.close();
@@ -457,7 +495,8 @@ private:
if (slash >= 0) name = name.substring(slash + 1);
if (!name.startsWith(".") &&
(name.endsWith(".txt") || name.endsWith(".TXT"))) {
(name.endsWith(".txt") || name.endsWith(".TXT") ||
name.endsWith(".epub") || name.endsWith(".EPUB"))) {
_fileList.push_back(name);
}
}
@@ -472,23 +511,86 @@ private:
void openBook(const String& filename) {
if (_fileOpen) closeBook();
// Find cached index
// ---- EPUB auto-conversion ----
String actualFilename = filename;
String actualFullPath = String(BOOKS_FOLDER) + "/" + filename;
bool isEpub = filename.endsWith(".epub") || filename.endsWith(".EPUB");
if (isEpub) {
// Build cache path for this EPUB
char cachePath[160];
EpubProcessor::buildCachePath(actualFullPath.c_str(), cachePath, sizeof(cachePath));
// Check if already converted
digitalWrite(SDCARD_CS, LOW);
bool cached = SD.exists(cachePath);
digitalWrite(SDCARD_CS, HIGH);
if (!cached) {
// Show conversion splash on e-ink
char shortName[28];
if (filename.length() > 24) {
strncpy(shortName, filename.c_str(), 21);
shortName[21] = '\0';
strcat(shortName, "...");
} else {
strncpy(shortName, filename.c_str(), sizeof(shortName) - 1);
shortName[sizeof(shortName) - 1] = '\0';
}
drawSplash("Converting EPUB...", "Please wait", shortName);
Serial.printf("TextReader: Converting EPUB '%s'\n", filename.c_str());
unsigned long t0 = millis();
digitalWrite(SDCARD_CS, LOW);
bool ok = EpubProcessor::processToText(actualFullPath.c_str(), cachePath);
digitalWrite(SDCARD_CS, HIGH);
if (!ok) {
Serial.println("TextReader: EPUB conversion failed!");
drawSplash("Convert failed!", "", shortName);
delay(2000);
return; // Stay in file list
}
Serial.printf("TextReader: EPUB converted in %lu ms\n", millis() - t0);
} else {
Serial.printf("TextReader: EPUB cache hit for '%s'\n", filename.c_str());
}
// Redirect to the cached .txt
actualFullPath = String(cachePath);
const char* lastSlash = strrchr(cachePath, '/');
actualFilename = String(lastSlash ? lastSlash + 1 : cachePath);
}
// ---- End EPUB auto-conversion ----
// Find cached index for this file
FileCache* cache = nullptr;
for (int i = 0; i < (int)_fileCache.size(); i++) {
if (_fileCache[i].filename == filename) {
if (_fileCache[i].filename == actualFilename) {
cache = &_fileCache[i];
break;
}
}
String fullPath = String(BOOKS_FOLDER) + "/" + filename;
_file = SD.open(fullPath.c_str(), FILE_READ);
_file = SD.open(actualFullPath.c_str(), FILE_READ);
// Fallback: try epub cache dir (for files discovered during boot scan)
if (!_file && !isEpub) {
String cacheFallback = String("/books/.epub_cache/") + actualFilename;
_file = SD.open(cacheFallback.c_str(), FILE_READ);
if (_file) {
actualFullPath = cacheFallback;
Serial.printf("TextReader: Opened from epub cache: %s\n", actualFilename.c_str());
}
}
if (!_file) {
Serial.printf("TextReader: Failed to open %s\n", filename.c_str());
Serial.printf("TextReader: Failed to open %s\n", actualFilename.c_str());
return;
}
_currentFile = filename;
_currentFile = actualFilename;
_fileOpen = true;
_currentPage = 0;
_pagePositions.clear();
@@ -501,55 +603,91 @@ private:
_currentPage = cache->lastReadPage;
}
// Already fully indexed - open immediately
// Already fully indexed open immediately
if (cache->fullyIndexed) {
_totalPages = _pagePositions.size();
_mode = READING;
loadPageContent();
Serial.printf("TextReader: Opened %s, %d pages, resume pg %d\n",
filename.c_str(), _totalPages, _currentPage + 1);
actualFilename.c_str(), _totalPages, _currentPage + 1);
return;
}
// Partially indexed - show splash and finish indexing
// Partially indexed finish indexing with splash
Serial.printf("TextReader: Finishing index for %s (have %d pages so far)\n",
filename.c_str(), (int)_pagePositions.size());
actualFilename.c_str(), (int)_pagePositions.size());
drawIndexingSplash(filename);
char shortName[28];
if (actualFilename.length() > 24) {
strncpy(shortName, actualFilename.c_str(), 21);
shortName[21] = '\0';
strcat(shortName, "...");
} else {
strncpy(shortName, actualFilename.c_str(), sizeof(shortName) - 1);
shortName[sizeof(shortName) - 1] = '\0';
}
drawSplash("Indexing...", "Please wait", shortName);
long lastPos = cache->pagePositions.back();
indexPagesWordWrap(_file, lastPos, _pagePositions,
_linesPerPage, _charsPerLine, 0);
if (_pagePositions.empty()) {
// Cache had no pages (e.g. dummy entry) — full index from scratch
_pagePositions.push_back(0);
indexPagesWordWrap(_file, 0, _pagePositions,
_linesPerPage, _charsPerLine, 0);
} else {
long lastPos = cache->pagePositions.back();
indexPagesWordWrap(_file, lastPos, _pagePositions,
_linesPerPage, _charsPerLine, 0);
}
} else {
// No cache at all - full index from scratch with splash
Serial.printf("TextReader: Full index for %s\n", filename.c_str());
// No cache full index from scratch
Serial.printf("TextReader: Full index for %s\n", actualFilename.c_str());
drawIndexingSplash(filename);
char shortName[28];
if (actualFilename.length() > 24) {
strncpy(shortName, actualFilename.c_str(), 21);
shortName[21] = '\0';
strcat(shortName, "...");
} else {
strncpy(shortName, actualFilename.c_str(), sizeof(shortName) - 1);
shortName[sizeof(shortName) - 1] = '\0';
}
drawSplash("Indexing...", "Please wait", shortName);
_pagePositions.push_back(0);
indexPagesWordWrap(_file, 0, _pagePositions,
_linesPerPage, _charsPerLine, 0);
}
// Save complete index
_totalPages = _pagePositions.size();
saveIndex(filename, _pagePositions, _file.size(), true, _currentPage);
// Update cache entry
// Update or create cache entry
bool foundCache = false;
for (int i = 0; i < (int)_fileCache.size(); i++) {
if (_fileCache[i].filename == filename) {
if (_fileCache[i].filename == actualFilename) {
_fileCache[i].pagePositions = _pagePositions;
_fileCache[i].fullyIndexed = true;
_fileCache[i].fileSize = _file.size();
foundCache = true;
break;
}
}
if (!foundCache) {
FileCache newCache;
newCache.filename = actualFilename;
newCache.fileSize = _file.size();
newCache.fullyIndexed = true;
newCache.lastReadPage = _currentPage;
newCache.pagePositions = _pagePositions;
_fileCache.push_back(newCache);
}
// Deselect SD to free SPI bus
digitalWrite(SDCARD_CS, HIGH);
saveIndex(actualFilename, _pagePositions, _file.size(), true, _currentPage);
_mode = READING;
loadPageContent();
Serial.printf("TextReader: Opened %s, %d pages, resume pg %d\n",
filename.c_str(), _totalPages, _currentPage + 1);
Serial.printf("TextReader: Opened %s, %d pages\n",
actualFilename.c_str(), _totalPages);
}
void closeBook() {
@@ -623,11 +761,11 @@ private:
if (_fileList.size() == 0) {
display.setCursor(0, 18);
display.setColor(DisplayDriver::LIGHT);
display.print("No .txt files found");
display.print("No files found");
display.setCursor(0, 30);
display.print("Add files to /books/");
display.print("Add .txt or .epub to");
display.setCursor(0, 42);
display.print("on SD card");
display.print("/books/ on SD card");
} else {
display.setTextSize(0); // Tiny font for file list
int listLineH = 8; // Approximate tiny font line height in virtual coords
@@ -820,9 +958,37 @@ public:
drawBootSplash(0, 0, "Scanning...");
Serial.println("TextReader: Boot indexing started");
// Scan for files
// Scan for files (includes .txt and .epub)
scanFiles();
// Also pick up previously converted EPUB cache files
if (SD.exists("/books/.epub_cache")) {
File cacheDir = SD.open("/books/.epub_cache");
if (cacheDir && cacheDir.isDirectory()) {
File f = cacheDir.openNextFile();
while (f && _fileList.size() < READER_MAX_FILES) {
if (!f.isDirectory()) {
String name = String(f.name());
int slash = name.lastIndexOf('/');
if (slash >= 0) name = name.substring(slash + 1);
if (name.endsWith(".txt") || name.endsWith(".TXT")) {
// Avoid duplicates
bool dup = false;
for (int i = 0; i < (int)_fileList.size(); i++) {
if (_fileList[i] == name) { dup = true; break; }
}
if (!dup) {
_fileList.push_back(name);
Serial.printf("TextReader: Found cached EPUB txt: %s\n", name.c_str());
}
}
}
f = cacheDir.openNextFile();
}
cacheDir.close();
}
}
if (_fileList.size() == 0) {
Serial.println("TextReader: No files to index");
_bootIndexed = true;
@@ -860,11 +1026,22 @@ public:
// Skip files that loaded from cache
if (_fileCache[i].filename.length() > 0) continue;
// Skip .epub files — they'll be converted on first open via openBook()
if (_fileList[i].endsWith(".epub") || _fileList[i].endsWith(".EPUB")) {
needsIndexCount--; // Don't count epubs in progress display
continue;
}
indexProgress++;
drawBootSplash(indexProgress, needsIndexCount, _fileList[i]);
// Try BOOKS_FOLDER first, then epub cache fallback
String fullPath = String(BOOKS_FOLDER) + "/" + _fileList[i];
File file = SD.open(fullPath.c_str(), FILE_READ);
if (!file) {
String cacheFallback = String("/books/.epub_cache/") + _fileList[i];
file = SD.open(cacheFallback.c_str(), FILE_READ);
}
if (!file) continue;
FileCache& cache = _fileCache[i];

View File

@@ -0,0 +1,538 @@
#pragma once
// =============================================================================
// EpubZipReader.h - Minimal ZIP reader for EPUB files on ESP32-S3
//
// Parses ZIP archives directly from SD card File objects.
// Uses the ESP32 ROM's built-in tinfl decompressor for DEFLATE.
// No external library dependencies.
//
// Supports:
// - STORED (method 0) entries - direct copy
// - DEFLATED (method 8) entries - ROM tinfl decompression
// - ZIP64 is NOT supported (EPUBs don't need it)
//
// Memory: Allocates decompression buffers from PSRAM when available.
// Typical EPUB chapter is 5-50KB, well within ESP32-S3's 8MB PSRAM.
// =============================================================================
#include <SD.h>
#include <FS.h>
// ROM tinfl decompressor - built into ESP32/ESP32-S3 ROM
// If this include fails on your platform, see the fallback note at bottom
#if __has_include(<rom/miniz.h>)
#include <rom/miniz.h>
#define HAS_ROM_TINFL 1
#elif __has_include(<esp32s3/rom/miniz.h>)
#include <esp32s3/rom/miniz.h>
#define HAS_ROM_TINFL 1
#elif __has_include(<esp32/rom/miniz.h>)
#include <esp32/rom/miniz.h>
#define HAS_ROM_TINFL 1
#else
#warning "ROM miniz not found - DEFLATED entries will not be supported"
#define HAS_ROM_TINFL 0
#endif
// ---- ZIP format constants ----
#define ZIP_LOCAL_FILE_HEADER_SIG 0x04034b50
#define ZIP_CENTRAL_DIR_SIG 0x02014b50
#define ZIP_END_OF_CENTRAL_DIR_SIG 0x06054b50
#define ZIP_METHOD_STORED 0
#define ZIP_METHOD_DEFLATED 8
// Maximum files we track in a ZIP (EPUBs typically have 20-100 files)
#define ZIP_MAX_ENTRIES 128
// Maximum filename length within the ZIP
#define ZIP_MAX_FILENAME 128
// ---- Data structures ----
struct ZipEntry {
char filename[ZIP_MAX_FILENAME];
uint16_t compressionMethod; // 0=STORED, 8=DEFLATED
uint32_t compressedSize;
uint32_t uncompressedSize;
uint32_t localHeaderOffset; // Offset to local file header in ZIP
uint32_t crc32;
};
// ---- Helper: read little-endian values from a byte buffer ----
static inline uint16_t zipRead16(const uint8_t* p) {
return (uint16_t)p[0] | ((uint16_t)p[1] << 8);
}
static inline uint32_t zipRead32(const uint8_t* p) {
return (uint32_t)p[0] | ((uint32_t)p[1] << 8) |
((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
}
// =============================================================================
// EpubZipReader class
// =============================================================================
class EpubZipReader {
public:
EpubZipReader() : _entryCount(0), _isOpen(false), _entries(nullptr) {
// Allocate entries array from PSRAM to avoid stack overflow
// (128 entries × ~146 bytes = ~19KB — too large for 8KB loopTask stack)
#ifdef BOARD_HAS_PSRAM
_entries = (ZipEntry*)ps_malloc(ZIP_MAX_ENTRIES * sizeof(ZipEntry));
#endif
if (!_entries) {
_entries = (ZipEntry*)malloc(ZIP_MAX_ENTRIES * sizeof(ZipEntry));
}
if (!_entries) {
Serial.println("ZipReader: FATAL - failed to allocate entry table");
}
}
~EpubZipReader() {
if (_entries) {
free(_entries);
_entries = nullptr;
}
}
// ----------------------------------------------------------
// Open a ZIP file and parse its central directory.
// Returns true on success, false on error.
// After open(), entries are available via getEntryCount()/getEntry().
// ----------------------------------------------------------
bool open(File& zipFile) {
_isOpen = false;
_entryCount = 0;
if (!_entries) {
Serial.println("ZipReader: entry table not allocated");
return false;
}
if (!zipFile || !zipFile.available()) {
Serial.println("ZipReader: file not valid");
return false;
}
_file = zipFile;
uint32_t fileSize = _file.size();
if (fileSize < 22) {
Serial.println("ZipReader: file too small for ZIP");
return false;
}
// ---- Step 1: Find the End of Central Directory record ----
// EOCD is at least 22 bytes, at end of file.
// Search backwards from end for the EOCD signature.
// Comment can be up to 65535 bytes, but EPUBs typically have none.
uint32_t searchStart = (fileSize > 65557) ? (fileSize - 65557) : 0;
uint32_t eocdOffset = 0;
bool foundEocd = false;
// Read the last chunk into a buffer to search for EOCD signature
uint32_t searchLen = fileSize - searchStart;
// Cap search buffer to a reasonable size
if (searchLen > 1024) {
searchStart = fileSize - 1024;
searchLen = 1024;
}
uint8_t* searchBuf = (uint8_t*)_allocBuffer(searchLen);
if (!searchBuf) {
Serial.println("ZipReader: failed to alloc search buffer");
return false;
}
_file.seek(searchStart);
if (_file.read(searchBuf, searchLen) != (int)searchLen) {
free(searchBuf);
Serial.println("ZipReader: failed to read EOCD area");
return false;
}
// Scan backwards for EOCD signature (0x06054b50)
for (int i = (int)searchLen - 22; i >= 0; i--) {
if (zipRead32(&searchBuf[i]) == ZIP_END_OF_CENTRAL_DIR_SIG) {
eocdOffset = searchStart + i;
// Parse EOCD fields
uint16_t totalEntries = zipRead16(&searchBuf[i + 10]);
uint32_t cdSize = zipRead32(&searchBuf[i + 12]);
uint32_t cdOffset = zipRead32(&searchBuf[i + 16]);
_cdOffset = cdOffset;
_cdSize = cdSize;
_totalEntries = totalEntries;
foundEocd = true;
break;
}
}
free(searchBuf);
if (!foundEocd) {
Serial.println("ZipReader: EOCD not found - not a valid ZIP");
return false;
}
Serial.printf("ZipReader: EOCD found at %u, %u entries, CD at %u (%u bytes)\n",
eocdOffset, _totalEntries, _cdOffset, _cdSize);
// ---- Step 2: Parse Central Directory entries ----
if (_cdSize == 0 || _cdSize > 512 * 1024) {
Serial.println("ZipReader: central directory size unreasonable");
return false;
}
uint8_t* cdBuf = (uint8_t*)_allocBuffer(_cdSize);
if (!cdBuf) {
Serial.printf("ZipReader: failed to alloc %u bytes for central directory\n", _cdSize);
return false;
}
_file.seek(_cdOffset);
if (_file.read(cdBuf, _cdSize) != (int)_cdSize) {
free(cdBuf);
Serial.println("ZipReader: failed to read central directory");
return false;
}
uint32_t pos = 0;
_entryCount = 0;
while (pos + 46 <= _cdSize && _entryCount < ZIP_MAX_ENTRIES) {
if (zipRead32(&cdBuf[pos]) != ZIP_CENTRAL_DIR_SIG) {
break; // No more central directory entries
}
uint16_t method = zipRead16(&cdBuf[pos + 10]);
uint32_t crc = zipRead32(&cdBuf[pos + 16]);
uint32_t compSize = zipRead32(&cdBuf[pos + 20]);
uint32_t uncompSize = zipRead32(&cdBuf[pos + 24]);
uint16_t fnLen = zipRead16(&cdBuf[pos + 28]);
uint16_t extraLen = zipRead16(&cdBuf[pos + 30]);
uint16_t commentLen = zipRead16(&cdBuf[pos + 32]);
uint32_t localOffset = zipRead32(&cdBuf[pos + 42]);
// Copy filename (truncate if necessary)
int copyLen = (fnLen < ZIP_MAX_FILENAME - 1) ? fnLen : ZIP_MAX_FILENAME - 1;
memcpy(_entries[_entryCount].filename, &cdBuf[pos + 46], copyLen);
_entries[_entryCount].filename[copyLen] = '\0';
_entries[_entryCount].compressionMethod = method;
_entries[_entryCount].compressedSize = compSize;
_entries[_entryCount].uncompressedSize = uncompSize;
_entries[_entryCount].localHeaderOffset = localOffset;
_entries[_entryCount].crc32 = crc;
// Skip directories (filenames ending with '/')
if (copyLen > 0 && _entries[_entryCount].filename[copyLen - 1] != '/') {
_entryCount++;
}
// Advance past this central directory entry
pos += 46 + fnLen + extraLen + commentLen;
}
free(cdBuf);
Serial.printf("ZipReader: parsed %d file entries\n", _entryCount);
_isOpen = true;
return true;
}
// ----------------------------------------------------------
// Close the reader (does not close the underlying File).
// ----------------------------------------------------------
void close() {
_isOpen = false;
_entryCount = 0;
}
// ----------------------------------------------------------
// Get entry count and entries
// ----------------------------------------------------------
int getEntryCount() const { return _entryCount; }
const ZipEntry* getEntry(int index) const {
if (index < 0 || index >= _entryCount) return nullptr;
return &_entries[index];
}
// ----------------------------------------------------------
// Find an entry by filename (case-sensitive).
// Returns index, or -1 if not found.
// ----------------------------------------------------------
int findEntry(const char* filename) const {
for (int i = 0; i < _entryCount; i++) {
if (strcmp(_entries[i].filename, filename) == 0) {
return i;
}
}
return -1;
}
// ----------------------------------------------------------
// Find an entry by filename suffix (e.g., ".opf", ".ncx").
// Returns index of first match, or -1 if not found.
// ----------------------------------------------------------
int findEntryBySuffix(const char* suffix) const {
int suffixLen = strlen(suffix);
for (int i = 0; i < _entryCount; i++) {
int fnLen = strlen(_entries[i].filename);
if (fnLen >= suffixLen &&
strcasecmp(&_entries[i].filename[fnLen - suffixLen], suffix) == 0) {
return i;
}
}
return -1;
}
// ----------------------------------------------------------
// Find entries matching a path prefix (e.g., "OEBPS/").
// Fills matchIndices[] up to maxMatches. Returns count found.
// ----------------------------------------------------------
int findEntriesByPrefix(const char* prefix, int* matchIndices, int maxMatches) const {
int count = 0;
int prefixLen = strlen(prefix);
for (int i = 0; i < _entryCount && count < maxMatches; i++) {
if (strncmp(_entries[i].filename, prefix, prefixLen) == 0) {
matchIndices[count++] = i;
}
}
return count;
}
// ----------------------------------------------------------
// Extract a file entry to a newly allocated buffer.
//
// On success, returns a malloc'd buffer (caller must free!)
// and sets *outSize to the uncompressed size.
//
// On failure, returns nullptr.
//
// The buffer is allocated from PSRAM if available.
// ----------------------------------------------------------
uint8_t* extractEntry(int index, uint32_t* outSize) {
if (!_isOpen || index < 0 || index >= _entryCount) {
return nullptr;
}
const ZipEntry& entry = _entries[index];
// ---- Read the local file header to get actual data offset ----
// Local header: 30 bytes fixed + variable filename + extra field
uint8_t localHeader[30];
_file.seek(entry.localHeaderOffset);
if (_file.read(localHeader, 30) != 30) {
Serial.println("ZipReader: failed to read local header");
return nullptr;
}
if (zipRead32(localHeader) != ZIP_LOCAL_FILE_HEADER_SIG) {
Serial.println("ZipReader: bad local header signature");
return nullptr;
}
uint16_t localFnLen = zipRead16(&localHeader[26]);
uint16_t localExtraLen = zipRead16(&localHeader[28]);
uint32_t dataOffset = entry.localHeaderOffset + 30 + localFnLen + localExtraLen;
// ---- Handle based on compression method ----
if (entry.compressionMethod == ZIP_METHOD_STORED) {
return _extractStored(dataOffset, entry.uncompressedSize, outSize);
}
else if (entry.compressionMethod == ZIP_METHOD_DEFLATED) {
return _extractDeflated(dataOffset, entry.compressedSize,
entry.uncompressedSize, outSize);
}
else {
Serial.printf("ZipReader: unsupported compression method %d for %s\n",
entry.compressionMethod, entry.filename);
return nullptr;
}
}
// ----------------------------------------------------------
// Extract a file entry by filename.
// Convenience wrapper around findEntry() + extractEntry().
// ----------------------------------------------------------
uint8_t* extractByName(const char* filename, uint32_t* outSize) {
int idx = findEntry(filename);
if (idx < 0) return nullptr;
return extractEntry(idx, outSize);
}
// ----------------------------------------------------------
// Check if reader is open and valid
// ----------------------------------------------------------
bool isOpen() const { return _isOpen; }
// ----------------------------------------------------------
// Debug: print all entries
// ----------------------------------------------------------
void printEntries() const {
Serial.printf("ZIP contains %d files:\n", _entryCount);
for (int i = 0; i < _entryCount; i++) {
const ZipEntry& e = _entries[i];
Serial.printf(" [%d] %s (%s, %u -> %u bytes)\n",
i, e.filename,
e.compressionMethod == 0 ? "STORED" : "DEFLATED",
e.compressedSize, e.uncompressedSize);
}
}
private:
File _file;
ZipEntry* _entries; // Heap-allocated (PSRAM) entry table
int _entryCount;
bool _isOpen;
uint32_t _cdOffset;
uint32_t _cdSize;
uint16_t _totalEntries;
// ----------------------------------------------------------
// Allocate buffer, preferring PSRAM if available
// ----------------------------------------------------------
void* _allocBuffer(size_t size) {
void* buf = nullptr;
#ifdef BOARD_HAS_PSRAM
buf = ps_malloc(size);
#endif
if (!buf) {
buf = malloc(size);
}
return buf;
}
// ----------------------------------------------------------
// Extract a STORED (uncompressed) entry
// ----------------------------------------------------------
uint8_t* _extractStored(uint32_t dataOffset, uint32_t size, uint32_t* outSize) {
uint8_t* buf = (uint8_t*)_allocBuffer(size + 1); // +1 for null terminator
if (!buf) {
Serial.printf("ZipReader: failed to alloc %u bytes for stored entry\n", size);
return nullptr;
}
_file.seek(dataOffset);
uint32_t bytesRead = _file.read(buf, size);
if (bytesRead != size) {
Serial.printf("ZipReader: short read (got %u, expected %u)\n", bytesRead, size);
free(buf);
return nullptr;
}
buf[size] = '\0'; // Null-terminate for text files
*outSize = size;
// Release SD CS pin for other SPI users
digitalWrite(SDCARD_CS, HIGH);
return buf;
}
// ----------------------------------------------------------
// Extract a DEFLATED entry using ROM tinfl
// ----------------------------------------------------------
uint8_t* _extractDeflated(uint32_t dataOffset, uint32_t compSize,
uint32_t uncompSize, uint32_t* outSize) {
#if HAS_ROM_TINFL
// Allocate compressed data buffer (from PSRAM)
uint8_t* compBuf = (uint8_t*)_allocBuffer(compSize);
if (!compBuf) {
Serial.printf("ZipReader: failed to alloc %u bytes for compressed data\n", compSize);
return nullptr;
}
// Allocate output buffer (+1 for null terminator)
uint8_t* outBuf = (uint8_t*)_allocBuffer(uncompSize + 1);
if (!outBuf) {
Serial.printf("ZipReader: failed to alloc %u bytes for decompressed data\n", uncompSize);
free(compBuf);
return nullptr;
}
// Heap-allocate the decompressor (~11KB struct - too large for 8KB loopTask stack!)
tinfl_decompressor* decomp = (tinfl_decompressor*)_allocBuffer(sizeof(tinfl_decompressor));
if (!decomp) {
Serial.printf("ZipReader: failed to alloc tinfl_decompressor (%u bytes)\n",
(uint32_t)sizeof(tinfl_decompressor));
free(compBuf);
free(outBuf);
return nullptr;
}
// Read compressed data from file
_file.seek(dataOffset);
if (_file.read(compBuf, compSize) != (int)compSize) {
Serial.println("ZipReader: failed to read compressed data");
free(decomp);
free(compBuf);
free(outBuf);
return nullptr;
}
// Release SD CS pin for other SPI users
digitalWrite(SDCARD_CS, HIGH);
// Decompress using ROM tinfl (low-level API to avoid stack allocation)
// ZIP DEFLATE is raw deflate (no zlib header).
tinfl_init(decomp);
size_t inBytes = compSize;
size_t outBytes = uncompSize;
tinfl_status status = tinfl_decompress(
decomp,
(const mz_uint8*)compBuf, // compressed input
&inBytes, // in: available, out: consumed
outBuf, // output buffer base
outBuf, // current output position
&outBytes, // in: available, out: produced
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF // raw deflate, single-shot
);
free(decomp);
free(compBuf);
if (status != TINFL_STATUS_DONE) {
Serial.printf("ZipReader: DEFLATE failed (status %d)\n", (int)status);
free(outBuf);
return nullptr;
}
outBuf[outBytes] = '\0'; // Null-terminate for text files
*outSize = (uint32_t)outBytes;
if (outBytes != uncompSize) {
Serial.printf("ZipReader: decompressed %u bytes, expected %u\n",
(uint32_t)outBytes, uncompSize);
}
return outBuf;
#else
// No ROM tinfl available
Serial.println("ZipReader: DEFLATE not supported (no ROM tinfl)");
*outSize = 0;
return nullptr;
#endif
}
};
// =============================================================================
// FALLBACK NOTE:
//
// If the ROM tinfl includes fail to compile on your ESP32 variant, you have
// two options:
//
// 1. Install lbernstone/miniz-esp32 from PlatformIO:
// lib_deps = https://github.com/lbernstone/miniz-esp32.git
// Then change the includes above to: #include <miniz.h>
//
// 2. Copy just the tinfl source (~550 lines) from:
// https://github.com/richgel999/miniz/blob/master/miniz_tinfl.c
// into your project. Only tinfl_decompress_mem_to_mem() is needed.
//
// =============================================================================

View File

@@ -0,0 +1,776 @@
#pragma once
// =============================================================================
// EpubProcessor.h - Convert EPUB files to plain text for TextReaderScreen
//
// Pipeline: EPUB (ZIP) → container.xml → OPF spine → extract chapters →
// strip XHTML tags → concatenated plain text → cached .txt on SD
//
// The resulting .txt file is placed in /books/ and picked up automatically
// by TextReaderScreen's existing pagination, indexing, and bookmarking.
//
// Dependencies: EpubZipReader.h (for ZIP extraction)
// =============================================================================
#include <SD.h>
#include <FS.h>
#include "EpubZipReader.h"
// Maximum chapters in spine (most novels have 20-80)
#define EPUB_MAX_CHAPTERS 200
// Maximum manifest items we track
#define EPUB_MAX_MANIFEST 256
// Buffer size for reading OPF/container XML
// (These are small files, typically 1-20KB)
#define EPUB_XML_BUF_SIZE 64
class EpubProcessor {
public:
// ----------------------------------------------------------
// Process an EPUB file: extract text and write to SD cache.
//
// epubPath: source, e.g. "/books/The Iliad.epub"
// txtPath: output, e.g. "/books/The Iliad by Homer.txt"
//
// Returns true if the .txt file was written successfully.
// If txtPath already exists, returns true immediately (cached).
// ----------------------------------------------------------
static bool processToText(const char* epubPath, const char* txtPath) {
// Check if already cached
if (SD.exists(txtPath)) {
Serial.printf("EpubProc: '%s' already cached\n", txtPath);
return true;
}
Serial.printf("EpubProc: Processing '%s'\n", epubPath);
unsigned long t0 = millis();
// Open the EPUB (ZIP archive)
File epubFile = SD.open(epubPath, FILE_READ);
if (!epubFile) {
Serial.println("EpubProc: Cannot open EPUB file");
return false;
}
// Heap-allocate zip reader (entries table is ~19KB)
EpubZipReader* zip = new EpubZipReader();
if (!zip) {
epubFile.close();
Serial.println("EpubProc: Cannot allocate ZipReader");
return false;
}
if (!zip->open(epubFile)) {
delete zip;
epubFile.close();
Serial.println("EpubProc: Cannot parse ZIP structure");
return false;
}
// Step 1: Find OPF path from container.xml
char opfPath[EPUB_XML_BUF_SIZE];
opfPath[0] = '\0';
if (!_findOpfPath(zip, opfPath, sizeof(opfPath))) {
delete zip;
epubFile.close();
Serial.println("EpubProc: Cannot find OPF path");
return false;
}
Serial.printf("EpubProc: OPF at '%s'\n", opfPath);
// Determine the content base directory (e.g., "OEBPS/")
char baseDir[EPUB_XML_BUF_SIZE];
_getDirectory(opfPath, baseDir, sizeof(baseDir));
// Step 2: Parse OPF to get title and spine chapter order
char title[128];
title[0] = '\0';
// Chapter paths in spine order
char** chapterPaths = nullptr;
int chapterCount = 0;
if (!_parseOpf(zip, opfPath, baseDir, title, sizeof(title),
&chapterPaths, &chapterCount)) {
delete zip;
epubFile.close();
Serial.println("EpubProc: Cannot parse OPF");
return false;
}
Serial.printf("EpubProc: Title='%s', %d chapters\n", title, chapterCount);
// Step 3: Extract each chapter, strip XHTML, write to output .txt
File outFile = SD.open(txtPath, FILE_WRITE);
if (!outFile) {
_freeChapterPaths(chapterPaths, chapterCount);
delete zip;
epubFile.close();
Serial.printf("EpubProc: Cannot create '%s'\n", txtPath);
return false;
}
// Write title as first line
if (title[0]) {
outFile.println(title);
outFile.println();
}
int chaptersWritten = 0;
uint32_t totalBytes = 0;
for (int i = 0; i < chapterCount; i++) {
int entryIdx = zip->findEntry(chapterPaths[i]);
if (entryIdx < 0) {
Serial.printf("EpubProc: Chapter not found: '%s'\n", chapterPaths[i]);
continue;
}
uint32_t rawSize = 0;
uint8_t* rawData = zip->extractEntry(entryIdx, &rawSize);
if (!rawData || rawSize == 0) {
Serial.printf("EpubProc: Failed to extract chapter %d\n", i);
if (rawData) free(rawData);
continue;
}
// Strip XHTML tags and write plain text
uint32_t textLen = 0;
uint8_t* plainText = _stripXhtml(rawData, rawSize, &textLen);
free(rawData);
if (plainText && textLen > 0) {
outFile.write(plainText, textLen);
// Add chapter separator
outFile.print("\n\n");
totalBytes += textLen + 2;
chaptersWritten++;
}
if (plainText) free(plainText);
}
outFile.flush();
outFile.close();
// Release SD CS for other SPI users
digitalWrite(SDCARD_CS, HIGH);
_freeChapterPaths(chapterPaths, chapterCount);
delete zip;
epubFile.close();
unsigned long elapsed = millis() - t0;
Serial.printf("EpubProc: Done! %d chapters, %u bytes in %lu ms -> '%s'\n",
chaptersWritten, totalBytes, elapsed, txtPath);
return chaptersWritten > 0;
}
// ----------------------------------------------------------
// Extract just the title from an EPUB (for display in file list).
// Returns false if it can't be determined.
// ----------------------------------------------------------
static bool getTitle(const char* epubPath, char* titleBuf, int titleBufSize) {
File epubFile = SD.open(epubPath, FILE_READ);
if (!epubFile) return false;
EpubZipReader* zip = new EpubZipReader();
if (!zip) { epubFile.close(); return false; }
if (!zip->open(epubFile)) {
delete zip; epubFile.close(); return false;
}
char opfPath[EPUB_XML_BUF_SIZE];
if (!_findOpfPath(zip, opfPath, sizeof(opfPath))) {
delete zip; epubFile.close(); return false;
}
// Extract OPF and find <dc:title>
int opfIdx = zip->findEntry(opfPath);
if (opfIdx < 0) { delete zip; epubFile.close(); return false; }
uint32_t opfSize = 0;
uint8_t* opfData = zip->extractEntry(opfIdx, &opfSize);
delete zip;
epubFile.close();
if (!opfData) return false;
bool found = _extractTagContent((const char*)opfData, opfSize,
"dc:title", titleBuf, titleBufSize);
free(opfData);
return found;
}
// ----------------------------------------------------------
// Build a cache .txt path from an .epub path.
// e.g., "/books/mybook.epub" -> "/books/.epub_cache/mybook.txt"
// ----------------------------------------------------------
static void buildCachePath(const char* epubPath, char* cachePath, int cachePathSize) {
// Extract filename without extension
const char* lastSlash = strrchr(epubPath, '/');
const char* filename = lastSlash ? lastSlash + 1 : epubPath;
// Find the directory part
char dir[128];
if (lastSlash) {
int dirLen = lastSlash - epubPath;
if (dirLen >= (int)sizeof(dir)) dirLen = sizeof(dir) - 1;
strncpy(dir, epubPath, dirLen);
dir[dirLen] = '\0';
} else {
strcpy(dir, "/books");
}
// Create cache directory if needed
char cacheDir[160];
snprintf(cacheDir, sizeof(cacheDir), "%s/.epub_cache", dir);
if (!SD.exists(cacheDir)) {
SD.mkdir(cacheDir);
}
// Strip .epub extension
char baseName[128];
strncpy(baseName, filename, sizeof(baseName) - 1);
baseName[sizeof(baseName) - 1] = '\0';
char* dot = strrchr(baseName, '.');
if (dot) *dot = '\0';
snprintf(cachePath, cachePathSize, "%s/%s.txt", cacheDir, baseName);
}
private:
// ----------------------------------------------------------
// Parse container.xml to find the OPF file path.
// Returns true if found.
// ----------------------------------------------------------
static bool _findOpfPath(EpubZipReader* zip, char* opfPath, int opfPathSize) {
int idx = zip->findEntry("META-INF/container.xml");
if (idx < 0) {
// Fallback: find any .opf file directly
idx = zip->findEntryBySuffix(".opf");
if (idx >= 0) {
const ZipEntry* e = zip->getEntry(idx);
strncpy(opfPath, e->filename, opfPathSize - 1);
opfPath[opfPathSize - 1] = '\0';
return true;
}
return false;
}
uint32_t size = 0;
uint8_t* data = zip->extractEntry(idx, &size);
if (!data) return false;
// Find: full-path="OEBPS/content.opf"
bool found = _extractAttribute((const char*)data, size,
"full-path", opfPath, opfPathSize);
free(data);
return found;
}
// ----------------------------------------------------------
// Parse OPF to extract title, build manifest, and resolve spine.
//
// Populates chapterPaths (heap-allocated array of strings) with
// full ZIP paths for each chapter in spine order.
// Caller must free with _freeChapterPaths().
// ----------------------------------------------------------
static bool _parseOpf(EpubZipReader* zip, const char* opfPath,
const char* baseDir, char* title, int titleSize,
char*** outChapterPaths, int* outChapterCount) {
int opfIdx = zip->findEntry(opfPath);
if (opfIdx < 0) return false;
uint32_t opfSize = 0;
uint8_t* opfData = zip->extractEntry(opfIdx, &opfSize);
if (!opfData) return false;
const char* xml = (const char*)opfData;
// Extract title
_extractTagContent(xml, opfSize, "dc:title", title, titleSize);
// Build manifest: map id -> href
// We use two parallel arrays to avoid complex data structures
struct ManifestItem {
char id[64];
char href[128];
bool isContent; // has media-type containing "html" or "xml"
};
// Heap-allocate manifest (could be large)
ManifestItem* manifest = (ManifestItem*)ps_malloc(
EPUB_MAX_MANIFEST * sizeof(ManifestItem));
if (!manifest) {
manifest = (ManifestItem*)malloc(EPUB_MAX_MANIFEST * sizeof(ManifestItem));
}
if (!manifest) {
free(opfData);
return false;
}
int manifestCount = 0;
// Parse <item> elements from <manifest>
const char* manifestStart = _findTag(xml, opfSize, "<manifest");
const char* manifestEnd = manifestStart ?
_findTag(manifestStart, opfSize - (manifestStart - xml), "</manifest") : nullptr;
if (!manifestEnd) manifestEnd = xml + opfSize;
if (manifestStart) {
const char* pos = manifestStart;
while (pos < manifestEnd && manifestCount < EPUB_MAX_MANIFEST) {
pos = _findTag(pos, manifestEnd - pos, "<item");
if (!pos || pos >= manifestEnd) break;
// Find the closing > of this <item ... />
const char* tagEnd = (const char*)memchr(pos, '>', manifestEnd - pos);
if (!tagEnd) break;
tagEnd++;
ManifestItem& item = manifest[manifestCount];
item.id[0] = '\0';
item.href[0] = '\0';
item.isContent = false;
_extractAttributeFromTag(pos, tagEnd - pos, "id",
item.id, sizeof(item.id));
_extractAttributeFromTag(pos, tagEnd - pos, "href",
item.href, sizeof(item.href));
// Check media-type for content files
char mediaType[64];
mediaType[0] = '\0';
_extractAttributeFromTag(pos, tagEnd - pos, "media-type",
mediaType, sizeof(mediaType));
item.isContent = (strstr(mediaType, "html") != nullptr ||
strstr(mediaType, "xml") != nullptr);
if (item.id[0] && item.href[0]) {
manifestCount++;
}
pos = tagEnd;
}
}
Serial.printf("EpubProc: Manifest has %d items\n", manifestCount);
// Parse <spine> to get reading order
// Spine contains <itemref idref="..."/> elements
const char* spineStart = _findTag(xml, opfSize, "<spine");
const char* spineEnd = spineStart ?
_findTag(spineStart, opfSize - (spineStart - xml), "</spine") : nullptr;
if (!spineEnd) spineEnd = xml + opfSize;
// Collect spine idrefs
char** chapterPaths = (char**)ps_malloc(EPUB_MAX_CHAPTERS * sizeof(char*));
if (!chapterPaths) chapterPaths = (char**)malloc(EPUB_MAX_CHAPTERS * sizeof(char*));
if (!chapterPaths) {
free(manifest);
free(opfData);
return false;
}
int chapterCount = 0;
if (spineStart) {
const char* pos = spineStart;
while (pos < spineEnd && chapterCount < EPUB_MAX_CHAPTERS) {
pos = _findTag(pos, spineEnd - pos, "<itemref");
if (!pos || pos >= spineEnd) break;
const char* tagEnd = (const char*)memchr(pos, '>', spineEnd - pos);
if (!tagEnd) break;
tagEnd++;
char idref[64];
idref[0] = '\0';
_extractAttributeFromTag(pos, tagEnd - pos, "idref",
idref, sizeof(idref));
if (idref[0]) {
// Look up in manifest
for (int m = 0; m < manifestCount; m++) {
if (strcmp(manifest[m].id, idref) == 0 && manifest[m].isContent) {
// Build full path: baseDir + href
int pathLen = strlen(baseDir) + strlen(manifest[m].href) + 1;
char* fullPath = (char*)malloc(pathLen);
if (fullPath) {
snprintf(fullPath, pathLen, "%s%s", baseDir, manifest[m].href);
chapterPaths[chapterCount++] = fullPath;
}
break;
}
}
}
pos = tagEnd;
}
}
free(manifest);
free(opfData);
*outChapterPaths = chapterPaths;
*outChapterCount = chapterCount;
return chapterCount > 0;
}
// ----------------------------------------------------------
// Strip XHTML/HTML tags from raw content, producing plain text.
//
// Handles:
// - Tag removal (everything between < and >)
// - <p>, <br>, <div>, <h1>-<h6> → newlines
// - HTML entity decoding (&amp; &lt; &gt; &quot; &apos; &#NNN; &#xHH;)
// - Collapse multiple whitespace/newlines
// - Skip <head>, <style>, <script> content entirely
//
// Returns heap-allocated buffer (caller must free).
// ----------------------------------------------------------
static uint8_t* _stripXhtml(const uint8_t* input, uint32_t inputLen,
uint32_t* outLen) {
// Output can't be larger than input
uint8_t* output = (uint8_t*)ps_malloc(inputLen + 1);
if (!output) output = (uint8_t*)malloc(inputLen + 1);
if (!output) { *outLen = 0; return nullptr; }
uint32_t outPos = 0;
bool inTag = false;
bool skipContent = false; // Inside <head>, <style>, <script>
char tagName[32];
int tagNamePos = 0;
bool tagNameDone = false;
bool isClosingTag = false;
bool lastWasNewline = false;
bool lastWasSpace = false;
// Skip to <body> if present (ignore everything before it)
const uint8_t* start = input;
const uint8_t* inputEnd = input + inputLen;
const char* bodyStart = _findTagCI((const char*)input, inputLen, "<body");
if (bodyStart) {
const char* bodyTagEnd = (const char*)memchr(bodyStart, '>',
inputEnd - (const uint8_t*)bodyStart);
if (bodyTagEnd) {
start = (const uint8_t*)(bodyTagEnd + 1);
}
}
const uint8_t* end = inputEnd;
for (const uint8_t* p = start; p < end; p++) {
char c = (char)*p;
if (inTag) {
// Collecting tag name
if (!tagNameDone) {
if (tagNamePos == 0 && c == '/') {
isClosingTag = true;
continue;
}
if (c == '>' || c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '/') {
tagName[tagNamePos] = '\0';
tagNameDone = true;
} else if (tagNamePos < (int)sizeof(tagName) - 1) {
tagName[tagNamePos++] = (c >= 'A' && c <= 'Z') ? (c + 32) : c;
}
}
if (c == '>') {
inTag = false;
// Handle skip regions
if (!isClosingTag) {
if (strcmp(tagName, "head") == 0 ||
strcmp(tagName, "style") == 0 ||
strcmp(tagName, "script") == 0) {
skipContent = true;
}
} else {
if (strcmp(tagName, "head") == 0 ||
strcmp(tagName, "style") == 0 ||
strcmp(tagName, "script") == 0) {
skipContent = false;
}
}
if (!skipContent) {
// Block-level elements produce newlines
if (strcmp(tagName, "p") == 0 ||
strcmp(tagName, "div") == 0 ||
strcmp(tagName, "br") == 0 ||
strcmp(tagName, "h1") == 0 ||
strcmp(tagName, "h2") == 0 ||
strcmp(tagName, "h3") == 0 ||
strcmp(tagName, "h4") == 0 ||
strcmp(tagName, "h5") == 0 ||
strcmp(tagName, "h6") == 0 ||
strcmp(tagName, "li") == 0 ||
strcmp(tagName, "tr") == 0 ||
strcmp(tagName, "blockquote") == 0 ||
strcmp(tagName, "hr") == 0) {
if (outPos > 0 && !lastWasNewline) {
output[outPos++] = '\n';
lastWasNewline = true;
lastWasSpace = false;
}
}
}
continue;
}
continue;
}
// Not in a tag
if (c == '<') {
inTag = true;
tagNamePos = 0;
tagNameDone = false;
isClosingTag = false;
continue;
}
if (skipContent) continue;
// Handle HTML entities
if (c == '&') {
char decoded = _decodeEntity(p, end, &p);
if (decoded) {
c = decoded;
// p now points to the ';' or last char of entity; loop will increment
}
}
// Whitespace collapsing
if (c == '\n' || c == '\r') {
if (!lastWasNewline && outPos > 0) {
output[outPos++] = '\n';
lastWasNewline = true;
lastWasSpace = false;
}
continue;
}
if (c == ' ' || c == '\t') {
if (!lastWasSpace && !lastWasNewline && outPos > 0) {
output[outPos++] = ' ';
lastWasSpace = true;
}
continue;
}
// Regular character
output[outPos++] = c;
lastWasNewline = false;
lastWasSpace = false;
}
// Trim trailing whitespace
while (outPos > 0 && (output[outPos-1] == '\n' || output[outPos-1] == ' ')) {
outPos--;
}
output[outPos] = '\0';
*outLen = outPos;
return output;
}
// ----------------------------------------------------------
// Decode an HTML entity starting at '&'.
// Advances *pos to the last character consumed.
// Returns the decoded character, or '&' if not recognized.
// ----------------------------------------------------------
static char _decodeEntity(const uint8_t* p, const uint8_t* end,
const uint8_t** outPos) {
// Look for ';' within a reasonable range
const uint8_t* semi = p + 1;
int maxLen = 10;
while (semi < end && semi < p + maxLen && *semi != ';') semi++;
if (*semi != ';' || semi >= end) {
*outPos = p; // Not an entity, return '&' literal
return '&';
}
int entityLen = semi - p - 1; // Length between & and ;
const char* entity = (const char*)(p + 1);
*outPos = semi; // Skip past ';'
// Named entities
if (entityLen == 3 && strncmp(entity, "amp", 3) == 0) return '&';
if (entityLen == 2 && strncmp(entity, "lt", 2) == 0) return '<';
if (entityLen == 2 && strncmp(entity, "gt", 2) == 0) return '>';
if (entityLen == 4 && strncmp(entity, "quot", 4) == 0) return '"';
if (entityLen == 4 && strncmp(entity, "apos", 4) == 0) return '\'';
if (entityLen == 4 && strncmp(entity, "nbsp", 4) == 0) return ' ';
if (entityLen == 5 && strncmp(entity, "mdash", 5) == 0) return '-';
if (entityLen == 5 && strncmp(entity, "ndash", 5) == 0) return '-';
if (entityLen == 6 && strncmp(entity, "hellip", 6) == 0) return '.';
if (entityLen == 5 && strncmp(entity, "lsquo", 5) == 0) return '\'';
if (entityLen == 5 && strncmp(entity, "rsquo", 5) == 0) return '\'';
if (entityLen == 5 && strncmp(entity, "ldquo", 5) == 0) return '"';
if (entityLen == 5 && strncmp(entity, "rdquo", 5) == 0) return '"';
// Numeric entities: &#NNN; or &#xHH;
if (entityLen >= 2 && entity[0] == '#') {
int codepoint = 0;
if (entity[1] == 'x' || entity[1] == 'X') {
// Hex
for (int i = 2; i < entityLen; i++) {
char ch = entity[i];
if (ch >= '0' && ch <= '9') codepoint = codepoint * 16 + (ch - '0');
else if (ch >= 'a' && ch <= 'f') codepoint = codepoint * 16 + (ch - 'a' + 10);
else if (ch >= 'A' && ch <= 'F') codepoint = codepoint * 16 + (ch - 'A' + 10);
}
} else {
// Decimal
for (int i = 1; i < entityLen; i++) {
char ch = entity[i];
if (ch >= '0' && ch <= '9') codepoint = codepoint * 10 + (ch - '0');
}
}
// Map to ASCII (best effort - e-ink font is ASCII only)
if (codepoint >= 32 && codepoint < 127) return (char)codepoint;
if (codepoint == 160) return ' '; // non-breaking space
if (codepoint == 8211 || codepoint == 8212) return '-'; // en/em dash
if (codepoint == 8216 || codepoint == 8217) return '\''; // smart quotes
if (codepoint == 8220 || codepoint == 8221) return '"'; // smart quotes
if (codepoint == 8230) return '.'; // ellipsis
if (codepoint == 8226) return '*'; // bullet
// Unknown codepoint > 127: skip it
return ' ';
}
// Unknown entity - output as space
return ' ';
}
// ----------------------------------------------------------
// Find a tag in XML data (case-sensitive, e.g., "<manifest").
// Returns pointer to '<' of found tag, or nullptr.
// ----------------------------------------------------------
static const char* _findTag(const char* data, int dataLen, const char* tag) {
int tagLen = strlen(tag);
const char* end = data + dataLen - tagLen;
for (const char* p = data; p <= end; p++) {
if (memcmp(p, tag, tagLen) == 0) return p;
}
return nullptr;
}
// ----------------------------------------------------------
// Find a tag case-insensitively (for <body>, <BODY>, etc.).
// ----------------------------------------------------------
static const char* _findTagCI(const char* data, int dataLen, const char* tag) {
int tagLen = strlen(tag);
const char* end = data + dataLen - tagLen;
for (const char* p = data; p <= end; p++) {
if (strncasecmp(p, tag, tagLen) == 0) return p;
}
return nullptr;
}
// ----------------------------------------------------------
// Extract an attribute value from a region of XML.
// Scans for attr="value" and copies value to outBuf.
// ----------------------------------------------------------
static bool _extractAttribute(const char* data, int dataLen,
const char* attrName, char* outBuf, int outBufSize) {
int nameLen = strlen(attrName);
const char* end = data + dataLen;
for (const char* p = data; p < end - nameLen - 2; p++) {
if (strncmp(p, attrName, nameLen) == 0 && p[nameLen] == '=') {
p += nameLen + 1;
char quote = *p;
if (quote != '"' && quote != '\'') continue;
p++;
const char* valEnd = (const char*)memchr(p, quote, end - p);
if (!valEnd) continue;
int valLen = valEnd - p;
if (valLen >= outBufSize) valLen = outBufSize - 1;
memcpy(outBuf, p, valLen);
outBuf[valLen] = '\0';
return true;
}
}
return false;
}
// ----------------------------------------------------------
// Extract an attribute value from within a single tag string.
// (More targeted version for parsing <item id="x" href="y"/>)
// ----------------------------------------------------------
static bool _extractAttributeFromTag(const char* tag, int tagLen,
const char* attrName,
char* outBuf, int outBufSize) {
return _extractAttribute(tag, tagLen, attrName, outBuf, outBufSize);
}
// ----------------------------------------------------------
// Extract text content between <tagName>...</tagName>.
// Works for simple cases like <dc:title>The Iliad</dc:title>.
// ----------------------------------------------------------
static bool _extractTagContent(const char* data, int dataLen,
const char* tagName, char* outBuf, int outBufSize) {
// Build open tag pattern: "<dc:title" (without >)
char openTag[64];
snprintf(openTag, sizeof(openTag), "<%s", tagName);
const char* start = _findTag(data, dataLen, openTag);
if (!start) return false;
// Find the > that closes the opening tag
const char* end = data + dataLen;
const char* contentStart = (const char*)memchr(start, '>', end - start);
if (!contentStart) return false;
contentStart++; // Skip past '>'
// Find closing tag
char closeTag[64];
snprintf(closeTag, sizeof(closeTag), "</%s>", tagName);
const char* contentEnd = _findTag(contentStart, end - contentStart, closeTag);
if (!contentEnd) return false;
int len = contentEnd - contentStart;
if (len >= outBufSize) len = outBufSize - 1;
memcpy(outBuf, contentStart, len);
outBuf[len] = '\0';
return true;
}
// ----------------------------------------------------------
// Get directory portion of a path.
// "OEBPS/content.opf" -> "OEBPS/"
// "content.opf" -> ""
// ----------------------------------------------------------
static void _getDirectory(const char* path, char* dirBuf, int dirBufSize) {
const char* lastSlash = strrchr(path, '/');
if (lastSlash) {
int len = lastSlash - path + 1; // Include trailing /
if (len >= dirBufSize) len = dirBufSize - 1;
memcpy(dirBuf, path, len);
dirBuf[len] = '\0';
} else {
dirBuf[0] = '\0';
}
}
// ----------------------------------------------------------
// Free the chapter paths array allocated by _parseOpf().
// ----------------------------------------------------------
static void _freeChapterPaths(char** paths, int count) {
if (paths) {
for (int i = 0; i < count; i++) {
if (paths[i]) free(paths[i]);
}
free(paths);
}
}
};