From f33da848cdc171caea8f758f85bf7489715d21fa Mon Sep 17 00:00:00 2001 From: SpudGunMan Date: Fri, 24 Oct 2025 10:32:28 -0700 Subject: [PATCH] cleanup --- modules/wiki.py | 152 ++++++++++++++++++++++++------------------------ 1 file changed, 75 insertions(+), 77 deletions(-) diff --git a/modules/wiki.py b/modules/wiki.py index 75c1cc8..c4b2093 100644 --- a/modules/wiki.py +++ b/modules/wiki.py @@ -2,88 +2,86 @@ from modules.log import * import wikipedia # pip install wikipedia +import requests +import bs4 as bs +from urllib.parse import quote +# ...existing code... -# Kiwix support for local wiki -if use_kiwix_server: - import requests - import bs4 as bs - from urllib.parse import quote -# Kiwix helper functions (only loaded if use_kiwix_server is True) -if wikipedia_enabled and use_kiwix_server: - def tag_visible(element): - """Filter visible text from HTML elements for Kiwix""" - if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: - return False - if isinstance(element, bs.element.Comment): - return False - return True +def tag_visible(element): + """Filter visible text from HTML elements for Kiwix""" + if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: + return False + if isinstance(element, bs.element.Comment): + return False + return True - def text_from_html(body): - """Extract visible text from HTML content""" - soup = bs.BeautifulSoup(body, 'html.parser') - texts = soup.find_all(string=True) - visible_texts = filter(tag_visible, texts) - return " ".join(t.strip() for t in visible_texts if t.strip()) +def text_from_html(body): + """Extract visible text from HTML content""" + soup = bs.BeautifulSoup(body, 'html.parser') + texts = soup.find_all(string=True) + visible_texts = filter(tag_visible, texts) + return " ".join(t.strip() for t in visible_texts if t.strip()) - def get_kiwix_summary(search_term): - """Query local Kiwix server for Wikipedia article""" - try: - search_encoded = quote(search_term) - # Try direct article access first - wiki_article = search_encoded.capitalize().replace("%20", "_") - exact_url = f"{kiwix_url}/raw/{kiwix_library_name}/content/A/{wiki_article}" +def get_kiwix_summary(search_term): + """Query local Kiwix server for Wikipedia article""" + try: + search_encoded = quote(search_term) + # Try direct article access first + wiki_article = search_encoded.capitalize().replace("%20", "_") + exact_url = f"{kiwix_url}/raw/{kiwix_library_name}/content/A/{wiki_article}" + + response = requests.get(exact_url, timeout=urlTimeoutSeconds) + if response.status_code == 200: + # Extract and clean text + text = text_from_html(response.text) + # Remove common Wikipedia metadata prefixes + text = text.split("Jump to navigation", 1)[-1] + text = text.split("Jump to search", 1)[-1] + # Truncate to reasonable length (first few sentences) + sentences = text.split('. ') + summary = '. '.join(sentences[:wiki_return_limit]) + if summary and not summary.endswith('.'): + summary += '.' + return summary.strip()[:500] # Hard limit at 500 chars + + # If direct access fails, try search + search_url = f"{kiwix_url}/search?content={kiwix_library_name}&pattern={search_encoded}" + response = requests.get(search_url, timeout=urlTimeoutSeconds) + + if response.status_code == 200 and "No results were found" not in response.text: + soup = BeautifulSoup(response.text, 'html.parser') + links = [a['href'] for a in soup.find_all('a', href=True) if "start=" not in a['href']] - response = requests.get(exact_url, timeout=urlTimeoutSeconds) - if response.status_code == 200: - # Extract and clean text - text = text_from_html(response.text) - # Remove common Wikipedia metadata prefixes - text = text.split("Jump to navigation", 1)[-1] - text = text.split("Jump to search", 1)[-1] - # Truncate to reasonable length (first few sentences) - sentences = text.split('. ') - summary = '. '.join(sentences[:wiki_return_limit]) - if summary and not summary.endswith('.'): - summary += '.' - return summary.strip()[:500] # Hard limit at 500 chars - - # If direct access fails, try search - search_url = f"{kiwix_url}/search?content={kiwix_library_name}&pattern={search_encoded}" - response = requests.get(search_url, timeout=urlTimeoutSeconds) - - if response.status_code == 200 and "No results were found" not in response.text: - soup = BeautifulSoup(response.text, 'html.parser') - links = [a['href'] for a in soup.find_all('a', href=True) if "start=" not in a['href']] - - for link in links[:3]: # Check first 3 results - article_name = link.split("/")[-1] - if not article_name or article_name[0].islower(): - continue - - article_url = f"{kiwix_url}{link}" - article_response = requests.get(article_url, timeout=urlTimeoutSeconds) - if article_response.status_code == 200: - text = text_from_html(article_response.text) - text = text.split("Jump to navigation", 1)[-1] - text = text.split("Jump to search", 1)[-1] - sentences = text.split('. ') - summary = '. '.join(sentences[:wiki_return_limit]) - if summary and not summary.endswith('.'): - summary += '.' - return summary.strip()[:500] - - logger.warning(f"System: No Kiwix Results for:{search_term}") - # try to fall back to online Wikipedia if available - return get_wikipedia_summary(search_term, force=True) - - - except requests.RequestException as e: - logger.warning(f"System: Kiwix connection error: {e}") - return "Unable to connect to local wiki server" - except Exception as e: - logger.warning(f"System: Error with Kiwix for:{search_term} {e}") - return ERROR_FETCHING_DATA + for link in links[:3]: # Check first 3 results + article_name = link.split("/")[-1] + if not article_name or article_name[0].islower(): + continue + + article_url = f"{kiwix_url}{link}" + article_response = requests.get(article_url, timeout=urlTimeoutSeconds) + if article_response.status_code == 200: + text = text_from_html(article_response.text) + text = text.split("Jump to navigation", 1)[-1] + text = text.split("Jump to search", 1)[-1] + sentences = text.split('. ') + summary = '. '.join(sentences[:wiki_return_limit]) + if summary and not summary.endswith('.'): + summary += '.' + return summary.strip()[:500] + + logger.warning(f"System: No Kiwix Results for:{search_term}") + # try to fall back to online Wikipedia if available + return get_wikipedia_summary(search_term, force=True) + + + except requests.RequestException as e: + logger.warning(f"System: Kiwix connection error: {e}") + return "Unable to connect to local wiki server" + except Exception as e: + logger.warning(f"System: Error with Kiwix for:{search_term} {e}") + return ERROR_FETCHING_DATA + def get_wikipedia_summary(search_term, location=None, force=False): lat, lon = location if location else (None, None)