mirror of
https://github.com/SpudGunMan/meshing-around.git
synced 2026-03-28 17:32:36 +01:00
136 lines
6.3 KiB
Python
136 lines
6.3 KiB
Python
# meshbot wiki module
|
|
|
|
from modules.log import logger
|
|
from modules.settings import (use_kiwix_server, kiwix_url, kiwix_library_name,
|
|
urlTimeoutSeconds, wiki_return_limit, ERROR_FETCHING_DATA, wikipedia_enabled)
|
|
#import wikipedia # pip install wikipedia
|
|
import requests
|
|
import bs4 as bs
|
|
from urllib.parse import quote
|
|
|
|
def tag_visible(element):
|
|
"""Filter visible text from HTML elements for Kiwix"""
|
|
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
|
return False
|
|
if isinstance(element, bs.element.Comment):
|
|
return False
|
|
return True
|
|
|
|
def text_from_html(body):
|
|
"""Extract main article text from HTML content"""
|
|
soup = bs.BeautifulSoup(body, 'html.parser')
|
|
# Try to find the main content div (works for both Kiwix and Wikipedia HTML)
|
|
main = soup.find('div', class_='mw-parser-output')
|
|
if not main:
|
|
# Fallback: just use the body if main content div not found
|
|
main = soup.body
|
|
if not main:
|
|
return ""
|
|
texts = main.find_all(string=True)
|
|
visible_texts = filter(tag_visible, texts)
|
|
return " ".join(t.strip() for t in visible_texts if t.strip())
|
|
|
|
def get_kiwix_summary(search_term, truncate=True):
|
|
"""Query local Kiwix server for Wikipedia article using only search results."""
|
|
if search_term is None or search_term.strip() == "":
|
|
return ERROR_FETCHING_DATA
|
|
try:
|
|
search_encoded = quote(search_term)
|
|
search_url = f"{kiwix_url}/search?content={kiwix_library_name}&pattern={search_encoded}"
|
|
response = requests.get(search_url, timeout=urlTimeoutSeconds)
|
|
|
|
if response.status_code == 200 and "No results were found" not in response.text:
|
|
soup = bs.BeautifulSoup(response.text, 'html.parser')
|
|
results = soup.select('div.results ul li')
|
|
logger.debug(f"Kiwix: Found {len(results)} results in search results for:{search_term}")
|
|
for li in results[:3]:
|
|
a = li.find('a', href=True)
|
|
if not a:
|
|
continue
|
|
article_url = f"{kiwix_url}{a['href']}"
|
|
article_response = requests.get(article_url, timeout=urlTimeoutSeconds)
|
|
if article_response.status_code == 200:
|
|
text = text_from_html(article_response.text)
|
|
# Remove navigation and search jump text
|
|
# text = text.split("Jump to navigation", 1)[-1]
|
|
# text = text.split("Jump to search", 1)[-1]
|
|
sentences = text.split('. ')
|
|
summary = '. '.join(sentences[:wiki_return_limit])
|
|
if summary and not summary.endswith('.'):
|
|
summary += '.'
|
|
if truncate:
|
|
return summary.strip()[:500]
|
|
else:
|
|
return summary.strip()
|
|
|
|
logger.debug(f"System: No Kiwix Results for:{search_term}")
|
|
if wikipedia_enabled:
|
|
logger.debug("Kiwix: Falling back to Wikipedia API.")
|
|
return get_wikipedia_summary(search_term, force=True)
|
|
return ERROR_FETCHING_DATA
|
|
|
|
except Exception as e:
|
|
logger.warning(f"System: Error with Kiwix for:{search_term} URL:{search_url} {e}")
|
|
return ERROR_FETCHING_DATA
|
|
|
|
def get_wikipedia_summary(search_term, location=None, force=False, truncate=True):
|
|
if use_kiwix_server and not force:
|
|
return get_kiwix_summary(search_term)
|
|
|
|
if not search_term or not search_term.strip():
|
|
return ERROR_FETCHING_DATA
|
|
|
|
api_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(search_term)}"
|
|
headers = {
|
|
"User-Agent": "MeshBot/1.0 (https://github.com/kkeeton/meshing-around; contact: youremail@example.com)"
|
|
}
|
|
try:
|
|
response = requests.get(api_url, timeout=5, headers=headers)
|
|
if response.status_code == 404:
|
|
logger.warning(f"System: No Wikipedia Results for:{search_term}")
|
|
return ERROR_FETCHING_DATA
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
logger.debug(f"Wikipedia API response for '{search_term}': {len(data)} keys")
|
|
if "extract" not in data or not data.get("extract"):
|
|
#logger.debug(f"System: Wikipedia API returned no extract for:{search_term} (data: {data})")
|
|
return ERROR_FETCHING_DATA
|
|
if data.get("type") == "disambiguation" or "may refer to:" in data.get("extract", ""):
|
|
#logger.warning(f"System: Disambiguation page for:{search_term} (data: {data})")
|
|
# Fetch and parse the HTML disambiguation page
|
|
html_url = f"https://en.wikipedia.org/wiki/{requests.utils.quote(search_term)}"
|
|
html_resp = requests.get(html_url, timeout=5, headers=headers)
|
|
if html_resp.status_code == 200:
|
|
soup = bs.BeautifulSoup(html_resp.text, 'html.parser')
|
|
items = soup.select('div.mw-parser-output ul li a[href^="/wiki/"]')
|
|
choices = []
|
|
for a in items:
|
|
title = a.get('title')
|
|
href = a.get('href')
|
|
# Filter out non-article links
|
|
if title and href and ':' not in href:
|
|
choices.append(f"{title} (https://en.wikipedia.org{href})")
|
|
if len(choices) >= 5:
|
|
break
|
|
if choices:
|
|
return f"'{search_term}' is ambiguous. Did you mean:\n- " + "\n- ".join(choices)
|
|
return f"'{search_term}' is ambiguous. Please be more specific. See: {html_url}"
|
|
summary = data.get("extract")
|
|
if not summary or not isinstance(summary, str) or not summary.strip():
|
|
#logger.debug(f"System: No summary found for:{search_term} (data: {data})")
|
|
return ERROR_FETCHING_DATA
|
|
sentences = [s for s in summary.split('. ') if s.strip()]
|
|
if not sentences:
|
|
return ERROR_FETCHING_DATA
|
|
summary = '. '.join(sentences[:wiki_return_limit])
|
|
if summary and not summary.endswith('.'):
|
|
summary += '.'
|
|
if truncate:
|
|
# Truncate to 500 characters
|
|
return summary.strip()[:500]
|
|
else:
|
|
return summary.strip()
|
|
except Exception as e:
|
|
logger.warning(f"System: Wikipedia API error for:{search_term} {e}")
|
|
return ERROR_FETCHING_DATA
|