mirror of
https://github.com/SpudGunMan/meshing-around.git
synced 2026-05-18 06:55:38 +02:00
cleanup
This commit is contained in:
+75
-77
@@ -2,88 +2,86 @@
|
||||
|
||||
from modules.log import *
|
||||
import wikipedia # pip install wikipedia
|
||||
import requests
|
||||
import bs4 as bs
|
||||
from urllib.parse import quote
|
||||
# ...existing code...
|
||||
|
||||
# Kiwix support for local wiki
|
||||
if use_kiwix_server:
|
||||
import requests
|
||||
import bs4 as bs
|
||||
from urllib.parse import quote
|
||||
|
||||
# Kiwix helper functions (only loaded if use_kiwix_server is True)
|
||||
if wikipedia_enabled and use_kiwix_server:
|
||||
def tag_visible(element):
|
||||
"""Filter visible text from HTML elements for Kiwix"""
|
||||
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
||||
return False
|
||||
if isinstance(element, bs.element.Comment):
|
||||
return False
|
||||
return True
|
||||
def tag_visible(element):
|
||||
"""Filter visible text from HTML elements for Kiwix"""
|
||||
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
||||
return False
|
||||
if isinstance(element, bs.element.Comment):
|
||||
return False
|
||||
return True
|
||||
|
||||
def text_from_html(body):
|
||||
"""Extract visible text from HTML content"""
|
||||
soup = bs.BeautifulSoup(body, 'html.parser')
|
||||
texts = soup.find_all(string=True)
|
||||
visible_texts = filter(tag_visible, texts)
|
||||
return " ".join(t.strip() for t in visible_texts if t.strip())
|
||||
def text_from_html(body):
|
||||
"""Extract visible text from HTML content"""
|
||||
soup = bs.BeautifulSoup(body, 'html.parser')
|
||||
texts = soup.find_all(string=True)
|
||||
visible_texts = filter(tag_visible, texts)
|
||||
return " ".join(t.strip() for t in visible_texts if t.strip())
|
||||
|
||||
def get_kiwix_summary(search_term):
|
||||
"""Query local Kiwix server for Wikipedia article"""
|
||||
try:
|
||||
search_encoded = quote(search_term)
|
||||
# Try direct article access first
|
||||
wiki_article = search_encoded.capitalize().replace("%20", "_")
|
||||
exact_url = f"{kiwix_url}/raw/{kiwix_library_name}/content/A/{wiki_article}"
|
||||
def get_kiwix_summary(search_term):
|
||||
"""Query local Kiwix server for Wikipedia article"""
|
||||
try:
|
||||
search_encoded = quote(search_term)
|
||||
# Try direct article access first
|
||||
wiki_article = search_encoded.capitalize().replace("%20", "_")
|
||||
exact_url = f"{kiwix_url}/raw/{kiwix_library_name}/content/A/{wiki_article}"
|
||||
|
||||
response = requests.get(exact_url, timeout=urlTimeoutSeconds)
|
||||
if response.status_code == 200:
|
||||
# Extract and clean text
|
||||
text = text_from_html(response.text)
|
||||
# Remove common Wikipedia metadata prefixes
|
||||
text = text.split("Jump to navigation", 1)[-1]
|
||||
text = text.split("Jump to search", 1)[-1]
|
||||
# Truncate to reasonable length (first few sentences)
|
||||
sentences = text.split('. ')
|
||||
summary = '. '.join(sentences[:wiki_return_limit])
|
||||
if summary and not summary.endswith('.'):
|
||||
summary += '.'
|
||||
return summary.strip()[:500] # Hard limit at 500 chars
|
||||
|
||||
# If direct access fails, try search
|
||||
search_url = f"{kiwix_url}/search?content={kiwix_library_name}&pattern={search_encoded}"
|
||||
response = requests.get(search_url, timeout=urlTimeoutSeconds)
|
||||
|
||||
if response.status_code == 200 and "No results were found" not in response.text:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
links = [a['href'] for a in soup.find_all('a', href=True) if "start=" not in a['href']]
|
||||
|
||||
response = requests.get(exact_url, timeout=urlTimeoutSeconds)
|
||||
if response.status_code == 200:
|
||||
# Extract and clean text
|
||||
text = text_from_html(response.text)
|
||||
# Remove common Wikipedia metadata prefixes
|
||||
text = text.split("Jump to navigation", 1)[-1]
|
||||
text = text.split("Jump to search", 1)[-1]
|
||||
# Truncate to reasonable length (first few sentences)
|
||||
sentences = text.split('. ')
|
||||
summary = '. '.join(sentences[:wiki_return_limit])
|
||||
if summary and not summary.endswith('.'):
|
||||
summary += '.'
|
||||
return summary.strip()[:500] # Hard limit at 500 chars
|
||||
|
||||
# If direct access fails, try search
|
||||
search_url = f"{kiwix_url}/search?content={kiwix_library_name}&pattern={search_encoded}"
|
||||
response = requests.get(search_url, timeout=urlTimeoutSeconds)
|
||||
|
||||
if response.status_code == 200 and "No results were found" not in response.text:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
links = [a['href'] for a in soup.find_all('a', href=True) if "start=" not in a['href']]
|
||||
|
||||
for link in links[:3]: # Check first 3 results
|
||||
article_name = link.split("/")[-1]
|
||||
if not article_name or article_name[0].islower():
|
||||
continue
|
||||
|
||||
article_url = f"{kiwix_url}{link}"
|
||||
article_response = requests.get(article_url, timeout=urlTimeoutSeconds)
|
||||
if article_response.status_code == 200:
|
||||
text = text_from_html(article_response.text)
|
||||
text = text.split("Jump to navigation", 1)[-1]
|
||||
text = text.split("Jump to search", 1)[-1]
|
||||
sentences = text.split('. ')
|
||||
summary = '. '.join(sentences[:wiki_return_limit])
|
||||
if summary and not summary.endswith('.'):
|
||||
summary += '.'
|
||||
return summary.strip()[:500]
|
||||
|
||||
logger.warning(f"System: No Kiwix Results for:{search_term}")
|
||||
# try to fall back to online Wikipedia if available
|
||||
return get_wikipedia_summary(search_term, force=True)
|
||||
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"System: Kiwix connection error: {e}")
|
||||
return "Unable to connect to local wiki server"
|
||||
except Exception as e:
|
||||
logger.warning(f"System: Error with Kiwix for:{search_term} {e}")
|
||||
return ERROR_FETCHING_DATA
|
||||
for link in links[:3]: # Check first 3 results
|
||||
article_name = link.split("/")[-1]
|
||||
if not article_name or article_name[0].islower():
|
||||
continue
|
||||
|
||||
article_url = f"{kiwix_url}{link}"
|
||||
article_response = requests.get(article_url, timeout=urlTimeoutSeconds)
|
||||
if article_response.status_code == 200:
|
||||
text = text_from_html(article_response.text)
|
||||
text = text.split("Jump to navigation", 1)[-1]
|
||||
text = text.split("Jump to search", 1)[-1]
|
||||
sentences = text.split('. ')
|
||||
summary = '. '.join(sentences[:wiki_return_limit])
|
||||
if summary and not summary.endswith('.'):
|
||||
summary += '.'
|
||||
return summary.strip()[:500]
|
||||
|
||||
logger.warning(f"System: No Kiwix Results for:{search_term}")
|
||||
# try to fall back to online Wikipedia if available
|
||||
return get_wikipedia_summary(search_term, force=True)
|
||||
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"System: Kiwix connection error: {e}")
|
||||
return "Unable to connect to local wiki server"
|
||||
except Exception as e:
|
||||
logger.warning(f"System: Error with Kiwix for:{search_term} {e}")
|
||||
return ERROR_FETCHING_DATA
|
||||
|
||||
|
||||
def get_wikipedia_summary(search_term, location=None, force=False):
|
||||
lat, lon = location if location else (None, None)
|
||||
|
||||
Reference in New Issue
Block a user