RcGcDb/src/misc.py

251 lines
8.2 KiB
Python

from __future__ import annotations
import json
from functools import cache
from html.parser import HTMLParser
import base64, re
import logging
from typing import Callable
from urllib.parse import urlparse, urlunparse
from src.config import settings
logger = logging.getLogger("rcgcdw.misc")
def get_paths(wiki: str, request) -> tuple:
"""Prepares wiki paths for the functions"""
parsed_url = urlparse(wiki)
WIKI_API_PATH = wiki + "api.php"
WIKI_SCRIPT_PATH = wiki
WIKI_ARTICLE_PATH = urlunparse((*parsed_url[0:2], "", "", "", "")) + request["query"]["general"]["articlepath"]
WIKI_JUST_DOMAIN = urlunparse((*parsed_url[0:2], "", "", "", ""))
return WIKI_API_PATH, WIKI_SCRIPT_PATH, WIKI_ARTICLE_PATH, WIKI_JUST_DOMAIN
def get_domain(url: str) -> str:
"""Get domain of given URL"""
parsed_url = urlparse(url)
return ".".join(urlunparse((*parsed_url[0:2], "", "", "", "")).split(".")[-2:]) # something like gamepedia.com, fandom.com
def run_hooks(hooks, *arguments):
for hook in hooks:
try:
hook(*arguments)
except:
if settings.get("error_tolerance", 1) > 0:
logger.exception("On running a pre hook, ignoring pre-hook")
else:
raise
class LinkParser(HTMLParser):
new_string = ""
recent_href = ""
def __init__(self, DOMAIN_URL: str):
self.WIKI_JUST_DOMAIN = DOMAIN_URL
super().__init__()
def handle_starttag(self, tag, attrs):
for attr in attrs:
if attr[0] == 'href':
self.recent_href = attr[1]
if self.recent_href.startswith("//"):
self.recent_href = "https:{rest}".format(rest=self.recent_href)
elif not self.recent_href.startswith("http"):
self.recent_href = self.WIKI_JUST_DOMAIN + self.recent_href
self.recent_href = self.recent_href.replace(")", "\\)")
elif attr[0] == 'data-uncrawlable-url':
self.recent_href = attr[1].encode('ascii')
self.recent_href = base64.b64decode(self.recent_href)
self.recent_href = self.WIKI_JUST_DOMAIN + self.recent_href.decode('ascii')
def handle_data(self, data):
if self.recent_href:
self.new_string = self.new_string + "[{}](<{}>)".format(escape_formatting(data), self.recent_href)
self.recent_href = ""
else:
self.new_string = self.new_string + escape_formatting(data)
def handle_comment(self, data):
self.new_string = self.new_string + escape_formatting(data)
def handle_endtag(self, tag):
# logger.debug(self.new_string)
pass
# LinkParse = LinkParser()
# def parse_link(domain: str, to_parse: str) -> str:
# """Because I have strange issues using the LinkParser class myself, this is a helper function
# to utilize the LinkParser properly"""
# LinkParse.WIKI_JUST_DOMAIN = domain
# LinkParse.new_string = ""
# LinkParse.feed(to_parse)
# LinkParse.recent_href = ""
# return LinkParse.new_string
def link_formatter(link: str) -> str:
"""Formats a link to not embed it"""
return "<" + re.sub(r"([)])", "\\\\\\1", link).replace(" ", "_") + ">"
def escape_formatting(data: str) -> str:
"""Escape Discord formatting"""
return re.sub(r"([`_*~:<>{}@/|\\\[\]\(\)])", "\\\\\\1", data, 0) if data is not None else ""
def create_article_path(article: str, WIKI_ARTICLE_PATH: str) -> str:
"""Takes the string and creates an URL with it as the article name"""
article = article.replace(" ", "_").replace("%", "%25").replace("\\", "%5C")
if "?" in WIKI_ARTICLE_PATH:
article = article.replace("&", "%26")
else:
article = article.replace("?", "%3F")
return WIKI_ARTICLE_PATH.replace("$1", article)
def profile_field_name(name, embed, _):
profile_fields = {"profile-location": _("Location"), "profile-aboutme": _("About me"),
"profile-link-google": _("Google link"), "profile-link-facebook": _("Facebook link"),
"profile-link-twitter": _("Twitter link"), "profile-link-reddit": _("Reddit link"),
"profile-link-twitch": _("Twitch link"), "profile-link-psn": _("PSN link"),
"profile-link-vk": _("VK link"), "profile-link-xbl": _("XBL link"),
"profile-link-steam": _("Steam link"), "profile-link-discord": _("Discord handle"),
"profile-link-battlenet": _("Battle.net handle")}
try:
return profile_fields[name]
except KeyError:
if embed:
return _("Unknown")
else:
return _("unknown")
def class_searcher(attribs: list) -> str:
"""Function to return classes of given element in HTMLParser on handle_starttag
:returns a string with all of the classes of element
"""
for attr in attribs:
if attr[0] == "class":
return attr[1]
return ""
class ContentParser(HTMLParser):
"""ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request
for two MediaWiki revisions. It extracts the following:
small_prev_ins - storing up to 1000 characters of added text
small_prev_del - storing up to 1000 chracters of removed text
ins_length - storing length of inserted text
del_length - storing length of deleted text
"""
current_tag = ""
last_ins = None
last_del = None
empty = False
small_prev_ins = ""
small_prev_del = ""
def __init__(self, lang: Callable):
super().__init__()
self.more = lang("\n__And more__")
self.ins_length = len(self.more)
self.del_length = len(self.more)
def handle_starttag(self, tagname, attribs):
if tagname == "ins" or tagname == "del":
self.current_tag = tagname
if tagname == "td":
classes = class_searcher(attribs).split(' ')
if "diff-addedline" in classes and self.ins_length <= 1000:
self.current_tag = "tda"
self.last_ins = ""
if "diff-deletedline" in classes and self.del_length <= 1000:
self.current_tag = "tdd"
self.last_del = ""
if "diff-empty" in classes:
self.empty = True
def handle_data(self, data):
data = escape_formatting(data)
if self.current_tag == "ins" and self.ins_length <= 1000:
self.ins_length += len("**" + data + "**")
if self.ins_length <= 1000:
self.last_ins = self.last_ins + "**" + data + "**"
if self.current_tag == "del" and self.del_length <= 1000:
self.del_length += len("~~" + data + "~~")
if self.del_length <= 1000:
self.last_del = self.last_del + "~~" + data + "~~"
if self.current_tag == "tda" and self.ins_length <= 1000:
self.ins_length += len(data)
if self.ins_length <= 1000:
self.last_ins = self.last_ins + data
if self.current_tag == "tdd" and self.del_length <= 1000:
self.del_length += len(data)
if self.del_length <= 1000:
self.last_del = self.last_del + data
def handle_endtag(self, tagname):
if tagname == "ins":
self.current_tag = "tda"
elif tagname == "del":
self.current_tag = "tdd"
elif tagname == "td":
self.current_tag = ""
elif tagname == "tr":
if self.last_ins is not None:
self.ins_length += 1
if self.empty and not self.last_ins.isspace():
if "**" in self.last_ins:
self.last_ins = self.last_ins.replace("**", "__")
self.ins_length += 4
self.last_ins = "**" + self.last_ins + "**"
self.small_prev_ins = self.small_prev_ins + "\n" + self.last_ins
if self.ins_length > 1000:
self.small_prev_ins = self.small_prev_ins + self.more
self.last_ins = None
if self.last_del is not None:
self.del_length += 1
if self.empty and not self.last_del.isspace():
if "~~" in self.last_del:
self.last_del = self.last_del.replace("~~", "__")
self.del_length += 4
self.last_del = "~~" + self.last_del + "~~"
self.small_prev_del = self.small_prev_del + "\n" + self.last_del
if self.del_length > 1000:
self.small_prev_del = self.small_prev_del + self.more
self.last_del = None
self.empty = False
@cache
def prepare_settings(display_mode: int) -> dict:
"""Prepares dict of RcGcDw compatible settings based on a template and display mode of given call"""
with open("src/api/template_settings.json", "r") as template_json:
template = json.load(template_json)
template["appearance"]["embed"]["embed_images"] = True if display_mode > 1 else False
template["appearance"]["embed"]["show_edit_changes"] = True if display_mode > 2 else False
return template
class LimitedList(list):
def __init__(self, *args, limit=settings.get("queue_limit", 30)):
list.__init__(self, *args)
self.queue_limit = limit
def append(self, obj) -> None:
if len(self) > self.queue_limit:
self.pop(0)
super(LimitedList, self).append(obj)
def __repr__(self):
return "\n".join([str(x) for x in self])