RcGcDw/src/misc.py

392 lines
14 KiB
Python
Raw Normal View History

2019-05-20 15:28:55 +00:00
# -*- coding: utf-8 -*-
# This file is part of Recent changes Goat compatible Discord webhook (RcGcDw).
2019-05-20 13:32:23 +00:00
# RcGcDw is free software: you can redistribute it and/or modify
2019-05-20 13:32:23 +00:00
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# RcGcDw is distributed in the hope that it will be useful,
2019-05-20 13:32:23 +00:00
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with RcGcDw. If not, see <http://www.gnu.org/licenses/>.
import base64
2021-05-15 10:11:56 +00:00
import json, logging, sys, re, platform
2019-05-20 13:11:30 +00:00
from html.parser import HTMLParser
from typing import Callable, Tuple, List, Optional, Union
2021-04-25 11:20:58 +00:00
from urllib.parse import urlparse, urlunparse
import requests
2021-04-25 11:20:58 +00:00
from src.api.context import Context
2022-10-12 11:41:22 +00:00
from src.argparser import command_args
2020-07-07 11:21:49 +00:00
from src.configloader import settings
2021-05-23 19:06:59 +00:00
import src.api.util
from src.discord.message import DiscordMessage, DiscordMessageMetadata
from src.discord.queue import messagequeue, send_to_discord
from src.exceptions import MediaWikiError
2020-08-03 13:27:24 +00:00
from src.i18n import misc
2019-05-20 19:01:45 +00:00
2020-08-03 13:27:24 +00:00
_ = misc.gettext
# Create a custom logger
misc_logger = logging.getLogger("rcgcdw.misc")
data_template = {"rcid": None, "discussion_id": 0, "abuse_log_id": None,
"daily_overview": {"edits": None, "new_files": None, "admin_actions": None, "bytes_changed": None,
"new_articles": None, "unique_editors": None, "day_score": None, "days_tracked": 0}}
WIKI_API_PATH: str = ""
WIKI_ARTICLE_PATH: str = ""
WIKI_SCRIPT_PATH: str = ""
WIKI_JUST_DOMAIN: str = ""
2022-11-10 14:39:29 +00:00
def send_simple(msgtype, message, name, avatar):
discord_msg = DiscordMessage("compact", msgtype, settings["webhookURL"], content=message)
discord_msg.set_avatar(avatar)
discord_msg.set_name(name)
messagequeue.resend_msgs()
send_to_discord(discord_msg, meta=DiscordMessageMetadata("POST"))
class DataFile:
"""Data class which instance of is shared by multiple modules to remain consistent and do not cause too many IO operations."""
def __init__(self):
2022-01-02 13:39:20 +00:00
self.data_filename: str = settings.get("datafile_path", "data.json")
self.data: dict = self.load_datafile()
2021-11-18 16:27:02 +00:00
misc_logger.debug("Current contents of {} {}".format(self.data_filename, self.data))
2022-01-02 13:39:20 +00:00
self.changed: bool = False
2021-11-18 16:27:02 +00:00
def generate_datafile(self):
"""Generate a data.json file from a template."""
try:
2021-11-18 16:27:02 +00:00
with open(self.data_filename, 'w', encoding="utf-8") as data:
data.write(json.dumps(data_template, indent=4))
except PermissionError:
misc_logger.critical("Could not create a data file (no permissions). No way to store last edit.")
sys.exit(1)
def load_datafile(self) -> dict:
"""Read a data.json file and return a dictionary with contents
:rtype: dict
"""
try:
2021-11-18 16:27:02 +00:00
with open(self.data_filename, encoding="utf-8") as data:
return json.loads(data.read())
except FileNotFoundError:
self.generate_datafile()
misc_logger.info("The data file could not be found. Generating a new one...")
2022-10-12 11:41:22 +00:00
if not command_args.nowelcome:
send_simple("welcome", _("RcGcDw is now running and checking {wiki}.").format(wiki=settings["wikiname"]),
_("Welcome"), settings["avatars"].get("welcome", None))
return data_template
def save_datafile(self):
"""Overwrites the data.json file with given dictionary"""
if self.changed is False: # don't cause unnecessary write operations
return
try:
2021-11-18 16:27:02 +00:00
with open(self.data_filename, "w", encoding="utf-8") as data_file:
data_file.write(json.dumps(self.data, indent=4))
self.changed = False
2021-05-03 11:37:42 +00:00
misc_logger.debug("Saving the database succeeded.")
except PermissionError:
misc_logger.critical("Could not modify a data file (no permissions). No way to store last edit.")
sys.exit(1)
2021-05-15 10:11:56 +00:00
except OSError as e:
if settings.get("error_tolerance", 1) > 1:
if platform.system() == "Windows":
2021-11-18 16:27:02 +00:00
if "Invalid argument: '" + self.data_filename + "'" in str(e):
2021-05-15 10:11:56 +00:00
misc_logger.error("Saving the data file failed due to Invalid argument exception, we've seen it "
"before in issue #209, if you know the reason for it happening please reopen the "
"issue with explanation, for now we are going to just ignore it.") # Reference #209
return
raise
def __setitem__(self, instance, value):
if self.data[instance] != value:
self.data[instance] = value
self.changed = True
def __getitem__(self, item):
try:
return self.data[item]
except KeyError: # if such value doesn't exist, set to and return none
self.__setitem__(item, None)
self.save_datafile()
return None
datafile = DataFile()
def weighted_average(value, weight, new_value):
"""Calculates weighted average of value number with weight weight and new_value with weight 1"""
return round(((value * weight) + new_value) / (weight + 1), 2)
2021-05-02 22:02:03 +00:00
def class_searcher(attribs: list) -> str:
"""Function to return classes of given element in HTMLParser on handle_starttag
2021-05-02 22:02:03 +00:00
:returns a string with all of the classes of element
2021-05-02 21:44:32 +00:00
"""
for attr in attribs:
if attr[0] == "class":
2021-05-02 22:02:03 +00:00
return attr[1]
return ""
2019-05-20 13:11:30 +00:00
2019-05-20 13:11:30 +00:00
class ContentParser(HTMLParser):
2021-04-24 09:19:38 +00:00
"""ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request
for two MediaWiki revisions. It extracts the following:
small_prev_ins - storing up to 1000 characters of added text
small_prev_del - storing up to 1000 chracters of removed text
ins_length - storing length of inserted text
del_length - storing length of deleted text
"""
2019-05-20 13:11:30 +00:00
more = _("\n__And more__")
current_tag = ""
last_ins = None
last_del = None
empty = False
2019-05-20 13:11:30 +00:00
small_prev_ins = ""
small_prev_del = ""
ins_length = len(more)
del_length = len(more)
def handle_starttag(self, tagname, attribs):
if tagname == "ins" or tagname == "del":
self.current_tag = tagname
2021-05-02 22:02:03 +00:00
if tagname == "td":
classes = class_searcher(attribs).split(' ')
if "diff-addedline" in classes and self.ins_length <= 1000:
self.current_tag = "tda"
self.last_ins = ""
if "diff-deletedline" in classes and self.del_length <= 1000:
self.current_tag = "tdd"
self.last_del = ""
if "diff-empty" in classes:
self.empty = True
2019-05-20 13:11:30 +00:00
def handle_data(self, data):
2021-05-01 12:25:03 +00:00
def escape_formatting(data: str) -> str:
"""Escape Discord formatting"""
2023-04-18 22:37:36 +00:00
return re.sub(r"([`_*~:<>{}@/|#\-\.\\\[\]\(\)])", "\\\\\\1", data)
data = escape_formatting(data)
2019-05-20 13:11:30 +00:00
if self.current_tag == "ins" and self.ins_length <= 1000:
self.ins_length += len("**" + data + "**")
2019-05-20 13:11:30 +00:00
if self.ins_length <= 1000:
self.last_ins = self.last_ins + "**" + data + "**"
2019-05-20 13:11:30 +00:00
if self.current_tag == "del" and self.del_length <= 1000:
self.del_length += len("~~" + data + "~~")
2019-05-20 13:11:30 +00:00
if self.del_length <= 1000:
self.last_del = self.last_del + "~~" + data + "~~"
if self.current_tag == "tda" and self.ins_length <= 1000:
2019-05-20 13:11:30 +00:00
self.ins_length += len(data)
if self.ins_length <= 1000:
self.last_ins = self.last_ins + data
if self.current_tag == "tdd" and self.del_length <= 1000:
2019-05-20 13:11:30 +00:00
self.del_length += len(data)
if self.del_length <= 1000:
self.last_del = self.last_del + data
2019-05-20 13:11:30 +00:00
def handle_endtag(self, tagname):
if tagname == "ins":
self.current_tag = "tda"
2019-05-20 13:11:30 +00:00
elif tagname == "del":
self.current_tag = "tdd"
2021-06-12 17:22:25 +00:00
elif tagname == "td":
self.current_tag = ""
elif tagname == "tr":
if self.last_ins is not None:
self.ins_length += 1
2021-06-12 17:22:25 +00:00
if self.empty and not self.last_ins.isspace():
2021-07-20 06:16:34 +00:00
if "**" in self.last_ins:
2021-06-12 17:22:25 +00:00
self.last_ins = self.last_ins.replace("**", "__")
self.ins_length += 4
self.last_ins = "**" + self.last_ins + "**"
self.small_prev_ins = self.small_prev_ins + "\n" + self.last_ins
if self.ins_length > 1000:
self.small_prev_ins = self.small_prev_ins + self.more
self.last_ins = None
if self.last_del is not None:
self.del_length += 1
2021-06-12 17:22:25 +00:00
if self.empty and not self.last_del.isspace():
2021-07-20 06:16:34 +00:00
if "~~" in self.last_del:
2021-06-12 17:22:25 +00:00
self.last_del = self.last_del.replace("~~", "__")
self.del_length += 4
self.last_del = "~~" + self.last_del + "~~"
self.small_prev_del = self.small_prev_del + "\n" + self.last_del
if self.del_length > 1000:
self.small_prev_del = self.small_prev_del + self.more
self.last_del = None
self.empty = False
def safe_read(request, *keys):
if request is None:
return None
try:
request = request.json()
for item in keys:
request = request[item]
except KeyError:
misc_logger.warning(
"Failure while extracting data from request on key {key} in {change}".format(key=item, change=request))
return None
except ValueError:
misc_logger.warning("Failure while extracting data from request in {change}".format(change=request))
return None
return request
def parse_mw_request_info(request_data: dict, url: str):
"""A function parsing request JSON message from MediaWiki logging all warnings and raising on MediaWiki errors"""
# any([True for k in request_data.keys() if k in ("error", "errors")])
2024-10-15 14:31:32 +00:00
errors: dict = request_data.get("errors", {}) # Is it ugly? I don't know tbh
if errors:
raise MediaWikiError(str(errors))
2024-10-15 14:31:32 +00:00
warnings: dict = request_data.get("warnings", {})
if warnings:
2024-10-15 14:31:32 +00:00
for module, warning_data in warnings.items():
misc_logger.warning("MediaWiki returned the following warning on module {module}: {text} on {url}.".format(
module=module, text=warning_data.get("warnings", ""), url=url
))
return request_data
2021-05-18 11:46:00 +00:00
def add_to_dict(dictionary, key):
if key in dictionary:
dictionary[key] += 1
else:
dictionary[key] = 1
return dictionary
2021-05-18 11:46:00 +00:00
2022-01-02 13:39:20 +00:00
def prepare_paths(path: str, dry=False):
"""Set the URL paths for article namespace and script namespace
WIKI_API_PATH will be: WIKI_DOMAIN/api.php
WIKI_ARTICLE_PATH will be: WIKI_DOMAIN/articlepath/$1 where $1 is the replaced string
WIKI_SCRIPT_PATH will be: WIKI_DOMAIN/
WIKI_JUST_DOMAIN will be: WIKI_DOMAIN"""
2022-01-02 13:39:20 +00:00
global WIKI_API_PATH
global WIKI_ARTICLE_PATH
global WIKI_SCRIPT_PATH
global WIKI_JUST_DOMAIN
def quick_try_url(url):
"""Quickly test if URL is the proper script path,
False if it appears invalid
dictionary when it appears valid"""
try:
request = requests.get(url, timeout=5, headers=settings["header"])
if request.status_code == requests.codes.ok:
if request.json()["query"]["general"] is not None:
return request
2024-08-08 21:13:52 +00:00
else:
2024-08-08 21:16:05 +00:00
misc_logger.debug(f"Request to the wiki failed with code {request.status_code} and text: {request.text}")
return False
except (KeyError, requests.exceptions.ConnectionError):
return False
try:
2020-08-23 13:32:12 +00:00
parsed_url = urlparse(path)
except KeyError:
misc_logger.critical("wiki_url is not specified in the settings. Please provide the wiki url in the settings and start the script again.")
sys.exit(1)
2020-08-23 13:32:12 +00:00
for url_scheme in (path, path.split("wiki")[0], urlunparse((*parsed_url[0:2], "", "", "", ""))): # check different combinations, it's supposed to be idiot-proof
tested = quick_try_url(url_scheme.rstrip("/") + "/api.php?action=query&format=json&meta=siteinfo")
if tested:
2020-08-23 13:32:12 +00:00
if not dry:
WIKI_API_PATH = urlunparse((*parsed_url[0:2], "", "", "", "")) + tested.json()["query"]["general"]["scriptpath"] + "/api.php"
WIKI_SCRIPT_PATH = urlunparse((*parsed_url[0:2], "", "", "", "")) + tested.json()["query"]["general"]["scriptpath"] + "/"
WIKI_ARTICLE_PATH = urlunparse((*parsed_url[0:2], "", "", "", "")) + tested.json()["query"]["general"]["articlepath"]
WIKI_JUST_DOMAIN = urlunparse((*parsed_url[0:2], "", "", "", ""))
break
return urlunparse((*parsed_url[0:2], "", "", "", ""))
else:
2020-08-23 13:32:12 +00:00
misc_logger.critical("Could not verify wikis paths. Please make sure you have given the proper wiki URLs in settings.json ({path} should be script path to your wiki) and your Internet connection is working.".format(path=path))
sys.exit(1)
2020-04-05 00:07:56 +00:00
2020-08-23 13:32:12 +00:00
prepare_paths(settings["wiki_url"])
2020-04-05 00:07:56 +00:00
def run_hooks(hooks: Union[List[Tuple[Callable[[Context, dict], None], int, Optional[Callable]]], List[Tuple[Callable[[DiscordMessage, DiscordMessageMetadata, Context, dict], None], int, Optional[Callable]]]], *arguments):
2021-05-14 12:30:52 +00:00
for hook in hooks:
if hook[2] is not None:
try:
if hook[2](*arguments) is False:
misc_logger.debug(f"Ignoring hook {hook[0].__name__} due to conditions not being met for execution")
continue
except:
if settings.get("error_tolerance", 1) > 0:
misc_logger.exception("On running a hook check function, ignoring hook")
else:
raise
2021-05-14 12:30:52 +00:00
try:
misc_logger.debug(f"Running {hook[0].__name__} hook")
hook[0](*arguments)
2021-05-14 12:30:52 +00:00
except:
if settings.get("error_tolerance", 1) > 0:
misc_logger.exception("On running a hook, ignoring hook")
2021-05-14 12:30:52 +00:00
else:
raise
def profile_field_name(name, embed, _):
profile_fields = {"profile-location": _("Location"), "profile-aboutme": _("About me"),
"profile-link-google": _("Google link"), "profile-link-facebook": _("Facebook link"),
"profile-link-twitter": _("Twitter link"), "profile-link-reddit": _("Reddit link"),
"profile-link-twitch": _("Twitch link"), "profile-link-psn": _("PSN link"),
"profile-link-vk": _("VK link"), "profile-link-xbl": _("XBL link"),
"profile-link-steam": _("Steam link"), "profile-link-discord": _("Discord handle"),
"profile-link-battlenet": _("Battle.net handle")}
try:
return profile_fields[name]
except KeyError:
if embed:
return _("Unknown")
else:
return _("unknown")
class LinkParser(HTMLParser):
new_string = ""
recent_href = ""
def __init__(self, DOMAIN_URL: str):
self.WIKI_JUST_DOMAIN = DOMAIN_URL
super().__init__()
def handle_starttag(self, tag, attrs):
for attr in attrs:
if attr[0] == 'href':
self.recent_href = attr[1]
if self.recent_href.startswith("//"):
self.recent_href = "https:{rest}".format(rest=self.recent_href)
elif not self.recent_href.startswith("http"):
self.recent_href = self.WIKI_JUST_DOMAIN + self.recent_href
self.recent_href = self.recent_href.replace(")", "\\)")
elif attr[0] == 'data-uncrawlable-url':
self.recent_href = attr[1].encode('ascii')
self.recent_href = base64.b64decode(self.recent_href)
self.recent_href = self.WIKI_JUST_DOMAIN + self.recent_href.decode('ascii')
def handle_data(self, data):
if self.recent_href:
2021-05-23 19:06:59 +00:00
self.new_string = self.new_string + "[{}](<{}>)".format(src.api.util.sanitize_to_markdown(data), self.recent_href)
self.recent_href = ""
else:
2021-05-23 19:06:59 +00:00
self.new_string = self.new_string + src.api.util.sanitize_to_markdown(data)
def handle_comment(self, data):
2021-05-23 19:06:59 +00:00
self.new_string = self.new_string + src.api.util.sanitize_to_markdown(data)
def handle_endtag(self, tag):
misc_logger.debug(self.new_string)