RcGcDb/src/wiki.py

252 lines
13 KiB
Python
Raw Normal View History

2020-07-09 23:58:25 +00:00
from dataclasses import dataclass
2020-07-11 15:54:08 +00:00
import re
import logging, aiohttp
from src.exceptions import *
2020-07-11 15:54:08 +00:00
from src.database import db_cursor, db_connection
2020-07-18 12:12:00 +00:00
from src.formatters.rc import embed_formatter, compact_formatter
2020-08-02 17:27:42 +00:00
from src.formatters.discussions import feeds_embed_formatter, feeds_compact_formatter
from src.misc import parse_link
2020-07-21 12:15:40 +00:00
from src.i18n import langs
2020-08-06 00:46:43 +00:00
from src.wiki_ratelimiter import RateLimiter
2020-08-22 18:28:02 +00:00
import sqlite3
2020-07-10 20:07:33 +00:00
import src.discord
2020-07-26 21:52:24 +00:00
import asyncio
2020-07-21 12:15:40 +00:00
from src.config import settings
2020-07-28 12:39:32 +00:00
# noinspection PyPackageRequirements
2020-07-21 12:15:40 +00:00
from bs4 import BeautifulSoup
2020-07-10 13:38:36 +00:00
logger = logging.getLogger("rcgcdb.wiki")
2020-07-09 23:58:25 +00:00
2020-10-22 23:24:20 +00:00
supported_logs = {"protect/protect", "protect/modify", "protect/unprotect", "upload/overwrite", "upload/upload",
"delete/delete", "delete/delete_redir", "delete/restore", "delete/revision", "delete/event",
"import/upload", "import/interwiki", "merge/merge", "move/move", "move/move_redir",
"protect/move_prot", "block/block", "block/unblock", "block/reblock", "rights/rights",
"rights/autopromote", "abusefilter/modify", "abusefilter/create", "interwiki/iw_add",
"interwiki/iw_edit", "interwiki/iw_delete", "curseprofile/comment-created",
"curseprofile/comment-edited", "curseprofile/comment-deleted", "curseprofile/comment-purged",
"curseprofile/profile-edited", "curseprofile/comment-replied", "contentmodel/change", "sprite/sprite",
"sprite/sheet", "sprite/slice", "managetags/create", "managetags/delete", "managetags/activate",
"managetags/deactivate", "tag/update", "cargo/createtable", "cargo/deletetable",
"cargo/recreatetable", "cargo/replacetable", "upload/revert", "newusers/create",
"newusers/autocreate", "newusers/create2", "newusers/byemail", "newusers/newusers",
"managewiki/settings", "managewiki/delete", "managewiki/lock", "managewiki/unlock",
"managewiki/namespaces", "managewiki/namespaces-delete", "managewiki/rights", "managewiki/undelete"}
2020-07-19 13:32:54 +00:00
2020-07-09 23:58:25 +00:00
@dataclass
2020-07-09 22:24:23 +00:00
class Wiki:
2020-07-10 13:38:36 +00:00
mw_messages: int = None
2020-07-09 23:58:25 +00:00
fail_times: int = 0 # corresponding to amount of times connection with wiki failed for client reasons (400-499)
2020-07-21 12:15:40 +00:00
session: aiohttp.ClientSession = None
2020-08-07 16:56:29 +00:00
rc_active: int = 0
2020-07-10 13:38:36 +00:00
2020-07-28 12:39:32 +00:00
@staticmethod
async def fetch_wiki(extended, script_path, session: aiohttp.ClientSession, ratelimiter: RateLimiter, amount=20) -> aiohttp.ClientResponse:
2020-08-06 00:46:43 +00:00
await ratelimiter.timeout_wait()
2020-07-19 13:32:54 +00:00
url_path = script_path + "api.php"
2020-07-10 13:38:36 +00:00
if extended:
params = {"action": "query", "format": "json", "uselang": "content", "list": "tags|recentchanges",
"meta": "allmessages|siteinfo",
"utf8": 1, "tglimit": "max", "tgprop": "displayname",
"rcprop": "title|redirect|timestamp|ids|loginfo|parsedcomment|sizes|flags|tags|user",
2020-07-28 13:58:25 +00:00
"rclimit": amount, "rcshow": "!bot", "rctype": "edit|new|log|categorize",
2020-07-10 13:38:36 +00:00
"ammessages": "recentchanges-page-added-to-category|recentchanges-page-removed-from-category|recentchanges-page-added-to-category-bundled|recentchanges-page-removed-from-category-bundled",
"amenableparser": 1, "amincludelocal": 1, "siprop": "namespaces|general"}
2020-07-10 13:38:36 +00:00
else:
params = {"action": "query", "format": "json", "uselang": "content", "list": "tags|recentchanges",
"meta": "siteinfo", "utf8": 1,
2020-07-26 21:52:24 +00:00
"tglimit": "max", "rcshow": "!bot", "tgprop": "displayname",
2020-07-10 13:38:36 +00:00
"rcprop": "title|redirect|timestamp|ids|loginfo|parsedcomment|sizes|flags|tags|user",
2020-07-28 13:58:25 +00:00
"rclimit": amount, "rctype": "edit|new|log|categorize", "siprop": "namespaces|general"}
2020-07-10 13:38:36 +00:00
try:
2020-07-26 21:52:24 +00:00
response = await session.get(url_path, params=params)
2020-08-06 00:46:43 +00:00
ratelimiter.timeout_add(1.0)
except (aiohttp.ClientConnectionError, aiohttp.ServerTimeoutError, asyncio.TimeoutError):
2020-08-14 06:29:10 +00:00
logger.error("A connection error occurred while requesting {}".format(url_path))
2020-07-10 20:07:33 +00:00
raise WikiServerError
return response
2020-08-02 17:27:42 +00:00
@staticmethod
2020-11-22 12:44:15 +00:00
async def fetch_feeds(wiki, session: aiohttp.ClientSession) -> aiohttp.ClientResponse:
url_path = "{wiki}wikia.php".format(wiki=wiki)
params = {"controller": "DiscussionPost", "method": "getPosts", "sortDirection": "descending", "sortKey": "creation_date", "limit": 20}
2020-08-02 17:27:42 +00:00
try:
response = await session.get(url_path, params=params)
2020-08-03 11:03:36 +00:00
response.raise_for_status()
except (aiohttp.ClientConnectionError, aiohttp.ServerTimeoutError, asyncio.TimeoutError, aiohttp.ClientResponseError):
2020-08-12 11:40:48 +00:00
logger.error("A connection error occurred while requesting {}".format(url_path))
2020-08-02 17:27:42 +00:00
raise WikiServerError
return response
2020-07-28 01:11:27 +00:00
@staticmethod
2020-08-06 00:46:43 +00:00
async def safe_request(url, ratelimiter, *keys):
await ratelimiter.timeout_wait()
try:
async with aiohttp.ClientSession(headers=settings["header"], timeout=aiohttp.ClientTimeout(6.0)) as session:
request = await session.get(url, allow_redirects=False)
2020-08-06 00:46:43 +00:00
ratelimiter.timeout_add(1.0)
2020-07-28 01:11:27 +00:00
request.raise_for_status()
json_request = await request.json(encoding="UTF-8")
except (aiohttp.ClientConnectionError, aiohttp.ServerTimeoutError, asyncio.TimeoutError):
2020-08-14 06:29:10 +00:00
logger.error("Reached connection error for request on link {url}".format(url=url))
else:
2020-07-28 01:11:27 +00:00
try:
for item in keys:
json_request = json_request[item]
except KeyError:
logger.warning(
2020-08-10 17:58:51 +00:00
"Failure while extracting data from request on key {key} in {change}".format(key=item, change=json_request))
2020-07-28 01:11:27 +00:00
return None
return json_request
2020-07-26 21:52:24 +00:00
async def fail_add(self, wiki_url, status):
logger.debug("Increasing fail_times to {}".format(self.fail_times+3))
self.fail_times += 3
if self.fail_times > 9:
await self.remove(wiki_url, status)
2020-07-19 23:40:20 +00:00
async def check_status(self, wiki_url, status):
if 199 < status < 300:
2020-07-26 21:52:24 +00:00
self.fail_times -= 1
pass
elif 400 < status < 500: # ignore 400 error since this might be our fault
2020-07-26 21:52:24 +00:00
await self.fail_add(wiki_url, status)
2020-07-19 23:40:20 +00:00
logger.warning("Wiki {} responded with HTTP code {}, increased fail_times to {}, skipping...".format(wiki_url, status, self.fail_times))
raise WikiError
elif 499 < status < 600:
2020-07-19 23:40:20 +00:00
logger.warning("Wiki {} responded with HTTP code {}, skipping...".format(wiki_url, status, self.fail_times))
raise WikiServerError
2020-07-28 12:39:32 +00:00
@staticmethod
async def remove(wiki_url, reason):
logger.info("Removing a wiki {}".format(wiki_url))
2020-07-26 22:50:27 +00:00
await src.discord.wiki_removal(wiki_url, reason)
await src.discord.wiki_removal_monitor(wiki_url, reason)
db_cursor.execute('DELETE FROM rcgcdw WHERE wiki = ?', (wiki_url,))
logger.warning('{} rows affected by DELETE FROM rcgcdw WHERE wiki = "{}"'.format(db_cursor.rowcount, wiki_url))
2020-07-11 15:54:08 +00:00
db_connection.commit()
2020-08-06 00:46:43 +00:00
async def pull_comment(self, comment_id, WIKI_API_PATH, rate_limiter):
2020-07-21 12:15:40 +00:00
try:
comment = await self.safe_request(
"{wiki}?action=comment&do=getRaw&comment_id={comment}&format=json".format(wiki=WIKI_API_PATH,
2020-08-06 00:46:43 +00:00
comment=comment_id), rate_limiter, "text")
2020-07-21 12:15:40 +00:00
logger.debug("Got the following comment from the API: {}".format(comment))
if comment is None:
raise TypeError
2020-07-21 12:15:40 +00:00
except (TypeError, AttributeError):
logger.exception("Could not resolve the comment text.")
except KeyError:
logger.exception("CurseProfile extension API did not respond with a valid comment content.")
else:
if len(comment) > 1000:
comment = comment[0:1000] + ""
return comment
return ""
2020-07-11 15:54:08 +00:00
2020-07-18 12:12:00 +00:00
async def process_cats(event: dict, local_wiki: Wiki, category_msgs: dict, categorize_events: dict):
"""Process categories based on local MW messages. """
2020-07-11 15:54:08 +00:00
if event["type"] == "categorize":
if "commenthidden" not in event:
2020-07-28 14:18:06 +00:00
if local_wiki.mw_messages is not None:
2020-07-11 15:54:08 +00:00
cat_title = event["title"].split(':', 1)[1]
# I so much hate this, blame Markus for making me do this
if event["revid"] not in categorize_events:
categorize_events[event["revid"]] = {"new": set(), "removed": set()}
comment_to_match = re.sub(r'<.*?a>', '', event["parsedcomment"])
wiki_cat_mw_messages = category_msgs[local_wiki.mw_messages]
2020-07-28 14:18:06 +00:00
if wiki_cat_mw_messages[0][1] in comment_to_match or wiki_cat_mw_messages[2][1] in comment_to_match: # Added to category
2020-07-11 15:54:08 +00:00
categorize_events[event["revid"]]["new"].add(cat_title)
logger.debug("Matched {} to added category for {}".format(cat_title, event["revid"]))
2020-07-28 14:18:06 +00:00
elif wiki_cat_mw_messages[1][1] in comment_to_match or wiki_cat_mw_messages[3][1] in comment_to_match: # Removed from category
2020-07-11 15:54:08 +00:00
categorize_events[event["revid"]]["removed"].add(cat_title)
logger.debug("Matched {} to removed category for {}".format(cat_title, event["revid"]))
else:
logger.debug(
"Unknown match for category change with messages {}, {}, {}, {} and comment_to_match {}".format(
wiki_cat_mw_messages[0], wiki_cat_mw_messages[1], wiki_cat_mw_messages[2], wiki_cat_mw_messages[3],
comment_to_match))
else:
logger.warning(
"Init information not available, could not read category information. Please restart the bot.")
else:
logger.debug("Log entry got suppressed, ignoring entry.")
async def process_mwmsgs(wiki_response: dict, local_wiki: Wiki, mw_msgs: dict):
"""
This function is made to parse the initial wiki extended information to update local_wiki.mw_messages that stores the key
to mw_msgs that is a dict storing id: tuple where tuple is a set of MW messages for categories.
The reason it's constructed this way is to prevent duplication of data in memory so Markus doesn't complain about
high RAM usage. It does however affect CPU performance as every wiki requires to check the list for the matching
tuples of MW messages.
:param wiki_response:
:param local_wiki:
:param mw_msgs:
:return:
"""
msgs = []
2020-07-22 11:43:18 +00:00
for message in wiki_response["query"]["allmessages"]:
2020-07-11 15:54:08 +00:00
if not "missing" in message: # ignore missing strings
msgs.append((message["name"], re.sub(r'\[\[.*?\]\]', '', message["*"])))
else:
2020-07-26 21:52:24 +00:00
logger.warning("Could not fetch the MW message translation for: {}".format(message["name"]))
2020-07-11 15:54:08 +00:00
msgs = tuple(msgs)
for key, set in mw_msgs.items():
if msgs == set:
local_wiki.mw_messages = key
return
2020-07-19 23:40:20 +00:00
# if same entry is not in mw_msgs
2020-07-11 15:54:08 +00:00
key = len(mw_msgs)
mw_msgs[key] = msgs # it may be a little bit messy for sure, however I don't expect any reason to remove mw_msgs entries by one
local_wiki.mw_messages = key
2020-07-18 12:12:00 +00:00
2020-07-27 03:16:50 +00:00
# db_wiki: webhook, wiki, lang, display, wikiid, rcid, postid
2020-08-07 16:56:29 +00:00
async def essential_info(change: dict, changed_categories, local_wiki: Wiki, target: tuple, paths: tuple, request: dict,
rate_limiter: RateLimiter):
2020-07-18 12:12:00 +00:00
"""Prepares essential information for both embed and compact message format."""
2020-08-09 23:57:14 +00:00
_ = langs[target[0][0]]["wiki"].gettext
2020-07-28 17:52:48 +00:00
changed_categories = changed_categories.get(change["revid"], None)
logger.debug("List of categories in essential_info: {}".format(changed_categories))
2020-07-19 23:40:20 +00:00
appearance_mode = embed_formatter if target[0][1] > 0 else compact_formatter
2020-07-28 12:39:32 +00:00
if "actionhidden" in change or "suppressed" in change: # if event is hidden using suppression
2020-08-09 23:57:14 +00:00
await appearance_mode("suppressed", change, "", changed_categories, local_wiki, target, paths, rate_limiter)
2020-07-18 12:12:00 +00:00
return
if "commenthidden" not in change:
parsed_comment = parse_link(paths[3], change["parsedcomment"])
2020-07-18 12:12:00 +00:00
else:
parsed_comment = _("~~hidden~~")
if not parsed_comment:
parsed_comment = None
if change["type"] in ["edit", "new"]:
if "userhidden" in change:
change["user"] = _("hidden")
identification_string = change["type"]
elif change["type"] == "log":
identification_string = "{logtype}/{logaction}".format(logtype=change["logtype"], logaction=change["logaction"])
elif change["type"] == "categorize":
return
else:
2020-07-28 13:41:07 +00:00
identification_string = change["type"]
2020-07-21 12:15:40 +00:00
additional_data = {"namespaces": request["query"]["namespaces"], "tags": {}}
for tag in request["query"]["tags"]:
try:
additional_data["tags"][tag["name"]] = (BeautifulSoup(tag["displayname"], "lxml")).get_text()
except KeyError:
additional_data["tags"][tag["name"]] = None # Tags with no displ
2020-08-09 23:57:14 +00:00
await appearance_mode(identification_string, change, parsed_comment, changed_categories, local_wiki, target, paths, rate_limiter, additional_data=additional_data)
2020-08-02 17:27:42 +00:00
2020-08-22 18:28:02 +00:00
async def essential_feeds(change: dict, comment_pages: dict, db_wiki: sqlite3.Row, target: tuple):
2020-08-02 17:27:42 +00:00
"""Prepares essential information for both embed and compact message format."""
appearance_mode = feeds_embed_formatter if target[0][1] > 0 else feeds_compact_formatter
2020-08-02 21:40:30 +00:00
identification_string = change["_embedded"]["thread"][0]["containerType"]
2020-08-22 17:51:15 +00:00
comment_page = None
if identification_string == "ARTICLE_COMMENT" and comment_pages is not None:
comment_page = comment_pages.get(change["forumId"], None)
if comment_page is not None:
2020-11-20 20:39:00 +00:00
comment_page["fullUrl"] = "/".join(db_wiki["wiki"].split("/", 3)[:3]) + comment_page["relativeUrl"].replace(")", "\)").replace("()", "\(")
2020-08-22 17:51:15 +00:00
await appearance_mode(identification_string, change, target, db_wiki["wiki"], article_page=comment_page)