2021-06-23 14:21:06 +00:00
from __future__ import annotations
2021-07-04 11:39:50 +00:00
2024-07-04 17:03:51 +00:00
import datetime
2022-08-09 10:57:40 +00:00
import functools
2021-07-04 11:39:50 +00:00
import time
2020-07-11 15:54:08 +00:00
import re
2020-07-10 14:11:45 +00:00
import logging , aiohttp
2022-08-31 12:30:41 +00:00
import asyncio
2022-10-29 15:04:25 +00:00
import requests
2021-06-22 19:42:32 +00:00
2023-08-13 14:31:07 +00:00
from src . api . util import default_message
2024-01-14 13:24:51 +00:00
from src . misc import prepare_settings , run_hooks
2022-08-18 10:42:48 +00:00
from src . discord . queue import messagequeue , QueueEntry
2023-08-13 14:31:07 +00:00
from src . mw_messages import MWMessages
2020-07-10 14:11:45 +00:00
from src . exceptions import *
2022-09-29 21:10:36 +00:00
from src . queue_handler import dbmanager
2024-01-14 13:24:51 +00:00
from src . api . hooks import formatter_hooks , pre_hooks , post_hooks
2021-07-09 12:55:23 +00:00
from src . api . client import Client
2021-07-18 12:40:26 +00:00
from src . api . context import Context
2022-08-31 12:30:41 +00:00
from src . discord . message import DiscordMessage , DiscordMessageMetadata , StackedDiscordMessage
2020-07-21 12:15:40 +00:00
from src . i18n import langs
2022-08-31 12:30:41 +00:00
from src . statistics import Statistics , Log , LogType
2020-07-21 12:15:40 +00:00
from src . config import settings
2020-07-28 12:39:32 +00:00
# noinspection PyPackageRequirements
2020-07-21 12:15:40 +00:00
from bs4 import BeautifulSoup
2021-06-23 14:21:06 +00:00
from collections import OrderedDict , defaultdict , namedtuple
2023-11-25 14:11:11 +00:00
from typing import Union , Optional , TYPE_CHECKING , List
2020-07-10 13:38:36 +00:00
2024-01-14 13:24:51 +00:00
Settings = namedtuple ( " Settings " , [ " lang " , " display " , " buttons " ] )
2020-07-10 14:11:45 +00:00
logger = logging . getLogger ( " rcgcdb.wiki " )
2020-07-09 23:58:25 +00:00
2022-10-03 13:34:36 +00:00
# wiki_reamoval_reasons = {410: _("wiki deleted"), 404: _("wiki deleted"), 401: _("wiki inaccessible"),
# 402: _("wiki inaccessible"), 403: _("wiki inaccessible"), 1000: _("discussions disabled")}
2021-06-23 14:21:06 +00:00
if TYPE_CHECKING :
from src . domain import Domain
2021-06-22 19:42:32 +00:00
2022-08-31 12:30:41 +00:00
MESSAGE_LIMIT = settings . get ( " message_limit " , 30 )
2022-10-09 12:10:08 +00:00
2021-05-30 11:23:48 +00:00
class Wiki :
2022-11-25 16:24:44 +00:00
def __init__ ( self , script_url : str , rc_id : Optional [ int ] , discussion_id : Optional [ str ] ) :
2021-06-05 11:12:23 +00:00
self . script_url : str = script_url
2022-10-29 15:04:25 +00:00
self . session : aiohttp . ClientSession = aiohttp . ClientSession ( headers = settings [ " header " ] , timeout = aiohttp . ClientTimeout ( total = 6 ) )
2021-06-05 11:12:23 +00:00
self . statistics : Statistics = Statistics ( rc_id , discussion_id )
2021-06-23 14:21:06 +00:00
self . mw_messages : Optional [ MWMessages ] = None
2022-07-24 20:02:25 +00:00
self . tags : dict [ str , Optional [ str ] ] = { } # Tag can be None if hidden
2021-06-22 19:42:32 +00:00
self . first_fetch_done : bool = False
2021-06-23 14:21:06 +00:00
self . domain : Optional [ Domain ] = None
2022-11-10 14:16:35 +00:00
self . rc_targets : Optional [ defaultdict [ Settings , list [ str ] ] ] = None
self . discussion_targets : Optional [ defaultdict [ Settings , list [ str ] ] ] = None
2022-07-24 20:02:25 +00:00
self . client : Client = Client ( formatter_hooks , self )
2022-08-31 12:30:41 +00:00
self . message_history : list [ StackedDiscordMessage ] = list ( )
2022-10-09 12:10:08 +00:00
self . namespaces : Optional [ dict ] = None
self . recache_requested : bool = False
2022-10-29 15:04:25 +00:00
self . session_requests = requests . Session ( )
self . session_requests . headers . update ( settings [ " header " ] )
2023-08-15 10:20:38 +00:00
logger . debug ( " Creating new wiki object for {} " . format ( script_url ) )
2021-05-30 11:23:48 +00:00
2023-08-14 16:01:59 +00:00
def __str__ ( self ) :
return self . __repr__ ( )
def __repr__ ( self ) :
return (
f " <statistics= { self . statistics } tags= { self . tags } first_fetch_done= { self . first_fetch_done } , rc_targets= { self . rc_targets } , discussion_targets= { self . discussion_targets } , "
f " recache_requested= { self . recache_requested } > " )
2021-05-30 11:23:48 +00:00
@property
def rc_id ( self ) :
2021-06-05 11:12:23 +00:00
return self . statistics . last_action
2022-11-07 14:46:15 +00:00
@property
def discussion_id ( self ) :
return self . statistics . last_post
2022-07-24 20:02:25 +00:00
@property
def last_request ( self ) :
return self . statistics . last_request
@last_request.setter
def last_request ( self , value ) :
self . statistics . last_request = value
2022-06-22 17:17:20 +00:00
# async def remove(self, reason):
# logger.info("Removing a wiki {}".format(self.script_url))
# await src.discord.wiki_removal(self.script_url, reason)
# await src.discord.wiki_removal_monitor(self.script_url, reason)
# async with db.pool().acquire() as connection:
# result = await connection.execute('DELETE FROM rcgcdw WHERE wiki = $1', self.script_url)
# logger.warning('{} rows affected by DELETE FROM rcgcdw WHERE wiki = "{}"'.format(result, self.script_url))
2021-06-23 14:21:06 +00:00
2022-08-31 12:30:41 +00:00
def add_message ( self , message : StackedDiscordMessage ) :
self . message_history . append ( message )
2022-11-10 14:16:35 +00:00
if len ( self . message_history ) > MESSAGE_LIMIT * len ( self . rc_targets ) :
self . message_history = self . message_history [ len ( self . message_history ) - MESSAGE_LIMIT * len ( self . rc_targets ) : ]
2022-08-31 12:30:41 +00:00
2021-06-23 14:21:06 +00:00
def set_domain ( self , domain : Domain ) :
self . domain = domain
2021-06-22 19:42:32 +00:00
2023-11-25 14:11:11 +00:00
def find_middle_next ( self , ids : List [ str ] , pageid : int ) - > list :
""" To address #235 RcGcDw should now remove diffs in next revs relative to redacted revs to protect information in revs that revert revdeleted information.
What this function does , is it fetches all messages for given page and finds revids of the messages that come next after ids
: arg ids - list
: arg pageid - int
: return list """
def extract_revid ( item : tuple [ StackedDiscordMessage , list [ int ] ] ) :
rev_ids = set ( )
for message_id in sorted ( item [ 1 ] , reverse = True ) :
rev_ids . add ( item [ 0 ] . message_list [ message_id ] . metadata . rev_id )
return rev_ids
ids = [ int ( x ) for x in ids ]
result = set ( )
ids . sort ( ) # Just to be sure, sort the list to make sure it's always sorted
search = self . search_message_history ( { " message_display " : 3 , " page_id " : pageid } )
# messages = db_cursor.execute("SELECT revid FROM event WHERE pageid = ? AND revid >= ? ORDER BY revid",
# (pageid, ids[0],))
all_in_page = sorted ( set ( [ x for row in map ( extract_revid , search ) for x in row ] ) ) # Flatten the result
for ID in ids :
try :
result . add ( all_in_page [ all_in_page . index ( ID ) + 1 ] )
except ( KeyError , ValueError ) :
logger . debug ( f " Value { ID } not in { all_in_page } or no value after that. " )
return list ( result - set ( ids ) )
2022-09-18 14:38:19 +00:00
def search_message_history ( self , params : dict ) - > list [ tuple [ StackedDiscordMessage , list [ int ] ] ] :
2023-11-25 14:11:11 +00:00
""" Search self.message_history for messages which match all properties in params and return them in a list
: param params is a dictionary of which messages are compared against . All name and values must be equal for match to return true
Matches metadata from discord . message . DiscordMessageMetadata
: returns [ ( StackedDiscordMessage , [ index ids of matching messages in that StackedMessage ] ) ] """
2022-09-18 14:38:19 +00:00
output = [ ]
for message in self . message_history :
returned_matches_for_stacked = message . filter ( params )
if returned_matches_for_stacked :
output . append ( ( message , [ x [ 0 ] for x in returned_matches_for_stacked ] ) )
return output
def delete_messages ( self , params : dict ) :
""" Delete certain messages from message_history which DiscordMessageMetadata matches all properties in params """
# Delete all messages with given IDs
for stacked_message , ids in self . search_message_history ( params ) :
stacked_message . delete_message_by_id ( ids )
# If all messages were removed, send a DELETE to Discord
if len ( stacked_message . message_list ) == 0 :
messagequeue . add_message ( QueueEntry ( stacked_message , [ stacked_message . webhook ] , self , method = " DELETE " ) )
else :
messagequeue . add_message ( QueueEntry ( stacked_message , [ stacked_message . webhook ] , self , method = " PATCH " ) )
2023-08-20 17:03:55 +00:00
def redact_messages ( self , context : Context , ids : list [ int ] , mode : str , censored_properties : dict ) :
2022-09-18 14:38:19 +00:00
# ids can refer to multiple events, and search does not support additive mode, so we have to loop it for all ids
for revlogid in ids :
for stacked_message , ids in self . search_message_history ( { mode : revlogid } ) : # This might not work depending on how Python handles it, but hey, learning experience
for message in [ message for num , message in enumerate ( stacked_message . message_list ) if num in ids ] :
if " user " in censored_properties and " url " in message [ " author " ] :
2023-08-20 17:03:55 +00:00
message [ " author " ] [ " name " ] = context . _ ( " hidden " )
2022-09-18 14:38:19 +00:00
message [ " author " ] . pop ( " url " )
if " action " in censored_properties and " url " in message :
2023-08-20 17:03:55 +00:00
message [ " title " ] = context . _ ( " ~~hidden~~ " )
2022-09-18 14:38:19 +00:00
message [ " embed " ] . pop ( " url " )
if " content " in censored_properties and " fields " in message :
message [ " embed " ] . pop ( " fields " )
if " comment " in censored_properties :
2023-08-20 17:03:55 +00:00
message [ " description " ] = context . _ ( " ~~hidden~~ " )
2022-09-18 14:38:19 +00:00
messagequeue . add_message ( QueueEntry ( stacked_message , [ stacked_message . webhook ] , self , method = " PATCH " ) )
2022-06-22 17:17:20 +00:00
# async def downtime_controller(self, down, reason=None):
# if down:
# self.fail_times += 1
# if self.fail_times > 20:
# await self.remove(reason)
# else:
# self.fail_times -= 1
2021-06-05 11:12:23 +00:00
2022-07-24 20:02:25 +00:00
async def update_targets ( self ) - > None :
2021-06-23 14:21:06 +00:00
""" This function generates all possible varations of outputs that we need to generate messages for.
: returns defaultdict [ namedtuple , list [ str ] ] - where namedtuple is a named tuple with settings for given webhooks in list """
target_settings : defaultdict [ Settings , list [ str ] ] = defaultdict ( list )
2022-11-10 14:16:35 +00:00
discussion_targets : defaultdict [ Settings , list [ str ] ] = defaultdict ( list )
2024-01-14 13:24:51 +00:00
async for webhook in dbmanager . fetch_rows ( " SELECT webhook, lang, display, rcid, postid, buttons FROM rcgcdb WHERE wiki = $1 " , self . script_url ) :
2022-11-10 14:16:35 +00:00
if webhook [ ' rcid ' ] == - 1 and webhook [ ' postid ' ] == ' -1 ' :
await self . remove_wiki_from_db ( 4 )
if webhook [ ' rcid ' ] != - 1 :
2024-01-14 13:24:51 +00:00
target_settings [ Settings ( webhook [ " lang " ] , webhook [ " display " ] , webhook [ " buttons " ] ) ] . append ( webhook [ " webhook " ] )
2022-11-10 14:16:35 +00:00
if webhook [ ' postid ' ] != ' -1 ' :
2024-01-14 13:24:51 +00:00
discussion_targets [ Settings ( webhook [ " lang " ] , webhook [ " display " ] , webhook [ " buttons " ] ) ] . append ( webhook [ " webhook " ] )
2022-11-10 14:16:35 +00:00
self . rc_targets = target_settings
self . discussion_targets = discussion_targets
2021-06-23 14:21:06 +00:00
2021-06-05 11:12:23 +00:00
def parse_mw_request_info ( self , request_data : dict , url : str ) :
""" A function parsing request JSON message from MediaWiki logging all warnings and raising on MediaWiki errors """
# any([True for k in request_data.keys() if k in ("error", "errors")])
errors : list = request_data . get ( " errors " , { } ) # Is it ugly? I don't know tbh
if errors :
raise MediaWikiError ( str ( errors ) )
warnings : list = request_data . get ( " warnings " , { } )
if warnings :
for warning in warnings :
logger . warning ( " MediaWiki returned the following warning: {code} - {text} on {url} . " . format (
code = warning [ " code " ] , text = warning . get ( " text " , warning . get ( " * " , " " ) ) , url = url
) )
return request_data
async def api_request ( self , params : Union [ str , OrderedDict ] , * json_path : str , timeout : int = 10 ,
allow_redirects : bool = False ) - > dict :
""" Method to GET request data from the wiki ' s API with error handling including recognition of MediaWiki errors.
Parameters :
params ( str , OrderedDict ) : a string or collections . OrderedDict object containing query parameters
json_path ( str ) : * args taking strings as values . After request is parsed as json it will extract data from given json path
timeout ( int , float ) ( default = 10 ) : int or float limiting time required for receiving a full response from a server before returning TimeoutError
allow_redirects ( bool ) ( default = False ) : switches whether the request should follow redirects or not
Returns :
request_content ( dict ) : a dict resulting from json extraction of HTTP GET request with given json_path
OR
One of the following exceptions :
ServerError : When connection with the wiki failed due to server error
ClientError : When connection with the wiki failed due to client error
KeyError : When json_path contained keys that weren ' t found in response JSON response
BadRequest : When params argument is of wrong type
MediaWikiError : When MediaWiki returns an error
"""
# Making request
try :
2022-10-29 15:04:25 +00:00
if isinstance ( params , str ) :
2021-06-05 11:12:23 +00:00
request = await self . session . get ( self . script_url + " api.php? " + params + " &errorformat=raw " , timeout = timeout ,
allow_redirects = allow_redirects )
elif isinstance ( params , OrderedDict ) :
params [ " errorformat " ] = " raw "
request = await self . session . get ( self . script_url + " api.php " , params = params , timeout = timeout ,
allow_redirects = allow_redirects )
else :
raise BadRequest ( params )
2024-07-04 14:14:47 +00:00
except ( aiohttp . ServerConnectionError , aiohttp . ServerTimeoutError , aiohttp . ContentTypeError ) as exc :
2021-06-05 11:12:23 +00:00
logger . warning ( " Reached {error} error for request on link {url} " . format ( error = repr ( exc ) ,
url = self . script_url + str ( params ) ) )
raise ServerError
# Catching HTTP errors
if 499 < request . status < 600 :
2023-08-15 16:16:00 +00:00
logger . warning ( f " A request to { self . script_url } { params } resulted in { request . status } " )
2021-06-05 11:12:23 +00:00
raise ServerError
elif request . status == 302 :
logger . critical (
" Redirect detected! Either the wiki given in the script settings (wiki field) is incorrect/the wiki got removed or is giving us the false value. Please provide the real URL to the wiki, current URL redirects to {} " . format (
request . url ) )
elif 399 < request . status < 500 :
logger . error ( " Request returned ClientError status code on {url} " . format ( url = request . url ) )
2022-10-03 13:34:36 +00:00
self . statistics . update ( Log ( type = LogType . HTTP_ERROR , title = " {} error " . format ( request . status ) , details = str ( request . headers ) + " \n " + str ( request . url ) ) )
2021-06-05 11:12:23 +00:00
raise ClientError ( request )
else :
# JSON Extraction
try :
request_json = self . parse_mw_request_info ( await request . json ( encoding = " UTF-8 " ) , str ( request . url ) )
for item in json_path :
request_json = request_json [ item ]
except ValueError :
logger . warning ( " ValueError when extracting JSON data on {url} " . format ( url = request . url ) )
raise ServerError
except MediaWikiError :
logger . exception ( " MediaWiki error on request: {} " . format ( request . url ) )
raise
except KeyError :
logger . exception ( " KeyError while iterating over json_path, full response: {} " . format ( request . json ( ) ) )
raise
return request_json
2022-10-29 15:04:25 +00:00
def sync_api_request ( self , params : Union [ str , OrderedDict ] , * json_path : str , timeout : int = 10 ,
allow_redirects : bool = False ) - > dict :
""" Synchronous function based on api_request created for compatibility reasons with RcGcDw API """
try :
if isinstance ( params , str ) :
2022-11-04 14:59:26 +00:00
request = self . session_requests . get ( self . script_url + " api.php " + params + " &errorformat=raw " , timeout = 10 , allow_redirects = allow_redirects )
2022-10-29 15:04:25 +00:00
elif isinstance ( params , OrderedDict ) :
request = self . session_requests . get ( self . script_url + " api.php " , params = params , timeout = 10 , allow_redirects = allow_redirects )
else :
raise BadRequest ( params )
except ( requests . exceptions . Timeout , requests . exceptions . ConnectionError , requests . exceptions . ChunkedEncodingError ) as exc :
logger . warning ( " Reached {error} error for request on link {url} " . format ( error = repr ( exc ) ,
url = self . client . WIKI_API_PATH + str ( params ) ) )
raise ServerError
if 499 < request . status_code < 600 :
2023-08-15 16:16:00 +00:00
logger . warning ( f " A request to { self . script_url } { params } resulted in { request . status } " )
2022-10-29 15:04:25 +00:00
raise ServerError
elif request . status_code == 302 :
logger . critical (
" Redirect detected! Either the wiki given in the script settings (wiki field) is incorrect/the wiki got removed or is giving us the false value. Please provide the real URL to the wiki, current URL redirects to {} " . format (
request . url ) )
elif 399 < request . status_code < 500 :
logger . error ( " Request returned ClientError status code on {url} " . format ( url = request . url ) )
self . statistics . update ( Log ( type = LogType . HTTP_ERROR , title = " {} error " . format ( request . status_code ) , details = str ( request . headers ) + " \n " + str ( request . url ) ) )
raise ClientError ( request )
else :
try :
request_json = self . parse_mw_request_info ( request . json ( ) , request . url )
for item in json_path :
request_json = request_json [ item ]
except ValueError :
logger . warning ( " ValueError when extracting JSON data on {url} " . format ( url = request . url ) )
raise ServerError
except MediaWikiError :
logger . exception ( " MediaWiki error on request: {} " . format ( request . url ) )
raise
except KeyError :
logger . exception ( " KeyError while iterating over json_path, full response: {} " . format ( request . json ( ) ) )
raise
return request_json
2021-07-04 11:39:50 +00:00
async def fetch_wiki ( self , amount = 10 ) - > dict :
2022-06-22 17:17:20 +00:00
if self . mw_messages is None :
2021-06-05 11:12:23 +00:00
params = OrderedDict ( { " action " : " query " , " format " : " json " , " uselang " : " content " , " list " : " tags|recentchanges " ,
" meta " : " allmessages|siteinfo " ,
" utf8 " : 1 , " tglimit " : " max " , " tgprop " : " displayname " ,
2024-01-14 13:24:51 +00:00
" rcprop " : " title|redirect|timestamp|ids|loginfo|parsedcomment|sizes|flags|tags|user|userid " ,
2021-06-05 11:12:23 +00:00
" rclimit " : amount , " rcshow " : " !bot " , " rctype " : " edit|new|log|categorize " ,
" ammessages " : " recentchanges-page-added-to-category|recentchanges-page-removed-from-category|recentchanges-page-added-to-category-bundled|recentchanges-page-removed-from-category-bundled " ,
" amenableparser " : 1 , " amincludelocal " : 1 , " siprop " : " namespaces|general " } )
else :
params = OrderedDict ( { " action " : " query " , " format " : " json " , " uselang " : " content " , " list " : " tags|recentchanges " ,
2022-08-16 10:50:49 +00:00
" meta " : " siteinfo " , " utf8 " : 1 , " rcshow " : " !bot " ,
2024-01-14 13:24:51 +00:00
" rcprop " : " title|redirect|timestamp|ids|loginfo|parsedcomment|sizes|flags|tags|user|userid " ,
2021-06-05 11:12:23 +00:00
" rclimit " : amount , " rctype " : " edit|new|log|categorize " , " siprop " : " namespaces|general " } )
try :
2024-07-18 14:42:44 +00:00
response = await self . api_request ( params = params , allow_redirects = True )
2022-08-09 14:08:30 +00:00
except ( aiohttp . ClientConnectionError , aiohttp . ServerTimeoutError , asyncio . TimeoutError ) as e :
2021-06-05 11:12:23 +00:00
logger . error ( " A connection error occurred while requesting {} " . format ( params ) )
2022-08-09 14:08:30 +00:00
raise WikiServerError ( e )
2021-06-05 11:12:23 +00:00
return response
2020-07-20 12:03:55 +00:00
2021-07-04 11:39:50 +00:00
async def scan ( self , amount = 10 ) :
2022-08-16 10:50:49 +00:00
""" Main track of fetching RecentChanges of a wiki.
2022-08-09 14:08:30 +00:00
: raises WikiServerError
"""
2021-07-04 11:39:50 +00:00
while True : # Trap event in case there are more changes needed to be fetched
try :
request = await self . fetch_wiki ( amount = amount )
2021-07-18 12:40:26 +00:00
self . client . last_request = request
2022-06-22 17:17:20 +00:00
except WikiServerError as e :
2022-10-09 12:10:08 +00:00
# If WikiServerError comes up 2 times in recent 2 minutes, this will reraise the exception, otherwise waits 2 seconds and retries
2022-08-09 14:08:30 +00:00
self . statistics . update ( Log ( type = LogType . CONNECTION_ERROR , title = str ( e . exception ) ) )
2024-03-23 14:14:27 +00:00
if self . statistics . recent_connection_errors ( ) > 9 :
2024-07-03 09:43:12 +00:00
raise e
2022-08-09 14:08:30 +00:00
await asyncio . sleep ( 2.0 )
2022-10-09 12:10:08 +00:00
continue
if not self . mw_messages or self . recache_requested :
2022-08-16 10:50:49 +00:00
process_cachable ( request , self )
2021-07-04 11:39:50 +00:00
try :
recent_changes = request [ " query " ] [ " recentchanges " ]
recent_changes . reverse ( )
except KeyError :
raise WikiError
if self . rc_id in ( 0 , None , - 1 ) :
if len ( recent_changes ) > 0 :
2022-11-25 16:24:44 +00:00
self . statistics . update ( last_action = recent_changes [ - 1 ] [ " rcid " ] )
2023-08-13 13:57:05 +00:00
dbmanager . add ( ( " UPDATE rcgcdb SET rcid = $1 WHERE wiki = $2 AND ( rcid != -1 OR rcid IS NULL ) " ,
2021-07-04 11:39:50 +00:00
( recent_changes [ - 1 ] [ " rcid " ] , self . script_url ) ) )
else :
2022-11-25 16:24:44 +00:00
self . statistics . update ( last_action = 0 )
2023-08-13 13:57:05 +00:00
dbmanager . add ( ( " UPDATE rcgcdb SET rcid = 0 WHERE wiki = $1 AND ( rcid != -1 OR rcid IS NULL ) " , ( self . script_url ) ) )
2024-07-03 09:43:12 +00:00
self . statistics . update ( Log ( type = LogType . VALUE_UPDATE ,
title = " Commited first fetch and updated rcid value from {} to {} " . format (
self . rc_id , self . statistics . last_action ) ) )
return
2021-07-04 11:39:50 +00:00
categorize_events = { }
new_events = 0
self . statistics . last_checked_rc = int ( time . time ( ) )
2024-02-25 13:23:22 +00:00
highest_id = self . rc_id
old_highest_id = self . rc_id
2021-07-04 11:39:50 +00:00
for change in recent_changes :
if change [ " rcid " ] > highest_id and amount != 450 :
new_events + = 1
if new_events == 10 :
# call the function again with max limit for more results, ignore the ones in this request
logger . debug ( " There were too many new events, queuing wiki with 450 limit. " )
amount = 450
break
await process_cats ( change , self , categorize_events )
else : # adequate amount of changes
2022-08-16 10:50:49 +00:00
message_list = [ ] # Collect all messages so they can be efficiently merged in Discord message sender
2021-07-04 11:39:50 +00:00
for change in recent_changes : # Yeah, second loop since the categories require to be all loaded up
if change [ " rcid " ] > self . rc_id :
if highest_id is None or change [ " rcid " ] > highest_id : # make sure that the highest_rc is really highest rcid but do allow other entries with potentially lesser rcids come after without breaking the cycle
highest_id = change [ " rcid " ]
2022-11-10 14:16:35 +00:00
for combination , webhooks in self . rc_targets . items ( ) :
2022-10-29 15:04:25 +00:00
message = await rc_processor ( self , change , categorize_events . get ( change . get ( " revid " ) , None ) , combination , webhooks )
if message is None :
break
2022-08-16 10:50:49 +00:00
message . wiki = self
2022-10-29 15:04:25 +00:00
message_list . append ( QueueEntry ( message , webhooks , self ) )
2022-08-16 10:50:49 +00:00
messagequeue . add_messages ( message_list )
2024-02-25 13:23:22 +00:00
if old_highest_id != highest_id : # update only when differs
self . statistics . update ( last_action = highest_id )
dbmanager . add ( ( " UPDATE rcgcdb SET rcid = $1 WHERE wiki = $2 AND ( rcid != -1 OR rcid IS NULL ) " , ( highest_id , self . script_url ) ) ) # If this is not enough for the future, save rcid in message sending function to make sure we always send all of the changes
2022-10-29 15:04:25 +00:00
return
2021-07-04 11:39:50 +00:00
2022-11-10 14:16:35 +00:00
async def remove_webhook_from_db ( self , reason : str ) :
raise NotImplementedError
async def remove_wiki_from_db ( self , reason : str ) :
raise NotImplementedError # TODO
2023-05-06 12:29:27 +00:00
async def fetch_discussions ( self , params : dict ) - > tuple [ aiohttp . ClientResponse , dict ] :
2022-11-07 14:46:15 +00:00
header = settings [ " header " ]
header [ " Accept " ] = " application/hal+json "
async with aiohttp . ClientSession ( headers = header ,
timeout = aiohttp . ClientTimeout ( 6.0 ) ) as session :
url_path = " {wiki} wikia.php " . format ( wiki = self . script_url )
try :
2022-11-10 14:16:35 +00:00
feeds_response = await session . get ( url_path , params = params )
feeds_response . raise_for_status ( )
2022-11-07 14:46:15 +00:00
except ( aiohttp . ClientConnectionError , aiohttp . ServerTimeoutError , asyncio . TimeoutError ,
2022-11-25 16:24:44 +00:00
aiohttp . ClientResponseError , aiohttp . TooManyRedirects ) as e :
2023-05-06 12:29:27 +00:00
logger . error ( " A connection error occurred while requesting {} " . format ( feeds_response . url ) )
2022-11-25 16:24:44 +00:00
raise WikiServerError ( e )
2023-05-06 12:29:27 +00:00
return feeds_response , await feeds_response . json ( encoding = " UTF-8 " )
2022-07-24 20:02:25 +00:00
2023-08-13 10:44:19 +00:00
def pull_comment ( self , comment_id ) :
try :
comment = self . sync_api_request ( " ?action=comment&do=getRaw&comment_id= {comment} &format=json " . format ( comment = comment_id ) , " text " )
logger . debug ( " Got the following comment from the API: {} " . format ( comment ) )
except ( ServerError , MediaWikiError ) :
pass
except ( BadRequest , ClientError ) :
logger . exception ( " Some kind of issue while creating a request (most likely client error). " )
except KeyError :
logger . exception ( " CurseProfile extension API did not respond with a valid comment content. " )
else :
if len ( comment ) > 1000 :
comment = comment [ 0 : 1000 ] + " … "
return comment
return " "
2021-07-03 12:07:47 +00:00
2022-08-16 10:50:49 +00:00
def process_cachable ( response : dict , wiki_object : Wiki ) - > None :
""" This function processes cachable objects – such as MediaWiki system messages and wiki tag display names to be used
for processing of DiscordMessages and saves them in a wiki object . """
mw_messages = response . get ( " query " , { } ) . get ( " allmessages " , [ ] )
final_mw_messages = dict ( )
for msg in mw_messages :
if " missing " not in msg : # ignore missing strings
final_mw_messages [ msg [ " name " ] ] = re . sub ( r ' \ [ \ [.*?]] ' , ' ' , msg [ " * " ] )
else :
logger . warning ( " Could not fetch the MW message translation for: {} " . format ( msg [ " name " ] ) )
wiki_object . mw_messages = MWMessages ( final_mw_messages )
for tag in response [ " query " ] [ " tags " ] :
try :
wiki_object . tags [ tag [ " name " ] ] = ( BeautifulSoup ( tag [ " displayname " ] , " lxml " ) ) . get_text ( )
except KeyError :
wiki_object . tags [ tag [ " name " ] ] = None
2022-10-09 12:10:08 +00:00
wiki_object . namespaces = response [ " query " ] [ " namespaces " ]
wiki_object . recache_requested = False
2022-08-16 10:50:49 +00:00
2024-01-14 13:24:51 +00:00
async def rc_processor ( wiki : Wiki , change : dict , changed_categories : dict , display_options : namedtuple ( " Settings " , [ " lang " , " display " , " buttons " ] ) , webhooks : list ) - > Optional [ DiscordMessage ] :
2022-08-16 10:50:49 +00:00
""" This function takes more vital information, communicates with a formatter and constructs DiscordMessage with it.
It creates DiscordMessageMetadata object , LinkParser and Context . Prepares a comment """
2021-07-08 11:33:10 +00:00
from src . misc import LinkParser
2023-12-05 19:07:22 +00:00
LinkParser = LinkParser ( wiki . client . WIKI_JUST_DOMAIN )
2022-08-09 10:57:40 +00:00
metadata = DiscordMessageMetadata ( " POST " , rev_id = change . get ( " revid " , None ) , log_id = change . get ( " logid " , None ) ,
2024-07-04 17:03:51 +00:00
page_id = change . get ( " pageid " , None ) , message_display = display_options . display ,
2024-07-04 20:09:23 +00:00
time_of_change = datetime . datetime . strptime ( change [ " timestamp " ] , ' % Y- % m- %d T % H: % M: % SZ ' ) . replace ( tzinfo = datetime . timezone . utc ) ,
2024-07-04 17:03:51 +00:00
domain = wiki . domain )
2024-01-14 13:24:51 +00:00
context = Context ( " embed " if display_options . display > 0 else " compact " , " recentchanges " , webhooks , wiki . client ,
langs [ display_options . lang ] [ " formatters " ] , prepare_settings ( display_options . display ) , display_options . buttons )
2021-07-18 12:40:26 +00:00
if ( " actionhidden " in change or " suppressed " in change ) and " suppressed " not in settings [ " ignored " ] : # if event is hidden using suppression
2021-07-08 11:33:10 +00:00
context . event = " suppressed "
2024-01-14 13:24:51 +00:00
run_hooks ( pre_hooks , context , change )
2021-07-08 11:33:10 +00:00
try :
2022-08-09 10:57:40 +00:00
discord_message : Optional [ DiscordMessage ] = await asyncio . get_event_loop ( ) . run_in_executor (
2022-10-29 15:04:25 +00:00
None , functools . partial ( default_message ( " suppressed " , context . message_type , formatter_hooks ) , context , change ) )
2021-07-08 11:33:10 +00:00
except :
if settings . get ( " error_tolerance " , 1 ) > 0 :
2022-08-09 10:57:40 +00:00
discord_message : Optional [ DiscordMessage ] = None # It's handled by send_to_discord, we still want other code to run
2021-07-08 11:33:10 +00:00
else :
raise
else :
if " commenthidden " not in change :
LinkParser . feed ( change . get ( " parsedcomment " , " " ) )
parsed_comment = LinkParser . new_string
else :
2024-02-24 18:16:00 +00:00
parsed_comment = langs [ display_options . lang ] [ " wiki " ] . gettext ( " ~~hidden~~ " )
2021-07-08 11:33:10 +00:00
if not parsed_comment and context . message_type == " embed " and settings [ " appearance " ] . get ( " embed " , { } ) . get (
" show_no_description_provided " , True ) :
2024-02-24 18:16:00 +00:00
parsed_comment = langs [ display_options . lang ] [ " wiki " ] . gettext ( " No description provided " )
2021-07-08 11:33:10 +00:00
context . set_parsedcomment ( parsed_comment )
if " userhidden " in change :
2024-02-24 18:16:00 +00:00
change [ " user " ] = langs [ display_options . lang ] [ " wiki " ] . gettext ( " hidden " )
2021-07-08 11:33:10 +00:00
if change . get ( " ns " , - 1 ) in settings . get ( " ignored_namespaces " , ( ) ) :
return
if change [ " type " ] in [ " edit " , " new " ] :
logger . debug ( " List of categories in essential_info: {} " . format ( changed_categories ) )
identification_string = change [ " type " ]
context . set_categories ( changed_categories )
elif change [ " type " ] == " categorize " :
return
elif change [ " type " ] == " log " :
identification_string = " {logtype} / {logaction} " . format ( logtype = change [ " logtype " ] ,
logaction = change [ " logaction " ] )
else :
identification_string = change . get ( " type " , " unknown " ) # If event doesn't have a type
if identification_string in settings [ " ignored " ] :
return
context . event = identification_string
try :
2022-08-09 10:57:40 +00:00
discord_message : Optional [ DiscordMessage ] = await asyncio . get_event_loop ( ) . run_in_executor ( None ,
2022-10-29 15:04:25 +00:00
functools . partial ( default_message ( identification_string , context . message_type , formatter_hooks ) , context ,
2022-08-09 10:57:40 +00:00
change ) )
2021-07-08 11:33:10 +00:00
except :
if settings . get ( " error_tolerance " , 1 ) > 0 :
2022-08-09 10:57:40 +00:00
discord_message : Optional [ DiscordMessage ] = None # It's handled by send_to_discord, we still want other code to run
2021-07-08 11:33:10 +00:00
else :
raise
2021-09-07 15:55:39 +00:00
if identification_string in ( " delete/delete " , " delete/delete_redir " ) : # TODO Move it into a hook?
2022-11-04 14:59:26 +00:00
wiki . delete_messages ( dict ( page_id = change . get ( " pageid " ) ) )
2021-07-09 12:55:23 +00:00
elif identification_string == " delete/event " :
2021-07-08 11:33:10 +00:00
logparams = change . get ( ' logparams ' , { " ids " : [ ] } )
2024-01-14 13:24:51 +00:00
if context . message_type == " embed " :
2023-11-25 14:11:11 +00:00
wiki . redact_messages ( context , logparams . get ( " ids " , [ ] ) , " log_id " , logparams . get ( " new " , { } ) )
2021-07-08 11:33:10 +00:00
else :
for logid in logparams . get ( " ids " , [ ] ) :
2022-09-18 14:38:19 +00:00
wiki . delete_messages ( dict ( logid = logid ) )
2021-07-09 12:55:23 +00:00
elif identification_string == " delete/revision " :
2021-07-08 11:33:10 +00:00
logparams = change . get ( ' logparams ' , { " ids " : [ ] } )
2024-01-14 13:24:51 +00:00
if context . message_type == " embed " :
2023-11-25 14:11:11 +00:00
wiki . redact_messages ( context , logparams . get ( " ids " , [ ] ) , " rev_id " , logparams . get ( " new " , { } ) )
if display_options . display == 3 :
wiki . redact_messages ( context , wiki . find_middle_next ( logparams . get ( " ids " , [ ] ) , change . get ( " pageid " , - 1 ) ) , " rev_id " ,
{ " content " : " " } )
2021-07-08 11:33:10 +00:00
else :
for revid in logparams . get ( " ids " , [ ] ) :
2022-09-18 14:38:19 +00:00
wiki . delete_messages ( dict ( revid = revid ) )
2024-01-14 13:24:51 +00:00
run_hooks ( post_hooks , discord_message , metadata , context , change )
2022-10-29 15:04:25 +00:00
if discord_message : # TODO How to react when none? (crash in formatter), probably bad handling atm
discord_message . finish_embed ( )
2022-08-09 10:57:40 +00:00
discord_message . metadata = metadata
return discord_message
2021-07-08 11:33:10 +00:00
2021-06-22 19:42:32 +00:00
2021-07-04 11:39:50 +00:00
async def process_cats ( event : dict , local_wiki : Wiki , categorize_events : dict ) :
2020-07-27 16:32:30 +00:00
""" Process categories based on local MW messages. """
2020-07-11 15:54:08 +00:00
if event [ " type " ] == " categorize " :
if " commenthidden " not in event :
2020-07-28 14:18:06 +00:00
if local_wiki . mw_messages is not None :
2020-07-11 15:54:08 +00:00
cat_title = event [ " title " ] . split ( ' : ' , 1 ) [ 1 ]
# I so much hate this, blame Markus for making me do this
if event [ " revid " ] not in categorize_events :
categorize_events [ event [ " revid " ] ] = { " new " : set ( ) , " removed " : set ( ) }
comment_to_match = re . sub ( r ' <.*?a> ' , ' ' , event [ " parsedcomment " ] )
2021-07-04 11:39:50 +00:00
if local_wiki . mw_messages [ " recentchanges-page-added-to-category " ] in comment_to_match or local_wiki . mw_messages [ " recentchanges-page-added-to-category-bundled " ] in comment_to_match : # Added to category
2020-07-11 15:54:08 +00:00
categorize_events [ event [ " revid " ] ] [ " new " ] . add ( cat_title )
2021-03-17 13:15:29 +00:00
#logger.debug("Matched {} to added category for {}".format(cat_title, event["revid"]))
2021-07-04 11:39:50 +00:00
elif local_wiki . mw_messages [ " recentchanges-page-removed-from-category " ] in comment_to_match or local_wiki . mw_messages [ " recentchanges-page-removed-from-category-bundled " ] in comment_to_match : # Removed from category
2020-07-11 15:54:08 +00:00
categorize_events [ event [ " revid " ] ] [ " removed " ] . add ( cat_title )
2021-03-17 13:15:29 +00:00
#logger.debug("Matched {} to removed category for {}".format(cat_title, event["revid"]))
2020-07-11 15:54:08 +00:00
else :
logger . debug (
2021-07-04 11:39:50 +00:00
" Unknown match for category change with messages {} and comment_to_match {} " . format ( local_wiki . mw_messages , comment_to_match ) )
2020-07-11 15:54:08 +00:00
else :
logger . warning (
" Init information not available, could not read category information. Please restart the bot. " )
else :
logger . debug ( " Log entry got suppressed, ignoring entry. " )
2022-08-16 10:50:49 +00:00
# This function has been removed. While its implementation seems sound, it should be considered only if we find performance
# concerns with RcGcDb
# async def process_mwmsgs(wiki_response: dict, local_wiki: Wiki, mw_msgs: dict):
# """
# This function is made to parse the initial wiki extended information to update local_wiki.mw_messages that stores the key
# to mw_msgs that is a dict storing id: tuple where tuple is a set of MW messages for categories.
# The reason it's constructed this way is to prevent duplication of data in memory so Markus doesn't complain about
# high RAM usage. It does however affect CPU performance as every wiki requires to check the list for the matching
# tuples of MW messages.
#
# :param wiki_response:
# :param local_wiki:
# :param mw_msgs:
# :return:
# """
# msgs = []
# for message in wiki_response["query"]["allmessages"]:
# if not "missing" in message: # ignore missing strings
# msgs.append((message["name"], re.sub(r'\[\[.*?\]\]', '', message["*"])))
# else:
# logger.warning("Could not fetch the MW message translation for: {}".format(message["name"]))
# msgs = tuple(msgs)
# for key, set in mw_msgs.items():
# if msgs == set:
# local_wiki.mw_messages = key
# return
# # if same entry is not in mw_msgs
# key = len(mw_msgs)
# mw_msgs[key] = msgs # it may be a little bit messy for sure, however I don't expect any reason to remove mw_msgs entries by one
# local_wiki.mw_messages = key