2019-05-20 15:28:55 +00:00
# -*- coding: utf-8 -*-
2021-04-18 23:21:38 +00:00
# This file is part of Recent changes Goat compatible Discord webhook (RcGcDw).
2019-05-20 13:32:23 +00:00
2021-04-18 23:21:38 +00:00
# RcGcDw is free software: you can redistribute it and/or modify
2019-05-20 13:32:23 +00:00
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
2021-04-18 23:21:38 +00:00
# RcGcDw is distributed in the hope that it will be useful,
2019-05-20 13:32:23 +00:00
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
2021-04-18 23:21:38 +00:00
# along with RcGcDw. If not, see <http://www.gnu.org/licenses/>.
2020-07-16 12:46:23 +00:00
import base64
2021-05-15 10:11:56 +00:00
import json , logging , sys , re , platform
2019-05-20 13:11:30 +00:00
from html . parser import HTMLParser
2025-01-02 00:44:33 +00:00
from typing import Callable , Tuple , List , Optional , Union
2021-04-25 11:20:58 +00:00
from urllib . parse import urlparse , urlunparse
2020-04-04 12:29:18 +00:00
import requests
2021-04-25 11:20:58 +00:00
2025-01-25 09:35:57 +00:00
from src . api . context import Context
2022-10-12 11:41:22 +00:00
from src . argparser import command_args
2020-07-07 11:21:49 +00:00
from src . configloader import settings
2021-05-23 19:06:59 +00:00
import src . api . util
2020-11-08 21:29:15 +00:00
from src . discord . message import DiscordMessage , DiscordMessageMetadata
from src . discord . queue import messagequeue , send_to_discord
2021-04-24 12:32:23 +00:00
from src . exceptions import MediaWikiError
2020-08-03 13:27:24 +00:00
from src . i18n import misc
2019-05-20 19:01:45 +00:00
2020-08-03 13:27:24 +00:00
_ = misc . gettext
2019-05-19 15:03:05 +00:00
# Create a custom logger
2019-05-20 10:41:40 +00:00
2019-05-19 15:03:05 +00:00
misc_logger = logging . getLogger ( " rcgcdw.misc " )
2020-10-17 23:45:06 +00:00
data_template = { " rcid " : None , " discussion_id " : 0 , " abuse_log_id " : None ,
2019-05-19 15:03:05 +00:00
" daily_overview " : { " edits " : None , " new_files " : None , " admin_actions " : None , " bytes_changed " : None ,
2019-05-19 16:25:20 +00:00
" new_articles " : None , " unique_editors " : None , " day_score " : None , " days_tracked " : 0 } }
2019-05-19 15:03:05 +00:00
2020-04-04 12:29:18 +00:00
WIKI_API_PATH : str = " "
WIKI_ARTICLE_PATH : str = " "
WIKI_SCRIPT_PATH : str = " "
WIKI_JUST_DOMAIN : str = " "
2022-11-10 14:39:29 +00:00
def send_simple ( msgtype , message , name , avatar ) :
discord_msg = DiscordMessage ( " compact " , msgtype , settings [ " webhookURL " ] , content = message )
discord_msg . set_avatar ( avatar )
discord_msg . set_name ( name )
messagequeue . resend_msgs ( )
send_to_discord ( discord_msg , meta = DiscordMessageMetadata ( " POST " ) )
2020-04-04 12:29:18 +00:00
class DataFile :
""" Data class which instance of is shared by multiple modules to remain consistent and do not cause too many IO operations. """
def __init__ ( self ) :
2022-01-02 13:39:20 +00:00
self . data_filename : str = settings . get ( " datafile_path " , " data.json " )
self . data : dict = self . load_datafile ( )
2021-11-18 16:27:02 +00:00
misc_logger . debug ( " Current contents of {} {} " . format ( self . data_filename , self . data ) )
2022-01-02 13:39:20 +00:00
self . changed : bool = False
2020-04-04 12:29:18 +00:00
2021-11-18 16:27:02 +00:00
def generate_datafile ( self ) :
2020-04-04 12:29:18 +00:00
""" Generate a data.json file from a template. """
try :
2021-11-18 16:27:02 +00:00
with open ( self . data_filename , ' w ' , encoding = " utf-8 " ) as data :
2020-04-04 12:29:18 +00:00
data . write ( json . dumps ( data_template , indent = 4 ) )
except PermissionError :
misc_logger . critical ( " Could not create a data file (no permissions). No way to store last edit. " )
sys . exit ( 1 )
def load_datafile ( self ) - > dict :
""" Read a data.json file and return a dictionary with contents
: rtype : dict
"""
try :
2021-11-18 16:27:02 +00:00
with open ( self . data_filename , encoding = " utf-8 " ) as data :
2020-04-04 12:29:18 +00:00
return json . loads ( data . read ( ) )
except FileNotFoundError :
self . generate_datafile ( )
misc_logger . info ( " The data file could not be found. Generating a new one... " )
2022-10-12 11:41:22 +00:00
if not command_args . nowelcome :
send_simple ( " welcome " , _ ( " RcGcDw is now running and checking {wiki} . " ) . format ( wiki = settings [ " wikiname " ] ) ,
2023-01-10 12:56:18 +00:00
_ ( " Welcome " ) , settings [ " avatars " ] . get ( " welcome " , None ) )
2020-04-04 12:29:18 +00:00
return data_template
def save_datafile ( self ) :
""" Overwrites the data.json file with given dictionary """
2020-10-17 23:45:06 +00:00
if self . changed is False : # don't cause unnecessary write operations
return
2020-04-04 12:29:18 +00:00
try :
2021-11-18 16:27:02 +00:00
with open ( self . data_filename , " w " , encoding = " utf-8 " ) as data_file :
2020-04-04 12:29:18 +00:00
data_file . write ( json . dumps ( self . data , indent = 4 ) )
2020-10-17 23:45:06 +00:00
self . changed = False
2021-05-03 11:37:42 +00:00
misc_logger . debug ( " Saving the database succeeded. " )
2020-04-04 12:29:18 +00:00
except PermissionError :
misc_logger . critical ( " Could not modify a data file (no permissions). No way to store last edit. " )
sys . exit ( 1 )
2021-05-15 10:11:56 +00:00
except OSError as e :
if settings . get ( " error_tolerance " , 1 ) > 1 :
if platform . system ( ) == " Windows " :
2021-11-18 16:27:02 +00:00
if " Invalid argument: ' " + self . data_filename + " ' " in str ( e ) :
2021-05-15 10:11:56 +00:00
misc_logger . error ( " Saving the data file failed due to Invalid argument exception, we ' ve seen it "
" before in issue #209, if you know the reason for it happening please reopen the "
" issue with explanation, for now we are going to just ignore it. " ) # Reference #209
return
raise
2020-04-04 12:29:18 +00:00
2020-10-17 23:45:06 +00:00
def __setitem__ ( self , instance , value ) :
2021-11-19 12:35:36 +00:00
if self . data [ instance ] != value :
self . data [ instance ] = value
self . changed = True
2020-10-17 23:45:06 +00:00
def __getitem__ ( self , item ) :
2020-11-18 13:58:13 +00:00
try :
return self . data [ item ]
except KeyError : # if such value doesn't exist, set to and return none
self . __setitem__ ( item , None )
self . save_datafile ( )
return None
2020-10-17 23:45:06 +00:00
2020-04-04 12:29:18 +00:00
datafile = DataFile ( )
2019-05-19 16:25:20 +00:00
2020-10-18 11:25:50 +00:00
2019-05-19 16:25:20 +00:00
def weighted_average ( value , weight , new_value ) :
""" Calculates weighted average of value number with weight weight and new_value with weight 1 """
return round ( ( ( value * weight ) + new_value ) / ( weight + 1 ) , 2 )
2019-05-20 10:41:40 +00:00
2021-05-02 22:02:03 +00:00
def class_searcher ( attribs : list ) - > str :
""" Function to return classes of given element in HTMLParser on handle_starttag
2020-10-18 11:25:50 +00:00
2021-05-02 22:02:03 +00:00
: returns a string with all of the classes of element
2021-05-02 21:44:32 +00:00
"""
for attr in attribs :
if attr [ 0 ] == " class " :
2021-05-02 22:02:03 +00:00
return attr [ 1 ]
return " "
2019-05-20 13:11:30 +00:00
2020-10-18 11:25:50 +00:00
2019-05-20 13:11:30 +00:00
class ContentParser ( HTMLParser ) :
2021-04-24 09:19:38 +00:00
""" ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request
for two MediaWiki revisions . It extracts the following :
small_prev_ins - storing up to 1000 characters of added text
small_prev_del - storing up to 1000 chracters of removed text
ins_length - storing length of inserted text
del_length - storing length of deleted text
"""
2019-05-20 13:11:30 +00:00
more = _ ( " \n __And more__ " )
current_tag = " "
2020-11-09 17:04:36 +00:00
last_ins = None
last_del = None
empty = False
2019-05-20 13:11:30 +00:00
small_prev_ins = " "
small_prev_del = " "
ins_length = len ( more )
del_length = len ( more )
def handle_starttag ( self , tagname , attribs ) :
if tagname == " ins " or tagname == " del " :
self . current_tag = tagname
2021-05-02 22:02:03 +00:00
if tagname == " td " :
classes = class_searcher ( attribs ) . split ( ' ' )
if " diff-addedline " in classes and self . ins_length < = 1000 :
self . current_tag = " tda "
self . last_ins = " "
if " diff-deletedline " in classes and self . del_length < = 1000 :
self . current_tag = " tdd "
self . last_del = " "
if " diff-empty " in classes :
self . empty = True
2019-05-20 13:11:30 +00:00
def handle_data ( self , data ) :
2021-05-01 12:25:03 +00:00
def escape_formatting ( data : str ) - > str :
""" Escape Discord formatting """
2023-04-18 22:37:36 +00:00
return re . sub ( r " ([`_*~:<> {} @/|# \ - \ . \\ \ [ \ ] \ ( \ )]) " , " \\ \\ \\ 1 " , data )
2020-11-09 17:04:36 +00:00
data = escape_formatting ( data )
2019-05-20 13:11:30 +00:00
if self . current_tag == " ins " and self . ins_length < = 1000 :
2020-11-09 17:04:36 +00:00
self . ins_length + = len ( " ** " + data + " ** " )
2019-05-20 13:11:30 +00:00
if self . ins_length < = 1000 :
2020-11-09 17:04:36 +00:00
self . last_ins = self . last_ins + " ** " + data + " ** "
2019-05-20 13:11:30 +00:00
if self . current_tag == " del " and self . del_length < = 1000 :
2020-11-09 17:04:36 +00:00
self . del_length + = len ( " ~~ " + data + " ~~ " )
2019-05-20 13:11:30 +00:00
if self . del_length < = 1000 :
2020-11-09 17:04:36 +00:00
self . last_del = self . last_del + " ~~ " + data + " ~~ "
if self . current_tag == " tda " and self . ins_length < = 1000 :
2019-05-20 13:11:30 +00:00
self . ins_length + = len ( data )
if self . ins_length < = 1000 :
2020-11-09 17:04:36 +00:00
self . last_ins = self . last_ins + data
if self . current_tag == " tdd " and self . del_length < = 1000 :
2019-05-20 13:11:30 +00:00
self . del_length + = len ( data )
if self . del_length < = 1000 :
2020-11-09 17:04:36 +00:00
self . last_del = self . last_del + data
2019-05-20 13:11:30 +00:00
def handle_endtag ( self , tagname ) :
if tagname == " ins " :
2020-11-09 17:04:36 +00:00
self . current_tag = " tda "
2019-05-20 13:11:30 +00:00
elif tagname == " del " :
2020-11-09 17:04:36 +00:00
self . current_tag = " tdd "
2021-06-12 17:22:25 +00:00
elif tagname == " td " :
self . current_tag = " "
2020-11-09 17:04:36 +00:00
elif tagname == " tr " :
if self . last_ins is not None :
self . ins_length + = 1
2021-06-12 17:22:25 +00:00
if self . empty and not self . last_ins . isspace ( ) :
2021-07-20 06:16:34 +00:00
if " ** " in self . last_ins :
2021-06-12 17:22:25 +00:00
self . last_ins = self . last_ins . replace ( " ** " , " __ " )
2020-11-09 17:04:36 +00:00
self . ins_length + = 4
self . last_ins = " ** " + self . last_ins + " ** "
self . small_prev_ins = self . small_prev_ins + " \n " + self . last_ins
if self . ins_length > 1000 :
self . small_prev_ins = self . small_prev_ins + self . more
self . last_ins = None
if self . last_del is not None :
self . del_length + = 1
2021-06-12 17:22:25 +00:00
if self . empty and not self . last_del . isspace ( ) :
2021-07-20 06:16:34 +00:00
if " ~~ " in self . last_del :
2021-06-12 17:22:25 +00:00
self . last_del = self . last_del . replace ( " ~~ " , " __ " )
2020-11-09 17:04:36 +00:00
self . del_length + = 4
self . last_del = " ~~ " + self . last_del + " ~~ "
self . small_prev_del = self . small_prev_del + " \n " + self . last_del
if self . del_length > 1000 :
self . small_prev_del = self . small_prev_del + self . more
self . last_del = None
self . empty = False
2019-05-20 19:23:19 +00:00
def safe_read ( request , * keys ) :
if request is None :
return None
try :
request = request . json ( )
for item in keys :
request = request [ item ]
except KeyError :
misc_logger . warning (
" Failure while extracting data from request on key {key} in {change} " . format ( key = item , change = request ) )
return None
except ValueError :
misc_logger . warning ( " Failure while extracting data from request in {change} " . format ( change = request ) )
return None
return request
2021-04-24 12:32:23 +00:00
def parse_mw_request_info ( request_data : dict , url : str ) :
""" A function parsing request JSON message from MediaWiki logging all warnings and raising on MediaWiki errors """
# any([True for k in request_data.keys() if k in ("error", "errors")])
2024-10-15 14:31:32 +00:00
errors : dict = request_data . get ( " errors " , { } ) # Is it ugly? I don't know tbh
2021-04-24 12:32:23 +00:00
if errors :
raise MediaWikiError ( str ( errors ) )
2024-10-15 14:31:32 +00:00
warnings : dict = request_data . get ( " warnings " , { } )
2021-04-24 12:32:23 +00:00
if warnings :
2024-10-15 14:31:32 +00:00
for module , warning_data in warnings . items ( ) :
misc_logger . warning ( " MediaWiki returned the following warning on module {module} : {text} on {url} . " . format (
module = module , text = warning_data . get ( " warnings " , " " ) , url = url
2021-04-24 12:32:23 +00:00
) )
return request_data
2021-05-18 11:46:00 +00:00
2019-05-20 19:23:19 +00:00
def add_to_dict ( dictionary , key ) :
if key in dictionary :
dictionary [ key ] + = 1
else :
dictionary [ key ] = 1
2020-04-04 12:29:18 +00:00
return dictionary
2021-05-18 11:46:00 +00:00
2022-01-02 13:39:20 +00:00
def prepare_paths ( path : str , dry = False ) :
2020-04-04 12:29:18 +00:00
""" Set the URL paths for article namespace and script namespace
WIKI_API_PATH will be : WIKI_DOMAIN / api . php
WIKI_ARTICLE_PATH will be : WIKI_DOMAIN / articlepath / $ 1 where $ 1 is the replaced string
WIKI_SCRIPT_PATH will be : WIKI_DOMAIN /
WIKI_JUST_DOMAIN will be : WIKI_DOMAIN """
2022-01-02 13:39:20 +00:00
global WIKI_API_PATH
global WIKI_ARTICLE_PATH
global WIKI_SCRIPT_PATH
global WIKI_JUST_DOMAIN
2020-04-04 12:29:18 +00:00
def quick_try_url ( url ) :
""" Quickly test if URL is the proper script path,
False if it appears invalid
dictionary when it appears valid """
try :
2024-08-08 20:49:40 +00:00
request = requests . get ( url , timeout = 5 , headers = settings [ " header " ] )
2020-04-04 12:29:18 +00:00
if request . status_code == requests . codes . ok :
if request . json ( ) [ " query " ] [ " general " ] is not None :
return request
2024-08-08 21:13:52 +00:00
else :
2024-08-08 21:16:05 +00:00
misc_logger . debug ( f " Request to the wiki failed with code { request . status_code } and text: { request . text } " )
2020-04-04 12:29:18 +00:00
return False
except ( KeyError , requests . exceptions . ConnectionError ) :
return False
try :
2020-08-23 13:32:12 +00:00
parsed_url = urlparse ( path )
2020-04-04 12:29:18 +00:00
except KeyError :
misc_logger . critical ( " wiki_url is not specified in the settings. Please provide the wiki url in the settings and start the script again. " )
sys . exit ( 1 )
2020-08-23 13:32:12 +00:00
for url_scheme in ( path , path . split ( " wiki " ) [ 0 ] , urlunparse ( ( * parsed_url [ 0 : 2 ] , " " , " " , " " , " " ) ) ) : # check different combinations, it's supposed to be idiot-proof
2023-12-14 16:19:43 +00:00
tested = quick_try_url ( url_scheme . rstrip ( " / " ) + " /api.php?action=query&format=json&meta=siteinfo " )
2020-04-04 12:29:18 +00:00
if tested :
2020-08-23 13:32:12 +00:00
if not dry :
WIKI_API_PATH = urlunparse ( ( * parsed_url [ 0 : 2 ] , " " , " " , " " , " " ) ) + tested . json ( ) [ " query " ] [ " general " ] [ " scriptpath " ] + " /api.php "
WIKI_SCRIPT_PATH = urlunparse ( ( * parsed_url [ 0 : 2 ] , " " , " " , " " , " " ) ) + tested . json ( ) [ " query " ] [ " general " ] [ " scriptpath " ] + " / "
WIKI_ARTICLE_PATH = urlunparse ( ( * parsed_url [ 0 : 2 ] , " " , " " , " " , " " ) ) + tested . json ( ) [ " query " ] [ " general " ] [ " articlepath " ]
WIKI_JUST_DOMAIN = urlunparse ( ( * parsed_url [ 0 : 2 ] , " " , " " , " " , " " ) )
break
return urlunparse ( ( * parsed_url [ 0 : 2 ] , " " , " " , " " , " " ) )
2020-04-04 12:29:18 +00:00
else :
2020-08-23 13:32:12 +00:00
misc_logger . critical ( " Could not verify wikis paths. Please make sure you have given the proper wiki URLs in settings.json ( {path} should be script path to your wiki) and your Internet connection is working. " . format ( path = path ) )
2020-04-04 12:29:18 +00:00
sys . exit ( 1 )
2020-04-05 00:07:56 +00:00
2020-08-23 13:32:12 +00:00
prepare_paths ( settings [ " wiki_url " ] )
2020-04-05 00:07:56 +00:00
2025-01-02 00:44:33 +00:00
def run_hooks ( hooks : Union [ List [ Tuple [ Callable [ [ Context , dict ] , None ] , int , Optional [ Callable ] ] ] , List [ Tuple [ Callable [ [ DiscordMessage , DiscordMessageMetadata , Context , dict ] , None ] , int , Optional [ Callable ] ] ] ] , * arguments ) :
2021-05-14 12:30:52 +00:00
for hook in hooks :
2025-01-02 00:44:33 +00:00
if hook [ 2 ] is not None :
try :
if hook [ 2 ] ( * arguments ) is False :
misc_logger . debug ( f " Ignoring hook { hook [ 0 ] . __name__ } due to conditions not being met for execution " )
continue
except :
if settings . get ( " error_tolerance " , 1 ) > 0 :
misc_logger . exception ( " On running a hook check function, ignoring hook " )
else :
raise
2021-05-14 12:30:52 +00:00
try :
2025-01-02 00:44:33 +00:00
misc_logger . debug ( f " Running { hook [ 0 ] . __name__ } hook " )
hook [ 0 ] ( * arguments )
2021-05-14 12:30:52 +00:00
except :
if settings . get ( " error_tolerance " , 1 ) > 0 :
2025-01-02 00:44:33 +00:00
misc_logger . exception ( " On running a hook, ignoring hook " )
2021-05-14 12:30:52 +00:00
else :
raise
2022-11-10 14:21:16 +00:00
def profile_field_name ( name , embed , _ ) :
profile_fields = { " profile-location " : _ ( " Location " ) , " profile-aboutme " : _ ( " About me " ) ,
" profile-link-google " : _ ( " Google link " ) , " profile-link-facebook " : _ ( " Facebook link " ) ,
" profile-link-twitter " : _ ( " Twitter link " ) , " profile-link-reddit " : _ ( " Reddit link " ) ,
" profile-link-twitch " : _ ( " Twitch link " ) , " profile-link-psn " : _ ( " PSN link " ) ,
" profile-link-vk " : _ ( " VK link " ) , " profile-link-xbl " : _ ( " XBL link " ) ,
" profile-link-steam " : _ ( " Steam link " ) , " profile-link-discord " : _ ( " Discord handle " ) ,
" profile-link-battlenet " : _ ( " Battle.net handle " ) }
2020-07-16 12:46:23 +00:00
try :
return profile_fields [ name ]
except KeyError :
if embed :
return _ ( " Unknown " )
else :
return _ ( " unknown " )
class LinkParser ( HTMLParser ) :
new_string = " "
recent_href = " "
2024-08-13 10:30:55 +00:00
def __init__ ( self , DOMAIN_URL : str ) :
self . WIKI_JUST_DOMAIN = DOMAIN_URL
super ( ) . __init__ ( )
2020-07-16 12:46:23 +00:00
def handle_starttag ( self , tag , attrs ) :
for attr in attrs :
if attr [ 0 ] == ' href ' :
self . recent_href = attr [ 1 ]
if self . recent_href . startswith ( " // " ) :
self . recent_href = " https: {rest} " . format ( rest = self . recent_href )
elif not self . recent_href . startswith ( " http " ) :
2024-08-13 10:30:55 +00:00
self . recent_href = self . WIKI_JUST_DOMAIN + self . recent_href
2020-07-16 12:46:23 +00:00
self . recent_href = self . recent_href . replace ( " ) " , " \\ ) " )
elif attr [ 0 ] == ' data-uncrawlable-url ' :
self . recent_href = attr [ 1 ] . encode ( ' ascii ' )
self . recent_href = base64 . b64decode ( self . recent_href )
2024-08-13 10:30:55 +00:00
self . recent_href = self . WIKI_JUST_DOMAIN + self . recent_href . decode ( ' ascii ' )
2020-07-16 12:46:23 +00:00
def handle_data ( self , data ) :
if self . recent_href :
2021-05-23 19:06:59 +00:00
self . new_string = self . new_string + " [ {} ](< {} >) " . format ( src . api . util . sanitize_to_markdown ( data ) , self . recent_href )
2020-07-16 12:46:23 +00:00
self . recent_href = " "
else :
2021-05-23 19:06:59 +00:00
self . new_string = self . new_string + src . api . util . sanitize_to_markdown ( data )
2020-07-16 12:46:23 +00:00
def handle_comment ( self , data ) :
2021-05-23 19:06:59 +00:00
self . new_string = self . new_string + src . api . util . sanitize_to_markdown ( data )
2020-07-16 12:46:23 +00:00
def handle_endtag ( self , tag ) :
2020-10-26 12:25:14 +00:00
misc_logger . debug ( self . new_string )