Additional work done on the ratelimiting

2025-02-23 00:54:09 +00:00 · 2020-08-03 16:44:42 +02:00 · 2020-08-03 16:44:42 +02:00 · a4462369bb
parent c1831b992b
commit a4462369bb
2 changed files with 91 additions and 68 deletions
--- a/src/bot.py
+++ b/src/bot.py
@ -12,7 +12,7 @@ from src.argparser import command_line_args
 from src.config import settings
 from src.database import db_cursor
 from src.exceptions import *
-from src.misc import get_paths
+from src.misc import get_paths, get_domain
 from src.msgqueue import messagequeue
 from src.queue_handler import DBHandler
 from src.wiki import Wiki, process_cats, process_mwmsgs, essential_info, essential_feeds
@ -41,11 +41,11 @@ for wiki in db_cursor.execute('SELECT DISTINCT wiki FROM rcgcdw'):
 # Start queueing logic


-def calculate_delay() -> float:
+def calculate_delay_for_group(group_length: int) -> float:
 	"""Calculate the delay between fetching each wiki to avoid rate limits"""
 	min_delay = 60 / settings["max_requests_per_minute"]
-	if (len(all_wikis) * min_delay) < settings["minimal_cooldown_per_wiki_in_sec"]:
-		return settings["minimal_cooldown_per_wiki_in_sec"] / len(all_wikis)
+	if (group_length * min_delay) < settings["minimal_cooldown_per_wiki_in_sec"]:
+		return settings["minimal_cooldown_per_wiki_in_sec"] / group_length
 	else:
 		return min_delay

@ -62,15 +62,18 @@ def generate_targets(wiki_url: str) -> defaultdict:
 	return combinations


-async def wiki_scanner():
-	"""Wiki scanner is spawned as a task which purpose is to continuously run over wikis in the DB, fetching recent changes
-	to add messages based on the changes to message queue later handled by message_sender coroutine."""
-	try:
-		while True:
-			calc_delay = calculate_delay()
-			fetch_all = db_cursor.execute(
-				'SELECT webhook, wiki, lang, display, wikiid, rcid, postid FROM rcgcdw GROUP BY wiki')
+async def generate_domain_groups():  # oh boy, I cannot wait to learn about async generators
+	combinations = defaultdict(list)
+	fetch_all = db_cursor.execute('SELECT webhook, wiki, lang, display, wikiid, rcid FROM rcgcdw GROUP BY wiki')
 	for db_wiki in fetch_all.fetchall():
+		combinations[get_domain(db_wiki["wiki"])].append(db_wiki)
+	for item in combinations.values():
+		yield item
+
+
+async def scan_group(group: list):
+	calc_delay = calculate_delay_for_group(len(group))
+	for db_wiki in group:
 		logger.debug("Wiki {}".format(db_wiki["wiki"]))
 		if db_wiki["wiki"] not in all_wikis:
 			logger.info("Registering new wiki locally: {}".format(db_wiki["wiki"]))
@ -133,7 +136,21 @@ async def wiki_scanner():
 								await formatter_exception_logger(db_wiki["wiki"], change, traceback.format_exc())
 			if recent_changes:
 				DBHandler.add(db_wiki["wiki"], change["rcid"])
-				await asyncio.sleep(delay=2.0)  # temporary measure until rate limiting is not implemented
+		await asyncio.sleep(delay=calc_delay)
+
+
+async def wiki_scanner():
+	"""Wiki scanner is spawned as a task which purpose is to continuously run over wikis in the DB, fetching recent changes
+	to add messages based on the changes to message queue later handled by message_sender coroutine."""
+	try:
+		while True:
+			async for group in generate_domain_groups():
+				asyncio.create_task(scan_group(group))
+
+			fetch_all = db_cursor.execute(
+				'SELECT webhook, wiki, lang, display, wikiid, rcid, postid FROM rcgcdw GROUP BY wiki')
+			for db_wiki in fetch_all.fetchall():
+
 				if db_wiki["wikiid"] is not None:
 					header = settings["header"]
 					header["Accept"] = "application/hal+json"
--- a/src/misc.py
+++ b/src/misc.py
@ -17,6 +17,12 @@ def get_paths(wiki: str, request) -> tuple:
 	return WIKI_API_PATH, WIKI_SCRIPT_PATH, WIKI_ARTICLE_PATH, WIKI_JUST_DOMAIN


+def get_domain(url: str) -> str:
+	"""Get domain of given URL"""
+	parsed_url = urlparse(url)
+	return ".".join(urlunparse((*parsed_url[0:2], "", "", "", "")).split(".")[-2:])  # something like gamepedia.com, fandom.com
+
+
 class LinkParser(HTMLParser):

 	new_string = ""