This commit is contained in:
Frisk 2021-05-02 23:44:32 +02:00
parent d3115153df
commit 5f8b537259
No known key found for this signature in database
GPG key ID: 213F7C15068AF8AC

View file

@ -104,6 +104,17 @@ def weighted_average(value, weight, new_value):
return round(((value * weight) + new_value) / (weight + 1), 2) return round(((value * weight) + new_value) / (weight + 1), 2)
def class_searcher(attribs: list, sclass: str) -> bool:
"""Function to search certain string (sclass) in attribute list of given tag provided by HTMLParser on handle_starttag
:returns True if element is of given sclass False if it isn't
"""
for attr in attribs:
if attr[0] == "class":
if sclass in attr[1]:
return True
return False
class ContentParser(HTMLParser): class ContentParser(HTMLParser):
"""ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request """ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request
for two MediaWiki revisions. It extracts the following: for two MediaWiki revisions. It extracts the following:
@ -125,13 +136,13 @@ class ContentParser(HTMLParser):
def handle_starttag(self, tagname, attribs): def handle_starttag(self, tagname, attribs):
if tagname == "ins" or tagname == "del": if tagname == "ins" or tagname == "del":
self.current_tag = tagname self.current_tag = tagname
if tagname == "td" and "diff-addedline" in attribs[0] and self.ins_length <= 1000: if tagname == "td" and class_searcher(attribs, "diff-addedline") and self.ins_length <= 1000:
self.current_tag = "tda" self.current_tag = "tda"
self.last_ins = "" self.last_ins = ""
if tagname == "td" and "diff-deletedline" in attribs[0] and self.del_length <= 1000: if tagname == "td" and class_searcher(attribs, "diff-deletedline") and self.del_length <= 1000:
self.current_tag = "tdd" self.current_tag = "tdd"
self.last_del = "" self.last_del = ""
if tagname == "td" and "diff-empty" in attribs[0]: if tagname == "td" and class_searcher(attribs, "diff-empty"):
self.empty = True self.empty = True
def handle_data(self, data): def handle_data(self, data):