Use RcGcDw ContentParser

This commit is contained in:
MarkusRost 2024-07-11 19:07:14 +00:00
parent faf3d3d9eb
commit 1e970337a8

View file

@ -140,6 +140,13 @@ def class_searcher(attribs: list) -> str:
class ContentParser(HTMLParser): class ContentParser(HTMLParser):
"""ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request
for two MediaWiki revisions. It extracts the following:
small_prev_ins - storing up to 1000 characters of added text
small_prev_del - storing up to 1000 chracters of removed text
ins_length - storing length of inserted text
del_length - storing length of deleted text
"""
current_tag = "" current_tag = ""
last_ins = None last_ins = None
last_del = None last_del = None
@ -172,30 +179,33 @@ class ContentParser(HTMLParser):
if self.current_tag == "ins" and self.ins_length <= 1000: if self.current_tag == "ins" and self.ins_length <= 1000:
self.ins_length += len("**" + data + "**") self.ins_length += len("**" + data + "**")
if self.ins_length <= 1000: if self.ins_length <= 1000:
self.last_ins = self.last_ins or "" + "**" + data + "**" self.last_ins = self.last_ins + "**" + data + "**"
if self.current_tag == "del" and self.del_length <= 1000: if self.current_tag == "del" and self.del_length <= 1000:
self.del_length += len("~~" + data + "~~") self.del_length += len("~~" + data + "~~")
if self.del_length <= 1000: if self.del_length <= 1000:
self.last_del = self.last_del or "" + "~~" + data + "~~" self.last_del = self.last_del + "~~" + data + "~~"
if self.current_tag == "tda" and self.ins_length <= 1000: if self.current_tag == "tda" and self.ins_length <= 1000:
self.ins_length += len(data) self.ins_length += len(data)
if self.ins_length <= 1000: if self.ins_length <= 1000:
self.last_ins = self.last_ins or "" + data self.last_ins = self.last_ins + data
if self.current_tag == "tdd" and self.del_length <= 1000: if self.current_tag == "tdd" and self.del_length <= 1000:
self.del_length += len(data) self.del_length += len(data)
if self.del_length <= 1000: if self.del_length <= 1000:
self.last_del = self.last_del or "" + data self.last_del = self.last_del + data
def handle_endtag(self, tagname): def handle_endtag(self, tagname):
self.current_tag = ""
if tagname == "ins": if tagname == "ins":
self.current_tag = "tda" self.current_tag = "tda"
elif tagname == "del": elif tagname == "del":
self.current_tag = "tdd" self.current_tag = "tdd"
elif tagname == "td":
self.current_tag = ""
elif tagname == "tr": elif tagname == "tr":
if self.last_ins is not None: if self.last_ins is not None:
self.ins_length += 1 self.ins_length += 1
if self.empty and not self.last_ins.isspace() and "**" not in self.last_ins: if self.empty and not self.last_ins.isspace():
if "**" in self.last_ins:
self.last_ins = self.last_ins.replace("**", "__")
self.ins_length += 4 self.ins_length += 4
self.last_ins = "**" + self.last_ins + "**" self.last_ins = "**" + self.last_ins + "**"
self.small_prev_ins = self.small_prev_ins + "\n" + self.last_ins self.small_prev_ins = self.small_prev_ins + "\n" + self.last_ins
@ -204,7 +214,9 @@ class ContentParser(HTMLParser):
self.last_ins = None self.last_ins = None
if self.last_del is not None: if self.last_del is not None:
self.del_length += 1 self.del_length += 1
if self.empty and not self.last_del.isspace() and "~~" not in self.last_del: if self.empty and not self.last_del.isspace():
if "~~" in self.last_del:
self.last_del = self.last_del.replace("~~", "__")
self.del_length += 4 self.del_length += 4
self.last_del = "~~" + self.last_del + "~~" self.last_del = "~~" + self.last_del + "~~"
self.small_prev_del = self.small_prev_del + "\n" + self.last_del self.small_prev_del = self.small_prev_del + "\n" + self.last_del