Update diff parser

Based on Wiki-Bot diff parser
This commit is contained in:
Markus-Rost 2020-11-09 18:05:02 +01:00
parent 203ed89847
commit 1ad94b2e3e

View file

@ -112,9 +112,11 @@ def profile_field_name(name, embed, lang):
class ContentParser(HTMLParser): class ContentParser(HTMLParser):
current_tag = "" current_tag = ""
last_ins = None
last_del = None
empty = False
small_prev_ins = "" small_prev_ins = ""
small_prev_del = "" small_prev_del = ""
added = False
def __init__(self, lang): def __init__(self, lang):
super().__init__() super().__init__()
@ -125,58 +127,57 @@ class ContentParser(HTMLParser):
def handle_starttag(self, tagname, attribs): def handle_starttag(self, tagname, attribs):
if tagname == "ins" or tagname == "del": if tagname == "ins" or tagname == "del":
self.current_tag = tagname self.current_tag = tagname
if tagname == "td" and 'diff-addedline' in attribs[0]: if tagname == "td" and "diff-addedline" in attribs[0] and self.ins_length <= 1000:
self.current_tag = tagname + "a" self.current_tag = "tda"
if tagname == "td" and 'diff-deletedline' in attribs[0]: self.last_ins = ""
self.current_tag = tagname + "d" if tagname == "td" and "diff-deletedline" in attribs[0] and self.del_length <= 1000:
if tagname == "td" and 'diff-marker' in attribs[0]: self.current_tag = "tdd"
self.added = True self.last_del = ""
if tagname == "td" and "diff-empty" in attribs[0]:
self.empty = True
def handle_data(self, data): def handle_data(self, data):
data = escape_formatting(data) data = escape_formatting(data)
if self.current_tag == "ins" and self.ins_length <= 1000: if self.current_tag == "ins" and self.ins_length <= 1000:
self.ins_length += len("**" + data + '**') self.ins_length += len("**" + data + "**")
if self.ins_length <= 1000: if self.ins_length <= 1000:
self.small_prev_ins = self.small_prev_ins + "**" + data + '**' self.last_ins = self.last_ins + "**" + data + "**"
else:
self.small_prev_ins = self.small_prev_ins + self.more
if self.current_tag == "del" and self.del_length <= 1000: if self.current_tag == "del" and self.del_length <= 1000:
self.del_length += len("~~" + data + '~~') self.del_length += len("~~" + data + "~~")
if self.del_length <= 1000: if self.del_length <= 1000:
self.small_prev_del = self.small_prev_del + "~~" + data + '~~' self.last_del = self.last_del + "~~" + data + "~~"
else: if self.current_tag == "tda" and self.ins_length <= 1000:
self.small_prev_del = self.small_prev_del + self.more
if (self.current_tag == "afterins" or self.current_tag == "tda") and self.ins_length <= 1000:
self.ins_length += len(data) self.ins_length += len(data)
if self.ins_length <= 1000: if self.ins_length <= 1000:
self.small_prev_ins = self.small_prev_ins + data self.last_ins = self.last_ins + data
else: if self.current_tag == "tdd" and self.del_length <= 1000:
self.small_prev_ins = self.small_prev_ins + self.more
if (self.current_tag == "afterdel" or self.current_tag == "tdd") and self.del_length <= 1000:
self.del_length += len(data) self.del_length += len(data)
if self.del_length <= 1000: if self.del_length <= 1000:
self.small_prev_del = self.small_prev_del + data self.last_del = self.last_del + data
else:
self.small_prev_del = self.small_prev_del + self.more
if self.added:
if data == '+' and self.ins_length <= 1000:
self.ins_length += 1
if self.ins_length <= 1000:
self.small_prev_ins = self.small_prev_ins + '\n'
else:
self.small_prev_ins = self.small_prev_ins + self.more
if data == '' and self.del_length <= 1000:
self.del_length += 1
if self.del_length <= 1000:
self.small_prev_del = self.small_prev_del + '\n'
else:
self.small_prev_del = self.small_prev_del + self.more
self.added = False
def handle_endtag(self, tagname): def handle_endtag(self, tagname):
if tagname == "ins":
self.current_tag = "afterins"
elif tagname == "del":
self.current_tag = "afterdel"
else:
self.current_tag = "" self.current_tag = ""
if tagname == "ins":
self.current_tag = "tda"
elif tagname == "del":
self.current_tag = "tdd"
elif tagname == "tr":
if self.last_ins is not None:
self.ins_length += 1
if self.empty and not self.last_ins.isspace() and "**" not in self.last_ins:
self.ins_length += 4
self.last_ins = "**" + self.last_ins + "**"
self.small_prev_ins = self.small_prev_ins + "\n" + self.last_ins
if self.ins_length > 1000:
self.small_prev_ins = self.small_prev_ins + self.more
self.last_ins = None
if self.last_del is not None:
self.del_length += 1
if self.empty and not self.last_del.isspace() and "~~" not in self.last_del:
self.del_length += 4
self.last_del = "~~" + self.last_del + "~~"
self.small_prev_del = self.small_prev_del + "\n" + self.last_del
if self.del_length > 1000:
self.small_prev_del = self.small_prev_del + self.more
self.last_del = None
self.empty = False