diff --git a/src/misc.py b/src/misc.py index 0f7ea08..579896b 100644 --- a/src/misc.py +++ b/src/misc.py @@ -140,6 +140,13 @@ def class_searcher(attribs: list) -> str: class ContentParser(HTMLParser): + """ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request + for two MediaWiki revisions. It extracts the following: + small_prev_ins - storing up to 1000 characters of added text + small_prev_del - storing up to 1000 chracters of removed text + ins_length - storing length of inserted text + del_length - storing length of deleted text + """ current_tag = "" last_ins = None last_del = None @@ -172,30 +179,33 @@ class ContentParser(HTMLParser): if self.current_tag == "ins" and self.ins_length <= 1000: self.ins_length += len("**" + data + "**") if self.ins_length <= 1000: - self.last_ins = self.last_ins or "" + "**" + data + "**" + self.last_ins = self.last_ins + "**" + data + "**" if self.current_tag == "del" and self.del_length <= 1000: self.del_length += len("~~" + data + "~~") if self.del_length <= 1000: - self.last_del = self.last_del or "" + "~~" + data + "~~" + self.last_del = self.last_del + "~~" + data + "~~" if self.current_tag == "tda" and self.ins_length <= 1000: self.ins_length += len(data) if self.ins_length <= 1000: - self.last_ins = self.last_ins or "" + data + self.last_ins = self.last_ins + data if self.current_tag == "tdd" and self.del_length <= 1000: self.del_length += len(data) if self.del_length <= 1000: - self.last_del = self.last_del or "" + data + self.last_del = self.last_del + data def handle_endtag(self, tagname): - self.current_tag = "" if tagname == "ins": self.current_tag = "tda" elif tagname == "del": self.current_tag = "tdd" + elif tagname == "td": + self.current_tag = "" elif tagname == "tr": if self.last_ins is not None: self.ins_length += 1 - if self.empty and not self.last_ins.isspace() and "**" not in self.last_ins: + if self.empty and not self.last_ins.isspace(): + if "**" in self.last_ins: + self.last_ins = self.last_ins.replace("**", "__") self.ins_length += 4 self.last_ins = "**" + self.last_ins + "**" self.small_prev_ins = self.small_prev_ins + "\n" + self.last_ins @@ -204,7 +214,9 @@ class ContentParser(HTMLParser): self.last_ins = None if self.last_del is not None: self.del_length += 1 - if self.empty and not self.last_del.isspace() and "~~" not in self.last_del: + if self.empty and not self.last_del.isspace(): + if "~~" in self.last_del: + self.last_del = self.last_del.replace("~~", "__") self.del_length += 4 self.last_del = "~~" + self.last_del + "~~" self.small_prev_del = self.small_prev_del + "\n" + self.last_del