mirror of
https://gitlab.com/chicken-riders/RcGcDb.git
synced 2025-02-23 00:54:09 +00:00
Use RcGcDw ContentParser
This commit is contained in:
parent
faf3d3d9eb
commit
1e970337a8
26
src/misc.py
26
src/misc.py
|
@ -140,6 +140,13 @@ def class_searcher(attribs: list) -> str:
|
|||
|
||||
|
||||
class ContentParser(HTMLParser):
|
||||
"""ContentPerser is an implementation of HTMLParser that parses output of action=compare&prop=diff API request
|
||||
for two MediaWiki revisions. It extracts the following:
|
||||
small_prev_ins - storing up to 1000 characters of added text
|
||||
small_prev_del - storing up to 1000 chracters of removed text
|
||||
ins_length - storing length of inserted text
|
||||
del_length - storing length of deleted text
|
||||
"""
|
||||
current_tag = ""
|
||||
last_ins = None
|
||||
last_del = None
|
||||
|
@ -172,30 +179,33 @@ class ContentParser(HTMLParser):
|
|||
if self.current_tag == "ins" and self.ins_length <= 1000:
|
||||
self.ins_length += len("**" + data + "**")
|
||||
if self.ins_length <= 1000:
|
||||
self.last_ins = self.last_ins or "" + "**" + data + "**"
|
||||
self.last_ins = self.last_ins + "**" + data + "**"
|
||||
if self.current_tag == "del" and self.del_length <= 1000:
|
||||
self.del_length += len("~~" + data + "~~")
|
||||
if self.del_length <= 1000:
|
||||
self.last_del = self.last_del or "" + "~~" + data + "~~"
|
||||
self.last_del = self.last_del + "~~" + data + "~~"
|
||||
if self.current_tag == "tda" and self.ins_length <= 1000:
|
||||
self.ins_length += len(data)
|
||||
if self.ins_length <= 1000:
|
||||
self.last_ins = self.last_ins or "" + data
|
||||
self.last_ins = self.last_ins + data
|
||||
if self.current_tag == "tdd" and self.del_length <= 1000:
|
||||
self.del_length += len(data)
|
||||
if self.del_length <= 1000:
|
||||
self.last_del = self.last_del or "" + data
|
||||
self.last_del = self.last_del + data
|
||||
|
||||
def handle_endtag(self, tagname):
|
||||
self.current_tag = ""
|
||||
if tagname == "ins":
|
||||
self.current_tag = "tda"
|
||||
elif tagname == "del":
|
||||
self.current_tag = "tdd"
|
||||
elif tagname == "td":
|
||||
self.current_tag = ""
|
||||
elif tagname == "tr":
|
||||
if self.last_ins is not None:
|
||||
self.ins_length += 1
|
||||
if self.empty and not self.last_ins.isspace() and "**" not in self.last_ins:
|
||||
if self.empty and not self.last_ins.isspace():
|
||||
if "**" in self.last_ins:
|
||||
self.last_ins = self.last_ins.replace("**", "__")
|
||||
self.ins_length += 4
|
||||
self.last_ins = "**" + self.last_ins + "**"
|
||||
self.small_prev_ins = self.small_prev_ins + "\n" + self.last_ins
|
||||
|
@ -204,7 +214,9 @@ class ContentParser(HTMLParser):
|
|||
self.last_ins = None
|
||||
if self.last_del is not None:
|
||||
self.del_length += 1
|
||||
if self.empty and not self.last_del.isspace() and "~~" not in self.last_del:
|
||||
if self.empty and not self.last_del.isspace():
|
||||
if "~~" in self.last_del:
|
||||
self.last_del = self.last_del.replace("~~", "__")
|
||||
self.del_length += 4
|
||||
self.last_del = "~~" + self.last_del + "~~"
|
||||
self.small_prev_del = self.small_prev_del + "\n" + self.last_del
|
||||
|
|
Loading…
Reference in a new issue