From d85f7c3109df027110cf8c7ed2d3de2aa128e7ec Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Sun, 4 Aug 2024 18:48:36 -0400 Subject: Unescape html in tweets. --- utils/twitter_import.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/utils/twitter_import.py b/utils/twitter_import.py index 90d0c7c..76170bb 100644 --- a/utils/twitter_import.py +++ b/utils/twitter_import.py @@ -14,6 +14,11 @@ def procline(x): r = chr(0x200b) + r # extremely hacky return r +def htmlunescape(s): + return s.replace("<", "<")\ + .replace(">", ">")\ + .replace("&", "&") + class Tweet: def __init__(self, tid, text, date, tags, media, urls, replying_to): self.tid = tid @@ -26,12 +31,13 @@ class Tweet: def parse(json): t = json["tweet"] - text = t["full_text"] + text = htmlunescape(t["full_text"]) #tcolinks = TCORE.findall(text) e = t["entities"] ee = t["extended_entities"] if "extended_entities" in t else None tid = t["id"] tags = [tag["text"] for tag in e["hashtags"]] + tags = ["Tweet"] + tags is_reply = "in_reply_to_status_id_str" in t replying_to = None if is_reply: -- cgit v1.2.3