import os
import sys
import json
from config import conf
from datetime import datetime
def htmlunescape(s):
return s.replace("<", "<")\
.replace(">", ">")\
.replace("&", "&")\
.replace("'", "'")\
.replace(""", '"')
def proc_post(o, msdata):
date = datetime.fromisoformat(o["published"])
text = htmlunescape(o["object"]["content"])
images = []
for a in o["object"]["attachment"]:
if a["mediaType"] in ["image/jpeg", "image/png"]:
images.append(a["url"])
is_reply = len(o["cc"]) > 1
ts = int(date.timestamp())
if is_reply:
return
#####################################################
media_dir = os.path.join(os.path.join(conf.LOCAL_DATA_ROOT, "media_orig"), str(date.year))
post_dir = os.path.join(os.path.join(conf.LOCAL_DATA_ROOT, "posts"), str(date.year))
post_fn = f"{ts}-{date.isoformat()[:-6]}Z.post"
os.makedirs(post_dir, 0o755, True)
output_text = text
output_text += "\n\n#Toot\n"
if len(images) > 0:
output_text += '\n'
os.makedirs(media_dir, 0o755, True)
for i in images:
p = i[1:]
bn = os.path.basename(p)
os.link(os.path.join(msdata, p),
os.path.join(media_dir, f"{ts}-{bn}"))
output_text += f"[{bn}]\n"
with open(os.path.join(post_dir, post_fn), "w") as f:
f.write(output_text)
def main():
o = None
if len(sys.argv) < 2:
print("specify the data directory of your mastodon archive on the command line pls")
exit(1)
msdata = sys.argv[1]
with open(os.path.join(msdata, "outbox.json")) as f:
s = f.read()
o = json.loads(s)
posts = []
for p in o['orderedItems']:
proc_post(p, msdata)
if __name__ == "__main__":
main()