utils/mastodon_import.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

import os
import sys
import json
from config import conf
from datetime import datetime

def htmlunescape(s):
    return s.replace("&lt;", "<")\
            .replace("&gt;", ">")\
            .replace("&amp;", "&")\
            .replace("&#39;", "'")\
            .replace("&quot;", '"')

def proc_post(o, msdata):
    date = datetime.fromisoformat(o["published"])
    text = htmlunescape(o["object"]["content"])
    images = []
    for a in o["object"]["attachment"]:
        if a["mediaType"] in ["image/jpeg", "image/png"]:
            images.append(a["url"])
    is_reply = len(o["cc"]) > 1
    ts = int(date.timestamp())
    if is_reply:
        return
    #####################################################
    media_dir = os.path.join(os.path.join(conf.LOCAL_DATA_ROOT, "media_orig"), str(date.year))
    post_dir = os.path.join(os.path.join(conf.LOCAL_DATA_ROOT, "posts"), str(date.year))
    post_fn = f"{ts}-{date.isoformat()[:-6]}Z.post"
    os.makedirs(post_dir, 0o755, True)
    output_text = text
    output_text += "\n\n#Toot\n"
    if len(images) > 0:
        output_text += '\n'
        os.makedirs(media_dir, 0o755, True)
    for i in images:
        p = i[1:]
        bn = os.path.basename(p)
        os.link(os.path.join(msdata, p),
                os.path.join(media_dir, f"{ts}-{bn}"))
        output_text += f"[{bn}]\n"
    with open(os.path.join(post_dir, post_fn), "w") as f:
        f.write(output_text)

def main():
    o = None
    if len(sys.argv) < 2:
        print("specify the data directory of your mastodon archive on the command line pls")
        exit(1)
    msdata = sys.argv[1]
    with open(os.path.join(msdata, "outbox.json")) as f:
        s = f.read()
        o = json.loads(s)
    posts = []
    for p in o['orderedItems']:
        proc_post(p, msdata)

if __name__ == "__main__":
    main()