aboutsummaryrefslogtreecommitdiff
path: root/fetch-documents.php
blob: 315f175f3553a0fd1d4b36c1ab5174df6d2581e9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
<?php

require_once __DIR__.'/common.php';

ini_set('memory_limit', '3072M');

function findAllAttachments(array $obj): array {
    $list = [];
    if (!empty($obj['attachments'])) {
        foreach ($obj['attachments'] as $attachment) {
            $list[] = $attachment;
            if ($attachment['type'] == 'wall' || $attachment['type'] == 'wall_reply') {
                $list = array_merge($list, findAllAttachments($attachment));
            }
        }
        $list = array_merge($list, $obj['attachments']);
    }
    if (!empty($obj['fwd_messages'])) {
        foreach ($obj['fwd_messages'] as $fwd_message) {
            $list = array_merge($list, findAllAttachments($fwd_message));
        }
    }
    $list = array_filter($list, function($attachment) {
        static $ids = [];

        $type = $attachment['type'];
        if (!isset($attachment[$type]))
            // weird
            return false;

        $attach = $attachment[$type];

        $id = $type;
        if (isset($attach['owner_id']))
            $id .= $attach['owner_id'].'_';
        if (isset($attach['id']))
            $id .= isset($attach['id']);

        if (isset($ids[$id]))
            return false;

        $ids[$id] = true;
        return true;
    });
    return $list;
}

$api_dir = ARCHIVE_DIR.'/messages/api';
foreach (scandir($api_dir) as $n) {
    if ($n == '.' || $n == '..')
        continue;

    foreach (scandir($api_dir.'/'.$n) as $file) {
        if (!preg_match('/^\d+\.txt$/', $file))
            continue;

        $obj = json_decode(file_get_contents($api_dir.'/'.$n.'/'.$file), true);
        $attachments = findAllAttachments($obj);

        $docs = array_filter($attachments, function($a) {
            return $a['type'] == 'doc';
        });
        if (empty($docs))
            continue;

        foreach ($docs as $doc) {
            $doc = $doc['doc']; // seriously?!
            $doc_id = $doc['owner_id'].'_'.$doc['id'];

            $doc_dir = ARCHIVE_DIR.'/messages/docs/'.$doc_id;
            if (!file_exists($doc_dir)) {
                if (!mkdir($doc_dir, 0755, true))
                    fatalError("failed to mkdir({$doc_dir})");
            }

            // TODO sanitize filename
            $doc_file = $doc_dir.'/'.$doc['title'];
            if (file_exists($doc_file)) {
                if (filesize($doc_file) == 56655)
                    unlink($doc_file);
                else {
                    echo "$doc_id already exists\n";
                    continue;
                }
            }

            list($code, $body) = httpGet($doc['url']);
            if ($code != 200) {
                fprintf(STDERR, "failed to download {$doc_id} ({$doc['url']})\n");
                rmdir($doc_dir);
                continue;
            }

            file_put_contents($doc_file, $body);
            echo "$doc_id saved, ".filesize($doc_file)." bytes\n";
            unset($body);
        }
    }
}