Diff of /docproduct/get_data.py [000000] .. [51873b]

Switch to unified view

a b/docproduct/get_data.py
1
""" Download Pushshift Data """
2
3
import os
4
from urllib import request as req
5
import re
6
import pycurl
7
import hashlib
8
# Define values
9
10
URLS=["https://files.pushshift.io/reddit/comments/","https://files.pushshift.io/reddit/submissions"]
11
BZ2_LINK_RE_PATTERN = r"<a\s.*href=[\"'](\S+)[\"'][^>]*>\S*.bz2<\/a>"
12
SHA256_LINK_RE_PATTERN = r"<a\s.*href=[\"'](\S+)[\"'][^>]*>sha256\S*<\/a>"
13
OUTPUT_DIR = "reddit_submissions"
14
# Define functions
15
def main():
16
    """The main entrypoint."""
17
18
    for BASE_URL in URLS:
19
        submissions_page=req.urlopen(BASE_URL).read().decode("utf-8")
20
21
22
        # Get BZ2 Links
23
        raw_links = re.findall(BZ2_LINK_RE_PATTERN,submissions_page)
24
        filtered_links = [link[2:] for link in raw_links if link.startswith("./")]
25
        individual_links = list(set(filtered_links))
26
27
        # Download files
28
        if not os.path.exists(OUTPUT_DIR):
29
            os.makedirs(OUTPUT_DIR)
30
        else:
31
            # get first match and remove the ./ from the start of the link
32
            sha256_link = re.findall(SHA256_LINK_RE_PATTERN,submissions_page)[0][2:]
33
            hash_file=(req.urlopen("%s/%s"%(BASE_URL,sha256_link)).read().decode("utf-8"))
34
35
            hash_file_pairs=({entry.split("  ")[1]:entry.split("  ")[0] for entry in hash_file.split("\n") if len(entry.split("  "))>1})
36
            for file in hash_file_pairs.keys():
37
                file_path=os.path.join(OUTPUT_DIR,file)
38
                if(os.path.exists(file_path) and hashlib.sha256(open(file_path,'rb').read()).hexdigest()!=hash_file_pairs[file.split("/")[-1]]):
39
                    print("File is corrput, deleting %s"%file_path)
40
                    os.remove(file_path)
41
42
43
        curl = pycurl.Curl()
44
        for link in sorted(individual_links):
45
            url = BASE_URL + "/" + link
46
            file_path=os.path.join(OUTPUT_DIR, link)
47
48
            if not os.path.exists(file_path):
49
                with open(file_path, "wb") as file:
50
                    curl.setopt(curl.URL, url)
51
                    curl.setopt(curl.WRITEDATA, file)
52
                    curl.perform()
53
                print("Downloaded", link)
54
        curl.close()
55
56
# Execute main function
57
if __name__ == "__main__":
58
    main()