[51873b]: / docproduct / get_data.py

Download this file

59 lines (47 with data), 2.2 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
""" Download Pushshift Data """
import os
from urllib import request as req
import re
import pycurl
import hashlib
# Define values
URLS=["https://files.pushshift.io/reddit/comments/","https://files.pushshift.io/reddit/submissions"]
BZ2_LINK_RE_PATTERN = r"<a\s.*href=[\"'](\S+)[\"'][^>]*>\S*.bz2<\/a>"
SHA256_LINK_RE_PATTERN = r"<a\s.*href=[\"'](\S+)[\"'][^>]*>sha256\S*<\/a>"
OUTPUT_DIR = "reddit_submissions"
# Define functions
def main():
"""The main entrypoint."""
for BASE_URL in URLS:
submissions_page=req.urlopen(BASE_URL).read().decode("utf-8")
# Get BZ2 Links
raw_links = re.findall(BZ2_LINK_RE_PATTERN,submissions_page)
filtered_links = [link[2:] for link in raw_links if link.startswith("./")]
individual_links = list(set(filtered_links))
# Download files
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
else:
# get first match and remove the ./ from the start of the link
sha256_link = re.findall(SHA256_LINK_RE_PATTERN,submissions_page)[0][2:]
hash_file=(req.urlopen("%s/%s"%(BASE_URL,sha256_link)).read().decode("utf-8"))
hash_file_pairs=({entry.split(" ")[1]:entry.split(" ")[0] for entry in hash_file.split("\n") if len(entry.split(" "))>1})
for file in hash_file_pairs.keys():
file_path=os.path.join(OUTPUT_DIR,file)
if(os.path.exists(file_path) and hashlib.sha256(open(file_path,'rb').read()).hexdigest()!=hash_file_pairs[file.split("/")[-1]]):
print("File is corrput, deleting %s"%file_path)
os.remove(file_path)
curl = pycurl.Curl()
for link in sorted(individual_links):
url = BASE_URL + "/" + link
file_path=os.path.join(OUTPUT_DIR, link)
if not os.path.exists(file_path):
with open(file_path, "wb") as file:
curl.setopt(curl.URL, url)
curl.setopt(curl.WRITEDATA, file)
curl.perform()
print("Downloaded", link)
curl.close()
# Execute main function
if __name__ == "__main__":
main()