|
a |
|
b/docproduct/get_data.py |
|
|
1 |
""" Download Pushshift Data """ |
|
|
2 |
|
|
|
3 |
import os |
|
|
4 |
from urllib import request as req |
|
|
5 |
import re |
|
|
6 |
import pycurl |
|
|
7 |
import hashlib |
|
|
8 |
# Define values |
|
|
9 |
|
|
|
10 |
URLS=["https://files.pushshift.io/reddit/comments/","https://files.pushshift.io/reddit/submissions"] |
|
|
11 |
BZ2_LINK_RE_PATTERN = r"<a\s.*href=[\"'](\S+)[\"'][^>]*>\S*.bz2<\/a>" |
|
|
12 |
SHA256_LINK_RE_PATTERN = r"<a\s.*href=[\"'](\S+)[\"'][^>]*>sha256\S*<\/a>" |
|
|
13 |
OUTPUT_DIR = "reddit_submissions" |
|
|
14 |
# Define functions |
|
|
15 |
def main(): |
|
|
16 |
"""The main entrypoint.""" |
|
|
17 |
|
|
|
18 |
for BASE_URL in URLS: |
|
|
19 |
submissions_page=req.urlopen(BASE_URL).read().decode("utf-8") |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
# Get BZ2 Links |
|
|
23 |
raw_links = re.findall(BZ2_LINK_RE_PATTERN,submissions_page) |
|
|
24 |
filtered_links = [link[2:] for link in raw_links if link.startswith("./")] |
|
|
25 |
individual_links = list(set(filtered_links)) |
|
|
26 |
|
|
|
27 |
# Download files |
|
|
28 |
if not os.path.exists(OUTPUT_DIR): |
|
|
29 |
os.makedirs(OUTPUT_DIR) |
|
|
30 |
else: |
|
|
31 |
# get first match and remove the ./ from the start of the link |
|
|
32 |
sha256_link = re.findall(SHA256_LINK_RE_PATTERN,submissions_page)[0][2:] |
|
|
33 |
hash_file=(req.urlopen("%s/%s"%(BASE_URL,sha256_link)).read().decode("utf-8")) |
|
|
34 |
|
|
|
35 |
hash_file_pairs=({entry.split(" ")[1]:entry.split(" ")[0] for entry in hash_file.split("\n") if len(entry.split(" "))>1}) |
|
|
36 |
for file in hash_file_pairs.keys(): |
|
|
37 |
file_path=os.path.join(OUTPUT_DIR,file) |
|
|
38 |
if(os.path.exists(file_path) and hashlib.sha256(open(file_path,'rb').read()).hexdigest()!=hash_file_pairs[file.split("/")[-1]]): |
|
|
39 |
print("File is corrput, deleting %s"%file_path) |
|
|
40 |
os.remove(file_path) |
|
|
41 |
|
|
|
42 |
|
|
|
43 |
curl = pycurl.Curl() |
|
|
44 |
for link in sorted(individual_links): |
|
|
45 |
url = BASE_URL + "/" + link |
|
|
46 |
file_path=os.path.join(OUTPUT_DIR, link) |
|
|
47 |
|
|
|
48 |
if not os.path.exists(file_path): |
|
|
49 |
with open(file_path, "wb") as file: |
|
|
50 |
curl.setopt(curl.URL, url) |
|
|
51 |
curl.setopt(curl.WRITEDATA, file) |
|
|
52 |
curl.perform() |
|
|
53 |
print("Downloaded", link) |
|
|
54 |
curl.close() |
|
|
55 |
|
|
|
56 |
# Execute main function |
|
|
57 |
if __name__ == "__main__": |
|
|
58 |
main() |