|
a |
|
b/scripts/scrape_drugs_data.py |
|
|
1 |
## This script scrapes the drugs.com website for the given drug name and saves the extracted text to a file. |
|
|
2 |
|
|
|
3 |
import requests |
|
|
4 |
from bs4 import BeautifulSoup |
|
|
5 |
import html2text |
|
|
6 |
from multiprocessing import Pool |
|
|
7 |
import os |
|
|
8 |
|
|
|
9 |
def extract_section_by_id(soup, section_id): |
|
|
10 |
""" |
|
|
11 |
Extracts the section with the given ID from the HTML content. |
|
|
12 |
|
|
|
13 |
Args: |
|
|
14 |
soup (BeautifulSoup): BeautifulSoup object containing the HTML content |
|
|
15 |
section_id (str): ID of the section to be extracted |
|
|
16 |
|
|
|
17 |
Returns: |
|
|
18 |
str: HTML content of the section |
|
|
19 |
""" |
|
|
20 |
|
|
|
21 |
# Find the section with the given ID |
|
|
22 |
section = soup.find('h2', id=section_id) |
|
|
23 |
|
|
|
24 |
# If section is not found, return an error message |
|
|
25 |
if not section: |
|
|
26 |
return f"Section with ID '{section_id}' not found." |
|
|
27 |
|
|
|
28 |
# Extract the content of the section |
|
|
29 |
content = [] |
|
|
30 |
for sibling in section.next_siblings: |
|
|
31 |
# Stop at the next section |
|
|
32 |
if sibling.name == 'h2': |
|
|
33 |
break |
|
|
34 |
# Extract the text if the sibling is a paragraph or list |
|
|
35 |
if sibling.name in ['p', 'ul', 'ol', 'div']: |
|
|
36 |
content.append(str(sibling)) |
|
|
37 |
|
|
|
38 |
return ' '.join(content) |
|
|
39 |
|
|
|
40 |
def scrape_website(args): |
|
|
41 |
""" |
|
|
42 |
Scrapes the drugs.com website for the given drug name and saves the extracted text to a file. |
|
|
43 |
|
|
|
44 |
Args: |
|
|
45 |
args (tuple): Tuple containing the URL and drug name |
|
|
46 |
|
|
|
47 |
Returns: |
|
|
48 |
None |
|
|
49 |
""" |
|
|
50 |
# Unpack the arguments |
|
|
51 |
url, drug_name = args |
|
|
52 |
print(f"[INFO] Processing: {drug_name}") |
|
|
53 |
|
|
|
54 |
# Fetch the HTML content |
|
|
55 |
try: |
|
|
56 |
response = requests.get(url) |
|
|
57 |
# If the request fails, return an error message |
|
|
58 |
if response.status_code != 200: |
|
|
59 |
print(f"[ERROR] Failed to fetch {url} with status code {response.status_code}") |
|
|
60 |
return |
|
|
61 |
# Parse the HTML content |
|
|
62 |
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
63 |
except Exception as e: |
|
|
64 |
print(f"[ERROR] Exception while fetching {url}: {e}") |
|
|
65 |
return |
|
|
66 |
|
|
|
67 |
# Initialize html2text |
|
|
68 |
h = html2text.HTML2Text() |
|
|
69 |
h.ignore_links = True |
|
|
70 |
h.ignore_images = True |
|
|
71 |
|
|
|
72 |
# Section IDs to extract |
|
|
73 |
section_ids = ["uses", "side-effects"] |
|
|
74 |
|
|
|
75 |
# Extract and convert HTML to text for each section |
|
|
76 |
file_content = '' |
|
|
77 |
# Extract the sections one by one |
|
|
78 |
for section_id in section_ids: |
|
|
79 |
section_html = extract_section_by_id(soup, section_id) |
|
|
80 |
section_text = h.handle(section_html) |
|
|
81 |
file_content += section_text + "\n\n" |
|
|
82 |
|
|
|
83 |
# Write the extracted text to a file |
|
|
84 |
file_name = f"data/raw_drug_info/{drug_name}.txt" |
|
|
85 |
with open(file_name, 'w', encoding='utf-8') as file: |
|
|
86 |
file.write(file_content) |
|
|
87 |
|
|
|
88 |
print(f"[INFO] Processed {drug_name} Successfully") |
|
|
89 |
|
|
|
90 |
if __name__ == "__main__": |
|
|
91 |
## List of drugs to be scrape from drugs.com |
|
|
92 |
DRUG_LIST = ["abilify", "infliximab", "rituximab", "etanercept", |
|
|
93 |
"Humira", "Enbrel", "Remicade", "Rituxan", |
|
|
94 |
"Nexium", "Prevacid", "Prilosec", "Protonix", |
|
|
95 |
"Crestor", "Lipitor", "Zocor", "Vytorin", |
|
|
96 |
"Victoza", "Byetta", "Januvia", "Onglyza", |
|
|
97 |
"Advair", "Symbicort", "Spiriva", "Singulair", |
|
|
98 |
"Cialis", "Viagra", "Levitra", "Staxyn", |
|
|
99 |
"AndroGel", "Prezista", "Doxycycline", "Cymbalta", |
|
|
100 |
"Neupogen", "Epogen", "Aranesp", "Neulasta", |
|
|
101 |
"Lunesta", "Ambien", "Provigil", "Nuvigil", |
|
|
102 |
"Metoprolol", "Lisinopril", "Amlodipine", "Atorvastatin", |
|
|
103 |
"Zoloft", "Lexapro", "Prozac", "Celexa", |
|
|
104 |
"Complera", "Atripla"] |
|
|
105 |
|
|
|
106 |
if not os.path.exists("data/raw_drug_info/"): |
|
|
107 |
os.makedirs("data/raw_drug_info/") |
|
|
108 |
|
|
|
109 |
# Prepare a list of tuples for the Pool |
|
|
110 |
tasks = [(f"https://www.drugs.com/{drug}.html", drug) for drug in DRUG_LIST] |
|
|
111 |
|
|
|
112 |
# Number of processes |
|
|
113 |
num_processes = 5 # Adjust this based on your machine's capability |
|
|
114 |
|
|
|
115 |
# Create a Pool of processes and map the function to the tasks |
|
|
116 |
with Pool(num_processes) as pool: |
|
|
117 |
pool.map(scrape_website, tasks) |