a b/scripts/scrape_drugs_data.py
1
## This script scrapes the drugs.com website for the given drug name and saves the extracted text to a file.
2
3
import requests
4
from bs4 import BeautifulSoup
5
import html2text
6
from multiprocessing import Pool
7
import os
8
9
def extract_section_by_id(soup, section_id):
10
    """
11
    Extracts the section with the given ID from the HTML content. 
12
13
    Args:
14
        soup (BeautifulSoup): BeautifulSoup object containing the HTML content
15
        section_id (str): ID of the section to be extracted
16
17
    Returns:
18
        str: HTML content of the section
19
    """
20
    
21
    # Find the section with the given ID
22
    section = soup.find('h2', id=section_id)
23
    
24
    # If section is not found, return an error message
25
    if not section:
26
        return f"Section with ID '{section_id}' not found."
27
28
    # Extract the content of the section
29
    content = []
30
    for sibling in section.next_siblings:
31
        # Stop at the next section
32
        if sibling.name == 'h2':
33
            break
34
        # Extract the text if the sibling is a paragraph or list
35
        if sibling.name in ['p', 'ul', 'ol', 'div']:
36
            content.append(str(sibling))
37
    
38
    return ' '.join(content)
39
40
def scrape_website(args):
41
    """
42
    Scrapes the drugs.com website for the given drug name and saves the extracted text to a file.
43
44
    Args:
45
        args (tuple): Tuple containing the URL and drug name
46
47
    Returns:
48
        None
49
    """
50
    # Unpack the arguments
51
    url, drug_name = args
52
    print(f"[INFO] Processing: {drug_name}")
53
    
54
    # Fetch the HTML content
55
    try:
56
        response = requests.get(url)
57
        # If the request fails, return an error message
58
        if response.status_code != 200:
59
            print(f"[ERROR] Failed to fetch {url} with status code {response.status_code}")
60
            return
61
        # Parse the HTML content
62
        soup = BeautifulSoup(response.content, 'html.parser')
63
    except Exception as e:
64
        print(f"[ERROR] Exception while fetching {url}: {e}")
65
        return
66
67
    # Initialize html2text
68
    h = html2text.HTML2Text()
69
    h.ignore_links = True
70
    h.ignore_images = True
71
72
    # Section IDs to extract
73
    section_ids = ["uses", "side-effects"]
74
75
    # Extract and convert HTML to text for each section
76
    file_content = ''
77
    # Extract the sections one by one
78
    for section_id in section_ids:
79
        section_html = extract_section_by_id(soup, section_id)
80
        section_text = h.handle(section_html)
81
        file_content += section_text + "\n\n"
82
83
    # Write the extracted text to a file
84
    file_name = f"data/raw_drug_info/{drug_name}.txt"
85
    with open(file_name, 'w', encoding='utf-8') as file:
86
        file.write(file_content)
87
88
    print(f"[INFO] Processed {drug_name} Successfully")
89
90
if __name__ == "__main__":
91
    ## List of drugs to be scrape from drugs.com
92
    DRUG_LIST = ["abilify", "infliximab", "rituximab", "etanercept",
93
                "Humira", "Enbrel", "Remicade", "Rituxan",
94
                "Nexium", "Prevacid", "Prilosec", "Protonix",
95
                "Crestor", "Lipitor", "Zocor", "Vytorin", 
96
                "Victoza", "Byetta", "Januvia", "Onglyza",
97
                "Advair", "Symbicort", "Spiriva", "Singulair",
98
                "Cialis", "Viagra", "Levitra", "Staxyn",
99
                "AndroGel", "Prezista", "Doxycycline", "Cymbalta",
100
                "Neupogen", "Epogen", "Aranesp", "Neulasta",
101
                "Lunesta", "Ambien", "Provigil", "Nuvigil",
102
                "Metoprolol", "Lisinopril", "Amlodipine", "Atorvastatin",
103
                "Zoloft", "Lexapro", "Prozac", "Celexa",
104
                "Complera", "Atripla"]
105
    
106
    if not os.path.exists("data/raw_drug_info/"):
107
        os.makedirs("data/raw_drug_info/")
108
109
    # Prepare a list of tuples for the Pool
110
    tasks = [(f"https://www.drugs.com/{drug}.html", drug) for drug in DRUG_LIST]
111
112
    # Number of processes
113
    num_processes = 5  # Adjust this based on your machine's capability
114
115
    # Create a Pool of processes and map the function to the tasks
116
    with Pool(num_processes) as pool:
117
        pool.map(scrape_website, tasks)