[8c54ae]: / src / scraping / scraper.py

Download this file

90 lines (70 with data), 3.8 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Get the current directory path
current_dir = os.path.dirname(os.path.abspath(__file__))
# Path to store the scraped data (you can create a scraped folder)
output_dir = os.path.join(current_dir, '..', '..', 'data', 'raw', 'scraped')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def scrape_criteria(driver, study_url, nct_number, nct_name):
"""
Scrape the inclusion/exclusion and other criteria from a clinical trial study page.
Parameters:
driver (webdriver): The Selenium WebDriver instance.
study_url (str): The URL of the clinical trial study.
nct_number (str): The NCT number of the clinical trial.
nct_name (str): The title of the clinical trial.
Saves the scraped criteria to a text file named <nct_number>_criteria.txt in the specified output directory.
"""
# Construct the URL for participation criteria
criteria_url = study_url + "#participation-criteria"
driver.get(criteria_url) # Navigate to the criteria URL
driver.maximize_window() # Maximize the browser window
try:
# Wait until the inclusion/exclusion criteria element is present and retrieve its text
inclusion_exclusion_criteria = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="participation-criteria"]/ctg-participation-criteria/div[2]/div/div[2]/div[1]'))
)
inclusion_exclusion_text = inclusion_exclusion_criteria.text
# Wait until the other criteria element is present and retrieve its text
other_criteria = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="participation-criteria"]/ctg-participation-criteria/div[2]/div/div[2]/div[2]'))
)
other_criteria_text = other_criteria.text
# Formatting text for output
formatted_text = f"Study Title: {nct_name}\nInclusion/Exclusion Criteria:\n{inclusion_exclusion_text}\n\nOther Criteria:\n{other_criteria_text}"
# Create the output file path
file_name = f"{nct_number}_criteria.txt"
output_path = os.path.join(output_dir, file_name)
# Save the formatted text to a text file
with open(output_path, 'w') as file:
file.write(formatted_text)
print(f"Data for {nct_number} successfully written to {file_name}")
except Exception as e:
# Handle any errors that occur during scraping
print(f"An error occurred for {nct_number}: {str(e)}")
def main():
"""
Main function to initialize the WebDriver and read study links from a CSV file,
then scrape criteria for each study.
"""
# Initialize the WebDriver (Make sure to specify the correct path to your chromedriver)
driver = webdriver.Chrome() # Ensure chromedriver is in PATH or specify path here
# Path to the CSV file containing study links
csv_path = os.path.join(current_dir, '..', '..', 'data', 'raw', 'study-links.csv')
# Read the study-links.csv file into a DataFrame
df = pd.read_csv(csv_path)
# Loop over each study URL in the CSV and scrape the data
for index, row in df.iterrows():
study_url = row['Study URL']
nct_number = row['NCT Number']
nct_name = row['Study Title']
print(f"Scraping data for NCT Number: {nct_number} from URL: {study_url}")
# Call the scrape_criteria function for each study
scrape_criteria(driver, study_url, nct_number, nct_name)
if __name__ == "__main__":
main()