Diff of /XMLextraction.py [000000] .. [6e0d8e]

Switch to unified view

a b/XMLextraction.py
1
import requests  # allows you to send HTTP/1.1 requests extremely easily.
2
from bs4 import BeautifulSoup
3
import pandas as pd
4
5
6
# Step 1: Read the CSV File of attrition rates with corresponding NCT Ids
7
clinical_attrition = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/ct_attrition_dataset.csv")
8
clinical_attrition.head()
9
10
# Step 2: visualize the data by creating a boxplot
11
# datavisualize = clinical_attrition.boxplot(column="dropout_percentage_all", by="nct_id")
12
# print(datavisualize)
13
14
# Step 3 & 4: Create the URL for each clinical trial and retrieve its XML file
15
new_data = []  # store the new data in an empty list for now.
16
for index in clinical_attrition.index: # loop through each nct id and create a url from it
17
    nct_id = clinical_attrition.loc[index, 'nct_id']  # integer-location based indexing. accesses elements by their integer position.
18
    url= "https://clinicaltrials.gov/ct2/show/" + nct_id + "?resultsxml=true"  # create the url
19
    response = requests.get(url) # returns the nct urls
20
    if response.status_code == 200:  # 200 is the HTTP status code for "OK", a successful response.
21
        xmlcontent = BeautifulSoup(response.content, features="xml")
22
23
        # Step 4: Extract information from the XML file. Web scrapping
24
25
        # trial characteristics
26
        title_clinicaltrial = xmlcontent.find('brief_title').text if xmlcontent.find('brief_title').text else None
27
        # title of the trial
28
        overall_status = xmlcontent.find('overall_status').text if xmlcontent.find('overall_status').text else None
29
        # status of the trial
30
        phase_trial = xmlcontent.find('phase').text if xmlcontent.find('phase') else None
31
        # phase of trial
32
        allocation = xmlcontent.find('allocation').text if xmlcontent.find('allocation') else None
33
        # the type of study design
34
        start_date = xmlcontent.find('start_date').text if xmlcontent.find('start_date') else None
35
        # start date of trial
36
        completion_date = xmlcontent.find('completion_date').text if xmlcontent.find('completion_date') else None
37
        # completion date
38
        primary_purpose = xmlcontent.find('primary_purpose').text if xmlcontent.find('primary_purpose') else None
39
        # primary purpose of trial
40
41
        # location: city, state, zipcode
42
        city = xmlcontent.find('city').text if xmlcontent.find('city') is not None else None
43
        state = xmlcontent.find('state').text if xmlcontent.find('state') is not None else None
44
        zipcode = xmlcontent.find('zip').text if xmlcontent.find('zip') is not None else None
45
        facility = xmlcontent.find('facility').text if xmlcontent.find('facility') is not None else None
46
47
        # patient demographics: minimum age, maximum age, gender
48
        gender = xmlcontent.find('gender').text if xmlcontent.find('gender') else None
49
        minimum_age = xmlcontent.find('minimum_age').text if xmlcontent.find('minimum_age') else None
50
        maximum_age = xmlcontent.find('maximum_age').text if xmlcontent.find('maximum_age') else None
51
52
        # Step 5: Update the clinical_attrition dataframe with new information. Dictionary created
53
        new_data.append({
54
            'Clinical Title': title_clinicaltrial,
55
            'Overall Status': overall_status,
56
            'Trial Phase': phase_trial,
57
            'Allocation': allocation,
58
            'Start Date': start_date,
59
            'Completion Date': completion_date,
60
            'Primary Purpose': primary_purpose,
61
            'City': city,
62
            'State': state,
63
            'Zipcode': zipcode,
64
            'Gender': gender,
65
            'Minimum Age': minimum_age,
66
            'Maximum Age': maximum_age
67
        })
68
    else:
69
        print(f"No XML file for this clinical trial {nct_id}")
70
clinical_attrition = clinical_attrition.assign(**pd.DataFrame(new_data))
71
# Save the updated DataFrame to a new CSV file
72
clinical_attrition.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv", index=False)
73
clinical_attrition.head()