|
a |
|
b/XMLextraction.py |
|
|
1 |
import requests # allows you to send HTTP/1.1 requests extremely easily. |
|
|
2 |
from bs4 import BeautifulSoup |
|
|
3 |
import pandas as pd |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
# Step 1: Read the CSV File of attrition rates with corresponding NCT Ids |
|
|
7 |
clinical_attrition = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/ct_attrition_dataset.csv") |
|
|
8 |
clinical_attrition.head() |
|
|
9 |
|
|
|
10 |
# Step 2: visualize the data by creating a boxplot |
|
|
11 |
# datavisualize = clinical_attrition.boxplot(column="dropout_percentage_all", by="nct_id") |
|
|
12 |
# print(datavisualize) |
|
|
13 |
|
|
|
14 |
# Step 3 & 4: Create the URL for each clinical trial and retrieve its XML file |
|
|
15 |
new_data = [] # store the new data in an empty list for now. |
|
|
16 |
for index in clinical_attrition.index: # loop through each nct id and create a url from it |
|
|
17 |
nct_id = clinical_attrition.loc[index, 'nct_id'] # integer-location based indexing. accesses elements by their integer position. |
|
|
18 |
url= "https://clinicaltrials.gov/ct2/show/" + nct_id + "?resultsxml=true" # create the url |
|
|
19 |
response = requests.get(url) # returns the nct urls |
|
|
20 |
if response.status_code == 200: # 200 is the HTTP status code for "OK", a successful response. |
|
|
21 |
xmlcontent = BeautifulSoup(response.content, features="xml") |
|
|
22 |
|
|
|
23 |
# Step 4: Extract information from the XML file. Web scrapping |
|
|
24 |
|
|
|
25 |
# trial characteristics |
|
|
26 |
title_clinicaltrial = xmlcontent.find('brief_title').text if xmlcontent.find('brief_title').text else None |
|
|
27 |
# title of the trial |
|
|
28 |
overall_status = xmlcontent.find('overall_status').text if xmlcontent.find('overall_status').text else None |
|
|
29 |
# status of the trial |
|
|
30 |
phase_trial = xmlcontent.find('phase').text if xmlcontent.find('phase') else None |
|
|
31 |
# phase of trial |
|
|
32 |
allocation = xmlcontent.find('allocation').text if xmlcontent.find('allocation') else None |
|
|
33 |
# the type of study design |
|
|
34 |
start_date = xmlcontent.find('start_date').text if xmlcontent.find('start_date') else None |
|
|
35 |
# start date of trial |
|
|
36 |
completion_date = xmlcontent.find('completion_date').text if xmlcontent.find('completion_date') else None |
|
|
37 |
# completion date |
|
|
38 |
primary_purpose = xmlcontent.find('primary_purpose').text if xmlcontent.find('primary_purpose') else None |
|
|
39 |
# primary purpose of trial |
|
|
40 |
|
|
|
41 |
# location: city, state, zipcode |
|
|
42 |
city = xmlcontent.find('city').text if xmlcontent.find('city') is not None else None |
|
|
43 |
state = xmlcontent.find('state').text if xmlcontent.find('state') is not None else None |
|
|
44 |
zipcode = xmlcontent.find('zip').text if xmlcontent.find('zip') is not None else None |
|
|
45 |
facility = xmlcontent.find('facility').text if xmlcontent.find('facility') is not None else None |
|
|
46 |
|
|
|
47 |
# patient demographics: minimum age, maximum age, gender |
|
|
48 |
gender = xmlcontent.find('gender').text if xmlcontent.find('gender') else None |
|
|
49 |
minimum_age = xmlcontent.find('minimum_age').text if xmlcontent.find('minimum_age') else None |
|
|
50 |
maximum_age = xmlcontent.find('maximum_age').text if xmlcontent.find('maximum_age') else None |
|
|
51 |
|
|
|
52 |
# Step 5: Update the clinical_attrition dataframe with new information. Dictionary created |
|
|
53 |
new_data.append({ |
|
|
54 |
'Clinical Title': title_clinicaltrial, |
|
|
55 |
'Overall Status': overall_status, |
|
|
56 |
'Trial Phase': phase_trial, |
|
|
57 |
'Allocation': allocation, |
|
|
58 |
'Start Date': start_date, |
|
|
59 |
'Completion Date': completion_date, |
|
|
60 |
'Primary Purpose': primary_purpose, |
|
|
61 |
'City': city, |
|
|
62 |
'State': state, |
|
|
63 |
'Zipcode': zipcode, |
|
|
64 |
'Gender': gender, |
|
|
65 |
'Minimum Age': minimum_age, |
|
|
66 |
'Maximum Age': maximum_age |
|
|
67 |
}) |
|
|
68 |
else: |
|
|
69 |
print(f"No XML file for this clinical trial {nct_id}") |
|
|
70 |
clinical_attrition = clinical_attrition.assign(**pd.DataFrame(new_data)) |
|
|
71 |
# Save the updated DataFrame to a new CSV file |
|
|
72 |
clinical_attrition.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv", index=False) |
|
|
73 |
clinical_attrition.head() |