[6e0d8e]: / XMLextraction.py

Download this file

73 lines (64 with data), 4.0 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import requests # allows you to send HTTP/1.1 requests extremely easily.
from bs4 import BeautifulSoup
import pandas as pd
# Step 1: Read the CSV File of attrition rates with corresponding NCT Ids
clinical_attrition = pd.read_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/ct_attrition_dataset.csv")
clinical_attrition.head()
# Step 2: visualize the data by creating a boxplot
# datavisualize = clinical_attrition.boxplot(column="dropout_percentage_all", by="nct_id")
# print(datavisualize)
# Step 3 & 4: Create the URL for each clinical trial and retrieve its XML file
new_data = [] # store the new data in an empty list for now.
for index in clinical_attrition.index: # loop through each nct id and create a url from it
nct_id = clinical_attrition.loc[index, 'nct_id'] # integer-location based indexing. accesses elements by their integer position.
url= "https://clinicaltrials.gov/ct2/show/" + nct_id + "?resultsxml=true" # create the url
response = requests.get(url) # returns the nct urls
if response.status_code == 200: # 200 is the HTTP status code for "OK", a successful response.
xmlcontent = BeautifulSoup(response.content, features="xml")
# Step 4: Extract information from the XML file. Web scrapping
# trial characteristics
title_clinicaltrial = xmlcontent.find('brief_title').text if xmlcontent.find('brief_title').text else None
# title of the trial
overall_status = xmlcontent.find('overall_status').text if xmlcontent.find('overall_status').text else None
# status of the trial
phase_trial = xmlcontent.find('phase').text if xmlcontent.find('phase') else None
# phase of trial
allocation = xmlcontent.find('allocation').text if xmlcontent.find('allocation') else None
# the type of study design
start_date = xmlcontent.find('start_date').text if xmlcontent.find('start_date') else None
# start date of trial
completion_date = xmlcontent.find('completion_date').text if xmlcontent.find('completion_date') else None
# completion date
primary_purpose = xmlcontent.find('primary_purpose').text if xmlcontent.find('primary_purpose') else None
# primary purpose of trial
# location: city, state, zipcode
city = xmlcontent.find('city').text if xmlcontent.find('city') is not None else None
state = xmlcontent.find('state').text if xmlcontent.find('state') is not None else None
zipcode = xmlcontent.find('zip').text if xmlcontent.find('zip') is not None else None
facility = xmlcontent.find('facility').text if xmlcontent.find('facility') is not None else None
# patient demographics: minimum age, maximum age, gender
gender = xmlcontent.find('gender').text if xmlcontent.find('gender') else None
minimum_age = xmlcontent.find('minimum_age').text if xmlcontent.find('minimum_age') else None
maximum_age = xmlcontent.find('maximum_age').text if xmlcontent.find('maximum_age') else None
# Step 5: Update the clinical_attrition dataframe with new information. Dictionary created
new_data.append({
'Clinical Title': title_clinicaltrial,
'Overall Status': overall_status,
'Trial Phase': phase_trial,
'Allocation': allocation,
'Start Date': start_date,
'Completion Date': completion_date,
'Primary Purpose': primary_purpose,
'City': city,
'State': state,
'Zipcode': zipcode,
'Gender': gender,
'Minimum Age': minimum_age,
'Maximum Age': maximum_age
})
else:
print(f"No XML file for this clinical trial {nct_id}")
clinical_attrition = clinical_attrition.assign(**pd.DataFrame(new_data))
# Save the updated DataFrame to a new CSV file
clinical_attrition.to_csv("/Users/shania/PycharmProjects/ClinicalAttritionRateMap/updated_clinical_attrition.csv", index=False)
clinical_attrition.head()