[bc9e98]: / benchmark / extract_from_xml.py

Download this file

104 lines (62 with data), 2.3 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os.path as op
from xml.dom import minidom
from xml.etree import ElementTree as ET
folder = 'ClinicalTrialGov'
def nctid2fulltext(nctid):
subfolder = nctid[:7]+'xxxx'
file = op.join(folder, subfolder, nctid + '.xml')
xml_file = 'NCT01884350.xml'
'''
ClinicalTrialGov/NCT0188xxxx/NCT01884350.xml
'''
tree = ET.parse(xml_file)
root = tree.getroot()
# def xml_file_2_tuple(xml_file):
# tree = ET.parse(xml_file)
# root = tree.getroot()
# nctid = root.find('id_info').find('nct_id').text ### nctid: 'NCT00000102'
# study_type = root.find('study_type').text
# if study_type != 'Interventional':
# return (None,) ### invalid
# interventions = [i for i in root.findall('intervention')]
# drug_interventions = [i.find('intervention_name').text for i in interventions \
# if i.find('intervention_type').text=='Drug']
# # or i.find('intervention_type').text=='Biological']
# if len(drug_interventions)==0:
# return (None,)
# try:
# status = root.find('overall_status').text
# except:
# status = ''
# # if status in drop_set:
# # return (None,) ### invalid
# try:
# why_stop = root.find('why_stopped').text
# except:
# why_stop = ''
# label = root2outcome(root)
# label = -1 if label is None else label
# try:
# phase = root.find('phase').text
# # print("phase\n\t\t", phase)
# except:
# phase = ''
# conditions = [i.text for i in root.findall('condition')]
# try:
# criteria = root.find('eligibility').find('criteria').find('textblock').text
# # print("criteria\n\t\t", criteria)
# except:
# criteria = ''
# #if criteria != '':
# # assert "Inclusion Criteria:" in criteria
# # assert "Exclusion Criteria:" in criteria
# # title = root.find('brief_title').text
# # try:
# # summary = root.find('brief_summary').text
# # # print("summary\n\t\t", summary)
# # except:
# # summary = ''
# conditions = [i.lower() for i in conditions]
# drugs = [i.lower() for i in drug_interventions]
# return nctid, status.lower(), why_stop.lower(), label, phase.lower(), conditions, drugs, criteria
# # return nctid, status.lower(), why_stop.lower(), label, phase.lower(), conditions, drugs, title, criteria, summary