--- a
+++ b/benchmark/extract_from_xml.py
@@ -0,0 +1,103 @@
+import os.path as op
+from xml.dom import minidom
+from xml.etree import ElementTree as ET
+
+
+folder = 'ClinicalTrialGov'
+
+def nctid2fulltext(nctid):
+	subfolder = nctid[:7]+'xxxx'
+	file = op.join(folder, subfolder, nctid + '.xml')
+
+
+xml_file = 'NCT01884350.xml'
+'''
+ClinicalTrialGov/NCT0188xxxx/NCT01884350.xml
+'''
+
+tree = ET.parse(xml_file)
+root = tree.getroot() 
+
+
+
+
+
+
+
+
+
+# def xml_file_2_tuple(xml_file):
+# 	tree = ET.parse(xml_file)
+# 	root = tree.getroot()
+# 	nctid = root.find('id_info').find('nct_id').text	### nctid: 'NCT00000102'
+# 	study_type = root.find('study_type').text 
+# 	if study_type != 'Interventional':
+# 		return (None,)  ### invalid 
+
+# 	interventions = [i for i in root.findall('intervention')]
+# 	drug_interventions = [i.find('intervention_name').text for i in interventions \
+# 														if i.find('intervention_type').text=='Drug']
+# 														# or i.find('intervention_type').text=='Biological']
+# 	if len(drug_interventions)==0:
+# 		return (None,)
+
+# 	try:
+# 		status = root.find('overall_status').text 
+# 	except:
+# 		status = ''
+# 	# if status in drop_set:
+# 	# 	return (None,)  ### invalid 
+# 	try:
+# 		why_stop = root.find('why_stopped').text
+# 	except:
+# 		why_stop = ''
+# 	label = root2outcome(root)
+# 	label = -1 if label is None else label 
+# 	try:
+# 		phase = root.find('phase').text 
+# 		# print("phase\n\t\t", phase)
+# 	except:
+# 		phase = ''
+# 	conditions = [i.text for i in root.findall('condition')]
+
+# 	try:
+# 		criteria = root.find('eligibility').find('criteria').find('textblock').text 
+# 		# print("criteria\n\t\t", criteria)
+# 	except:
+# 		criteria = ''
+# 	#if criteria != '':
+# 	#	assert "Inclusion Criteria:" in criteria 
+# 	#	assert "Exclusion Criteria:" in criteria 
+# 	# title = root.find('brief_title').text	
+# 	# try: 
+# 	# 	summary = root.find('brief_summary').text 
+# 	# 	# print("summary\n\t\t", summary)
+# 	# except:
+# 	# 	summary = '' 
+
+# 	conditions = [i.lower() for i in conditions]
+# 	drugs = [i.lower() for i in drug_interventions]
+
+# 	return nctid, status.lower(), why_stop.lower(), label, phase.lower(), conditions, drugs, criteria
+# 	# return nctid, status.lower(), why_stop.lower(), label, phase.lower(), conditions, drugs, title, criteria, summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+