Diff of /benchmark/nctid2date.py [000000] .. [bc9e98]

Switch to side-by-side view

--- a
+++ b/benchmark/nctid2date.py
@@ -0,0 +1,44 @@
+import csv, os
+from tqdm import tqdm 
+from xml.etree import ElementTree as ET
+
+
+def xmlfile_2_date(xml_file):
+	tree = ET.parse(xml_file)
+	root = tree.getroot()
+	try:
+		start_date = root.find('start_date').text	
+		# start_date = int(start_date.split()[-1])
+	except:
+		start_date = ''
+	try:
+		completion_date = root.find('primary_completion_date').text
+	except:
+		try:
+			completion_date = root.find('completion_date').text 
+		except:
+			completion_date = ''
+	return start_date, completion_date 
+
+
+raw_folder = "raw_data"
+nctid_lst = []
+total_num, start_num, completion_num = 0, 0, 0 
+with open("data/raw_data.csv") as fin, open("data/nctid_date.txt", 'w') as fout:
+	readers = list(csv.reader(fin))[1:]
+	for row in tqdm(readers):
+		nctid = row[0]
+		file = os.path.join(raw_folder, nctid[:7]+"xxxx/"+nctid+".xml")
+		start_date, completion_date = xmlfile_2_date(file)
+		if start_date != '':
+			start_num += 1
+		if completion_date != '':
+			completion_num += 1
+		total_num += 1
+		fout.write(nctid + '\t' + start_date + '\t' + completion_date + '\n')
+
+print("total_num", total_num)
+print("start_num", start_num)
+print("completion_num", completion_num)
+
+