|
a |
|
b/benchmark/nctid2date.py |
|
|
1 |
import csv, os |
|
|
2 |
from tqdm import tqdm |
|
|
3 |
from xml.etree import ElementTree as ET |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
def xmlfile_2_date(xml_file): |
|
|
7 |
tree = ET.parse(xml_file) |
|
|
8 |
root = tree.getroot() |
|
|
9 |
try: |
|
|
10 |
start_date = root.find('start_date').text |
|
|
11 |
# start_date = int(start_date.split()[-1]) |
|
|
12 |
except: |
|
|
13 |
start_date = '' |
|
|
14 |
try: |
|
|
15 |
completion_date = root.find('primary_completion_date').text |
|
|
16 |
except: |
|
|
17 |
try: |
|
|
18 |
completion_date = root.find('completion_date').text |
|
|
19 |
except: |
|
|
20 |
completion_date = '' |
|
|
21 |
return start_date, completion_date |
|
|
22 |
|
|
|
23 |
|
|
|
24 |
raw_folder = "raw_data" |
|
|
25 |
nctid_lst = [] |
|
|
26 |
total_num, start_num, completion_num = 0, 0, 0 |
|
|
27 |
with open("data/raw_data.csv") as fin, open("data/nctid_date.txt", 'w') as fout: |
|
|
28 |
readers = list(csv.reader(fin))[1:] |
|
|
29 |
for row in tqdm(readers): |
|
|
30 |
nctid = row[0] |
|
|
31 |
file = os.path.join(raw_folder, nctid[:7]+"xxxx/"+nctid+".xml") |
|
|
32 |
start_date, completion_date = xmlfile_2_date(file) |
|
|
33 |
if start_date != '': |
|
|
34 |
start_num += 1 |
|
|
35 |
if completion_date != '': |
|
|
36 |
completion_num += 1 |
|
|
37 |
total_num += 1 |
|
|
38 |
fout.write(nctid + '\t' + start_date + '\t' + completion_date + '\n') |
|
|
39 |
|
|
|
40 |
print("total_num", total_num) |
|
|
41 |
print("start_num", start_num) |
|
|
42 |
print("completion_num", completion_num) |
|
|
43 |
|
|
|
44 |
|