Diff of /benchmark/nctid2date.py [000000] .. [bc9e98]

Switch to unified view

a b/benchmark/nctid2date.py
1
import csv, os
2
from tqdm import tqdm 
3
from xml.etree import ElementTree as ET
4
5
6
def xmlfile_2_date(xml_file):
7
    tree = ET.parse(xml_file)
8
    root = tree.getroot()
9
    try:
10
        start_date = root.find('start_date').text   
11
        # start_date = int(start_date.split()[-1])
12
    except:
13
        start_date = ''
14
    try:
15
        completion_date = root.find('primary_completion_date').text
16
    except:
17
        try:
18
            completion_date = root.find('completion_date').text 
19
        except:
20
            completion_date = ''
21
    return start_date, completion_date 
22
23
24
raw_folder = "raw_data"
25
nctid_lst = []
26
total_num, start_num, completion_num = 0, 0, 0 
27
with open("data/raw_data.csv") as fin, open("data/nctid_date.txt", 'w') as fout:
28
    readers = list(csv.reader(fin))[1:]
29
    for row in tqdm(readers):
30
        nctid = row[0]
31
        file = os.path.join(raw_folder, nctid[:7]+"xxxx/"+nctid+".xml")
32
        start_date, completion_date = xmlfile_2_date(file)
33
        if start_date != '':
34
            start_num += 1
35
        if completion_date != '':
36
            completion_num += 1
37
        total_num += 1
38
        fout.write(nctid + '\t' + start_date + '\t' + completion_date + '\n')
39
40
print("total_num", total_num)
41
print("start_num", start_num)
42
print("completion_num", completion_num)
43
44