[bc9e98]: / benchmark / pseudolabel.py

Download this file

100 lines (75 with data), 2.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
'''
input: 348k data
1. ClinicalTrialGov/NCTxxxx/xxxxxx.xml & all_xml
1. data/diseases.csv
2. data/drug2smiles.pkl
output: data/raw_data.csv
processing:
0.1 Interventional: 273k data (348k total, e.g., observatorial, surgery, )
0.2 intervention_type == Drug (drug not empty)
0.3 drop_set 96k data (273k), (we don't use drop_set to filter out)
0.4 -1 -> 0 based on "why_stop"
0.5 filter out -1(invalid)
1. disease -> icd
2. drug -> smiles
3. inclusive / exclusive criteria ---- to do
requires ~10 minutes.
'''
##### standard library
import os, csv, pickle
from xml.dom import minidom
from xml.etree import ElementTree as ET
from collections import defaultdict
from time import time
import re
from tqdm import tqdm
from utils import get_path_of_all_xml_file, walkData
drop_set = ['Active, not recruiting', 'Enrolling by invitation', 'No longer available',
'Not yet recruiting', 'Recruiting', 'Temporarily not available', 'Unknown status']
'''
14 overall_status
Active, not recruiting
Approved for marketing
Available
Completed
Enrolling by invitation
No longer available
Not yet recruiting
Recruiting
Suspended
Temporarily not available
Terminated
Unknown status
Withdrawn
Withheld
'''
def root2outcome(root):
result_list = []
walkData(root, prefix = '', result_list = result_list)
filter_func = lambda x:'p_value' in x[0]
outcome_list = list(filter(filter_func, result_list))
if len(outcome_list)==0:
return None
outcome = outcome_list[0][1]
if outcome[0]=='<':
return 1
if outcome[0]=='>':
return 0
if outcome[0]=='=':
outcome = outcome[1:]
try:
label = float(outcome)
if label < 0.05:
return 1
else:
return 0
except:
return None
def xmlfile_2_label(xml_file):
tree = ET.parse(xml_file)
root = tree.getroot()
nctid = root.find('id_info').find('nct_id').text ### nctid: 'NCT00000102'
label = root2outcome(root)
label = -1 if label is None else label
return label