[bc9e98]: / benchmark / check_statistics_of_raw_data.py

Download this file

136 lines (87 with data), 2.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
'''
input: 270k
data/raw_data.csv
process:
1. print statistics
output:
None
'''
import csv
from collections import defaultdict
text2cnt = defaultdict(int)
raw_data_file = "data/raw_data.csv"
with open(raw_data_file, 'r') as csvfile:
reader = list(csv.reader(csvfile, delimiter = ','))[1:]
fieldname = ['nctid', 'status', 'why_stop', 'label', 'phase', 'diseases', 'drugs', 'title', 'criteria', 'summary']
drop_set = ['Active, not recruiting', 'Enrolling by invitation', 'No longer available',
'Not yet recruiting', 'Recruiting', 'Temporarily not available', 'Unknown status']
for row in reader:
status = row[1]
why_stop = row[2]
label = row[3]
phase = row[4]
text2cnt[status] += 1
# if status == 'Suspended':
# text2cnt[label] += 1
# text2cnt[status] += 1
# if status not in drop_set:
# text = '\t'.join(row[3:4])
# text2cnt[text] += 1
text2cnt_lst = sorted([(k,v) for k,v in text2cnt.items()], key = lambda x:x[1], reverse = True)
for k,v in text2cnt_lst[:]:
print(k.strip(), v)
'''
Observation
status
Completed 150900
-1 139332
1 6534
0 5034
Recruiting 38492
-1 38489
1 3
Unknown status 28093
-1 28076
1 13
0 4
Terminated 17270
-1 16468
0 589
1 213
Active, not recruiting 14236
-1 13852
1 234
0 150
Not yet recruiting 13331
-1 13331
Withdrawn 7355
-1 7355
Enrolling by invitation 2066
-1 2066
Suspended 1601
-1 1600
0 1
why_stop
'' 155449
Slow accrual 173
Lack of funding 140
See termination reason in detailed description. 118
low accrual 103
slow accrual 101
xxxxxxx
xxxxxxx
xxxxxxx
label
-1 164755
1 6747
0 5624
phase
N/A 67084
Phase 2 31069
Phase 1 25649
Phase 3 23084
Phase 4 18714
Phase 1/Phase 2 6520
Phase 2/Phase 3 3347
Early Phase 1 1658
'''