|
a |
|
b/scripts/XmlFiles.py |
|
|
1 |
import os |
|
|
2 |
|
|
|
3 |
tsvFile = open("Eligibility.tsv","w",encoding="utf8") |
|
|
4 |
|
|
|
5 |
rootPath = "/Users/sriramnarayanan/Downloads/AllPublicXml/" |
|
|
6 |
|
|
|
7 |
subFolders = os.listdir(rootPath) |
|
|
8 |
subFolders = subFolders[2:] |
|
|
9 |
|
|
|
10 |
for subFolder in subFolders: |
|
|
11 |
files = os.listdir(rootPath+subFolder) |
|
|
12 |
for fname in files: |
|
|
13 |
|
|
|
14 |
file = open(rootPath+subFolder+"/"+fname, encoding = "utf-8") |
|
|
15 |
fileText = file.read() |
|
|
16 |
index = fileText.find("<criteria>") |
|
|
17 |
if index == -1: #look into alternate looping methods |
|
|
18 |
|
|
|
19 |
file.close() |
|
|
20 |
continue |
|
|
21 |
#go to next file |
|
|
22 |
|
|
|
23 |
index2 = fileText[index:].find("Exclusion Criteria") |
|
|
24 |
|
|
|
25 |
if index2 == -1: |
|
|
26 |
index2 =0 |
|
|
27 |
|
|
|
28 |
#keep the nct number, inclusion/exclusion criteria, item, corresponding drug/intervention |
|
|
29 |
index2 += index |
|
|
30 |
#go to next file |
|
|
31 |
index3 = fileText[index2:].find("</textblock>") |
|
|
32 |
if index3 == -1: |
|
|
33 |
file.close() |
|
|
34 |
continue |
|
|
35 |
|
|
|
36 |
index3 += index2 |
|
|
37 |
if index2 == index: |
|
|
38 |
index2=index3 |
|
|
39 |
#go to next file |
|
|
40 |
textBlock1 = fileText[int(index):int(index2)].replace("\n","").replace("\t","") #ask if this will lose information |
|
|
41 |
textBlock2 = fileText[int(index2):int(index3)].replace("\n","").replace("\t","") |
|
|
42 |
textBlock3 = str(fname.replace(".xml","")) |
|
|
43 |
textBlock1 = textBlock1.replace("<criteria>","").replace("<textblock>","") |
|
|
44 |
textBlock2 = textBlock2.replace("<criteria>","").replace("<textblock>","") |
|
|
45 |
|
|
|
46 |
tsvFile.write(textBlock1+"\t") #will this lose information |
|
|
47 |
tsvFile.write(textBlock2+"\t") |
|
|
48 |
tsvFile.write(textBlock3+"\n") |
|
|
49 |
#print(index, index2, index3) |
|
|
50 |
#print("TextBlock1:"+"\n"+textBlock1+"\n") |
|
|
51 |
#print("TextBlock2:"+"\n"+textBlock2+"\n"+"\n") |
|
|
52 |
|
|
|
53 |
file.close() |
|
|
54 |
|
|
|
55 |
#check if index = -1, if it does move to next file |
|
|
56 |
#if else find index of next occurence of inclusion criteria |
|
|
57 |
#then find index next occurrence of exclusion criteria |
|
|
58 |
#then find index of next occurrence of <textblock> |
|
|
59 |
|
|
|
60 |
#use python array indexing to |
|
|
61 |
#remove the "/ns" and "/ts" from those pieces of text |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
|
|
|
65 |
#NCT "tab" inclusion "tab" exclusion" |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
|
|
|
69 |
|