Diff of /scripts/XmlFiles.py [000000] .. [c09aa8]

Switch to unified view

a b/scripts/XmlFiles.py
1
import os
2
3
tsvFile = open("Eligibility.tsv","w",encoding="utf8")
4
5
rootPath = "/Users/sriramnarayanan/Downloads/AllPublicXml/"
6
7
subFolders = os.listdir(rootPath)
8
subFolders = subFolders[2:]
9
10
for subFolder in subFolders:
11
    files = os.listdir(rootPath+subFolder)
12
    for fname in files:
13
        
14
        file = open(rootPath+subFolder+"/"+fname, encoding = "utf-8")
15
        fileText = file.read()
16
        index = fileText.find("<criteria>")
17
        if index == -1: #look into alternate looping methods
18
            
19
            file.close()
20
            continue
21
            #go to next file
22
        
23
        index2 = fileText[index:].find("Exclusion Criteria")
24
25
        if index2 == -1:
26
            index2 =0
27
28
        #keep the nct number, inclusion/exclusion criteria, item, corresponding drug/intervention    
29
        index2 += index
30
            #go to next file
31
        index3 = fileText[index2:].find("</textblock>")
32
        if index3 == -1:
33
            file.close()
34
            continue
35
        
36
        index3 += index2
37
        if index2 == index:
38
            index2=index3
39
            #go to next file
40
        textBlock1 = fileText[int(index):int(index2)].replace("\n","").replace("\t","") #ask if this will lose information
41
        textBlock2 = fileText[int(index2):int(index3)].replace("\n","").replace("\t","")
42
        textBlock3 = str(fname.replace(".xml",""))
43
        textBlock1 = textBlock1.replace("<criteria>","").replace("<textblock>","")
44
        textBlock2 = textBlock2.replace("<criteria>","").replace("<textblock>","")
45
        
46
        tsvFile.write(textBlock1+"\t") #will this lose information
47
        tsvFile.write(textBlock2+"\t")
48
        tsvFile.write(textBlock3+"\n")
49
        #print(index, index2, index3)
50
        #print("TextBlock1:"+"\n"+textBlock1+"\n")
51
        #print("TextBlock2:"+"\n"+textBlock2+"\n"+"\n")
52
                      
53
        file.close()
54
55
#check if index = -1, if it does move to next file
56
#if else find index of next occurence of inclusion criteria
57
#then find index next occurrence of exclusion criteria
58
#then find index of next occurrence of <textblock>
59
60
#use python array indexing to 
61
#remove the "/ns" and "/ts" from those pieces of text
62
63
64
65
#NCT "tab" inclusion "tab" exclusion"
66
67
68
69