|
a |
|
b/dataprep/ProcessUMLS.java |
|
|
1 |
package dataprep; |
|
|
2 |
import java.io.BufferedReader; |
|
|
3 |
import java.io.BufferedWriter; |
|
|
4 |
import java.io.File; |
|
|
5 |
import java.io.FileNotFoundException; |
|
|
6 |
import java.io.FileReader; |
|
|
7 |
import java.io.FileWriter; |
|
|
8 |
import java.io.IOException; |
|
|
9 |
import java.util.ArrayList; |
|
|
10 |
import java.util.HashMap; |
|
|
11 |
import java.util.HashSet; |
|
|
12 |
import java.util.List; |
|
|
13 |
import java.util.Map; |
|
|
14 |
import java.util.Set; |
|
|
15 |
|
|
|
16 |
/** |
|
|
17 |
* Provides methods for processing UMLS mapped FAERS data |
|
|
18 |
* @author zhengc |
|
|
19 |
* |
|
|
20 |
*/ |
|
|
21 |
public class ProcessUMLS { |
|
|
22 |
|
|
|
23 |
public static void main(String[] args) throws IOException { |
|
|
24 |
String input = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD"; |
|
|
25 |
String output = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered"; |
|
|
26 |
String idnummapfile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_idnummap"; |
|
|
27 |
|
|
|
28 |
// Compute total terms |
|
|
29 |
Set<String> terms = getTermSet(input); |
|
|
30 |
System.out.println(terms.size()); |
|
|
31 |
// Create numbers of each termID appeared in the data |
|
|
32 |
Map<String, Integer> idnummap = getIdNumMap(input); |
|
|
33 |
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(idnummapfile))); |
|
|
34 |
bw.write("UMLS_id" + "," + "Number" + "\n"); |
|
|
35 |
for (String id : idnummap.keySet()) { |
|
|
36 |
bw.write(id + "," + idnummap.get(id) + "\n"); |
|
|
37 |
} |
|
|
38 |
bw.close(); |
|
|
39 |
|
|
|
40 |
System.out.println("Number of total diseases: " + idnummap.size()); |
|
|
41 |
|
|
|
42 |
// Set threshold and get processed data |
|
|
43 |
Set<String> freqset = getFreqTermSet(idnummap, 1, 500000); |
|
|
44 |
System.out.printf("%d disease will be filtered", freqset.size()); |
|
|
45 |
// for (String term : freqset) { |
|
|
46 |
// System.out.println(term); |
|
|
47 |
// } |
|
|
48 |
procUMLS(input, freqset, output); |
|
|
49 |
} |
|
|
50 |
|
|
|
51 |
/** |
|
|
52 |
* Get all disease terms |
|
|
53 |
* @param filename raw FDA labeling data file |
|
|
54 |
* @return disease term set |
|
|
55 |
* @throws IOException |
|
|
56 |
*/ |
|
|
57 |
private static Set<String> getTermSet(String filename) throws IOException { |
|
|
58 |
Set<String> termSet = new HashSet<String>(); |
|
|
59 |
BufferedReader br = new BufferedReader(new FileReader(new File(filename))); |
|
|
60 |
String line = null; |
|
|
61 |
while ((line=br.readLine()) != null) { |
|
|
62 |
String[] terms = line.split(","); |
|
|
63 |
for (int i=0; i<terms.length; i++) { |
|
|
64 |
termSet.add(terms[i]); |
|
|
65 |
} |
|
|
66 |
} |
|
|
67 |
br.close(); |
|
|
68 |
return termSet; |
|
|
69 |
} |
|
|
70 |
|
|
|
71 |
/** |
|
|
72 |
* Compute the number of occurrence of each disease term in FAERS data |
|
|
73 |
* @param filename cleaned, UMLS mapped FAERS file |
|
|
74 |
* @return id_number map that key is disease UMLS term and value is number of occurrence |
|
|
75 |
* @throws IOException |
|
|
76 |
*/ |
|
|
77 |
private static Map<String, Integer> getIdNumMap(String filename) throws IOException { |
|
|
78 |
Map<String, Integer> idNumMap = new HashMap<String, Integer>(); |
|
|
79 |
BufferedReader br = new BufferedReader(new FileReader(new File(filename))); |
|
|
80 |
String line = null; |
|
|
81 |
while ((line=br.readLine()) != null) { |
|
|
82 |
String[] terms = line.split(","); |
|
|
83 |
// get unique terms |
|
|
84 |
Set<String> termset = new HashSet<String>(); |
|
|
85 |
for (int i=0; i<terms.length; i++) { |
|
|
86 |
termset.add(terms[i]); |
|
|
87 |
} |
|
|
88 |
for (String term : termset) { |
|
|
89 |
if (!idNumMap.containsKey(term)) { |
|
|
90 |
idNumMap.put(term, 1); |
|
|
91 |
} else { |
|
|
92 |
idNumMap.put(term, idNumMap.get(term) + 1); |
|
|
93 |
|
|
|
94 |
} |
|
|
95 |
} |
|
|
96 |
} |
|
|
97 |
br.close(); |
|
|
98 |
return idNumMap; |
|
|
99 |
} |
|
|
100 |
|
|
|
101 |
/** |
|
|
102 |
* Remove rare and too many occurrence disease term in FAERS |
|
|
103 |
* @param idNumMap id_number map |
|
|
104 |
* @param minfreq low bound of threshold |
|
|
105 |
* @param maxfreq up bound threshold |
|
|
106 |
* @return filtered disease term set |
|
|
107 |
*/ |
|
|
108 |
private static Set<String> getFreqTermSet(Map<String, Integer> idNumMap, int minfreq, int maxfreq) { |
|
|
109 |
Set<String> freqSet = new HashSet<String>(); |
|
|
110 |
int value = 0; |
|
|
111 |
for (String key : idNumMap.keySet()) { |
|
|
112 |
value = idNumMap.get(key); |
|
|
113 |
if (value <= minfreq | value >= maxfreq) { |
|
|
114 |
freqSet.add(key); |
|
|
115 |
} |
|
|
116 |
} |
|
|
117 |
return freqSet; |
|
|
118 |
} |
|
|
119 |
|
|
|
120 |
/** |
|
|
121 |
* Gets filtered FAERS data |
|
|
122 |
* @param input cleaned, UMLS mapped FARES data |
|
|
123 |
* @param freqset filtered disease term set |
|
|
124 |
* @param output filter FAERS data |
|
|
125 |
* @throws IOException |
|
|
126 |
*/ |
|
|
127 |
private static void procUMLS(String input, Set<String> freqset, String output) throws IOException { |
|
|
128 |
BufferedReader br = new BufferedReader(new FileReader(new File(input))); |
|
|
129 |
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(output))); |
|
|
130 |
String line = null; |
|
|
131 |
while ((line=br.readLine()) != null) { |
|
|
132 |
String[] terms = line.split(","); |
|
|
133 |
// get unique terms |
|
|
134 |
Set<String> termset = new HashSet<String>(); |
|
|
135 |
for (int i=0; i<terms.length; i++) { |
|
|
136 |
termset.add(terms[i]); |
|
|
137 |
} |
|
|
138 |
List<String> termlist = new ArrayList<String>(termset); |
|
|
139 |
|
|
|
140 |
StringBuilder sb = new StringBuilder(); |
|
|
141 |
for (String term : termlist) { |
|
|
142 |
if (!freqset.contains(term)) { |
|
|
143 |
sb.append(term + ","); |
|
|
144 |
} |
|
|
145 |
} |
|
|
146 |
String sbline = sb.toString(); |
|
|
147 |
// System.out.println(sbline); |
|
|
148 |
if (!sbline.isEmpty()) { |
|
|
149 |
sbline = sbline.replaceFirst(",$", ""); |
|
|
150 |
bw.write(sbline + "\n"); |
|
|
151 |
} |
|
|
152 |
} |
|
|
153 |
bw.close(); |
|
|
154 |
br.close(); |
|
|
155 |
} |
|
|
156 |
|
|
|
157 |
} |