[4fba4e]: / dataprep / ProcessUMLS.java

Download this file

158 lines (145 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
package dataprep;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Provides methods for processing UMLS mapped FAERS data
* @author zhengc
*
*/
public class ProcessUMLS {
public static void main(String[] args) throws IOException {
String input = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD";
String output = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered";
String idnummapfile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_idnummap";
// Compute total terms
Set<String> terms = getTermSet(input);
System.out.println(terms.size());
// Create numbers of each termID appeared in the data
Map<String, Integer> idnummap = getIdNumMap(input);
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(idnummapfile)));
bw.write("UMLS_id" + "," + "Number" + "\n");
for (String id : idnummap.keySet()) {
bw.write(id + "," + idnummap.get(id) + "\n");
}
bw.close();
System.out.println("Number of total diseases: " + idnummap.size());
// Set threshold and get processed data
Set<String> freqset = getFreqTermSet(idnummap, 1, 500000);
System.out.printf("%d disease will be filtered", freqset.size());
// for (String term : freqset) {
// System.out.println(term);
// }
procUMLS(input, freqset, output);
}
/**
* Get all disease terms
* @param filename raw FDA labeling data file
* @return disease term set
* @throws IOException
*/
private static Set<String> getTermSet(String filename) throws IOException {
Set<String> termSet = new HashSet<String>();
BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
String line = null;
while ((line=br.readLine()) != null) {
String[] terms = line.split(",");
for (int i=0; i<terms.length; i++) {
termSet.add(terms[i]);
}
}
br.close();
return termSet;
}
/**
* Compute the number of occurrence of each disease term in FAERS data
* @param filename cleaned, UMLS mapped FAERS file
* @return id_number map that key is disease UMLS term and value is number of occurrence
* @throws IOException
*/
private static Map<String, Integer> getIdNumMap(String filename) throws IOException {
Map<String, Integer> idNumMap = new HashMap<String, Integer>();
BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
String line = null;
while ((line=br.readLine()) != null) {
String[] terms = line.split(",");
// get unique terms
Set<String> termset = new HashSet<String>();
for (int i=0; i<terms.length; i++) {
termset.add(terms[i]);
}
for (String term : termset) {
if (!idNumMap.containsKey(term)) {
idNumMap.put(term, 1);
} else {
idNumMap.put(term, idNumMap.get(term) + 1);
}
}
}
br.close();
return idNumMap;
}
/**
* Remove rare and too many occurrence disease term in FAERS
* @param idNumMap id_number map
* @param minfreq low bound of threshold
* @param maxfreq up bound threshold
* @return filtered disease term set
*/
private static Set<String> getFreqTermSet(Map<String, Integer> idNumMap, int minfreq, int maxfreq) {
Set<String> freqSet = new HashSet<String>();
int value = 0;
for (String key : idNumMap.keySet()) {
value = idNumMap.get(key);
if (value <= minfreq | value >= maxfreq) {
freqSet.add(key);
}
}
return freqSet;
}
/**
* Gets filtered FAERS data
* @param input cleaned, UMLS mapped FARES data
* @param freqset filtered disease term set
* @param output filter FAERS data
* @throws IOException
*/
private static void procUMLS(String input, Set<String> freqset, String output) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(new File(input)));
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(output)));
String line = null;
while ((line=br.readLine()) != null) {
String[] terms = line.split(",");
// get unique terms
Set<String> termset = new HashSet<String>();
for (int i=0; i<terms.length; i++) {
termset.add(terms[i]);
}
List<String> termlist = new ArrayList<String>(termset);
StringBuilder sb = new StringBuilder();
for (String term : termlist) {
if (!freqset.contains(term)) {
sb.append(term + ",");
}
}
String sbline = sb.toString();
// System.out.println(sbline);
if (!sbline.isEmpty()) {
sbline = sbline.replaceFirst(",$", "");
bw.write(sbline + "\n");
}
}
bw.close();
br.close();
}
}