Diff of /dataprep/ProcessUMLS.java [000000] .. [4fba4e]

Switch to unified view

a b/dataprep/ProcessUMLS.java
1
package dataprep;
2
import java.io.BufferedReader;
3
import java.io.BufferedWriter;
4
import java.io.File;
5
import java.io.FileNotFoundException;
6
import java.io.FileReader;
7
import java.io.FileWriter;
8
import java.io.IOException;
9
import java.util.ArrayList;
10
import java.util.HashMap;
11
import java.util.HashSet;
12
import java.util.List;
13
import java.util.Map;
14
import java.util.Set;
15
16
/**  
17
 * Provides methods for processing UMLS mapped FAERS data
18
 * @author zhengc
19
 *
20
 */
21
public class ProcessUMLS {
22
23
    public static void main(String[] args) throws IOException {
24
        String input = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD";
25
        String output = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered";
26
        String idnummapfile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_idnummap";
27
        
28
        // Compute total terms
29
        Set<String> terms = getTermSet(input);
30
        System.out.println(terms.size());
31
        // Create numbers of each termID appeared in the data
32
        Map<String, Integer> idnummap = getIdNumMap(input);
33
        BufferedWriter bw = new BufferedWriter(new FileWriter(new File(idnummapfile)));
34
        bw.write("UMLS_id" + "," + "Number" + "\n");
35
        for (String id : idnummap.keySet()) {
36
            bw.write(id + "," + idnummap.get(id) + "\n");
37
        }
38
        bw.close();
39
        
40
        System.out.println("Number of total diseases: " + idnummap.size());
41
        
42
        // Set threshold and get processed data
43
        Set<String> freqset = getFreqTermSet(idnummap, 1, 500000);
44
        System.out.printf("%d disease will be filtered", freqset.size());
45
//      for (String term : freqset) {
46
//          System.out.println(term);
47
//      }
48
        procUMLS(input, freqset, output);
49
    }
50
    
51
    /**
52
     * Get all disease terms
53
     * @param filename raw FDA labeling data file
54
     * @return disease term set
55
     * @throws IOException
56
     */
57
    private static Set<String> getTermSet(String filename) throws IOException {
58
        Set<String> termSet = new HashSet<String>();
59
        BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
60
        String line = null;
61
        while ((line=br.readLine()) != null) {
62
            String[] terms = line.split(",");
63
            for (int i=0; i<terms.length; i++) {
64
                termSet.add(terms[i]);
65
            }
66
        }
67
        br.close();
68
        return termSet;
69
    }
70
    
71
    /**
72
     * Compute the number of occurrence of each disease term in FAERS data 
73
     * @param filename cleaned, UMLS mapped FAERS file
74
     * @return id_number map that key is disease UMLS term and value is number of occurrence
75
     * @throws IOException
76
     */
77
    private static Map<String, Integer> getIdNumMap(String filename) throws IOException {
78
        Map<String, Integer> idNumMap = new HashMap<String, Integer>();
79
        BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
80
        String line = null;
81
        while ((line=br.readLine()) != null) {
82
            String[] terms = line.split(",");
83
            // get unique terms
84
            Set<String> termset = new HashSet<String>();
85
            for (int i=0; i<terms.length; i++) {
86
                termset.add(terms[i]);
87
            }
88
            for (String term : termset) {
89
                if (!idNumMap.containsKey(term)) {
90
                    idNumMap.put(term, 1);
91
                } else {
92
                    idNumMap.put(term, idNumMap.get(term) + 1);
93
                    
94
                }
95
            }
96
        }
97
        br.close();
98
        return idNumMap;
99
    }
100
    
101
    /**
102
     * Remove rare and too many occurrence disease term in FAERS
103
     * @param idNumMap id_number map
104
     * @param minfreq low bound of threshold
105
     * @param maxfreq up bound threshold
106
     * @return filtered disease term set 
107
     */
108
    private static Set<String> getFreqTermSet(Map<String, Integer> idNumMap, int minfreq, int maxfreq) {
109
        Set<String> freqSet = new HashSet<String>();
110
        int value = 0;
111
        for (String key : idNumMap.keySet()) {
112
            value = idNumMap.get(key);
113
            if (value <= minfreq | value >= maxfreq) {
114
                freqSet.add(key);
115
            }
116
        }
117
        return freqSet;
118
    }
119
    
120
    /**
121
     * Gets filtered FAERS data
122
     * @param input cleaned, UMLS mapped FARES data
123
     * @param freqset filtered disease term set 
124
     * @param output filter FAERS data
125
     * @throws IOException
126
     */
127
    private static void procUMLS(String input, Set<String> freqset,  String output) throws IOException {
128
        BufferedReader br = new BufferedReader(new FileReader(new File(input)));
129
        BufferedWriter bw = new BufferedWriter(new FileWriter(new File(output)));
130
        String line = null;
131
        while ((line=br.readLine()) != null) {
132
            String[] terms = line.split(",");
133
            // get unique terms
134
            Set<String> termset = new HashSet<String>();
135
            for (int i=0; i<terms.length; i++) {
136
                termset.add(terms[i]);
137
            }
138
            List<String> termlist = new ArrayList<String>(termset);
139
            
140
            StringBuilder sb = new StringBuilder();
141
            for (String term : termlist) {
142
                if (!freqset.contains(term)) {
143
                    sb.append(term + ",");
144
                }
145
            }
146
            String sbline = sb.toString();
147
//          System.out.println(sbline);
148
            if (!sbline.isEmpty()) {
149
                sbline = sbline.replaceFirst(",$", "");
150
                bw.write(sbline + "\n");
151
            }
152
        }
153
        bw.close();
154
        br.close();
155
    }
156
157
}