--- a
+++ b/dataprep/ProcessUMLS.java
@@ -0,0 +1,157 @@
+package dataprep;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**  
+ * Provides methods for processing UMLS mapped FAERS data
+ * @author zhengc
+ *
+ */
+public class ProcessUMLS {
+
+	public static void main(String[] args) throws IOException {
+		String input = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD";
+		String output = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered";
+		String idnummapfile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_idnummap";
+		
+		// Compute total terms
+		Set<String> terms = getTermSet(input);
+		System.out.println(terms.size());
+		// Create numbers of each termID appeared in the data
+		Map<String, Integer> idnummap = getIdNumMap(input);
+		BufferedWriter bw = new BufferedWriter(new FileWriter(new File(idnummapfile)));
+		bw.write("UMLS_id" + "," + "Number" + "\n");
+		for (String id : idnummap.keySet()) {
+			bw.write(id + "," + idnummap.get(id) + "\n");
+		}
+		bw.close();
+		
+		System.out.println("Number of total diseases: " + idnummap.size());
+		
+		// Set threshold and get processed data
+		Set<String> freqset = getFreqTermSet(idnummap, 1, 500000);
+		System.out.printf("%d disease will be filtered", freqset.size());
+//		for (String term : freqset) {
+//			System.out.println(term);
+//		}
+		procUMLS(input, freqset, output);
+	}
+	
+	/**
+	 * Get all disease terms
+	 * @param filename raw FDA labeling data file
+	 * @return disease term set
+	 * @throws IOException
+	 */
+	private static Set<String> getTermSet(String filename) throws IOException {
+		Set<String> termSet = new HashSet<String>();
+		BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+		String line = null;
+		while ((line=br.readLine()) != null) {
+			String[] terms = line.split(",");
+			for (int i=0; i<terms.length; i++) {
+				termSet.add(terms[i]);
+			}
+		}
+		br.close();
+		return termSet;
+	}
+	
+	/**
+	 * Compute the number of occurrence of each disease term in FAERS data 
+	 * @param filename cleaned, UMLS mapped FAERS file
+	 * @return id_number map that key is disease UMLS term and value is number of occurrence
+	 * @throws IOException
+	 */
+	private static Map<String, Integer> getIdNumMap(String filename) throws IOException {
+		Map<String, Integer> idNumMap = new HashMap<String, Integer>();
+		BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+		String line = null;
+		while ((line=br.readLine()) != null) {
+			String[] terms = line.split(",");
+			// get unique terms
+			Set<String> termset = new HashSet<String>();
+			for (int i=0; i<terms.length; i++) {
+				termset.add(terms[i]);
+			}
+			for (String term : termset) {
+				if (!idNumMap.containsKey(term)) {
+					idNumMap.put(term, 1);
+				} else {
+					idNumMap.put(term, idNumMap.get(term) + 1);
+					
+				}
+			}
+		}
+		br.close();
+		return idNumMap;
+	}
+	
+	/**
+	 * Remove rare and too many occurrence disease term in FAERS
+	 * @param idNumMap id_number map
+	 * @param minfreq low bound of threshold
+	 * @param maxfreq up bound threshold
+	 * @return filtered disease term set 
+	 */
+	private static Set<String> getFreqTermSet(Map<String, Integer> idNumMap, int minfreq, int maxfreq) {
+		Set<String> freqSet = new HashSet<String>();
+		int value = 0;
+		for (String key : idNumMap.keySet()) {
+			value = idNumMap.get(key);
+			if (value <= minfreq | value >= maxfreq) {
+				freqSet.add(key);
+			}
+		}
+		return freqSet;
+	}
+	
+	/**
+	 * Gets filtered FAERS data
+	 * @param input cleaned, UMLS mapped FARES data
+	 * @param freqset filtered disease term set 
+	 * @param output filter FAERS data
+	 * @throws IOException
+	 */
+	private static void procUMLS(String input, Set<String> freqset,  String output) throws IOException {
+		BufferedReader br = new BufferedReader(new FileReader(new File(input)));
+		BufferedWriter bw = new BufferedWriter(new FileWriter(new File(output)));
+		String line = null;
+		while ((line=br.readLine()) != null) {
+			String[] terms = line.split(",");
+			// get unique terms
+			Set<String> termset = new HashSet<String>();
+			for (int i=0; i<terms.length; i++) {
+				termset.add(terms[i]);
+			}
+			List<String> termlist = new ArrayList<String>(termset);
+			
+			StringBuilder sb = new StringBuilder();
+			for (String term : termlist) {
+				if (!freqset.contains(term)) {
+					sb.append(term + ",");
+				}
+			}
+			String sbline = sb.toString();
+//			System.out.println(sbline);
+			if (!sbline.isEmpty()) {
+				sbline = sbline.replaceFirst(",$", "");
+				bw.write(sbline + "\n");
+			}
+		}
+		bw.close();
+		br.close();
+	}
+
+}