Diff of /dataprep/WidthToArff.java [000000] .. [4fba4e]

Switch to side-by-side view

--- a
+++ b/dataprep/WidthToArff.java
@@ -0,0 +1,116 @@
+package dataprep;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Provides methods for converting cleaned FAERS data into an arff file
+ * @author zhengc
+ *
+ */
+public class WidthToArff {
+
+	public static void main(String[] args) throws IOException {
+//		String widthfile = args[0];
+//		String arfffile = args[1];
+
+		String widthfile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered";
+		String arfffile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered_sp.arff";
+				
+		// write header info
+		BufferedReader br = new BufferedReader(new FileReader(new File(widthfile)));
+		BufferedWriter bw = new BufferedWriter(new FileWriter(new File(arfffile)));
+		bw.write("@relation " + "widthfile" + "\n");
+		bw.write("\n");
+		
+//		Map<String, Integer> itemindex = getItemIndex(csvfile);
+		Set<String> itemset = getItemSet(widthfile);
+		System.out.println(itemset.size());
+		List<String> itemlist = new ArrayList<String>(itemset);
+		Collections.sort(itemlist);
+		for (String item : itemlist) {
+			bw.write("@attribute " + "'" + item + "'" + " {0,1}" + "\n");
+		}
+		
+		// write data
+		bw.write("\n");
+		bw.write("@data" + "\n");
+//		String line = null;
+		String line = br.readLine();
+//		while ((line=br.readLine()) != null) {
+//			bw.write(line + "\n");
+//		}
+		
+		while ((line=br.readLine()) != null) {
+			StringBuilder sb = new StringBuilder();
+			String[] parts = line.split(",");
+			
+			// get unique indications
+			Set<String> partsset = new HashSet<String>();
+			for (int i=0; i<parts.length; i++) {
+				partsset.add(parts[i]);
+			}
+			// convert to list and sort the list
+			List<String> partslist = new ArrayList<String>(partsset);
+			Collections.sort(partslist);
+			
+			bw.write("{");
+			for (int i=0; i<partslist.size(); i++) {
+				int index = itemlist.indexOf(partslist.get(i));
+				sb.append(index + " " + "1" + ",");
+			}
+			sb.append("}");
+			String spline = sb.toString(); 
+			String splinef = spline.replace(",}", "}"); //remove last comma
+			bw.write(splinef + "\n");
+		}
+		bw.close();
+		br.close();
+	}
+	
+	private static Map<String, Integer> getItemIndex(String filename) throws IOException {
+		Map<String,Integer> itemindex = new HashMap<String, Integer>();
+		BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+		String line = null;
+		int m = 0;
+		while ((line=br.readLine()) != null) {
+			String[] parts = line.split(",");
+			for (int i = 0; i < parts.length; i++) {
+				if (!itemindex.containsKey(parts[i])) {
+					itemindex.put(parts[i], m);
+					m++;
+				}
+			}
+		}
+		br.close();
+		return itemindex;
+	}
+	
+	private static Set<String> getItemSet(String filename) throws IOException {
+		Set<String> itemset = new HashSet<String>();
+		BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
+//		String line = null;
+		String line = br.readLine();
+		while ((line=br.readLine()) != null) {
+			String[] parts = line.split(",");
+			for (int i = 0; i < parts.length; i++) {
+					itemset.add(parts[i]);
+					
+			}
+		}
+		br.close();
+		return itemset;
+	}
+}
+