--- a +++ b/dataprep/WidthToArff.java @@ -0,0 +1,116 @@ +package dataprep; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Provides methods for converting cleaned FAERS data into an arff file + * @author zhengc + * + */ +public class WidthToArff { + + public static void main(String[] args) throws IOException { +// String widthfile = args[0]; +// String arfffile = args[1]; + + String widthfile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered"; + String arfffile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered_sp.arff"; + + // write header info + BufferedReader br = new BufferedReader(new FileReader(new File(widthfile))); + BufferedWriter bw = new BufferedWriter(new FileWriter(new File(arfffile))); + bw.write("@relation " + "widthfile" + "\n"); + bw.write("\n"); + +// Map<String, Integer> itemindex = getItemIndex(csvfile); + Set<String> itemset = getItemSet(widthfile); + System.out.println(itemset.size()); + List<String> itemlist = new ArrayList<String>(itemset); + Collections.sort(itemlist); + for (String item : itemlist) { + bw.write("@attribute " + "'" + item + "'" + " {0,1}" + "\n"); + } + + // write data + bw.write("\n"); + bw.write("@data" + "\n"); +// String line = null; + String line = br.readLine(); +// while ((line=br.readLine()) != null) { +// bw.write(line + "\n"); +// } + + while ((line=br.readLine()) != null) { + StringBuilder sb = new StringBuilder(); + String[] parts = line.split(","); + + // get unique indications + Set<String> partsset = new HashSet<String>(); + for (int i=0; i<parts.length; i++) { + partsset.add(parts[i]); + } + // convert to list and sort the list + List<String> partslist = new ArrayList<String>(partsset); + Collections.sort(partslist); + + bw.write("{"); + for (int i=0; i<partslist.size(); i++) { + int index = itemlist.indexOf(partslist.get(i)); + sb.append(index + " " + "1" + ","); + } + sb.append("}"); + String spline = sb.toString(); + String splinef = spline.replace(",}", "}"); //remove last comma + bw.write(splinef + "\n"); + } + bw.close(); + br.close(); + } + + private static Map<String, Integer> getItemIndex(String filename) throws IOException { + Map<String,Integer> itemindex = new HashMap<String, Integer>(); + BufferedReader br = new BufferedReader(new FileReader(new File(filename))); + String line = null; + int m = 0; + while ((line=br.readLine()) != null) { + String[] parts = line.split(","); + for (int i = 0; i < parts.length; i++) { + if (!itemindex.containsKey(parts[i])) { + itemindex.put(parts[i], m); + m++; + } + } + } + br.close(); + return itemindex; + } + + private static Set<String> getItemSet(String filename) throws IOException { + Set<String> itemset = new HashSet<String>(); + BufferedReader br = new BufferedReader(new FileReader(new File(filename))); +// String line = null; + String line = br.readLine(); + while ((line=br.readLine()) != null) { + String[] parts = line.split(","); + for (int i = 0; i < parts.length; i++) { + itemset.add(parts[i]); + + } + } + br.close(); + return itemset; + } +} +