[4fba4e]: / dataprep / WidthToArff.java

Download this file

117 lines (103 with data), 3.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package dataprep;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Provides methods for converting cleaned FAERS data into an arff file
* @author zhengc
*
*/
public class WidthToArff {
public static void main(String[] args) throws IOException {
// String widthfile = args[0];
// String arfffile = args[1];
String widthfile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered";
String arfffile = "/Users/zhengc/workspace/FARES/data/FARES/UMLS_map_data/ID_indications_all_clean_width_umls_id_diso_AD_filtered_sp.arff";
// write header info
BufferedReader br = new BufferedReader(new FileReader(new File(widthfile)));
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(arfffile)));
bw.write("@relation " + "widthfile" + "\n");
bw.write("\n");
// Map<String, Integer> itemindex = getItemIndex(csvfile);
Set<String> itemset = getItemSet(widthfile);
System.out.println(itemset.size());
List<String> itemlist = new ArrayList<String>(itemset);
Collections.sort(itemlist);
for (String item : itemlist) {
bw.write("@attribute " + "'" + item + "'" + " {0,1}" + "\n");
}
// write data
bw.write("\n");
bw.write("@data" + "\n");
// String line = null;
String line = br.readLine();
// while ((line=br.readLine()) != null) {
// bw.write(line + "\n");
// }
while ((line=br.readLine()) != null) {
StringBuilder sb = new StringBuilder();
String[] parts = line.split(",");
// get unique indications
Set<String> partsset = new HashSet<String>();
for (int i=0; i<parts.length; i++) {
partsset.add(parts[i]);
}
// convert to list and sort the list
List<String> partslist = new ArrayList<String>(partsset);
Collections.sort(partslist);
bw.write("{");
for (int i=0; i<partslist.size(); i++) {
int index = itemlist.indexOf(partslist.get(i));
sb.append(index + " " + "1" + ",");
}
sb.append("}");
String spline = sb.toString();
String splinef = spline.replace(",}", "}"); //remove last comma
bw.write(splinef + "\n");
}
bw.close();
br.close();
}
private static Map<String, Integer> getItemIndex(String filename) throws IOException {
Map<String,Integer> itemindex = new HashMap<String, Integer>();
BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
String line = null;
int m = 0;
while ((line=br.readLine()) != null) {
String[] parts = line.split(",");
for (int i = 0; i < parts.length; i++) {
if (!itemindex.containsKey(parts[i])) {
itemindex.put(parts[i], m);
m++;
}
}
}
br.close();
return itemindex;
}
private static Set<String> getItemSet(String filename) throws IOException {
Set<String> itemset = new HashSet<String>();
BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
// String line = null;
String line = br.readLine();
while ((line=br.readLine()) != null) {
String[] parts = line.split(",");
for (int i = 0; i < parts.length; i++) {
itemset.add(parts[i]);
}
}
br.close();
return itemset;
}
}