|
a |
|
b/assoc_rules/ProcRules.java |
|
|
1 |
package assoc_rules; |
|
|
2 |
import java.io.BufferedReader; |
|
|
3 |
import java.io.BufferedWriter; |
|
|
4 |
import java.io.File; |
|
|
5 |
import java.io.FileReader; |
|
|
6 |
import java.io.FileWriter; |
|
|
7 |
import java.io.IOException; |
|
|
8 |
import java.util.regex.Matcher; |
|
|
9 |
import java.util.regex.Pattern; |
|
|
10 |
|
|
|
11 |
/** |
|
|
12 |
* This class provides methods for processing mined association rules |
|
|
13 |
* @author zhengc |
|
|
14 |
* |
|
|
15 |
*/ |
|
|
16 |
public class ProcRules { |
|
|
17 |
|
|
|
18 |
public static void main(String[] args) throws IOException { |
|
|
19 |
|
|
|
20 |
/* Command line arguments*/ |
|
|
21 |
String rules = args[0]; |
|
|
22 |
String out = args[1]; |
|
|
23 |
String category = args[2]; |
|
|
24 |
|
|
|
25 |
BufferedReader br = new BufferedReader(new FileReader(new File(rules))); |
|
|
26 |
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(out))); |
|
|
27 |
|
|
|
28 |
// Writer header line |
|
|
29 |
String header = category + "_1" + "|" + "Support_1 " + "|" + category + "_2" + "|" +"Support_2" + "|" + |
|
|
30 |
"Confidence" + "|" + "Lift" + "|" + "Leverage" + "|" + "Conviction" + "\n"; |
|
|
31 |
bw.write(header); |
|
|
32 |
|
|
|
33 |
/* The following codes parse association rules from weka FPGrowth output |
|
|
34 |
* into easy handled format for downstream analysis |
|
|
35 |
* It takes several steps |
|
|
36 |
*/ |
|
|
37 |
String line = br.readLine(); |
|
|
38 |
//System.out.println(line); |
|
|
39 |
line = br.readLine(); |
|
|
40 |
//System.out.println(line); |
|
|
41 |
while ((line = br.readLine()) != null) { |
|
|
42 |
String procline = ""; |
|
|
43 |
|
|
|
44 |
//Step 1: Remove meaningless symbols including "=T", "<>", and count number in the begging etc |
|
|
45 |
//String pat = "=T|\\<?[a-z]+:|\\>|:|=|\\(|\\)|\\s*\\d+\\. +"; |
|
|
46 |
String pat = "=1|\\<?[a-z]+:|\\>|:|=|\\(|\\)|\\s*\\d+\\. +"; |
|
|
47 |
Pattern p = Pattern.compile(pat); |
|
|
48 |
Matcher m = p.matcher(line); |
|
|
49 |
line = m.replaceAll(""); |
|
|
50 |
|
|
|
51 |
//Step 2: Distinguish spaces inside features from outside by replace spaces inside with "_" |
|
|
52 |
// String pat0 = "(\\[([A-Z\\-\\d]+ +)+[A-Z\\-\\d]+\\])|(\\[([A-Z \\-\\d]+, )+([A-Z \\-\\d])+\\])"; |
|
|
53 |
String pat0 = "(\\[\\s+\\w+\\]|\\[([\\w\\-\\s/]+\\s+)+[\\w\\-/]+\\])|(\\[([\\w\\s\\-/]+,\\s)+([\\w\\s\\-/])+\\])"; |
|
|
54 |
Pattern p0 = Pattern.compile(pat0); |
|
|
55 |
Matcher m0 = p0.matcher(line); |
|
|
56 |
StringBuffer sb = new StringBuffer(); |
|
|
57 |
// System.out.println(m0.find()); |
|
|
58 |
while (m0.find()) { |
|
|
59 |
String replace = line.substring(m0.start(), m0.end()).replace(" ", "_"); |
|
|
60 |
// System.out.println(replace); |
|
|
61 |
m0.appendReplacement(sb, replace); |
|
|
62 |
} |
|
|
63 |
m0.appendTail(sb); |
|
|
64 |
line = sb.toString(); |
|
|
65 |
|
|
|
66 |
//Step 3: Replace spaces outside of features with "|" |
|
|
67 |
String delim = " +"; |
|
|
68 |
String[] parts = line.split(delim); |
|
|
69 |
for (int i=0; i < parts.length; i++) { |
|
|
70 |
procline = String.join("|", procline, parts[i]); |
|
|
71 |
} |
|
|
72 |
procline = procline.replaceFirst("\\|", ""); |
|
|
73 |
|
|
|
74 |
//Step 4: Change back spaces inside of features |
|
|
75 |
String pat4 = "_"; |
|
|
76 |
Pattern p4 = Pattern.compile(pat4); |
|
|
77 |
Matcher m4 = p4.matcher(procline); |
|
|
78 |
procline = m4.replaceAll(" "); |
|
|
79 |
System.out.println(procline); |
|
|
80 |
|
|
|
81 |
bw.write(procline + "\n"); // Write to file |
|
|
82 |
|
|
|
83 |
} |
|
|
84 |
br.close(); |
|
|
85 |
bw.close(); |
|
|
86 |
|
|
|
87 |
} |
|
|
88 |
|
|
|
89 |
} |