|
a |
|
b/scripts/pdrscraping.java |
|
|
1 |
import java.util.*; |
|
|
2 |
import java.io.*; |
|
|
3 |
import java.net.*; |
|
|
4 |
public class pdrscraping { |
|
|
5 |
public static void main(String[] args) throws Exception { |
|
|
6 |
HashSet<String> rawsources = new HashSet<String>(); |
|
|
7 |
for (int i = 1901; i <= 2682; i++) { |
|
|
8 |
String k = getURLSource("http://www.pdr.net/drug-information/?druglabelid=" + i); |
|
|
9 |
if (!k.equals("")) rawsources.add(k); |
|
|
10 |
} |
|
|
11 |
HashSet<String> summaryurls = new HashSet<String>(); |
|
|
12 |
for (String source : rawsources) { |
|
|
13 |
int index = source.indexOf("title=\"Drug Summary\""); |
|
|
14 |
if (index >= 0) { |
|
|
15 |
ArrayList<Integer> indices = new ArrayList<Integer>(); |
|
|
16 |
int ind = source.indexOf("<li><a href=\""); |
|
|
17 |
while (ind >= 0) { |
|
|
18 |
indices.add(ind); |
|
|
19 |
ind = source.indexOf("<li><a href=\"", ind+1); |
|
|
20 |
} |
|
|
21 |
int findex = 0; |
|
|
22 |
if (indices.size() > 0) { |
|
|
23 |
int prev = 0; |
|
|
24 |
for (int j : indices) { |
|
|
25 |
if (j > index) break; |
|
|
26 |
else prev = j; |
|
|
27 |
} |
|
|
28 |
findex = prev; |
|
|
29 |
} |
|
|
30 |
summaryurls.add(source.substring(findex + 13, index-2)); |
|
|
31 |
} |
|
|
32 |
} |
|
|
33 |
HashSet<String> summarysources = new HashSet<String>(); |
|
|
34 |
for (String url : summaryurls) { |
|
|
35 |
String k = getURLSource(url); |
|
|
36 |
if (!k.equals("")) summarysources.add(k); |
|
|
37 |
}//summarysources.add(getURLSource(url)); |
|
|
38 |
Scanner in = new Scanner(new File("cleanedcsv.csv")); |
|
|
39 |
HashMap<String, String> nametoid = new HashMap<String, String>(); |
|
|
40 |
while (in.hasNext()) { |
|
|
41 |
String[] line = parse(in.nextLine()); |
|
|
42 |
if (line.length > 1 && line[1] != null) nametoid.put(line[1].toLowerCase(), line[0]); |
|
|
43 |
} |
|
|
44 |
PrintWriter out = new PrintWriter(new File("pdrdata1901-2682.csv")); |
|
|
45 |
for (String source : summarysources) { |
|
|
46 |
int index = source.indexOf("<title>"); |
|
|
47 |
int lindex = source.indexOf("dose"); |
|
|
48 |
String drugname = source.substring(index+8, lindex-2); |
|
|
49 |
int paren = drugname.indexOf("("); |
|
|
50 |
String name = drugname.substring(paren+1); |
|
|
51 |
String[] parts = name.split(" "); |
|
|
52 |
String id = ""; |
|
|
53 |
for (int i = 0; i < parts.length && id.equals(""); i++) { |
|
|
54 |
String check = ""; |
|
|
55 |
for (int j = 0; j <= i; j++) check = parts[parts.length - j - 1] + check; |
|
|
56 |
if (nametoid.containsKey(check)) id = nametoid.get(check); |
|
|
57 |
} |
|
|
58 |
if (id.equals("") && parts[0].contains("/")) { |
|
|
59 |
String[] slash = parts[0].split("/"); |
|
|
60 |
if (nametoid.containsKey(slash[0])) id = nametoid.get(slash[0]); |
|
|
61 |
if (nametoid.containsKey(slash[1])) id = id + nametoid.get(slash[1]); |
|
|
62 |
} |
|
|
63 |
ArrayList<Integer> indices = new ArrayList<Integer>(); |
|
|
64 |
ArrayList<Integer> endindices = new ArrayList<Integer>(); |
|
|
65 |
int ind = source.indexOf("<strong>For"); |
|
|
66 |
while (ind >= 0) { |
|
|
67 |
indices.add(ind); |
|
|
68 |
endindices.add(source.indexOf("</strong>", ind+1)); |
|
|
69 |
ind = source.indexOf("<strong>For", ind+1); |
|
|
70 |
} |
|
|
71 |
String indications = ""; |
|
|
72 |
for (int i = 0; i < indices.size(); i++) indications = indications + "***" + source.substring(indices.get(i)+8, endindices.get(i)-1); |
|
|
73 |
int contraindex = source.indexOf("<h3 class=\"drugSummary\">CONTRAINDICATIONS / PRECAUTIONS</h3>"); |
|
|
74 |
int contraind = source.indexOf("<strong>", contraindex); |
|
|
75 |
int endind = source.indexOf(" </div", contraind); |
|
|
76 |
ArrayList<Integer> contraindices = new ArrayList<Integer>(); |
|
|
77 |
ArrayList<Integer> contraend = new ArrayList<Integer>(); |
|
|
78 |
while (contraind >= 0 && contraind < endind) { |
|
|
79 |
contraindices.add(contraind); |
|
|
80 |
contraend.add(source.indexOf("</strong>", contraind+1)); |
|
|
81 |
contraind = source.indexOf("<strong>", contraind+1); |
|
|
82 |
} |
|
|
83 |
String contraindications = ""; |
|
|
84 |
for (int i = 0; i < contraindices.size(); i++) contraindications = contraindications + "***" + source.substring(contraindices.get(i)+8, contraend.get(i)); |
|
|
85 |
out.println(id + "," + name + ",\"" + indications + "\"" + ",\"" + contraindications + "\""); |
|
|
86 |
} |
|
|
87 |
in.close(); |
|
|
88 |
out.close(); |
|
|
89 |
} |
|
|
90 |
static String[] parse(String in) { |
|
|
91 |
boolean inQuotes=false; |
|
|
92 |
int size = 1; |
|
|
93 |
for (int i = 0; i < in.length(); i++) { |
|
|
94 |
char j = in.charAt(i); |
|
|
95 |
if (j == '\"') { |
|
|
96 |
inQuotes=!inQuotes; |
|
|
97 |
} |
|
|
98 |
if (!inQuotes && j == ',') size++; |
|
|
99 |
} |
|
|
100 |
String[] out = new String[size]; |
|
|
101 |
String cur = ""; |
|
|
102 |
int index = 0; |
|
|
103 |
for (int i = 0; i < in.length(); i++) { |
|
|
104 |
char j = in.charAt(i); |
|
|
105 |
if (j == '\"') { |
|
|
106 |
inQuotes=!inQuotes; |
|
|
107 |
} |
|
|
108 |
else if (!inQuotes && j == ',') { |
|
|
109 |
out[index] = cur; |
|
|
110 |
index++; |
|
|
111 |
cur = ""; |
|
|
112 |
} |
|
|
113 |
else cur = cur + j; |
|
|
114 |
} |
|
|
115 |
return out; |
|
|
116 |
} |
|
|
117 |
public static String getURLSource(String url) throws IOException |
|
|
118 |
{ |
|
|
119 |
URL urlObject = new URL(url); |
|
|
120 |
URLConnection urlConnection = urlObject.openConnection(); |
|
|
121 |
urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); |
|
|
122 |
try { |
|
|
123 |
return toString(urlConnection.getInputStream());}catch(Exception e) {return "";} |
|
|
124 |
} |
|
|
125 |
private static String toString(InputStream inputStream) throws IOException |
|
|
126 |
{ |
|
|
127 |
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) |
|
|
128 |
{ |
|
|
129 |
String inputLine; |
|
|
130 |
StringBuilder stringBuilder = new StringBuilder(); |
|
|
131 |
while ((inputLine = bufferedReader.readLine()) != null) |
|
|
132 |
{ |
|
|
133 |
stringBuilder.append(inputLine); |
|
|
134 |
} |
|
|
135 |
|
|
|
136 |
return stringBuilder.toString(); |
|
|
137 |
} |
|
|
138 |
} |
|
|
139 |
} |