[c09aa8]: / scripts / pdrscraping.java

Download this file

140 lines (138 with data), 5.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import java.util.*;
import java.io.*;
import java.net.*;
public class pdrscraping {
public static void main(String[] args) throws Exception {
HashSet<String> rawsources = new HashSet<String>();
for (int i = 1901; i <= 2682; i++) {
String k = getURLSource("http://www.pdr.net/drug-information/?druglabelid=" + i);
if (!k.equals("")) rawsources.add(k);
}
HashSet<String> summaryurls = new HashSet<String>();
for (String source : rawsources) {
int index = source.indexOf("title=\"Drug Summary\"");
if (index >= 0) {
ArrayList<Integer> indices = new ArrayList<Integer>();
int ind = source.indexOf("<li><a href=\"");
while (ind >= 0) {
indices.add(ind);
ind = source.indexOf("<li><a href=\"", ind+1);
}
int findex = 0;
if (indices.size() > 0) {
int prev = 0;
for (int j : indices) {
if (j > index) break;
else prev = j;
}
findex = prev;
}
summaryurls.add(source.substring(findex + 13, index-2));
}
}
HashSet<String> summarysources = new HashSet<String>();
for (String url : summaryurls) {
String k = getURLSource(url);
if (!k.equals("")) summarysources.add(k);
}//summarysources.add(getURLSource(url));
Scanner in = new Scanner(new File("cleanedcsv.csv"));
HashMap<String, String> nametoid = new HashMap<String, String>();
while (in.hasNext()) {
String[] line = parse(in.nextLine());
if (line.length > 1 && line[1] != null) nametoid.put(line[1].toLowerCase(), line[0]);
}
PrintWriter out = new PrintWriter(new File("pdrdata1901-2682.csv"));
for (String source : summarysources) {
int index = source.indexOf("<title>");
int lindex = source.indexOf("dose");
String drugname = source.substring(index+8, lindex-2);
int paren = drugname.indexOf("(");
String name = drugname.substring(paren+1);
String[] parts = name.split(" ");
String id = "";
for (int i = 0; i < parts.length && id.equals(""); i++) {
String check = "";
for (int j = 0; j <= i; j++) check = parts[parts.length - j - 1] + check;
if (nametoid.containsKey(check)) id = nametoid.get(check);
}
if (id.equals("") && parts[0].contains("/")) {
String[] slash = parts[0].split("/");
if (nametoid.containsKey(slash[0])) id = nametoid.get(slash[0]);
if (nametoid.containsKey(slash[1])) id = id + nametoid.get(slash[1]);
}
ArrayList<Integer> indices = new ArrayList<Integer>();
ArrayList<Integer> endindices = new ArrayList<Integer>();
int ind = source.indexOf("<strong>For");
while (ind >= 0) {
indices.add(ind);
endindices.add(source.indexOf("</strong>", ind+1));
ind = source.indexOf("<strong>For", ind+1);
}
String indications = "";
for (int i = 0; i < indices.size(); i++) indications = indications + "***" + source.substring(indices.get(i)+8, endindices.get(i)-1);
int contraindex = source.indexOf("<h3 class=\"drugSummary\">CONTRAINDICATIONS / PRECAUTIONS</h3>");
int contraind = source.indexOf("<strong>", contraindex);
int endind = source.indexOf(" </div", contraind);
ArrayList<Integer> contraindices = new ArrayList<Integer>();
ArrayList<Integer> contraend = new ArrayList<Integer>();
while (contraind >= 0 && contraind < endind) {
contraindices.add(contraind);
contraend.add(source.indexOf("</strong>", contraind+1));
contraind = source.indexOf("<strong>", contraind+1);
}
String contraindications = "";
for (int i = 0; i < contraindices.size(); i++) contraindications = contraindications + "***" + source.substring(contraindices.get(i)+8, contraend.get(i));
out.println(id + "," + name + ",\"" + indications + "\"" + ",\"" + contraindications + "\"");
}
in.close();
out.close();
}
static String[] parse(String in) {
boolean inQuotes=false;
int size = 1;
for (int i = 0; i < in.length(); i++) {
char j = in.charAt(i);
if (j == '\"') {
inQuotes=!inQuotes;
}
if (!inQuotes && j == ',') size++;
}
String[] out = new String[size];
String cur = "";
int index = 0;
for (int i = 0; i < in.length(); i++) {
char j = in.charAt(i);
if (j == '\"') {
inQuotes=!inQuotes;
}
else if (!inQuotes && j == ',') {
out[index] = cur;
index++;
cur = "";
}
else cur = cur + j;
}
return out;
}
public static String getURLSource(String url) throws IOException
{
URL urlObject = new URL(url);
URLConnection urlConnection = urlObject.openConnection();
urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
try {
return toString(urlConnection.getInputStream());}catch(Exception e) {return "";}
}
private static String toString(InputStream inputStream) throws IOException
{
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")))
{
String inputLine;
StringBuilder stringBuilder = new StringBuilder();
while ((inputLine = bufferedReader.readLine()) != null)
{
stringBuilder.append(inputLine);
}
return stringBuilder.toString();
}
}
}