--- a +++ b/scripts/pdrscraping.java @@ -0,0 +1,139 @@ +import java.util.*; +import java.io.*; +import java.net.*; +public class pdrscraping { + public static void main(String[] args) throws Exception { + HashSet<String> rawsources = new HashSet<String>(); + for (int i = 1901; i <= 2682; i++) { + String k = getURLSource("http://www.pdr.net/drug-information/?druglabelid=" + i); + if (!k.equals("")) rawsources.add(k); + } + HashSet<String> summaryurls = new HashSet<String>(); + for (String source : rawsources) { + int index = source.indexOf("title=\"Drug Summary\""); + if (index >= 0) { + ArrayList<Integer> indices = new ArrayList<Integer>(); + int ind = source.indexOf("<li><a href=\""); + while (ind >= 0) { + indices.add(ind); + ind = source.indexOf("<li><a href=\"", ind+1); + } + int findex = 0; + if (indices.size() > 0) { + int prev = 0; + for (int j : indices) { + if (j > index) break; + else prev = j; + } + findex = prev; + } + summaryurls.add(source.substring(findex + 13, index-2)); + } + } + HashSet<String> summarysources = new HashSet<String>(); + for (String url : summaryurls) { + String k = getURLSource(url); + if (!k.equals("")) summarysources.add(k); + }//summarysources.add(getURLSource(url)); + Scanner in = new Scanner(new File("cleanedcsv.csv")); + HashMap<String, String> nametoid = new HashMap<String, String>(); + while (in.hasNext()) { + String[] line = parse(in.nextLine()); + if (line.length > 1 && line[1] != null) nametoid.put(line[1].toLowerCase(), line[0]); + } + PrintWriter out = new PrintWriter(new File("pdrdata1901-2682.csv")); + for (String source : summarysources) { + int index = source.indexOf("<title>"); + int lindex = source.indexOf("dose"); + String drugname = source.substring(index+8, lindex-2); + int paren = drugname.indexOf("("); + String name = drugname.substring(paren+1); + String[] parts = name.split(" "); + String id = ""; + for (int i = 0; i < parts.length && id.equals(""); i++) { + String check = ""; + for (int j = 0; j <= i; j++) check = parts[parts.length - j - 1] + check; + if (nametoid.containsKey(check)) id = nametoid.get(check); + } + if (id.equals("") && parts[0].contains("/")) { + String[] slash = parts[0].split("/"); + if (nametoid.containsKey(slash[0])) id = nametoid.get(slash[0]); + if (nametoid.containsKey(slash[1])) id = id + nametoid.get(slash[1]); + } + ArrayList<Integer> indices = new ArrayList<Integer>(); + ArrayList<Integer> endindices = new ArrayList<Integer>(); + int ind = source.indexOf("<strong>For"); + while (ind >= 0) { + indices.add(ind); + endindices.add(source.indexOf("</strong>", ind+1)); + ind = source.indexOf("<strong>For", ind+1); + } + String indications = ""; + for (int i = 0; i < indices.size(); i++) indications = indications + "***" + source.substring(indices.get(i)+8, endindices.get(i)-1); + int contraindex = source.indexOf("<h3 class=\"drugSummary\">CONTRAINDICATIONS / PRECAUTIONS</h3>"); + int contraind = source.indexOf("<strong>", contraindex); + int endind = source.indexOf(" </div", contraind); + ArrayList<Integer> contraindices = new ArrayList<Integer>(); + ArrayList<Integer> contraend = new ArrayList<Integer>(); + while (contraind >= 0 && contraind < endind) { + contraindices.add(contraind); + contraend.add(source.indexOf("</strong>", contraind+1)); + contraind = source.indexOf("<strong>", contraind+1); + } + String contraindications = ""; + for (int i = 0; i < contraindices.size(); i++) contraindications = contraindications + "***" + source.substring(contraindices.get(i)+8, contraend.get(i)); + out.println(id + "," + name + ",\"" + indications + "\"" + ",\"" + contraindications + "\""); + } + in.close(); + out.close(); + } + static String[] parse(String in) { + boolean inQuotes=false; + int size = 1; + for (int i = 0; i < in.length(); i++) { + char j = in.charAt(i); + if (j == '\"') { + inQuotes=!inQuotes; + } + if (!inQuotes && j == ',') size++; + } + String[] out = new String[size]; + String cur = ""; + int index = 0; + for (int i = 0; i < in.length(); i++) { + char j = in.charAt(i); + if (j == '\"') { + inQuotes=!inQuotes; + } + else if (!inQuotes && j == ',') { + out[index] = cur; + index++; + cur = ""; + } + else cur = cur + j; + } + return out; + } + public static String getURLSource(String url) throws IOException + { + URL urlObject = new URL(url); + URLConnection urlConnection = urlObject.openConnection(); + urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); + try { + return toString(urlConnection.getInputStream());}catch(Exception e) {return "";} + } + private static String toString(InputStream inputStream) throws IOException + { + try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) + { + String inputLine; + StringBuilder stringBuilder = new StringBuilder(); + while ((inputLine = bufferedReader.readLine()) != null) + { + stringBuilder.append(inputLine); + } + + return stringBuilder.toString(); + } + } +}