Diff of /scripts/pdrscraping.java [000000] .. [c09aa8]

Switch to unified view

a b/scripts/pdrscraping.java
1
import java.util.*;
2
import java.io.*;
3
import java.net.*;
4
public class pdrscraping {
5
  public static void main(String[] args) throws Exception {
6
    HashSet<String> rawsources = new HashSet<String>();
7
    for (int i = 1901; i <= 2682; i++) {
8
      String k = getURLSource("http://www.pdr.net/drug-information/?druglabelid=" + i);
9
      if (!k.equals("")) rawsources.add(k);
10
    }
11
    HashSet<String> summaryurls = new HashSet<String>();
12
    for (String source : rawsources) {
13
      int index = source.indexOf("title=\"Drug Summary\"");
14
      if (index >= 0) {
15
        ArrayList<Integer> indices = new ArrayList<Integer>();
16
        int ind = source.indexOf("<li><a href=\"");
17
        while (ind >= 0) {
18
          indices.add(ind);
19
          ind = source.indexOf("<li><a href=\"", ind+1);
20
        }
21
        int findex = 0;
22
        if (indices.size() > 0) {
23
          int prev = 0;
24
          for (int j : indices) {
25
            if (j > index) break;
26
            else prev = j;
27
          }
28
          findex = prev;
29
        }
30
        summaryurls.add(source.substring(findex + 13, index-2));
31
      }
32
    }
33
    HashSet<String> summarysources = new HashSet<String>();
34
    for (String url : summaryurls) {
35
      String k = getURLSource(url);
36
      if (!k.equals("")) summarysources.add(k);
37
    }//summarysources.add(getURLSource(url));
38
    Scanner in = new Scanner(new File("cleanedcsv.csv"));
39
    HashMap<String, String> nametoid = new HashMap<String, String>();
40
    while (in.hasNext()) {
41
      String[] line = parse(in.nextLine());
42
      if (line.length > 1 && line[1] != null) nametoid.put(line[1].toLowerCase(), line[0]);
43
    }
44
    PrintWriter out = new PrintWriter(new File("pdrdata1901-2682.csv"));
45
    for (String source : summarysources) {
46
      int index = source.indexOf("<title>");
47
      int lindex = source.indexOf("dose");
48
      String drugname = source.substring(index+8, lindex-2);
49
      int paren = drugname.indexOf("(");
50
      String name = drugname.substring(paren+1);
51
      String[] parts = name.split(" ");
52
      String id = "";
53
      for (int i = 0; i < parts.length && id.equals(""); i++) {
54
        String check = "";
55
        for (int j = 0; j <= i; j++) check = parts[parts.length - j - 1] + check;
56
        if (nametoid.containsKey(check)) id = nametoid.get(check);
57
      }
58
      if (id.equals("") && parts[0].contains("/")) {
59
        String[] slash = parts[0].split("/");
60
        if (nametoid.containsKey(slash[0])) id = nametoid.get(slash[0]);
61
        if (nametoid.containsKey(slash[1])) id = id + nametoid.get(slash[1]);
62
      }
63
      ArrayList<Integer> indices = new ArrayList<Integer>();
64
      ArrayList<Integer> endindices = new ArrayList<Integer>();
65
      int ind = source.indexOf("<strong>For");
66
      while (ind >= 0) {
67
        indices.add(ind);
68
        endindices.add(source.indexOf("</strong>", ind+1));
69
        ind = source.indexOf("<strong>For", ind+1);
70
      }
71
      String indications = "";
72
      for (int i = 0; i < indices.size(); i++) indications = indications + "***" + source.substring(indices.get(i)+8, endindices.get(i)-1);
73
      int contraindex = source.indexOf("<h3 class=\"drugSummary\">CONTRAINDICATIONS / PRECAUTIONS</h3>");
74
      int contraind = source.indexOf("<strong>", contraindex);
75
      int endind = source.indexOf(" </div", contraind);
76
      ArrayList<Integer> contraindices = new ArrayList<Integer>();
77
      ArrayList<Integer> contraend = new ArrayList<Integer>();
78
      while (contraind >= 0 && contraind < endind) {
79
        contraindices.add(contraind);
80
        contraend.add(source.indexOf("</strong>", contraind+1));
81
        contraind = source.indexOf("<strong>", contraind+1);
82
      }
83
      String contraindications = "";
84
      for (int i = 0; i < contraindices.size(); i++) contraindications = contraindications + "***" + source.substring(contraindices.get(i)+8, contraend.get(i));
85
      out.println(id + "," + name + ",\"" + indications + "\"" + ",\"" + contraindications + "\"");
86
    }
87
    in.close();
88
    out.close();
89
  }
90
  static String[] parse(String in) {
91
    boolean inQuotes=false;
92
    int size = 1;
93
    for (int i = 0; i < in.length(); i++) {
94
      char j = in.charAt(i);
95
      if (j == '\"') {
96
        inQuotes=!inQuotes;
97
      }
98
      if (!inQuotes && j == ',') size++;
99
    }
100
    String[] out = new String[size];
101
    String cur = "";
102
    int index = 0;
103
    for (int i = 0; i < in.length(); i++) {
104
      char j = in.charAt(i);
105
      if (j == '\"') {
106
        inQuotes=!inQuotes;
107
      }
108
      else if (!inQuotes && j == ',') {
109
        out[index] = cur;
110
        index++;
111
        cur = "";
112
      }
113
      else cur = cur + j;
114
    }
115
    return out;
116
  }
117
  public static String getURLSource(String url) throws IOException
118
    {
119
        URL urlObject = new URL(url);
120
        URLConnection urlConnection = urlObject.openConnection();
121
        urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
122
        try {
123
        return toString(urlConnection.getInputStream());}catch(Exception e) {return "";}
124
    }
125
  private static String toString(InputStream inputStream) throws IOException
126
    {
127
        try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")))
128
        {
129
            String inputLine;
130
            StringBuilder stringBuilder = new StringBuilder();
131
            while ((inputLine = bufferedReader.readLine()) != null)
132
            {
133
                stringBuilder.append(inputLine);
134
            }
135
136
            return stringBuilder.toString();
137
        }
138
    }
139
}