Diff of /PTAB_download_briefs.py [000000] .. [8c057c]

Switch to unified view

a b/PTAB_download_briefs.py
1
# pip install pdfplumber -q
2
3
import pandas as pd
4
import itertools
5
from pandas import DataFrame
6
import requests
7
import io
8
import pdfplumber
9
import sys
10
11
# sys.argv[0] is script name
12
infile = sys.argv[1]    # should be one column with a header
13
outfile = sys.argv[2]
14
startind = int(sys.argv[3])  # first index in Proceedings to process
15
endind = int(sys.argv[4])    # last index in Proceedings (if -1 go to end)
16
try:
17
    onlydo = sys.argv[5]    # 'Petitions', 'Responses', or 'Decisions'
18
except:                     # if only doing one kind
19
    onlydo = False
20
21
Proceedings = pd.read_csv(infile)
22
if endind == 0:
23
    Proceedings = Proceedings[Proceedings.columns[0]].to_list()[startind:]
24
else:
25
    Proceedings = Proceedings[Proceedings.columns[0]].to_list()[startind:endind+1]
26
27
28
def extractPDFText(pdfFile):
29
    # Find the first page with a numeric page number
30
    numpages = len(pdfFile.pages)
31
    if numpages < 11:
32
        start_page = 0
33
        end_page = min(10, numpages-1)
34
    elif numpages < 24:
35
        start_page = 0
36
        end_page = min(23, numpages-1)
37
    else:
38
        start_page = 0
39
        end_page = min(30, numpages-1)
40
    textr = ''
41
    for page in range(start_page,end_page+1):
42
        try:
43
            text = (pdfFile.pages[page]).extract_text()
44
            textr += text if text else ''
45
        except:
46
            pass
47
48
    return textr.replace('\n','')
49
50
Petitions = {}
51
Responses = {}
52
Decisions = {}
53
Petitions_json = {}
54
Responses_json = {}
55
Decisions_json = {}
56
57
for procnum in Proceedings:
58
    print('Processing', procnum)
59
60
    if (onlydo == 'Petitions') or (onlydo is False):
61
        rdoc = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Petition&proceedingNumber={procnum}")
62
        Petitions_json[procnum] = rdoc.json()['results']
63
        if rdoc.json()['recordTotalQuantity'] == 0:
64
            num_records = 0
65
            Petitions[procnum] = 'ERROR_NO_RECORDS_FOUND'
66
        else:
67
            doc_id = [result['documentIdentifier'] \
68
                      for result in Petitions_json[procnum] \
69
                      if result['documentTypeName']=='Petition']
70
            num_records = len(doc_id)
71
            if num_records == 0:
72
                doc_id = [result['documentIdentifier'] \
73
                          for result in Petitions_json[procnum] \
74
                          if ('Petition' in result['documentTitleText'] \
75
                              and not 'Rehear' in result['documentTitleText'])]
76
                num_records = len(doc_id)
77
            if num_records > 1:
78
                doc_id = [result['documentIdentifier']
79
                          for result in Petitions_json[procnum]
80
                          if result['documentTypeName']=='Petition' and
81
                          'Petition' in result['documentTitleText']]
82
                num_records = len(doc_id)
83
            if num_records == 1:
84
                doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
85
                pdfFile = pdfplumber.open(io.BytesIO(doc.content))
86
                Petitions[procnum] = extractPDFText(pdfFile)
87
            elif num_records > 1:
88
                doc_id = [result['documentIdentifier']
89
                          for result in Petitions_json[procnum]
90
                          if (result['documentTypeName']=='Petition') \
91
                          and ('Petition' in result['documentTitleText']) \
92
                          and ('Corrected' in result['documentTitleText'])]
93
                num_records = len(doc_id)
94
                if num_records == 0:
95
                    doc_id = [result['documentIdentifier'] \
96
                              for result in Petitions_json[procnum] \
97
                              if ('Petition' in result['documentTitleText'] \
98
                                  and 'Corrected' in result['documentTitleText'])]
99
                num_records = len(doc_id)
100
                if num_records == 1:
101
                    doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
102
                    pdfFile = pdfplumber.open(io.BytesIO(doc.content))
103
                    Petitions[procnum] = extractPDFText(pdfFile)                
104
                else:
105
                    Petitions[procnum] = f'ERROR_TOO_MANY_RECORDS_RETURNED_{num_records}'
106
            elif num_records == 0:
107
                Petitions[procnum] = 'ERROR_NO_RECORD_RETURNED'
108
109
    if (onlydo == 'Responses') or (onlydo is False):
110
        rdoc1 = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Response&proceedingNumber={procnum}")
111
        rdoc2 = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Opposition&proceedingNumber={procnum}")
112
        Responses_json[procnum] = rdoc1.json()['results'] + rdoc2.json()['results']
113
        if (rdoc1.json()['recordTotalQuantity'] == 0) and (rdoc2.json()['recordTotalQuantity'] == 0):
114
            num_records = 0
115
            Responses[procnum] = 'ERROR_NO_RECORDS_FOUND'
116
        else:
117
            doc_id = [result['documentIdentifier']
118
                      for result in Responses_json[procnum] \
119
                      if 'Preliminary Response' in result['documentTypeName']]
120
            num_records = len(doc_id)
121
            if num_records == 0:
122
                doc_id = [result['documentIdentifier']
123
                          for result in Responses_json[procnum]
124
                          if ('Response' in result['documentTypeName'] or 'Opposition' in result['documentTypeName'] \
125
                          or result['documentTypeName']=='Notice') \
126
                          and ('Preliminary Response' in result['documentTitleText']) \
127
                          and ('Motion' not in result['documentTitleText'])]
128
                num_records = len(doc_id)
129
            if num_records == 1:
130
                doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
131
                pdfFile = pdfplumber.open(io.BytesIO(doc.content))
132
                Responses[procnum] = extractPDFText(pdfFile)
133
            elif num_records > 1:
134
                doc_id = [result['documentIdentifier']
135
                          for result in Responses_json[procnum] \
136
                          if (result['documentTypeName']=='Response' or result['documentTypeName']=='Opposition') \
137
                          and ('Preliminary Response' in result['documentTitleText'] \
138
                               and 'Motion' not in result['documentTitleText'] \
139
                               and 'Exhibit' not in result['documentTitleText'])]
140
                num_records = len(doc_id)
141
                if num_records == 1:
142
                    doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
143
                    pdfFile = pdfplumber.open(io.BytesIO(doc.content))
144
                    Responses[procnum] = extractPDFText(pdfFile),
145
                elif num_records > 1:
146
                    doc_id = [result['documentIdentifier'] \
147
                              for result in Responses_json[procnum] \
148
                              if ('Preliminary Response' in result['documentTitleText']) \
149
                              and ('Corrected' in result['documentTitleText'])]
150
                    num_records = len(doc_id)
151
                    if num_records > 1:
152
                        Responses[procnum] = f'WARNING_TOO_MANY_RECORDS_RETURNED_{num_records}_'
153
                        doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
154
                        pdfFile = pdfplumber.open(io.BytesIO(doc.content))
155
                        Responses[procnum] += extractPDFText(pdfFile)
156
                elif num_records == 0:
157
                    doc_id = [result['documentIdentifier'] \
158
                              for result in Responses_json[procnum] \
159
                              if 'Preliminary Response' in result['documentTitleText']]
160
                    num_records = len(doc_id)
161
                    if num_records == 0:
162
                        doc_id = [result['documentIdentifier'] \
163
                                  for result in Responses_json[procnum] \
164
                                  if ('Response' in result['documentTypeName']) \
165
                                  and ('Motion' not in result['documentTitleText']) \
166
                                  and ('Exhibit' not in result['documentTitleText'])]
167
                        num_records = len(doc_id)
168
                        if num_records > 1:
169
                            doc_id = [result['documentIdentifier'] \
170
                                      for result in Responses_json[procnum] \
171
                                      if ('Response' in result['documentTypeName']) \
172
                                      and ('Response' in result['documentTitleText']) \
173
                                      and ('Exhibit' not in result['documentTitleText'])]
174
                            num_records = len(doc_id)
175
                    if num_records == 1:
176
                        doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
177
                        pdfFile = pdfplumber.open(io.BytesIO(doc.content))
178
                        Responses[procnum] = extractPDFText(pdfFile)
179
                    elif num_records == 0:
180
                        Responses[procnum] = 'ERROR_NO_RECORD_RETURNED'
181
                    elif num_records > 1:
182
                        doc_id = [result['documentIdentifier'] \
183
                                  for result in Responses_json[procnum] \
184
                                  if ('Preliminary Response' in result['documentTitleText']) \
185
                                  and ('Corrected' in result['documentTitleText'])]
186
                        num_records = len(doc_id)
187
                        if num_records == 0:
188
                            doc_id = [result['documentIdentifier'] \
189
                                      for result in Responses_json[procnum] \
190
                                      if ('Response' in result['documentTypeName']) \
191
                                      and ('Response' in result['documentTitleText']) \
192
                                      and ('Exhibit' not in result['documentTitleText'])]
193
                            num_records = len(doc_id)
194
                        if num_records > 1:
195
                            Responses[procnum] = f'WARNING_TOO_MANY_RECORDS_RETURNED_{num_records}_'
196
                            doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
197
                            pdfFile = pdfplumber.open(io.BytesIO(doc.content))
198
                            Responses[procnum] += extractPDFText(pdfFile)
199
                        if num_records == 1:
200
                            doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
201
                            pdfFile = pdfplumber.open(io.BytesIO(doc.content))
202
                            Responses[procnum] = extractPDFText(pdfFile)
203
                        
204
            elif num_records == 0:
205
                Responses[procnum] = 'ERROR_NO_RECORD_RETURNED'
206
207
    if (onlydo == 'Decisions') or (onlydo is False):
208
        rdoc = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Decision&proceedingNumber={procnum}")
209
        Decisions_json[procnum] = rdoc.json()['results']
210
        if rdoc.json()['recordTotalQuantity'] == 0:
211
            num_records = 0
212
            rdoc = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Notice&proceedingNumber={procnum}")
213
            doc_id = [result['documentIndentifier'] for result in rdoc \
214
                      if ('Institution' in result['documentTitleText'] \
215
                      and 'Decision' in result['documentTitleText']) \
216
                      and ('Denying Rehearing' not in result['documentTitleText'] \
217
                           or 'Seal' not in result['documentTitleText'] \
218
                           or 'Motion' not in result['documentTitleText'])]
219
            num_records = len(doc_id)
220
            if (num_records == 0) or (num_records > 1):
221
                Decisions[procnum] = 'ERROR_NO_RECORDS_FOUND'
222
            else:
223
                doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
224
                pdfFile = pdfplumber.open(io.BytesIO(doc.content))
225
                Decisions[procnum] = extractPDFText(pdfFile)
226
        else:
227
            doc_id = [result['documentIdentifier']
228
                      for result in Decisions_json[procnum]
229
                      if result['documentTypeName']=='Institution Decision']
230
            num_records = len(doc_id)
231
            if num_records > 1:
232
                doc_id = [result['documentIdentifier']
233
                          for result in Decisions_json[procnum]
234
                          if (result['documentTypeName']=='Institution Decision' \
235
                              or result['documentTypeName']=='Decision Granting Institution' \
236
                              or result['documentTypeName']=='Decision Denying Institution') \
237
                          and ('Decision' in result['documentTitleText'])]
238
                num_records = len(doc_id)
239
                if num_records > 1:
240
                    doc_id = [result['documentIdentifier']
241
                              for result in Decisions_json[procnum]
242
                              if (result['documentTypeName']=='Institution Decision' \
243
                                  or result['documentTypeName']=='Decision Granting Institution' \
244
                                  or result['documentTypeName']=='Decision Denying Institution') \
245
                              and ('Decision' in result['documentTitleText'] and not 'Rehearing' in result['documentTitleText'])]
246
                    num_records = len(doc_id)
247
            if num_records == 1:
248
                doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
249
                pdfFile = pdfplumber.open(io.BytesIO(doc.content))
250
                Decisions[procnum] = extractPDFText(pdfFile)
251
            elif num_records > 1:
252
                Decisions[procnum] = f'ERROR_TOO_MANY_RECORDS_RETURNED_{num_records}'
253
            elif num_records == 0:
254
                doc_id = [result['documentIdentifier'] \
255
                          for result in Decisions_json[procnum] \
256
                          if ('Decision' in result['documentTitleText']) \
257
                          and ('Institution' in result['documentTitleText'])]
258
                num_records = len(doc_id)
259
                if num_records == 0:
260
                    Decisions[procnum] = 'ERROR_NO_RECORD_RETURNED'
261
                elif num_records == 1:
262
                    doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
263
                    pdfFile = pdfplumber.open(io.BytesIO(doc.content))
264
                    Decisions[procnum] = extractPDFText(pdfFile)                
265
                elif num_records > 1:
266
                    doc_id = [result['documentIdentifier'] \
267
                              for result in Decisions_json[procnum] \
268
                              if ('Decision' in result['documentTitleText']) \
269
                              and ('Institution' in result['documentTitleText']) \
270
                              and ('Rehearing' not in result['documentTitleText'])]
271
                    num_records = len(doc_id)
272
                    if num_records == 1:
273
                        doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True)
274
                        pdfFile = pdfplumber.open(io.BytesIO(doc.content))
275
                        Decisions[procnum] = extractPDFText(pdfFile)
276
                    else:
277
                        Decisions[procnum] = f'ERROR_TOO_MANY_RECORDS_RETURNED_{num_records}'
278
                
279
if onlydo == 'Petitions':
280
    ResultsDF = DataFrame.from_dict({key:[Petitions[key]]
281
                                     for key in Proceedings},
282
                                    orient='index',
283
                                    columns=['Petitions'])
284
elif onlydo == 'Responses':
285
    ResultsDF = DataFrame.from_dict({key:[Responses[key]]
286
                                     for key in Responses},
287
                                    orient='index',
288
                                    columns=['Responses'])
289
elif onlydo == 'Decisions':
290
    ResultsDF = DataFrame.from_dict({key:[Decisions[key]]
291
                                     for key in Proceedings},
292
                                    orient='index',
293
                                    columns=['Decisions'])
294
else:
295
    ResultsDF = DataFrame.from_dict({key:[Petitions[key], Responses[key], Decisions[key]]
296
                                     for key in Proceedings},
297
                                    orient='index',
298
                                    columns=['Petitions','Responses','Decisions'])
299
300
ResultsDF = ResultsDF.reset_index().rename(columns={'index':'Proceeding'})
301
302
ResultsDF.to_csv(outfile, sep='\t', index=False)