|
a |
|
b/PTAB_download_briefs.py |
|
|
1 |
# pip install pdfplumber -q |
|
|
2 |
|
|
|
3 |
import pandas as pd |
|
|
4 |
import itertools |
|
|
5 |
from pandas import DataFrame |
|
|
6 |
import requests |
|
|
7 |
import io |
|
|
8 |
import pdfplumber |
|
|
9 |
import sys |
|
|
10 |
|
|
|
11 |
# sys.argv[0] is script name |
|
|
12 |
infile = sys.argv[1] # should be one column with a header |
|
|
13 |
outfile = sys.argv[2] |
|
|
14 |
startind = int(sys.argv[3]) # first index in Proceedings to process |
|
|
15 |
endind = int(sys.argv[4]) # last index in Proceedings (if -1 go to end) |
|
|
16 |
try: |
|
|
17 |
onlydo = sys.argv[5] # 'Petitions', 'Responses', or 'Decisions' |
|
|
18 |
except: # if only doing one kind |
|
|
19 |
onlydo = False |
|
|
20 |
|
|
|
21 |
Proceedings = pd.read_csv(infile) |
|
|
22 |
if endind == 0: |
|
|
23 |
Proceedings = Proceedings[Proceedings.columns[0]].to_list()[startind:] |
|
|
24 |
else: |
|
|
25 |
Proceedings = Proceedings[Proceedings.columns[0]].to_list()[startind:endind+1] |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
def extractPDFText(pdfFile): |
|
|
29 |
# Find the first page with a numeric page number |
|
|
30 |
numpages = len(pdfFile.pages) |
|
|
31 |
if numpages < 11: |
|
|
32 |
start_page = 0 |
|
|
33 |
end_page = min(10, numpages-1) |
|
|
34 |
elif numpages < 24: |
|
|
35 |
start_page = 0 |
|
|
36 |
end_page = min(23, numpages-1) |
|
|
37 |
else: |
|
|
38 |
start_page = 0 |
|
|
39 |
end_page = min(30, numpages-1) |
|
|
40 |
textr = '' |
|
|
41 |
for page in range(start_page,end_page+1): |
|
|
42 |
try: |
|
|
43 |
text = (pdfFile.pages[page]).extract_text() |
|
|
44 |
textr += text if text else '' |
|
|
45 |
except: |
|
|
46 |
pass |
|
|
47 |
|
|
|
48 |
return textr.replace('\n','') |
|
|
49 |
|
|
|
50 |
Petitions = {} |
|
|
51 |
Responses = {} |
|
|
52 |
Decisions = {} |
|
|
53 |
Petitions_json = {} |
|
|
54 |
Responses_json = {} |
|
|
55 |
Decisions_json = {} |
|
|
56 |
|
|
|
57 |
for procnum in Proceedings: |
|
|
58 |
print('Processing', procnum) |
|
|
59 |
|
|
|
60 |
if (onlydo == 'Petitions') or (onlydo is False): |
|
|
61 |
rdoc = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Petition&proceedingNumber={procnum}") |
|
|
62 |
Petitions_json[procnum] = rdoc.json()['results'] |
|
|
63 |
if rdoc.json()['recordTotalQuantity'] == 0: |
|
|
64 |
num_records = 0 |
|
|
65 |
Petitions[procnum] = 'ERROR_NO_RECORDS_FOUND' |
|
|
66 |
else: |
|
|
67 |
doc_id = [result['documentIdentifier'] \ |
|
|
68 |
for result in Petitions_json[procnum] \ |
|
|
69 |
if result['documentTypeName']=='Petition'] |
|
|
70 |
num_records = len(doc_id) |
|
|
71 |
if num_records == 0: |
|
|
72 |
doc_id = [result['documentIdentifier'] \ |
|
|
73 |
for result in Petitions_json[procnum] \ |
|
|
74 |
if ('Petition' in result['documentTitleText'] \ |
|
|
75 |
and not 'Rehear' in result['documentTitleText'])] |
|
|
76 |
num_records = len(doc_id) |
|
|
77 |
if num_records > 1: |
|
|
78 |
doc_id = [result['documentIdentifier'] |
|
|
79 |
for result in Petitions_json[procnum] |
|
|
80 |
if result['documentTypeName']=='Petition' and |
|
|
81 |
'Petition' in result['documentTitleText']] |
|
|
82 |
num_records = len(doc_id) |
|
|
83 |
if num_records == 1: |
|
|
84 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
85 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
86 |
Petitions[procnum] = extractPDFText(pdfFile) |
|
|
87 |
elif num_records > 1: |
|
|
88 |
doc_id = [result['documentIdentifier'] |
|
|
89 |
for result in Petitions_json[procnum] |
|
|
90 |
if (result['documentTypeName']=='Petition') \ |
|
|
91 |
and ('Petition' in result['documentTitleText']) \ |
|
|
92 |
and ('Corrected' in result['documentTitleText'])] |
|
|
93 |
num_records = len(doc_id) |
|
|
94 |
if num_records == 0: |
|
|
95 |
doc_id = [result['documentIdentifier'] \ |
|
|
96 |
for result in Petitions_json[procnum] \ |
|
|
97 |
if ('Petition' in result['documentTitleText'] \ |
|
|
98 |
and 'Corrected' in result['documentTitleText'])] |
|
|
99 |
num_records = len(doc_id) |
|
|
100 |
if num_records == 1: |
|
|
101 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
102 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
103 |
Petitions[procnum] = extractPDFText(pdfFile) |
|
|
104 |
else: |
|
|
105 |
Petitions[procnum] = f'ERROR_TOO_MANY_RECORDS_RETURNED_{num_records}' |
|
|
106 |
elif num_records == 0: |
|
|
107 |
Petitions[procnum] = 'ERROR_NO_RECORD_RETURNED' |
|
|
108 |
|
|
|
109 |
if (onlydo == 'Responses') or (onlydo is False): |
|
|
110 |
rdoc1 = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Response&proceedingNumber={procnum}") |
|
|
111 |
rdoc2 = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Opposition&proceedingNumber={procnum}") |
|
|
112 |
Responses_json[procnum] = rdoc1.json()['results'] + rdoc2.json()['results'] |
|
|
113 |
if (rdoc1.json()['recordTotalQuantity'] == 0) and (rdoc2.json()['recordTotalQuantity'] == 0): |
|
|
114 |
num_records = 0 |
|
|
115 |
Responses[procnum] = 'ERROR_NO_RECORDS_FOUND' |
|
|
116 |
else: |
|
|
117 |
doc_id = [result['documentIdentifier'] |
|
|
118 |
for result in Responses_json[procnum] \ |
|
|
119 |
if 'Preliminary Response' in result['documentTypeName']] |
|
|
120 |
num_records = len(doc_id) |
|
|
121 |
if num_records == 0: |
|
|
122 |
doc_id = [result['documentIdentifier'] |
|
|
123 |
for result in Responses_json[procnum] |
|
|
124 |
if ('Response' in result['documentTypeName'] or 'Opposition' in result['documentTypeName'] \ |
|
|
125 |
or result['documentTypeName']=='Notice') \ |
|
|
126 |
and ('Preliminary Response' in result['documentTitleText']) \ |
|
|
127 |
and ('Motion' not in result['documentTitleText'])] |
|
|
128 |
num_records = len(doc_id) |
|
|
129 |
if num_records == 1: |
|
|
130 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
131 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
132 |
Responses[procnum] = extractPDFText(pdfFile) |
|
|
133 |
elif num_records > 1: |
|
|
134 |
doc_id = [result['documentIdentifier'] |
|
|
135 |
for result in Responses_json[procnum] \ |
|
|
136 |
if (result['documentTypeName']=='Response' or result['documentTypeName']=='Opposition') \ |
|
|
137 |
and ('Preliminary Response' in result['documentTitleText'] \ |
|
|
138 |
and 'Motion' not in result['documentTitleText'] \ |
|
|
139 |
and 'Exhibit' not in result['documentTitleText'])] |
|
|
140 |
num_records = len(doc_id) |
|
|
141 |
if num_records == 1: |
|
|
142 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
143 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
144 |
Responses[procnum] = extractPDFText(pdfFile), |
|
|
145 |
elif num_records > 1: |
|
|
146 |
doc_id = [result['documentIdentifier'] \ |
|
|
147 |
for result in Responses_json[procnum] \ |
|
|
148 |
if ('Preliminary Response' in result['documentTitleText']) \ |
|
|
149 |
and ('Corrected' in result['documentTitleText'])] |
|
|
150 |
num_records = len(doc_id) |
|
|
151 |
if num_records > 1: |
|
|
152 |
Responses[procnum] = f'WARNING_TOO_MANY_RECORDS_RETURNED_{num_records}_' |
|
|
153 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
154 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
155 |
Responses[procnum] += extractPDFText(pdfFile) |
|
|
156 |
elif num_records == 0: |
|
|
157 |
doc_id = [result['documentIdentifier'] \ |
|
|
158 |
for result in Responses_json[procnum] \ |
|
|
159 |
if 'Preliminary Response' in result['documentTitleText']] |
|
|
160 |
num_records = len(doc_id) |
|
|
161 |
if num_records == 0: |
|
|
162 |
doc_id = [result['documentIdentifier'] \ |
|
|
163 |
for result in Responses_json[procnum] \ |
|
|
164 |
if ('Response' in result['documentTypeName']) \ |
|
|
165 |
and ('Motion' not in result['documentTitleText']) \ |
|
|
166 |
and ('Exhibit' not in result['documentTitleText'])] |
|
|
167 |
num_records = len(doc_id) |
|
|
168 |
if num_records > 1: |
|
|
169 |
doc_id = [result['documentIdentifier'] \ |
|
|
170 |
for result in Responses_json[procnum] \ |
|
|
171 |
if ('Response' in result['documentTypeName']) \ |
|
|
172 |
and ('Response' in result['documentTitleText']) \ |
|
|
173 |
and ('Exhibit' not in result['documentTitleText'])] |
|
|
174 |
num_records = len(doc_id) |
|
|
175 |
if num_records == 1: |
|
|
176 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
177 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
178 |
Responses[procnum] = extractPDFText(pdfFile) |
|
|
179 |
elif num_records == 0: |
|
|
180 |
Responses[procnum] = 'ERROR_NO_RECORD_RETURNED' |
|
|
181 |
elif num_records > 1: |
|
|
182 |
doc_id = [result['documentIdentifier'] \ |
|
|
183 |
for result in Responses_json[procnum] \ |
|
|
184 |
if ('Preliminary Response' in result['documentTitleText']) \ |
|
|
185 |
and ('Corrected' in result['documentTitleText'])] |
|
|
186 |
num_records = len(doc_id) |
|
|
187 |
if num_records == 0: |
|
|
188 |
doc_id = [result['documentIdentifier'] \ |
|
|
189 |
for result in Responses_json[procnum] \ |
|
|
190 |
if ('Response' in result['documentTypeName']) \ |
|
|
191 |
and ('Response' in result['documentTitleText']) \ |
|
|
192 |
and ('Exhibit' not in result['documentTitleText'])] |
|
|
193 |
num_records = len(doc_id) |
|
|
194 |
if num_records > 1: |
|
|
195 |
Responses[procnum] = f'WARNING_TOO_MANY_RECORDS_RETURNED_{num_records}_' |
|
|
196 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
197 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
198 |
Responses[procnum] += extractPDFText(pdfFile) |
|
|
199 |
if num_records == 1: |
|
|
200 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
201 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
202 |
Responses[procnum] = extractPDFText(pdfFile) |
|
|
203 |
|
|
|
204 |
elif num_records == 0: |
|
|
205 |
Responses[procnum] = 'ERROR_NO_RECORD_RETURNED' |
|
|
206 |
|
|
|
207 |
if (onlydo == 'Decisions') or (onlydo is False): |
|
|
208 |
rdoc = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Decision&proceedingNumber={procnum}") |
|
|
209 |
Decisions_json[procnum] = rdoc.json()['results'] |
|
|
210 |
if rdoc.json()['recordTotalQuantity'] == 0: |
|
|
211 |
num_records = 0 |
|
|
212 |
rdoc = requests.get(f"https://developer.uspto.gov/ptab-api/documents?documentTypeName=Notice&proceedingNumber={procnum}") |
|
|
213 |
doc_id = [result['documentIndentifier'] for result in rdoc \ |
|
|
214 |
if ('Institution' in result['documentTitleText'] \ |
|
|
215 |
and 'Decision' in result['documentTitleText']) \ |
|
|
216 |
and ('Denying Rehearing' not in result['documentTitleText'] \ |
|
|
217 |
or 'Seal' not in result['documentTitleText'] \ |
|
|
218 |
or 'Motion' not in result['documentTitleText'])] |
|
|
219 |
num_records = len(doc_id) |
|
|
220 |
if (num_records == 0) or (num_records > 1): |
|
|
221 |
Decisions[procnum] = 'ERROR_NO_RECORDS_FOUND' |
|
|
222 |
else: |
|
|
223 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
224 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
225 |
Decisions[procnum] = extractPDFText(pdfFile) |
|
|
226 |
else: |
|
|
227 |
doc_id = [result['documentIdentifier'] |
|
|
228 |
for result in Decisions_json[procnum] |
|
|
229 |
if result['documentTypeName']=='Institution Decision'] |
|
|
230 |
num_records = len(doc_id) |
|
|
231 |
if num_records > 1: |
|
|
232 |
doc_id = [result['documentIdentifier'] |
|
|
233 |
for result in Decisions_json[procnum] |
|
|
234 |
if (result['documentTypeName']=='Institution Decision' \ |
|
|
235 |
or result['documentTypeName']=='Decision Granting Institution' \ |
|
|
236 |
or result['documentTypeName']=='Decision Denying Institution') \ |
|
|
237 |
and ('Decision' in result['documentTitleText'])] |
|
|
238 |
num_records = len(doc_id) |
|
|
239 |
if num_records > 1: |
|
|
240 |
doc_id = [result['documentIdentifier'] |
|
|
241 |
for result in Decisions_json[procnum] |
|
|
242 |
if (result['documentTypeName']=='Institution Decision' \ |
|
|
243 |
or result['documentTypeName']=='Decision Granting Institution' \ |
|
|
244 |
or result['documentTypeName']=='Decision Denying Institution') \ |
|
|
245 |
and ('Decision' in result['documentTitleText'] and not 'Rehearing' in result['documentTitleText'])] |
|
|
246 |
num_records = len(doc_id) |
|
|
247 |
if num_records == 1: |
|
|
248 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
249 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
250 |
Decisions[procnum] = extractPDFText(pdfFile) |
|
|
251 |
elif num_records > 1: |
|
|
252 |
Decisions[procnum] = f'ERROR_TOO_MANY_RECORDS_RETURNED_{num_records}' |
|
|
253 |
elif num_records == 0: |
|
|
254 |
doc_id = [result['documentIdentifier'] \ |
|
|
255 |
for result in Decisions_json[procnum] \ |
|
|
256 |
if ('Decision' in result['documentTitleText']) \ |
|
|
257 |
and ('Institution' in result['documentTitleText'])] |
|
|
258 |
num_records = len(doc_id) |
|
|
259 |
if num_records == 0: |
|
|
260 |
Decisions[procnum] = 'ERROR_NO_RECORD_RETURNED' |
|
|
261 |
elif num_records == 1: |
|
|
262 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
263 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
264 |
Decisions[procnum] = extractPDFText(pdfFile) |
|
|
265 |
elif num_records > 1: |
|
|
266 |
doc_id = [result['documentIdentifier'] \ |
|
|
267 |
for result in Decisions_json[procnum] \ |
|
|
268 |
if ('Decision' in result['documentTitleText']) \ |
|
|
269 |
and ('Institution' in result['documentTitleText']) \ |
|
|
270 |
and ('Rehearing' not in result['documentTitleText'])] |
|
|
271 |
num_records = len(doc_id) |
|
|
272 |
if num_records == 1: |
|
|
273 |
doc = requests.get(f"https://developer.uspto.gov/ptab-api/documents/{doc_id[0]}/download",stream=True) |
|
|
274 |
pdfFile = pdfplumber.open(io.BytesIO(doc.content)) |
|
|
275 |
Decisions[procnum] = extractPDFText(pdfFile) |
|
|
276 |
else: |
|
|
277 |
Decisions[procnum] = f'ERROR_TOO_MANY_RECORDS_RETURNED_{num_records}' |
|
|
278 |
|
|
|
279 |
if onlydo == 'Petitions': |
|
|
280 |
ResultsDF = DataFrame.from_dict({key:[Petitions[key]] |
|
|
281 |
for key in Proceedings}, |
|
|
282 |
orient='index', |
|
|
283 |
columns=['Petitions']) |
|
|
284 |
elif onlydo == 'Responses': |
|
|
285 |
ResultsDF = DataFrame.from_dict({key:[Responses[key]] |
|
|
286 |
for key in Responses}, |
|
|
287 |
orient='index', |
|
|
288 |
columns=['Responses']) |
|
|
289 |
elif onlydo == 'Decisions': |
|
|
290 |
ResultsDF = DataFrame.from_dict({key:[Decisions[key]] |
|
|
291 |
for key in Proceedings}, |
|
|
292 |
orient='index', |
|
|
293 |
columns=['Decisions']) |
|
|
294 |
else: |
|
|
295 |
ResultsDF = DataFrame.from_dict({key:[Petitions[key], Responses[key], Decisions[key]] |
|
|
296 |
for key in Proceedings}, |
|
|
297 |
orient='index', |
|
|
298 |
columns=['Petitions','Responses','Decisions']) |
|
|
299 |
|
|
|
300 |
ResultsDF = ResultsDF.reset_index().rename(columns={'index':'Proceeding'}) |
|
|
301 |
|
|
|
302 |
ResultsDF.to_csv(outfile, sep='\t', index=False) |