|
a |
|
b/upload_images_aws.py |
|
|
1 |
from gcloud import storage |
|
|
2 |
import argparse |
|
|
3 |
from enum import Enum |
|
|
4 |
import io |
|
|
5 |
from google.cloud import vision |
|
|
6 |
from google.cloud.vision import types |
|
|
7 |
from PIL import Image, ImageDraw |
|
|
8 |
import os |
|
|
9 |
import tempfile |
|
|
10 |
from pdf2image import convert_from_path, convert_from_bytes |
|
|
11 |
import pdf2image |
|
|
12 |
|
|
|
13 |
def convert_pdf(file_path, output_path=None): |
|
|
14 |
print(file_path) |
|
|
15 |
if ".JPG" in file_path: |
|
|
16 |
jpg = Image.open(file_path) |
|
|
17 |
jpg.save(output_path, 'JPEG', quality=80) |
|
|
18 |
return jpg |
|
|
19 |
|
|
|
20 |
if ".png" in file_path: |
|
|
21 |
png = Image.open(file_path) |
|
|
22 |
png.load() # required for png.split() |
|
|
23 |
|
|
|
24 |
background = Image.new("RGB", png.size, (255, 255, 255)) |
|
|
25 |
background.paste(png, mask=png.split()[3]) # 3 is the alpha channel |
|
|
26 |
|
|
|
27 |
background.save(output_path, 'JPEG', subsampling=0, quality=100) |
|
|
28 |
return background |
|
|
29 |
# save temp image files in temp dir, delete them after we are finished |
|
|
30 |
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
31 |
# convert pdf to multiple image |
|
|
32 |
|
|
|
33 |
images = convert_from_path(file_path, output_folder=temp_dir) |
|
|
34 |
|
|
|
35 |
# save images to temporary directory |
|
|
36 |
temp_images = [] |
|
|
37 |
for i in range(len(images)): |
|
|
38 |
image_path = f'{temp_dir}/{i}.jpg' |
|
|
39 |
images[i].save(image_path, 'JPEG') |
|
|
40 |
temp_images.append(image_path) |
|
|
41 |
# read images into pillow.Image |
|
|
42 |
imgs = list(map(Image.open, temp_images)) |
|
|
43 |
# find minimum width of images |
|
|
44 |
min_img_width = min(i.width for i in imgs) |
|
|
45 |
# find total height of all images |
|
|
46 |
total_height = 0 |
|
|
47 |
for i, img in enumerate(imgs): |
|
|
48 |
total_height += imgs[i].height |
|
|
49 |
# create new image object with width and total height |
|
|
50 |
merged_image = Image.new(imgs[0].mode, (min_img_width, total_height)) |
|
|
51 |
# paste images together one by one |
|
|
52 |
y = 0 |
|
|
53 |
for img in imgs: |
|
|
54 |
merged_image.paste(img, (0, y)) |
|
|
55 |
y += img.height |
|
|
56 |
# save merged image |
|
|
57 |
merged_image.save(output_path, 'JPEG', subsampling=0, quality=100) |
|
|
58 |
return merged_image |
|
|
59 |
|
|
|
60 |
|
|
|
61 |
if __name__ == '__main__': |
|
|
62 |
|
|
|
63 |
# data_path = '/Users/rhettd/Documents/Fall2019/MED_CONSULT/Data/fwdfacesheets/' |
|
|
64 |
data_path = '/Users/rhettd/Documents/Fall2019/MED_CONSULT/Data/XWP - ARCHANA WAGLE PC/' |
|
|
65 |
|
|
|
66 |
# client = storage.Client(project='medical-extraction') |
|
|
67 |
# bucket = client.get_bucket('report-ap') |
|
|
68 |
|
|
|
69 |
for file_name in os.listdir(data_path): |
|
|
70 |
if file_name != ".DS_Store" and file_name != "Done": |
|
|
71 |
image_name = file_name.split('.')[0] |
|
|
72 |
|
|
|
73 |
image = convert_pdf(data_path + file_name, data_path +"Done/"+ image_name + '.jpg') |
|
|
74 |
|
|
|
75 |
# blob = bucket.blob("face_sheet_images/" + image_name + '.jpg') |
|
|
76 |
# blob.upload_from_filename(data_path + "Done/"+ image_name+'.jpg') |