|
a |
|
b/processing_pytorch.py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
""" |
|
|
3 |
Created on Wed Sep 8 10:27:47 2021 |
|
|
4 |
|
|
|
5 |
@author: m.beuque |
|
|
6 |
""" |
|
|
7 |
from matplotlib.image import imread |
|
|
8 |
import cv2 |
|
|
9 |
import tqdm |
|
|
10 |
import pandas as pd |
|
|
11 |
import os |
|
|
12 |
import numpy as np |
|
|
13 |
from torch.utils import data |
|
|
14 |
import torchvision.transforms as transforms |
|
|
15 |
from sklearn.utils import shuffle |
|
|
16 |
import torch |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
def generate_dataset_tissue_type(main_path,path_data,SEED): |
|
|
20 |
data = pd.read_csv(path_data,sep = ',' ) |
|
|
21 |
|
|
|
22 |
X = [] |
|
|
23 |
y = [] |
|
|
24 |
paths = [] |
|
|
25 |
for slide in tqdm.tqdm(os.listdir(os.path.join(main_path, 'Slides'))): |
|
|
26 |
tile_path = os.path.join(main_path, 'Slides',slide,'tiles') |
|
|
27 |
gland = data[(data['labels']=='stroma') & (data['dataset_name']==slide)]['image_name'] |
|
|
28 |
tissue = data[(data['labels']=='epithelial tissue') & (data['dataset_name']==slide)]['image_name'] |
|
|
29 |
gland = list(gland) |
|
|
30 |
tissue = list(tissue) |
|
|
31 |
for image_path in gland: |
|
|
32 |
if os.path.isfile(os.path.join(tile_path, image_path)): |
|
|
33 |
X.append(imread(os.path.join(tile_path, image_path))) |
|
|
34 |
y.append(0) |
|
|
35 |
paths.append(os.path.join(tile_path, image_path)) |
|
|
36 |
for image_path in tissue: |
|
|
37 |
if os.path.isfile(os.path.join(tile_path, image_path)): |
|
|
38 |
X.append(imread(os.path.join(tile_path, image_path))) |
|
|
39 |
y.append(1) |
|
|
40 |
paths.append(os.path.join(tile_path, image_path)) |
|
|
41 |
|
|
|
42 |
X = np.array(X) |
|
|
43 |
for j, elmt in enumerate(X): |
|
|
44 |
if elmt.shape !=(96,96,3): |
|
|
45 |
X[j] = cv2.resize(elmt,(96,96),interpolation = cv2.INTER_CUBIC) |
|
|
46 |
X,y = shuffle(X,y,random_state=SEED) |
|
|
47 |
|
|
|
48 |
return X, y, paths |
|
|
49 |
|
|
|
50 |
def generate_dataset_grade(main_path,path_data): |
|
|
51 |
#path_data is the path to the csv containing the information of the ing or testing or validation dataset |
|
|
52 |
#main_path contains the folder "Slides" were the H&E tiles where stored |
|
|
53 |
data = pd.read_csv(path_data,sep = ',' ) |
|
|
54 |
X = [] |
|
|
55 |
y = [] |
|
|
56 |
paths = [] |
|
|
57 |
|
|
|
58 |
for slide in tqdm.tqdm(os.listdir(os.path.join(main_path, 'Slides'))): |
|
|
59 |
tile_path = os.path.join(main_path, 'Slides',slide,'tiles') |
|
|
60 |
healthy = data[(data['labels']=='non-dysplasia') & (data['dataset_name']==slide)]['image_name'] |
|
|
61 |
lowgrade = data[(data['labels']=='low grade') & (data['dataset_name']==slide)]['image_name'] |
|
|
62 |
highgrade = data[(data['labels']=='high grade') & (data['dataset_name']==slide)]['image_name'] |
|
|
63 |
healthy = list(healthy) |
|
|
64 |
lowgrade = list(lowgrade) |
|
|
65 |
highgrade = list(highgrade) |
|
|
66 |
for image_path in healthy: |
|
|
67 |
if os.path.isfile(os.path.join(tile_path, image_path)): |
|
|
68 |
X.append(imread(os.path.join(tile_path, image_path))) |
|
|
69 |
y.append("non-dysplasia") |
|
|
70 |
paths.append(os.path.join(tile_path, image_path)) |
|
|
71 |
else: |
|
|
72 |
print("error for non-dysplasia") |
|
|
73 |
for image_path in lowgrade: |
|
|
74 |
if os.path.isfile(os.path.join(tile_path, image_path)): |
|
|
75 |
X.append(imread(os.path.join(tile_path, image_path))) |
|
|
76 |
y.append("low grade") |
|
|
77 |
paths.append(os.path.join(tile_path, image_path)) |
|
|
78 |
else : |
|
|
79 |
print("error for low grade") |
|
|
80 |
for image_path in highgrade: |
|
|
81 |
if os.path.isfile(os.path.join(tile_path, image_path)): |
|
|
82 |
X.append(imread(os.path.join(tile_path, image_path))) |
|
|
83 |
y.append("high grade") |
|
|
84 |
paths.append(os.path.join(tile_path, image_path)) |
|
|
85 |
else : |
|
|
86 |
print("error for high grade") |
|
|
87 |
|
|
|
88 |
#rescale the images to the same size |
|
|
89 |
for j, elmt in enumerate(X): |
|
|
90 |
if elmt.shape !=(96,96,3): |
|
|
91 |
X[j] = cv2.resize(elmt,(96,96),interpolation = cv2.INTER_CUBIC) |
|
|
92 |
X = np.array(X) |
|
|
93 |
y = np.array(y) |
|
|
94 |
return X,y,paths |
|
|
95 |
|
|
|
96 |
#regular dataset generation |
|
|
97 |
class CancerDataset(data.Dataset): |
|
|
98 |
'Characterizes a dataset for PyTorch' |
|
|
99 |
def __init__(self,X_all, y_all, transform = transforms.Compose([transforms.CenterCrop(64),transforms.ToTensor()])): |
|
|
100 |
'Initialization' |
|
|
101 |
self.labels = y_all |
|
|
102 |
self.list_IDs = X_all |
|
|
103 |
self.transform = transform |
|
|
104 |
self.image_files_list = [str(s) for s in range(len(self.list_IDs))] |
|
|
105 |
|
|
|
106 |
def __len__(self): |
|
|
107 |
'Denotes the total number of samples' |
|
|
108 |
return len(self.list_IDs) |
|
|
109 |
|
|
|
110 |
def __getitem__(self, index): |
|
|
111 |
'Generates one sample of data' |
|
|
112 |
# Select sample |
|
|
113 |
X = self.list_IDs[index] |
|
|
114 |
X = self.transform(image=X) |
|
|
115 |
X = X['image'] |
|
|
116 |
# Load data and get label |
|
|
117 |
y = self.labels[index] |
|
|
118 |
return X, y |
|
|
119 |
|
|
|
120 |
def df_dl_features(X,paths,data_transforms,classifier): |
|
|
121 |
features = {} |
|
|
122 |
for i,temp_X in tqdm.tqdm(enumerate(X)): |
|
|
123 |
tensor_X=data_transforms(image=temp_X) |
|
|
124 |
tensor_X=tensor_X["image"] |
|
|
125 |
tensor_X.unsqueeze_(0) |
|
|
126 |
output=torch.flatten(classifier(tensor_X)).detach().numpy() |
|
|
127 |
features[paths[i]]=output.flatten() |
|
|
128 |
features=pd.DataFrame.from_dict(features) |
|
|
129 |
features=features.T |
|
|
130 |
return features |