|
a |
|
b/extract_features.py |
|
|
1 |
import torch |
|
|
2 |
import torch.nn as nn |
|
|
3 |
from math import floor |
|
|
4 |
import os |
|
|
5 |
import random |
|
|
6 |
import numpy as np |
|
|
7 |
import pdb |
|
|
8 |
import time |
|
|
9 |
from datasets.dataset_h5 import Dataset_All_Bags, Whole_Slide_Bag |
|
|
10 |
from torch.utils.data import DataLoader |
|
|
11 |
from models.resnet_custom import resnet50_baseline |
|
|
12 |
import argparse |
|
|
13 |
from utils.utils import print_network, collate_features |
|
|
14 |
from utils.file_utils import save_hdf5 |
|
|
15 |
from PIL import Image |
|
|
16 |
import h5py |
|
|
17 |
|
|
|
18 |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') |
|
|
19 |
|
|
|
20 |
|
|
|
21 |
def compute_w_loader(file_path, output_path, model, batch_size = 8, verbose = 0, |
|
|
22 |
print_every=20, pretrained=True, target_patch_size=-1): |
|
|
23 |
""" |
|
|
24 |
args: |
|
|
25 |
file_path: directory of bag (.h5 file) |
|
|
26 |
output_path: directory to save computed features (.h5 file) |
|
|
27 |
model: pytorch model |
|
|
28 |
batch_size: batch_size for computing features in batches |
|
|
29 |
verbose: level of feedback |
|
|
30 |
pretrained: use weights pretrained on imagenet |
|
|
31 |
""" |
|
|
32 |
dataset = Whole_Slide_Bag(file_path=file_path, pretrained=pretrained, |
|
|
33 |
target_patch_size=target_patch_size) |
|
|
34 |
x, y = dataset[0] |
|
|
35 |
kwargs = {'num_workers': 4, 'pin_memory': True} if device.type == "cuda" else {} |
|
|
36 |
loader = DataLoader(dataset=dataset, batch_size=batch_size, **kwargs, collate_fn=collate_features) |
|
|
37 |
|
|
|
38 |
if verbose > 0: |
|
|
39 |
print('processing {}: total of {} batches'.format(file_path,len(loader))) |
|
|
40 |
|
|
|
41 |
mode = 'w' |
|
|
42 |
for count, (batch, coords) in enumerate(loader): |
|
|
43 |
with torch.no_grad(): |
|
|
44 |
if count % print_every == 0: |
|
|
45 |
print('batch {}/{}, {} files processed'.format(count, len(loader), count * batch_size)) |
|
|
46 |
batch = batch.to(device, non_blocking=True) |
|
|
47 |
mini_bs = coords.shape[0] |
|
|
48 |
|
|
|
49 |
features = model(batch) |
|
|
50 |
|
|
|
51 |
features = features.cpu().numpy() |
|
|
52 |
|
|
|
53 |
asset_dict = {'features': features, 'coords': coords} |
|
|
54 |
save_hdf5(output_path, asset_dict, attr_dict= None, mode=mode) |
|
|
55 |
mode = 'a' |
|
|
56 |
|
|
|
57 |
return output_path |
|
|
58 |
|
|
|
59 |
|
|
|
60 |
parser = argparse.ArgumentParser(description='Feature Extraction') |
|
|
61 |
parser.add_argument('--data_dir', type=str) |
|
|
62 |
parser.add_argument('--csv_path', type=str) |
|
|
63 |
parser.add_argument('--feat_dir', type=str) |
|
|
64 |
parser.add_argument('--batch_size', type=int, default=256) |
|
|
65 |
parser.add_argument('--slide_ext', type=str, default= '.svs') |
|
|
66 |
parser.add_argument('--no_auto_skip', default=False, action='store_true') |
|
|
67 |
parser.add_argument('--target_patch_size', type=int, default=-1, |
|
|
68 |
help='the desired size of patches for optional scaling before feature embedding') |
|
|
69 |
args = parser.parse_args() |
|
|
70 |
|
|
|
71 |
|
|
|
72 |
if __name__ == '__main__': |
|
|
73 |
|
|
|
74 |
print('initializing dataset') |
|
|
75 |
csv_path = args.csv_path |
|
|
76 |
bags_dataset = Dataset_All_Bags(csv_path) |
|
|
77 |
|
|
|
78 |
os.makedirs(args.feat_dir, exist_ok=True) |
|
|
79 |
dest_files = os.listdir(args.feat_dir) |
|
|
80 |
|
|
|
81 |
print('loading model checkpoint') |
|
|
82 |
model = resnet50_baseline(pretrained=True) |
|
|
83 |
model = model.to(device) |
|
|
84 |
|
|
|
85 |
# print_network(model) |
|
|
86 |
if torch.cuda.device_count() > 1: |
|
|
87 |
model = nn.DataParallel(model) |
|
|
88 |
|
|
|
89 |
model.eval() |
|
|
90 |
total = len(bags_dataset) |
|
|
91 |
|
|
|
92 |
for bag_candidate_idx in range(total): |
|
|
93 |
slide_id = bags_dataset[bag_candidate_idx].split(args.slide_ext)[0] |
|
|
94 |
bag_name = slide_id + '.h5' |
|
|
95 |
bag_candidate = os.path.join(args.data_dir, 'patches', bag_name) |
|
|
96 |
|
|
|
97 |
print('\nprogress: {}/{}'.format(bag_candidate_idx, total)) |
|
|
98 |
print(bag_name) |
|
|
99 |
if not args.no_auto_skip and slide_id+'.pt' in dest_files: |
|
|
100 |
print('skipped {}'.format(slide_id)) |
|
|
101 |
continue |
|
|
102 |
|
|
|
103 |
output_path = os.path.join(args.feat_dir, 'h5_files', bag_name) |
|
|
104 |
file_path = bag_candidate |
|
|
105 |
time_start = time.time() |
|
|
106 |
output_file_path = compute_w_loader(file_path, output_path, |
|
|
107 |
model = model, batch_size = args.batch_size, |
|
|
108 |
verbose = 1, print_every = 20, |
|
|
109 |
target_patch_size=args.target_patch_size) |
|
|
110 |
time_elapsed = time.time() - time_start |
|
|
111 |
print('\ncomputing features for {} took {} s'.format(output_file_path, time_elapsed)) |
|
|
112 |
file = h5py.File(output_file_path, "r") |
|
|
113 |
|
|
|
114 |
features = file['features'][:] |
|
|
115 |
print('features size: ', features.shape) |
|
|
116 |
print('coordinates size: ', file['coords'].shape) |
|
|
117 |
features = torch.from_numpy(features) |
|
|
118 |
bag_base, _ = os.path.splitext(bag_name) |
|
|
119 |
torch.save(features, os.path.join(args.feat_dir, 'pt_files', bag_base+'.pt')) |