Diff of /extract_features.py [000000] .. [0fdc30]

Switch to unified view

a b/extract_features.py
1
import torch
2
import torch.nn as nn
3
from math import floor
4
import os
5
import random
6
import numpy as np
7
import pdb
8
import time
9
from datasets.dataset_h5 import Dataset_All_Bags, Whole_Slide_Bag
10
from torch.utils.data import DataLoader
11
from models.resnet_custom import resnet50_baseline
12
import argparse
13
from utils.utils import print_network, collate_features
14
from utils.file_utils import save_hdf5
15
from PIL import Image
16
import h5py
17
18
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
19
20
21
def compute_w_loader(file_path, output_path, model, batch_size = 8, verbose = 0, 
22
                     print_every=20, pretrained=True, target_patch_size=-1):
23
    """
24
    args:
25
        file_path: directory of bag (.h5 file)
26
        output_path: directory to save computed features (.h5 file)
27
        model: pytorch model
28
        batch_size: batch_size for computing features in batches
29
        verbose: level of feedback
30
        pretrained: use weights pretrained on imagenet
31
    """
32
    dataset = Whole_Slide_Bag(file_path=file_path, pretrained=pretrained, 
33
                              target_patch_size=target_patch_size)
34
    x, y = dataset[0]
35
    kwargs = {'num_workers': 4, 'pin_memory': True} if device.type == "cuda" else {}
36
    loader = DataLoader(dataset=dataset, batch_size=batch_size, **kwargs, collate_fn=collate_features)
37
38
    if verbose > 0:
39
        print('processing {}: total of {} batches'.format(file_path,len(loader)))
40
41
    mode = 'w'
42
    for count, (batch, coords) in enumerate(loader):
43
        with torch.no_grad():   
44
            if count % print_every == 0:
45
                print('batch {}/{}, {} files processed'.format(count, len(loader), count * batch_size))
46
            batch = batch.to(device, non_blocking=True)
47
            mini_bs = coords.shape[0]
48
            
49
            features = model(batch)
50
            
51
            features = features.cpu().numpy()
52
53
            asset_dict = {'features': features, 'coords': coords}
54
            save_hdf5(output_path, asset_dict, attr_dict= None, mode=mode)
55
            mode = 'a'
56
    
57
    return output_path
58
59
60
parser = argparse.ArgumentParser(description='Feature Extraction')
61
parser.add_argument('--data_dir', type=str)
62
parser.add_argument('--csv_path', type=str)
63
parser.add_argument('--feat_dir', type=str)
64
parser.add_argument('--batch_size', type=int, default=256)
65
parser.add_argument('--slide_ext', type=str, default= '.svs')
66
parser.add_argument('--no_auto_skip', default=False, action='store_true')
67
parser.add_argument('--target_patch_size', type=int, default=-1,
68
                    help='the desired size of patches for optional scaling before feature embedding')
69
args = parser.parse_args()
70
71
72
if __name__ == '__main__':
73
74
    print('initializing dataset')
75
    csv_path = args.csv_path
76
    bags_dataset = Dataset_All_Bags(csv_path)
77
    
78
    os.makedirs(args.feat_dir, exist_ok=True)
79
    dest_files = os.listdir(args.feat_dir)
80
81
    print('loading model checkpoint')
82
    model = resnet50_baseline(pretrained=True)
83
    model = model.to(device)
84
    
85
    # print_network(model)
86
    if torch.cuda.device_count() > 1:
87
        model = nn.DataParallel(model)
88
        
89
    model.eval()
90
    total = len(bags_dataset)
91
92
    for bag_candidate_idx in range(total):
93
        slide_id = bags_dataset[bag_candidate_idx].split(args.slide_ext)[0]
94
        bag_name = slide_id + '.h5'
95
        bag_candidate = os.path.join(args.data_dir, 'patches', bag_name)
96
97
        print('\nprogress: {}/{}'.format(bag_candidate_idx, total))
98
        print(bag_name)
99
        if not args.no_auto_skip and slide_id+'.pt' in dest_files:
100
            print('skipped {}'.format(slide_id))
101
            continue 
102
103
        output_path = os.path.join(args.feat_dir, 'h5_files', bag_name)
104
        file_path = bag_candidate
105
        time_start = time.time()
106
        output_file_path = compute_w_loader(file_path, output_path, 
107
                                            model = model, batch_size = args.batch_size, 
108
                                            verbose = 1, print_every = 20,
109
                                            target_patch_size=args.target_patch_size)
110
        time_elapsed = time.time() - time_start
111
        print('\ncomputing features for {} took {} s'.format(output_file_path, time_elapsed))
112
        file = h5py.File(output_file_path, "r")
113
114
        features = file['features'][:]
115
        print('features size: ', features.shape)
116
        print('coordinates size: ', file['coords'].shape)
117
        features = torch.from_numpy(features)
118
        bag_base, _ = os.path.splitext(bag_name)
119
        torch.save(features, os.path.join(args.feat_dir, 'pt_files', bag_base+'.pt'))