b/im4MEC.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+# Adapted from https://github.com/AIRMEC/im4MEC
+class Attn_Net_Gated(nn.Module):
+    def __init__(self, L=1024, D=256, dropout=False, p_dropout_atn=0.25, n_classes=1):
+        super(Attn_Net_Gated, self).__init__()
+        self.attention_a = [nn.Linear(L, D), nn.Tanh()]
+        self.attention_b = [nn.Linear(L, D), nn.Sigmoid()]
+        if dropout:
+            self.attention_a.append(nn.Dropout(p_dropout_atn))
+            self.attention_b.append(nn.Dropout(p_dropout_atn))
+        self.attention_a = nn.Sequential(*self.attention_a)
+        self.attention_b = nn.Sequential(*self.attention_b)
+        self.attention_c = nn.Linear(D, n_classes)
+    def forward(self, x):
+        a = self.attention_a(x)
+        b = self.attention_b(x)
+        A = a.mul(b)
+        A = self.attention_c(A)  # N x n_classes
+        return A
+class Im4MEC(nn.Module):
+    def __init__(
+        self,
+        input_feature_size=1024,
+        precompression_layer=True,
+        feature_size_comp = 512,
+        feature_size_attn = 256,
+        dropout=True,
+        p_dropout_fc=0.25,
+        p_dropout_atn=0.25,
+        n_classes=4,
+    ):
+        super(Im4MEC, self).__init__()
+        self.n_classes = n_classes
+        if precompression_layer:
+            self.compression_layer = nn.Sequential(*[
+                                                    nn.Linear(input_feature_size, feature_size_comp*4),
+                                                    nn.ReLU(),
+                                                    nn.Dropout(p_dropout_fc),
+                                                    nn.Linear(feature_size_comp*4, feature_size_comp*2),
+                                                    nn.ReLU(),
+                                                    nn.Dropout(p_dropout_fc),
+                                                    nn.Linear(feature_size_comp*2, feature_size_comp),
+                                                    nn.ReLU(),
+                                                    nn.Dropout(p_dropout_fc)])
+            dim_post_compression = feature_size_comp
+        else:
+            self.compression_layer = nn.Identity()
+            dim_post_compression = input_feature_size
+        self.attention_net = Attn_Net_Gated(
+            L=dim_post_compression,
+            D=feature_size_attn,
+            dropout=dropout,
+            p_dropout_atn=p_dropout_atn,
+            n_classes=self.n_classes)
+        # Classification head.
+        self.classifiers = nn.ModuleList(
+            [nn.Linear(dim_post_compression, 1) for i in range(self.n_classes)]
+        )
+        # Init weights.
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_normal_(module.weight)
+            if module.bias is not None:
+                module.bias.data.zero_()
+    def forward_attention(self, h):
+        A_ = self.attention_net(h)  # h shape is N_tilesxdim
+        A_raw = torch.transpose(A_, 1, 0)  # K_attention_classesxN_tiles
+        A = F.softmax(A_raw, dim=-1)  # normalize attentions scores over tiles
+        return A_raw, A
+    def forward(self, h):
+        h = self.compression_layer(h)
+        # Attention MIL.
+        A_raw, A = self.forward_attention(h) # 1xN tiles
+        M = A @ h #torch.Size([1, dim_embedding])  # 1x512 [Sum over N(aihi,1), ..., Sum over N(aihi,512)]
+        logits = torch.empty(1, self.n_classes).float().to(h.device)
+        for c in range(self.n_classes):
+            logits[0, c] = self.classifiers[c](M[c])
+        Y_hat = torch.topk(logits, 1, dim=1)[1]
+        Y_prob = F.softmax(logits, dim=1)
+        return logits, Y_prob, Y_hat, A_raw, M