b/SNN_model.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BERT_Arch(nn.Module):
+    def __init__(self, bert):
+      super(BERT_Arch, self).__init__()
+      self.bert = bert
+      self.conv1 = nn.Conv1d(in_channels=768, out_channels=128, kernel_size=3, stride=1) # kernal_size=3 == three-grams
+      self.avg_pooling = nn.AvgPool1d(kernel_size=2)
+      self.conv2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, stride=1)
+      self.flatten = nn.Flatten()
+      self.fc = nn.Linear(64,128)
+      self.dropout = nn.Dropout(0.2)
+    def forward(self, seq, mask):
+      hs, cls_hs = self.bert(seq, attention_mask=mask, return_dict=False)
+      x = hs.permute(0, 2, 1).contiguous()          # Permute `hs` to match input shape requirement of `nn.Conv1d`
+                                                    # The contiguous() ensures the memory of the tensor is stored contiguously
+                                                    # which helps avoid potential issues during processing.
+                                                    # Output shape: (b, 768, 70) = (b, embed_dim, max_len_seq).
+      x = F.relu(self.conv1(x))                     # Output shape: (b, 128, *)  * depends on kernel size and padding
+      x = self.avg_pooling(x)                       # Output shape: (b, 128, *)
+      x = F.relu(self.conv2(x))                     # Output shape: (b, 128, *)
+      x = F.max_pool1d(x, kernel_size=x.shape[2])   # Output shape: (b, 128, 1) # trick: we use kernel of size x.shape[2] to reduce from * to 1
+      x = self.flatten(x)                           # Output shape: (b, 128)
+      x = self.fc(x)                                # Output shape: (b, 128)
+      x = self.dropout(x)
+      return x
+class SiameseNeuralNetwork(nn.Module):
+    def __init__(self, bert_arch):
+        super().__init__()
+        self.bert_arch = bert_arch
+        self.distance_layer = nn.Sequential(nn.Linear(128, 1), nn.Sigmoid())  # if we would use BCEWithLogitsLoss as loss function, we should delte the sigmoid since we dont need it after the linear layer a sigmoid layer
+    def forward(self, seq1, seq2, mask1, mask2):
+        feature_vec1 = self.bert_arch(seq1, mask1) # feature_vec1 shape:  [batch_size, embedding_size]
+        feature_vec2 = self.bert_arch(seq2, mask2)
+        difference = torch.abs(feature_vec1 - feature_vec2)
+        out = self.distance_layer(difference)
+        return out
+class ContrastiveLoss(nn.Module):
+    """
+    Takes embeddings of two samples and a target label == 1 if samples are from the same class and label == 0 otherwise
+    """
+    def __init__(self, margin):
+        super(ContrastiveLoss, self).__init__()
+        self.margin = margin
+        self.eps = 1e-9
+    def forward(self, output1, output2, target, size_average=True):
+        distances = (output2 - output1).pow(2).sum(1)  # squared distances
+        losses = 0.5 * (target.float() * distances +
+                        (1 + -1 * target).float() *  torch.nn.functional.relu(self.margin - (distances + self.eps).sqrt()).pow(2))
+        return losses.mean() if size_average else losses.sum()