SHEPHERD / Git / [db6163] /shepherd/samplers.py

Models:
MarcoTheBlack/
SHEPHERD
Downloads: 1
[db6163]: / shepherd / samplers.py
History
Download this file
691 lines (585 with data), 38.0 kB

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch_sparse import SparseTensor
from torch_cluster import random_walk
from torch_geometric.data.sampler import EdgeIndex, Adj
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from torch_geometric.utils import add_self_loops, add_remaining_self_loops
from torch_geometric.data import Data, DataLoader, NeighborSampler

from typing import List, Optional, Tuple, NamedTuple, Union, Callable, Dict
from collections import defaultdict
import time
import random
import pickle
from collections import Counter
from operator import itemgetter
import copy
import numpy as np
from utils.pretrain_utils import get_indices_into_edge_index, HeterogeneousEdgeIndex 
from sklearn.preprocessing import label_binarize

import project_config


class NeighborSampler(torch.utils.data.DataLoader):
    r"""The neighbor sampler from the `"Inductive Representation Learning on
    Large Graphs" <https://arxiv.org/abs/1706.02216>`_ paper, which allows
    for mini-batch training of GNNs on large-scale graphs where full-batch
    training is not feasible.
    Given a GNN with :math:`L` layers and a specific mini-batch of nodes
    :obj:`node_idx` for which we want to compute embeddings, this module
    iteratively samples neighbors and constructs bipartite graphs that simulate
    the actual computation flow of GNNs.
    More specifically, :obj:`sizes` denotes how much neighbors we want to
    sample for each node in each layer.
    This module then takes in these :obj:`sizes` and iteratively samples
    :obj:`sizes[l]` for each node involved in layer :obj:`l`.
    In the next layer, sampling is repeated for the union of nodes that were
    already encountered.
    The actual computation graphs are then returned in reverse-mode, meaning
    that we pass messages from a larger set of nodes to a smaller one, until we
    reach the nodes for which we originally wanted to compute embeddings.
    Hence, an item returned by :class:`NeighborSampler` holds the current
    :obj:`batch_size`, the IDs :obj:`n_id` of all nodes involved in the
    computation, and a list of bipartite graph objects via the tuple
    :obj:`(edge_index, e_id, size)`, where :obj:`edge_index` represents the
    bipartite edges between source and target nodes, :obj:`e_id` denotes the
    IDs of original edges in the full graph, and :obj:`size` holds the shape
    of the bipartite graph.
    For each bipartite graph, target nodes are also included at the beginning
    of the list of source nodes so that one can easily apply skip-connections
    or add self-loops.
    .. note::
        For an example of using :obj:`NeighborSampler`, see
        `examples/reddit.py
        <https://github.com/rusty1s/pytorch_geometric/blob/master/examples/
        reddit.py>`_ or
        `examples/ogbn_products_sage.py
        <https://github.com/rusty1s/pytorch_geometric/blob/master/examples/
        ogbn_products_sage.py>`_.
    Args:
        edge_index (Tensor or SparseTensor): A :obj:`torch.LongTensor` or a
            :obj:`torch_sparse.SparseTensor` that defines the underlying graph
            connectivity/message passing flow.
            :obj:`edge_index` holds the indices of a (sparse) symmetric
            adjacency matrix.
            If :obj:`edge_index` is of type :obj:`torch.LongTensor`, its shape
            must be defined as :obj:`[2, num_edges]`, where messages from nodes
            :obj:`edge_index[0]` are sent to nodes in :obj:`edge_index[1]`
            (in case :obj:`flow="source_to_target"`).
            If :obj:`edge_index` is of type :obj:`torch_sparse.SparseTensor`,
            its sparse indices :obj:`(row, col)` should relate to
            :obj:`row = edge_index[1]` and :obj:`col = edge_index[0]`.
            The major difference between both formats is that we need to input
            the *transposed* sparse adjacency matrix.
        sizes ([int]): The number of neighbors to sample for each node in each
            layer. If set to :obj:`sizes[l] = -1`, all neighbors are included
            in layer :obj:`l`.
        node_idx (LongTensor, optional): The nodes that should be considered
            for creating mini-batches. If set to :obj:`None`, all nodes will be
            considered.
        num_nodes (int, optional): The number of nodes in the graph.
            (default: :obj:`None`)
        return_e_id (bool, optional): If set to :obj:`False`, will not return
            original edge indices of sampled edges. This is only useful in case
            when operating on graphs without edge features to save memory.
            (default: :obj:`True`)
        transform (callable, optional): A function/transform that takes in
            an a sampled mini-batch and returns a transformed version.
            (default: :obj:`None`)
        **kwargs (optional): Additional arguments of
            :class:`torch.utils.data.DataLoader`, such as :obj:`batch_size`,
            :obj:`shuffle`, :obj:`drop_last` or :obj:`num_workers`.
    """
    def __init__(self, dataset_type: str, edge_index: Union[Tensor, SparseTensor], 
                sample_edge_index: Union[Tensor, SparseTensor],
                 sizes: List[int],
                 node_idx: Optional[Tensor] = None,
                 num_nodes: Optional[int] = None, return_e_id: bool = True,
                 transform: Callable = None,
                 do_filter_edges: bool = True, 
                 **kwargs):

        edge_index = edge_index.to('cpu')
        sample_edge_index = sample_edge_index.to('cpu')

        # add self loops
        sample_edge_index, _ = add_self_loops(sample_edge_index)


        if 'collate_fn' in kwargs:
            del kwargs['collate_fn']

        # Save for Pytorch Lightning...
        self.dataset_type = dataset_type
        self.edge_index = edge_index #always train edge index
        self.sample_edge_index = sample_edge_index # depends on train/val/test
        self.node_idx = node_idx
        self.num_nodes = num_nodes

        self.sizes = sizes
        self.return_e_id = return_e_id
        self.transform = transform
        self.is_sparse_tensor = isinstance(edge_index, SparseTensor)
        self.__val__ = None
        self.do_filter_edges = do_filter_edges

        # Obtain a *transposed* `SparseTensor` instance.
        if not self.is_sparse_tensor:
            if (num_nodes is None and node_idx is not None
                    and node_idx.dtype == torch.bool):
                num_nodes = node_idx.size(0)
                sample_num_nodes = num_nodes
            if (num_nodes is None and node_idx is not None
                    and node_idx.dtype == torch.long):
                num_nodes = max(int(edge_index.max()), int(node_idx.max())) + 1
                sample_num_nodes = num_nodes
            if num_nodes is None:
                num_nodes = int(edge_index.max()) + 1
                sample_num_nodes = int(sample_edge_index.max()) + 1

            value = torch.arange(edge_index.size(1)) if return_e_id else None
            sample_value = torch.arange(sample_edge_index.size(1)) if return_e_id else None
            self.adj_t = SparseTensor(row=edge_index[0], col=edge_index[1],
                                      value=value,
                                      sparse_sizes=(num_nodes, num_nodes)).t()
            self.adj_t_sample = SparseTensor(row=sample_edge_index[0], col=sample_edge_index[1],
                                      value=sample_value,
                                      sparse_sizes=(sample_num_nodes, sample_num_nodes)).t()
        else:
            adj_t = edge_index
            adj_t_sample = sample_edge_index
            if return_e_id:
                self.__val__ = adj_t.storage.value()
                value = torch.arange(adj_t.nnz())
                adj_t = adj_t.set_value(value, layout='coo')
                adj_t_sample = adj_t_sample.set_value(torch.arange(adj_t_sample.nnz()), layout='coo')
            self.adj_t = adj_t
            self.adj_t_sample = adj_t_sample

        self.adj_t.storage.rowptr()
        self.adj_t_sample.storage.rowptr()

        if node_idx is None:
            node_idx = torch.arange(self.adj_t_sample.sparse_size(0)) 
        elif node_idx.dtype == torch.bool:
            node_idx = node_idx.nonzero(as_tuple=False).view(-1)

        super(NeighborSampler, self).__init__(
            node_idx.view(-1).tolist(), collate_fn=self.sample, **kwargs)

    

    def filter_edges(self, edge_index, e_id, source_nodes, target_nodes):
        '''
        Filter out the edges we're trying to predict in the current batch from the edge index
        NOTE: edge_index here is re-indexed
        '''
        reindex_source_nodes = torch.arange(source_nodes.size(0))
        reindex_target_nodes = torch.arange(start = source_nodes.size(0), end = source_nodes.size(0) + target_nodes.size(0))

        # get reverse edges to filter as well
        all_source_nodes = torch.cat([reindex_source_nodes, reindex_target_nodes])
        all_target_nodes = torch.cat([reindex_target_nodes, reindex_source_nodes])
        ind_to_edge_index, ind_to_nodes = get_indices_into_edge_index(edge_index, all_source_nodes, all_target_nodes) #get index into the original edge index (this returns e_ids)
        mask = torch.ones(edge_index.size(1), dtype=torch.bool)
        mask[ind_to_edge_index] = False

        return edge_index[:, mask], e_id[mask]


    def sample(self, source_batch):
        
        #convert to tensor
        if not isinstance(source_batch, Tensor):
            source_batch = torch.tensor(source_batch)

        # sample nodes to form positive edges. we will try to predict these edges
        row, col, e_id = self.adj_t_sample.coo()    
        target_batch = random_walk(row, col, source_batch, walk_length=1, coalesced=False)[:, 1] #NOTE: only does self loops when no edges in the current partition of the dataset
        batch = torch.cat([source_batch, target_batch], dim=0) 

        batch_size: int = len(batch)
        adjs = []
        n_id = batch
        for size in self.sizes:
            adj_t, n_id = self.adj_t.sample_adj(n_id, size, replace=False) 
            e_id = adj_t.storage.value()
            size = adj_t.sparse_sizes()[::-1]
            if self.__val__ is not None:
                adj_t.set_value_(self.__val__[e_id], layout='coo')

            if self.is_sparse_tensor: #TODO: implement filter_edges if sparse tensor
                adjs.append(Adj(adj_t, e_id, size))
            else:
                row, col, _ = adj_t.coo()
                edge_index = torch.stack([col, row], dim=0)

                if self.do_filter_edges and self.dataset_type == 'train':
                    edge_index, e_id = self.filter_edges(edge_index, e_id, source_batch, target_batch)
                adjs.append(EdgeIndex(edge_index, e_id, size))

        adjs = adjs[0] if len(adjs) == 1 else adjs[::-1]
        out = (batch_size, n_id, adjs)
        out = self.transform(*out) if self.transform is not None else out
        return out

    def __repr__(self):
        return '{}(sizes={})'.format(self.__class__.__name__, self.sizes)

class PatientNeighborSampler(torch.utils.data.DataLoader):
   
    def __init__(self, dataset_type: str, edge_index: Union[Tensor, SparseTensor], 
                 sample_edge_index: Union[Tensor, SparseTensor],
                 sizes: List[int], 
                 patient_dataset,
                 all_edge_attributes,
                 n_nodes: int,
                 relevant_node_idx = None,
                 do_filter_edges: Optional[bool] = False,
                 num_nodes: Optional[int] = None, 
                 return_e_id: bool = True,
                 sparse_sample: Optional[int] = 0,
                 train_phenotype_counter: Dict = None,
                 train_gene_counter: Dict = None,
                 sample_edges_from_train_patients=False,
                 upsample_cand: Optional[int] = 0,
                 n_cand_diseases=-1,
                 use_diseases=False,
                 nid_to_spl_dict = None,
                 gp_spl = None,
                 spl_indexing_dict=None,

                 gene_similarity_dict=None,
                 gene_deg_dict = None,

                 hparams=None,
                 transform: Callable = None, 
                 **kwargs):

        edge_index = edge_index.to('cpu')
        sample_edge_index = sample_edge_index.to('cpu')

        # add self loops
        sample_edge_index = torch.cat((sample_edge_index, torch.stack([edge_index.unique(), edge_index.unique()])),1 )
        sample_edge_index, _ = add_remaining_self_loops(sample_edge_index)

        if 'collate_fn' in kwargs:
            del kwargs['collate_fn']

        # Save for Pytorch Lightning...
        self.do_filter_edges = do_filter_edges
        self.relevant_node_idx = relevant_node_idx
        self.n_nodes = n_nodes
        self.all_edge_attr = all_edge_attributes
        self.dataset_type = dataset_type
        self.sparse_sample = sparse_sample
        self.edge_index = edge_index #always train edge index
        self.sample_edge_index = sample_edge_index # depends on train/val/test
        self.patient_dataset = patient_dataset
        self.num_nodes = num_nodes
        self.train_phenotype_counter = train_phenotype_counter
        self.train_gene_counter = train_gene_counter
        self.sample_edges_from_train_patients = sample_edges_from_train_patients
        self.sizes = sizes
        self.return_e_id = return_e_id
        self.transform = transform
        self.is_sparse_tensor = isinstance(edge_index, SparseTensor)
        self.__val__ = None

        # For SPL
        self.nid_to_spl_dict = nid_to_spl_dict 
        if hparams["alpha"] < 1: self.gp_spl = gp_spl
        else: self.gp_spl = None
        self.spl_indexing_dict = spl_indexing_dict

        # Up-sample candidate genes
        self.upsample_cand = upsample_cand
        self.cand_gene_freq = Counter([])
        with open(str(project_config.KG_DIR  / f'ensembl_to_idx_dict_{project_config.CURR_KG}.pkl'), 'rb') as handle:
            ensembl_to_idx_dict = pickle.load(handle) # create ensembl to node_idx map
        idx_to_ensembl_dict = {v: k for k, v in ensembl_to_idx_dict.items()}
        self.cand_gene_freq = Counter([k for k in nid_to_spl_dict if k in idx_to_ensembl_dict]) # Upsample from all gene nodes in the KG
        
        self.n_cand_diseases = n_cand_diseases
        self.use_diseases = use_diseases
        self.hparams = hparams

        self.gene_similarity_dict = gene_similarity_dict
        self.gene_deg_dict = gene_deg_dict

        # Obtain a *transposed* `SparseTensor` instance.
        if not self.is_sparse_tensor:
            if num_nodes is None:
                num_nodes = int(edge_index.max()) + 1
                sample_num_nodes = int(sample_edge_index.max()) + 1

            value = torch.arange(edge_index.size(1)) if return_e_id else None
            sample_value = torch.arange(sample_edge_index.size(1)) if return_e_id else None
            self.adj_t = SparseTensor(row=edge_index[0], col=edge_index[1],
                                      value=value,
                                      sparse_sizes=(num_nodes, num_nodes)).t()
            self.adj_t_sample = SparseTensor(row=sample_edge_index[0], col=sample_edge_index[1],
                                      value=sample_value,
                                      sparse_sizes=(sample_num_nodes, sample_num_nodes)).t()
        else:
            adj_t = edge_index
            adj_t_sample = sample_edge_index
            if return_e_id:
                self.__val__ = adj_t.storage.value()
                value = torch.arange(adj_t.nnz())
                adj_t = adj_t.set_value(value, layout='coo')
                adj_t_sample = adj_t_sample.set_value(torch.arange(adj_t_sample.nnz()), layout='coo')
            self.adj_t = adj_t
            self.adj_t_sample = adj_t_sample

        self.adj_t.storage.rowptr()
        self.adj_t_sample.storage.rowptr()



        super(PatientNeighborSampler, self).__init__(
            self.patient_dataset, collate_fn=self.collate, **kwargs)

    def filter_edges(self, edge_index, e_id, source_nodes, target_nodes):
        '''
        Filter out the edges we're trying to predict in the current batch from the edge index
        NOTE: edge_index here is re-indexed
        '''
        reindex_source_nodes = torch.arange(source_nodes.size(0))
        reindex_target_nodes = torch.arange(start = source_nodes.size(0), end = source_nodes.size(0) + target_nodes.size(0))

        # get reverse edges to filter as well
        all_source_nodes = torch.cat([reindex_source_nodes, reindex_target_nodes])
        all_target_nodes = torch.cat([reindex_target_nodes, reindex_source_nodes])
        ind_to_edge_index, ind_to_nodes = get_indices_into_edge_index(edge_index, all_source_nodes, all_target_nodes) #get index into the original edge index (this returns e_ids)
        mask = torch.ones(edge_index.size(1), dtype=torch.bool)
        mask[ind_to_edge_index] = False

        return edge_index[:, mask], e_id[mask]

    def get_source_nodes(self, phenotype_node_idx, candidate_gene_node_idx, correct_genes_node_idx, disease_node_idx, candidate_disease_node_idx, sim_gene_node_idx): 
        
        # Get batch node indices based on patient phenotypes and genes
        if sim_gene_node_idx is not None:
            source_batch = torch.cat(phenotype_node_idx +  candidate_gene_node_idx +  correct_genes_node_idx + disease_node_idx + candidate_disease_node_idx + sim_gene_node_idx)
        else:
            source_batch = torch.cat(phenotype_node_idx +  candidate_gene_node_idx +  correct_genes_node_idx + disease_node_idx + candidate_disease_node_idx)

         # Randomly sample nodes in KG 
        if self.sparse_sample > 0:
            if self.relevant_node_idx == None:
                rand_idx = torch.randint(high=self.n_nodes, size=(self.sparse_sample,)) # NOTE that this can sample duplicates, but has the benefit of randomly sampling new nodes each epoch
            else:
                rand_idx = self.relevant_node_idx[torch.randint(high=self.relevant_node_idx.size(0), size=(self.sparse_sample,))]
            
            source_batch = torch.cat([source_batch, rand_idx])
            source_batch = torch.unique(source_batch)
            sparse_idx = torch.unique(rand_idx)
        else:
            source_batch = torch.unique(source_batch)
            sparse_idx = torch.Tensor([])

        return source_batch, sparse_idx

    def sample_target_nodes(self, source_batch):
        row, col, e_id = self.adj_t_sample.coo() 
        
        if self.sample_edges_from_train_patients:
            train_patient_nodes = torch.tensor(list(self.train_phenotype_counter.keys()) + list(self.train_gene_counter.keys())) 
            ind_with_train_patient_nodes = (col == train_patient_nodes.unsqueeze(-1)).nonzero(as_tuple=True)[1]
            subset_row = row[ind_with_train_patient_nodes]
            subset_col = col[ind_with_train_patient_nodes]
            try:
                # first try to find an edge that connects back to the training set patient data
                targets = random_walk(subset_row, subset_col, source_batch, walk_length=1, coalesced=False)[:, 1] #NOTE: only does self loops when no edges in the current partition of the dataset
                source_batch_1 = source_batch[~torch.eq(source_batch, targets)]
                targets_1 = targets[~torch.eq(source_batch, targets)]

                # if no edges are found, use all available edges in this split of the data
                source_batch_2 = source_batch[torch.eq(source_batch, targets)]
                targets_2 = random_walk(row, col, source_batch_2, walk_length=1, coalesced=False)[:, 1] #NOTE: only does self loops when no edges in the current partition of the dataset

                #concat the two together
                source_batch = torch.cat([source_batch_1, source_batch_2])
                targets = torch.cat([targets_1, targets_2])

            except:
                targets = random_walk(row, col, source_batch, walk_length=1, coalesced=False)[:, 1] #NOTE: only does self loops when no edges in the current partition of the dataset
        else:
            targets = random_walk(row, col, source_batch, walk_length=1, coalesced=False)[:, 1] #NOTE: only does self loops when no edges in the current partition of the dataset
        return source_batch, targets

    def add_patient_information(self, patient_ids, phenotype_node_idx, candidate_gene_node_idx, correct_genes_node_idx, sim_gene_node_idx, gene_sims, gene_degs, disease_node_idx, candidate_disease_node_idx, labels, disease_labels, patient_labels, additional_labels, adjs, batch_size, n_id, sparse_idx, target_batch): #candidate_disease_node_idx

        # Create Data Object & Add patient level information
        adjs = [HeterogeneousEdgeIndex(adj.edge_index, adj.e_id, self.all_edge_attr[adj.e_id], adj.size) for adj in adjs] 
        max_n_candidates = max([len(l) for l in candidate_gene_node_idx])
        data = Data(adjs = adjs, 
                batch_size = batch_size,
                patient_ids = patient_ids,
                n_id = n_id
                )
        if self.hparams['loss'] != 'patient_disease_NCA' and self.hparams['loss'] != 'patient_patient_NCA':
            if None in list(labels): data['one_hot_labels'] = None
            else: data['one_hot_labels'] = torch.LongTensor(label_binarize(labels, classes = list(range(max_n_candidates))))

        if self.use_diseases:
            data['disease_one_hot_labels'] = disease_labels 

        if self.hparams['loss'] == 'patient_patient_NCA':
            if patient_labels is None: data['patient_labels'] = None
            else: data['patient_labels'] = torch.stack(patient_labels)

        # Get candidate genes to phenotypes SPL
        if not self.gp_spl is None:
            if not self.spl_indexing_dict is None:
                patient_ids = np.vectorize(self.spl_indexing_dict.get)(patient_ids).astype(int)
            gene_to_phenotypes_spl = -torch.Tensor(self.gp_spl[patient_ids,:])
            # get gene idx to spl information
            cand_gene_idx_to_spl = [torch.LongTensor(np.vectorize(self.nid_to_spl_dict.get)(cand_genes)) for cand_genes in list(candidate_gene_node_idx)]
            # get SPLs for each patient's candidate genes
            batch_cand_gene_to_phenotypes_spl = [gene_spls[cand_genes] for cand_genes, gene_spls in zip(cand_gene_idx_to_spl, gene_to_phenotypes_spl)]
            # pad to same # of candidate genes
            data['batch_cand_gene_to_phenotypes_spl'] = pad_sequence(batch_cand_gene_to_phenotypes_spl, batch_first=True, padding_value=0)
            # get unique gene idx across all patients in the batch
            cand_gene_idx_flattened_unique = torch.unique(torch.cat(cand_gene_idx_to_spl)).flatten()
            # get SPLs for unique genes in the batch
            data['batch_concat_cand_gene_to_phenotypes_spl'] = gene_to_phenotypes_spl[:, cand_gene_idx_flattened_unique]
        else:
            data['batch_cand_gene_to_phenotypes_spl'] = None
            data['batch_concat_cand_gene_to_phenotypes_spl'] = None


        # Create mapping from KG node IDs to batch indices
        node2batch = {n+1: int(i+1) for i, n in enumerate(data.n_id.tolist())}
        node2batch[0] = 0

        # add phenotype / gene / disease names
        data['phenotype_names'] = [[(self.patient_dataset.node_idx_to_name(p.item()), self.patient_dataset.node_idx_to_degree(p.item())) for p in p_list] for p_list in phenotype_node_idx ]
        data['cand_gene_names'] = [[self.patient_dataset.node_idx_to_name(g.item()) for g in g_list] for g_list in candidate_gene_node_idx ]
        data['corr_gene_names'] = [[self.patient_dataset.node_idx_to_name(g.item()) for g in g_list] for g_list in correct_genes_node_idx  ]
        data['disease_names'] = [[self.patient_dataset.node_idx_to_name(d.item()) for d in d_list] for d_list in disease_node_idx ]

        if self.use_diseases:
            data['cand_disease_names'] = [[self.patient_dataset.node_idx_to_name(d.item()) for d in d_list] for d_list in candidate_disease_node_idx ]


        #reindex nodes to make room for padding
        phenotype_node_idx = [p + 1 for p in phenotype_node_idx]
        candidate_gene_node_idx = [g + 1 for g in candidate_gene_node_idx]
        correct_genes_node_idx = [g + 1 for g in correct_genes_node_idx]
        if self.use_diseases:
            disease_node_idx = [d + 1 for d in disease_node_idx]
            candidate_disease_node_idx = [d + 1 for d in candidate_disease_node_idx]
        if 'augment_genes' in self.hparams and self.hparams['augment_genes']:
            sim_gene_node_idx = [g + 1 for g in sim_gene_node_idx]

        # if there aren't any disease idx in the batch, we add filler
        if self.use_diseases:
            if all(len(t) == 0 for t in disease_node_idx):
                disease_node_idx = [torch.LongTensor([0]) for i in range(len(disease_node_idx))]
            if all(len(t) == 0 for t in candidate_disease_node_idx):
                candidate_disease_node_idx = [torch.LongTensor([0]) for i in range(len(candidate_disease_node_idx))]

        # add padding to patient phenotype and gene node idx
        data['batch_pheno_nid'] = pad_sequence(phenotype_node_idx, batch_first=True, padding_value=0) 
        if len(candidate_gene_node_idx[0]) > 0:
            data['batch_cand_gene_nid'] = pad_sequence(candidate_gene_node_idx, batch_first=True, padding_value=0) 
        data['batch_corr_gene_nid'] = pad_sequence(correct_genes_node_idx, batch_first=True, padding_value=0) 
        if self.use_diseases:
            data['batch_disease_nid'] = pad_sequence(disease_node_idx, batch_first=True, padding_value=0) 
            data['batch_cand_disease_nid'] = pad_sequence(candidate_disease_node_idx, batch_first=True, padding_value=0) 
        if 'augment_genes' in self.hparams and self.hparams['augment_genes']:
            data['batch_cand_gene_degs'] = pad_sequence(gene_degs, batch_first=True, padding_value=0) 
            data['batch_sim_gene_nid'] = pad_sequence(sim_gene_node_idx, batch_first=True, padding_value=0) 
            data['batch_sim_gene_sims'] = pad_sequence(gene_sims, batch_first=True, padding_value=0)
            # Normalize
            data['batch_sim_gene_sims'] = data['batch_sim_gene_sims'] / torch.sum(data['batch_sim_gene_sims'], dim=1, keepdim=True)
        else:
            if len(candidate_gene_node_idx[0]) > 0:
                data['batch_cand_gene_nid'] = pad_sequence(candidate_gene_node_idx, batch_first=True, padding_value=0) 

        # Convert KG node IDs to batch IDs
        # When performing inference (i.e., predict.py), use the original node IDs because the full KG is used in forward pass of node model
        if self.dataset_type != "predict":
            data['batch_pheno_nid']  = torch.LongTensor(np.vectorize(node2batch.get)(data['batch_pheno_nid']))
            if len(candidate_gene_node_idx[0]) > 0:
                data['batch_cand_gene_nid'] = torch.LongTensor(np.vectorize(node2batch.get)(data['batch_cand_gene_nid']))
            if len(correct_genes_node_idx[0]) > 0:
                data['batch_corr_gene_nid'] = torch.LongTensor(np.vectorize(node2batch.get)(data['batch_corr_gene_nid']))
            if self.use_diseases:
                data['batch_disease_nid'] = torch.LongTensor(np.vectorize(node2batch.get)(data['batch_disease_nid']))
                data['batch_cand_disease_nid'] = torch.LongTensor(np.vectorize(node2batch.get)(data['batch_cand_disease_nid']))
            if 'augment_genes' in self.hparams and self.hparams['augment_genes']:
                data['batch_sim_gene_nid'] = torch.LongTensor(np.vectorize(node2batch.get)(data['batch_sim_gene_nid']))
        return data

    def get_candidate_diseases(self, disease_node_idx, candidate_gene_node_idx):
        cand_diseases = self.patient_dataset.get_candidate_diseases(cand_type=self.hparams['candidate_disease_type'])
        if self.n_cand_diseases != -1: cand_diseases = cand_diseases[torch.randperm(len(cand_diseases))][0:self.n_cand_diseases] 
        
        if self.hparams['only_hard_distractors']: #add candidates to every patient
            candidate_disease_node_idx = tuple(torch.unique(torch.cat([corr_dis, cand_diseases ]), sorted=False) for corr_dis in disease_node_idx)
            candidate_disease_node_idx = tuple(torch.unique(dis[torch.randperm(len(dis))], sorted=False, return_inverse=False, return_counts=False) for dis in candidate_disease_node_idx)
        else: # split candidates across all patients in the batch
            all_correct_diseases = torch.cat(disease_node_idx)
            all_diseases = torch.unique(torch.cat([all_correct_diseases, cand_diseases]))
            all_diseases = all_diseases[torch.randperm(len(all_diseases))]
            candidate_disease_node_idx = np.array_split(all_diseases, len(candidate_gene_node_idx))
            candidate_disease_node_idx = tuple(candidate_disease_node_idx)
        max_n_dis_candidates = max([len(l) for l in candidate_disease_node_idx])
        if max_n_dis_candidates == 0: 
            max_n_dis_candidates = 1
            print('WARNING: there are no disease candidates')

        disease_ind = [(dis.unsqueeze(1) == corr_dis.unsqueeze(0)).nonzero(as_tuple=True)[0] if len(corr_dis) > 0 else torch.tensor(-1) for dis, corr_dis in zip(candidate_disease_node_idx, disease_node_idx)]
        disease_labels = torch.zeros((len(candidate_disease_node_idx), max_n_dis_candidates))
        for i, ind in enumerate(disease_ind): disease_labels[i,ind[ind != -1]] = 1
        return candidate_disease_node_idx, disease_labels

    def get_candidate_patients(self, patient_ids):
        # get patients with the same disease/gene
        similar_pat_ids = [self.patient_dataset.get_similar_patients(p_id, similarity_type=self.hparams['patient_similarity_type']) for p_id in patient_ids]
        # shuffle patients & subset to n_sim_pats so we have X similar patients per patient in the batch
        similar_pat_ids = [p[:self.hparams['n_similar_patients']] for p in similar_pat_ids] #[torch.randperm(len(p))]
        # Retrieve the patients for each of the sampled patient ids if they aren't already in the batch
        patient_ids = list(patient_ids) 
        similar_pats = [self.patient_dataset[self.patient_dataset.patient_id_to_index[p_id.item()]] for p_ids in similar_pat_ids for p_id in p_ids if p_id.item() not in patient_ids]
        return similar_pats
    
    def sample(self, batch, source_batch, target_batch):
        batch_size: int = len(batch)
        adjs = []
        n_id = batch
        for size in self.sizes:

            adj_t, n_id = self.adj_t.sample_adj(n_id, size, replace=False)
            e_id = adj_t.storage.value()
            size = adj_t.sparse_sizes()[::-1]
            if self.__val__ is not None:
                adj_t.set_value_(self.__val__[e_id], layout='coo')

            if self.is_sparse_tensor: #TODO: implement filter_edges if sparse tensor
                adjs.append(Adj(adj_t, e_id, size))
            else:
                row, col, _ = adj_t.coo()
                edge_index = torch.stack([col, row], dim=0)
                if self.do_filter_edges and self.dataset_type == 'train':
                    edge_index, e_id = self.filter_edges(edge_index, e_id, source_batch, target_batch)
                adjs.append(EdgeIndex(edge_index, e_id, size))

        adjs = [adjs[0]] if len(adjs) == 1 else adjs[::-1]
        return adjs, batch_size, n_id
    
    def get_similar_genes(self, patient_ids, candidate_gene_node_idx):
        k = self.hparams['n_sim_genes']
        gene_ids = []
        sims = []
        degs = []
        assert len(patient_ids) == len(candidate_gene_node_idx)
        for p, p_cand_genes in zip(patient_ids, candidate_gene_node_idx):
            p_genes = []
            p_sims = []
            p_degs = []
            for g in p_cand_genes:
                p_genes.append(torch.LongTensor([idx for idx, sim in list(self.gene_similarity_dict[int(g)])[:k]]))
                p_sims.append(torch.LongTensor([sim for idx, sim in list(self.gene_similarity_dict[int(g)])[:k]]))
                p_degs.append(self.gene_deg_dict[int(g)])
            gene_ids.append(torch.stack(p_genes))
            sims.append(torch.stack(p_sims))
            degs.append(torch.LongTensor(p_degs))
        assert len(gene_ids) == len(patient_ids)
        assert len(sims) == len(patient_ids)
        unique_genes = torch.unique(torch.cat(gene_ids).flatten()).unsqueeze(-1)
        return tuple(gene_ids), tuple(sims), tuple(degs), tuple(unique_genes)

    def collate(self, batch):
        t00 = time.time()
        phenotype_node_idx, candidate_gene_node_idx, correct_genes_node_idx, disease_node_idx, labels, additional_labels, patient_ids = zip(*batch)

        # Up-sample under-represented candidate genes
        t0 = time.time()
        if self.upsample_cand > 0:
            curr_cand_gene_freq = Counter(torch.cat(candidate_gene_node_idx).flatten().tolist())
            self.cand_gene_freq += curr_cand_gene_freq
            num_patients = len(candidate_gene_node_idx) * self.upsample_cand
            lowest_k_cand = self.cand_gene_freq.most_common()[:-num_patients-1:-1]
            lowest_k_cand = np.array_split([g[0] for g in lowest_k_cand], len(candidate_gene_node_idx))
            
            upsampled_candidate_gene_node_idx = []
            added_cand_gene = []
            for patient, cand_gene, corr_gene_idx in zip(candidate_gene_node_idx, lowest_k_cand, labels):
                
                # Remove correct genes from list of upsampled candidate genes
                corr_gene_nid = patient[corr_gene_idx]
                cand_gene = cand_gene[~np.isin(cand_gene, corr_gene_nid)].flatten()
                
                # Remove duplicates
                unique_cand_genes, new_cand_genes_freq = torch.unique(torch.tensor(patient.tolist() + list(cand_gene)), return_counts = True)
                unique_cand_genes = unique_cand_genes[new_cand_genes_freq == 1]
                cand_gene = cand_gene[np.isin(cand_gene, unique_cand_genes)]                
                
                # Add upsampled candidate genes
                added_cand_gene.extend(list(cand_gene))
                new_cand_list = torch.tensor(patient.tolist() + list(cand_gene))
                upsampled_candidate_gene_node_idx.append(new_cand_list)
            
            candidate_gene_node_idx = tuple(upsampled_candidate_gene_node_idx)
            self.cand_gene_freq += Counter(added_cand_gene)

        
        # Add similar patients to batch (for "patients like me" head)
        if self.hparams['add_similar_patients']:
            similar_pats = self.get_candidate_patients(patient_ids)
            # merge original batch with sampled patients
            phenotype_node_idx_sim, candidate_gene_node_idx_sim, correct_genes_node_idx_sim, disease_node_idx_sim, labels_sim, additional_labels_sim, patient_ids_sim = zip(*similar_pats)
            phenotype_node_idx = phenotype_node_idx + phenotype_node_idx_sim
            candidate_gene_node_idx = candidate_gene_node_idx + candidate_gene_node_idx_sim
            correct_genes_node_idx = correct_genes_node_idx + correct_genes_node_idx_sim
            disease_node_idx = disease_node_idx + disease_node_idx_sim
            labels = labels + labels_sim
            additional_labels = additional_labels + additional_labels_sim
            patient_ids = patient_ids + patient_ids_sim
        
        # get patient labels
        patient_labels = correct_genes_node_idx
        
        # Add candidate diseases to batch
        if self.hparams['add_cand_diseases']:
            candidate_disease_node_idx, disease_labels = self.get_candidate_diseases(disease_node_idx, candidate_gene_node_idx)
        else: 
            candidate_disease_node_idx = disease_node_idx
            disease_labels = torch.tensor([1] * len(candidate_disease_node_idx))

        if self.hparams['augment_genes']:
            sim_gene_node_idx, gene_sims, gene_degs, unique_sim_genes = self.get_similar_genes(patient_ids, candidate_gene_node_idx)
        else:
            unique_sim_genes = gene_degs = gene_sims = sim_gene_node_idx = None

        t1 = time.time()

        # get nodes from patients + randomly sampled nodes
        source_batch, sparse_idx = self.get_source_nodes(phenotype_node_idx, candidate_gene_node_idx, correct_genes_node_idx, disease_node_idx, candidate_disease_node_idx, unique_sim_genes)
       
        # sample nodes to form positive edges
        source_batch, target_batch = self.sample_target_nodes(source_batch) 
        batch = torch.cat([source_batch, target_batch], dim=0) 
        t2 = time.time()

        # get k hop adj graph
        adjs, batch_size, n_id = self.sample(batch, source_batch, target_batch)
        t3 = time.time()

        # add patient information to data object
        data = self.add_patient_information(patient_ids, phenotype_node_idx, candidate_gene_node_idx, correct_genes_node_idx, sim_gene_node_idx, gene_sims, gene_degs, disease_node_idx, candidate_disease_node_idx, labels, disease_labels, patient_labels, additional_labels, adjs, batch_size, n_id, sparse_idx, target_batch) #candidate_disease_node_idx
        t4 = time.time()
        
        if self.hparams['time']:
            print(f'It takes {t0-t00:0.4f}s to unzip batch, {t1-t0:0.4f}s to upsample candidate gene nodes, {t2-t1:0.4f}s to sample positive nodes, {t3-t2:0.4f}s to get k-hop adjs, and {t4-t3:0.4f}s to add patient information')
        return data        

    def __repr__(self):
        return '{}(sizes={})'.format(self.__class__.__name__, self.sizes)