Diff of /data_prep/preprocess.py [000000] .. [db6163]

Switch to unified view

a b/data_prep/preprocess.py
1
2
import sys
3
import obonet
4
import networkx
5
import re
6
import time
7
import numpy as np
8
import pandas as pd
9
from pathlib import Path
10
from multiprocessing import Pool
11
sys.path.insert(0, '..') # add project_config to path
12
import project_config as config
13
import logging
14
import pickle
15
logging.basicConfig(level=logging.INFO)
16
#############
17
# SOURCES
18
# HGNC (DOWNLOAD) - https://www.genenames.org/download/custom/
19
# ENSEMBL (BIOMART) - http://useast.ensembl.org/biomart/martview/b11081b5e1087424087a575e2792953e
20
    # biomart_ensembl_ncbi_map
21
# NCBI (FTP) - https://ftp.ncbi.nih.gov/gene/DATA/
22
    # gene info
23
    # gene history
24
    # gene2refseq
25
    # gene2ensembl
26
27
28
29
class Preprocessor():
30
31
    def write_pkl(self, d, filename):
32
        file_path = Path(config.PREPROCESS_PATH / filename)
33
        with open(str(file_path), 'wb') as f: 
34
            pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
35
36
    def load_pkl(self, filename):
37
        file_path = Path(config.PREPROCESS_PATH / filename)
38
        with open(str(file_path), 'rb') as f: 
39
            return pickle.load(f)
40
41
42
    def read_hgnc_mappings(self):
43
        # Read in hgnc data
44
        hgnc = pd.read_csv(config.HGNC_PATH / "hgnc_complete_set-12-31-19.tsv", sep='\t', dtype=str)
45
        # hgnc = hgnc.query("symbol != 'LINC00856'")
46
        hgnc = hgnc[hgnc.symbol.notnull()]
47
        hgnc.loc[hgnc.alias_symbol == 'WISP-3', 'alias_symbol'] = 'WISP3' #other sources don't have a dash in the gene name
48
        self.hgnc = hgnc
49
50
        self.hgnc_biomart = pd.read_csv(config.HGNC_PATH / "hgnc_biomart_hgnc_ensembl_ncbi.txt", sep='\t', dtype=str).drop(columns=['Previous name']).drop_duplicates()
51
52
53
        if Path(config.PREPROCESS_PATH / 'prev_to_curr_dict.pkl').exists():
54
            self.prev_to_curr_dict = self.load_pkl('prev_to_curr_dict.pkl')
55
            self.curr_to_prev_dict = self.load_pkl('curr_to_prev_dict.pkl')
56
            self.alias_to_curr_dict = self.load_pkl('alias_to_curr_dict.pkl')
57
            self.curr_to_alias_dict = self.load_pkl('curr_to_alias_dict.pkl')
58
        else:
59
            alias_genes = hgnc['alias_symbol'].str.split('|').tolist()
60
            hgnc_no_null_prev = hgnc.loc[~pd.isnull(hgnc.prev_symbol)]
61
            # map from previous gene symbol to current gene symbol
62
            self.prev_to_curr_dict = {p:symbol for symbol, prev_symbols in zip(hgnc_no_null_prev.symbol, hgnc_no_null_prev.prev_symbol) for p in prev_symbols.split('|')}
63
            self.write_pkl(self.prev_to_curr_dict, 'prev_to_curr_dict.pkl')
64
            # map from current gene symbol to previous gene symbol
65
            self.curr_to_prev_dict = {symbol:prev_symbols.split('|') for symbol, prev_symbols in zip(hgnc_no_null_prev.symbol, hgnc_no_null_prev.prev_symbol) }
66
            self.write_pkl(self.curr_to_prev_dict, 'curr_to_prev_dict.pkl')
67
68
            hgnc_no_null_alias = hgnc.loc[~pd.isnull(hgnc.alias_symbol)]
69
            # map from alias gene symbol to current gene symbol
70
            self.alias_to_curr_dict = {a:symbol for symbol, alias_symbols in zip(hgnc_no_null_alias.symbol, hgnc_no_null_alias.alias_symbol) for a in alias_symbols.split('|')}
71
            self.write_pkl(self.alias_to_curr_dict, 'alias_to_curr_dict.pkl')
72
            # map from current gene symbol to alias gene symbol
73
            self.curr_to_alias_dict = {symbol:alias_symbols.split('|') for symbol, alias_symbols in zip(hgnc_no_null_alias.symbol, hgnc_no_null_alias.alias_symbol) }
74
            self.write_pkl(self.curr_to_alias_dict, 'curr_to_alias_dict.pkl')
75
     
76
    def read_shilpa_mappings(self):
77
        self.ensembl_to_hgnc_shilpa = pd.read_csv(config.HGNC_PATH / "GRCh38_ensembl_gene_list_from_shilpa.tsv", sep='\t', skiprows=4, dtype=str)
78
        if Path(config.PREPROCESS_PATH / 'synonym_to_ensembl_dict.pkl').exists():
79
            self.synonym_to_ensembl_dict = self.load_pkl('synonym_to_ensembl_dict.pkl')
80
        else:            
81
            # map gene synonyms to ensembl ids using Shilpa's curated mapping
82
            synonym_genes = self.ensembl_to_hgnc_shilpa['gene_synonyms'].str.split(',').tolist()
83
            ensembl_ids = self.ensembl_to_hgnc_shilpa['ensembl_gene_id'].tolist() 
84
            self.synonym_to_ensembl_dict =  {s.split('.')[0]:ensembl for synonyms, ensembl in zip(synonym_genes, ensembl_ids) for s in synonyms}
85
            self.write_pkl(self.synonym_to_ensembl_dict, 'synonym_to_ensembl_dict.pkl')
86
87
    def read_biomart_mappings(self):
88
        #hgnc-ensembl-ncbi mappings from Ensembl Biomart
89
        if Path(config.BIOMART_PATH / 'combined_ensembl_biomart.txt').exists():
90
            self.ensembl_biomart_mapping = pd.read_csv(config.BIOMART_PATH / 'combined_ensembl_biomart.txt', sep='\t', dtype=str)
91
            #self.entrez_to_ensembl_dict = self.load_pkl('entrez_to_ensembl_dict.pkl')
92
93
        else:
94
            ensembl_to_hgnc = pd.read_csv(config.HGNC_PATH / "ensembl_to_hgnc_map.txt", sep='\t').drop(columns=['HGNC ID', 'EntrezGene ID']) #unknown origin
95
            ensembl_to_hgnc['NCBI gene ID'] = None
96
            biomart_mapping = pd.read_csv(config.BIOMART_PATH / 'biomart_ensembl_genesymb_entrez.txt', delimiter = '\t', dtype=str)
97
            biomart_mapping = biomart_mapping.drop(columns=['Transcript stable ID', 'Transcript stable ID version']).drop_duplicates()
98
            
99
            # combine two mappings
100
            all_biomart_mapping = pd.concat([ensembl_to_hgnc, biomart_mapping], sort=False).drop(columns=['Gene stable ID version'])
101
            all_biomart_mapping = all_biomart_mapping.drop_duplicates()
102
            self.ensembl_biomart_mapping = all_biomart_mapping.sort_values(['NCBI gene ID'], na_position='last').drop_duplicates(subset=['Gene stable ID', 'Gene name', 'HGNC symbol'], keep='first')
103
            self.ensembl_biomart_mapping['NCBI gene ID'] = self.ensembl_biomart_mapping['NCBI gene ID'].astype(str)
104
            self.ensembl_biomart_mapping.to_csv(config.BIOMART_PATH / 'combined_ensembl_biomart.txt', sep='\t', index=False)
105
106
    def read_ncbi_mappings(self):
107
        self.gene2ensembl = pd.read_csv(config.NCBI_PATH / 'gene2ensembl', sep='\t', dtype='str')
108
109
110
        if Path(config.PREPROCESS_PATH / 'disc_symbol_to_geneid.pkl').exists():
111
            self.disc_symbol_to_geneid = self.load_pkl('disc_symbol_to_geneid.pkl')
112
113
            self.disc_geneid_to_geneid = self.load_pkl('disc_geneid_to_geneid.pkl')
114
115
            self.synonym_to_entrez_symbol_dict = self.load_pkl('synonym_to_entrez_symbol_dict.pkl')
116
            self.geneid_to_entrez_symbol_dict = self.load_pkl('geneid_to_entrez_symbol_dict.pkl')
117
            self.entrez_symbol_to_geneid_dict = self.load_pkl('entrez_symbol_to_geneid_dict.pkl')
118
            self.refseq_geneid_to_symbol_map = self.load_pkl('refseq_geneid_to_symbol_map.pkl')
119
            self.refseq_symbol_to_geneid_map = self.load_pkl('refseq_symbol_to_geneid_map.pkl')
120
121
        else:
122
            # NCBI GENEID HISTORY
123
            gene_history = pd.read_csv(config.NCBI_PATH / 'gene_history', sep='\t', dtype='str')
124
            # map from discontinued symbol to current NCBI gene ID
125
            self.disc_symbol_to_geneid = {disc_symbol:geneid for disc_symbol, geneid in zip(gene_history['Discontinued_Symbol'],gene_history['GeneID']) if geneid != '-'}
126
            self.write_pkl(self.disc_symbol_to_geneid, 'disc_symbol_to_geneid.pkl')
127
            # map from discontinued NCBI gene ID to current NCBI gene ID
128
            self.disc_geneid_to_geneid = {disc_geneid:geneid for disc_geneid, geneid in zip(gene_history['Discontinued_GeneID'],gene_history['GeneID'] ) if geneid != '-'}
129
            self.write_pkl(self.disc_geneid_to_geneid, 'disc_geneid_to_geneid.pkl')
130
131
132
            # GENE INFO
133
            # map from synonym to gene symbol
134
            self.gene_info = pd.read_csv(config.NCBI_PATH / 'gene_info', sep='\t', dtype = 'str')
135
            gene_info_with_synonyms = self.gene_info.loc[~pd.isnull(self.gene_info.Synonyms)]
136
            self.synonym_to_entrez_symbol_dict = {s:symbol for symbol, synonyms in zip(gene_info_with_synonyms.Symbol, gene_info_with_synonyms.Synonyms) for s in synonyms.split('|')}
137
            self.write_pkl(self.synonym_to_entrez_symbol_dict, 'synonym_to_entrez_symbol_dict.pkl')
138
139
            # map from NCBI gene ID to Gene Symbol
140
            self.geneid_to_entrez_symbol_dict = {geneid:symbol for geneid, symbol in zip(self.gene_info.GeneID, self.gene_info.Symbol) }
141
            self.write_pkl(self.geneid_to_entrez_symbol_dict, 'geneid_to_entrez_symbol_dict.pkl')
142
143
            self.entrez_symbol_to_geneid_dict = {symbol:geneid for geneid, symbol in zip(gene_info.GeneID, gene_info.Symbol) }
144
            self.write_pkl(self.entrez_symbol_to_geneid_dict, 'entrez_symbol_to_geneid_dict.pkl')
145
146
            # # GENE2 REFSEQ
147
            gene2refseq = pd.read_csv(config.NCBI_PATH / 'gene2refseq', sep='\t', dtype='str')
148
            # map from NCBI gene id to symbol
149
            self.refseq_geneid_to_symbol_map = {gene_id:symbol for gene_id, symbol in zip(gene2refseq['GeneID'], gene2refseq['Symbol'])}
150
            self.write_pkl(self.refseq_geneid_to_symbol_map, 'refseq_geneid_to_symbol_map.pkl')
151
152
            # map from symbol to NCBI gene ID
153
            self.refseq_symbol_to_geneid_map = {symbol:gene_id for gene_id, symbol in zip(gene2refseq['GeneID'], gene2refseq['Symbol'])}
154
            self.write_pkl(self.refseq_symbol_to_geneid_map, 'refseq_symbol_to_geneid_map.pkl')
155
          
156
    def get_unique_ensembl_ids(self):
157
        ensembl_ids = self.hgnc['ensembl_gene_id'].tolist() + self.hgnc_biomart['Ensembl gene ID'].tolist() \
158
            + self.ensembl_biomart_mapping['Gene stable ID'].tolist() \
159
            + self.ensembl_to_hgnc_shilpa['ensembl_gene_id'].tolist()
160
        ensembl_ids = set(ensembl_ids)
161
        return [e for e in ensembl_ids if not pd.isnull(e)]
162
163
    def get_gene_symbols(self):
164
        symbols = self.hgnc['symbol'].unique().tolist() + self.hgnc_biomart['Approved symbol'].tolist()
165
        symbols = set(symbols)
166
        return [s for s in symbols if not pd.isnull(s)]
167
168
    def get_ncbi_ids(self):
169
        #TODO: make sure this works
170
        if Path(config.NCBI_PATH / 'all_ncbi_ids.txt').exists():
171
            ncbi = pd.read_csv(config.NCBI_PATH / 'all_ncbi_ids.txt', sep='\t', dtype=str)
172
            
173
        else:
174
            ncbi = self.hgnc_biomart['NCBI gene ID'].tolist() + self.ensembl_biomart_mapping['NCBI gene ID'].tolist() + self.gene_info['GeneID'].tolist()
175
            ncbi = set(ncbi)
176
            ncbi = pd.DataFrame({'ids':[s for s in ncbi if not pd.isnull(s)]})
177
            ncbi['ids'] = ncbi['ids'].astype(str)
178
            ncbi.to_csv(config.NCBI_PATH / 'all_ncbi_ids.txt', sep='\t', index=False)
179
        return ncbi['ids'].tolist()
180
181
182
    def __init__(self, process_phenotypes=True, process_genes=True, hpo_source_year='2019'):
183
        if process_phenotypes:
184
            # read in hpo hierarchy
185
            HPO_2015_DIR = config.UDN_DATA / 'hpo' / 'Jan_2015' 
186
            HPO_2019_DIR = config.UDN_DATA / 'hpo' / '2019' 
187
            HPO_2020_DIR = config.UDN_DATA / 'hpo' / '8_2020' 
188
            HPO_2021_DIR = config.UDN_DATA / 'hpo' / '2021' 
189
190
            self.hpo_2019 = obonet.read_obo(HPO_2019_DIR / 'hp.obo') 
191
            self.hpo_2015 = obonet.read_obo(HPO_2015_DIR / 'hp.obo') 
192
            self.hpo_2020 = obonet.read_obo(HPO_2020_DIR / 'hp.obo') 
193
            self.hpo_2021 = obonet.read_obo(HPO_2021_DIR / 'hp.obo') 
194
195
            self.hpo_source_year = hpo_source_year
196
            if hpo_source_year == '2019': self.hpo_ontology = self.hpo_2019
197
            elif hpo_source_year == '2020': self.hpo_ontology = self.hpo_2020
198
            elif hpo_source_year == '2021': self.hpo_ontology = self.hpo_2021
199
            else: raise NotImplementedError
200
201
            # create dictionary mapping alternate HPO IDs to current HPO IDs
202
            alt_to_curr_hpo_path = Path(config.PREPROCESS_PATH / f'alt_to_curr_hpo_dict_{hpo_source_year}.pkl')
203
            if alt_to_curr_hpo_path.exists():
204
                with open(str(alt_to_curr_hpo_path), 'rb') as f:
205
                    self.alt_to_curr_hpo_dict = pickle.load(f)
206
            else:
207
                self.alt_to_curr_hpo_dict = {}
208
                for node_id in self.hpo_ontology.nodes():
209
                    node_dict = self.hpo_ontology.nodes[node_id]
210
                    if 'alt_id' in node_dict:
211
                        for alt_id in node_dict['alt_id']:
212
                            self.alt_to_curr_hpo_dict[alt_id] = node_id
213
                with open(str(alt_to_curr_hpo_path), 'wb') as f:
214
                    pickle.dump(self.alt_to_curr_hpo_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
215
                    
216
217
218
            #map from outdated HPO codes to codes in 2019 hierarchy
219
            # Not in the 2015 or 2019 HPO
220
            # HP:0500014 (Abnormal test result) -> None, throw out
221
            # HP:0040290 (Abnormality of skeletal muscles) -> 'HP:0003011'
222
            # HP:0030963 (obsolete Abnormal aortic morphology) -> 'HP:0001679'
223
            # HP:0030971 (obsolete Abnormal vena cava morphology) -> 'HP:0005345'
224
225
            # in 2015 HPO, but not 2019. Parent is also not in 2019
226
            # HP:0005111 (Dilatation of the ascending aorta) -> 'HP:0005128'
227
            # HP:0001146 (Pigmentary retinal degeneration) -> 'HP:0000580'
228
            # 'HP:0007757' (choroid hypoplasia) -> 'HP:0000610' (Abnormal choroid morphology)
229
            # HP:0009620 (obsolete Radial deviation of the thumb) -> HP:0040021 (Radial deviation of the thumb)
230
            # HP:0009448 (obsolete Aplasia of the phalanges of the 3rd finger) -> HP:0009447 (Aplasia/Hypoplasia of the phalanges of the 3rd finger)
231
            # HP:0004110 (obsolete Radially deviated index finger phalanges) -> HP:0009467 (Radial deviation of the 2nd finger)
232
            # HP:3000026 obsolete Abnormality of common carotid artery plus branches -> HP:0430021 Abnormal common carotid artery morphology 
233
            self.OLD_TO_2019_HPO_DICT = {'HP:0040290':'HP:0003011', 'HP:0030963':'HP:0001679', 
234
                        'HP:0030971':'HP:0005345', 'HP:0005111':'HP:0004970',
235
                        'HP:0001146':'HP:0000580', 'HP:0100637':'HP:0012720', 
236
                        'HP:0007757':'HP:0000610', 'HP:0009620': 'HP:0040021',
237
                        'HP:0009448':'HP:0009447', 'HP:0004110': 'HP:0009467',
238
                        'HP:3000026': 'HP:0430021'}
239
240
        if process_genes:
241
            logging.info('Initializing Gene Mappings....')
242
243
            # manually created from gene cards
244
            self.genecards_alias_dict = {'RARS':'RARS1', 'C11ORF80':'C11orf80', 'KIF1BP':'KIFBP', 'ICK':'CILK1', 'AARS':'AARS1', 'SPG23':'DSTYK', 
245
                            'C9ORF72':'C9orf72', 'C8ORF37':'C8orf37', 'HARS':'HARS1', 'GARS':'GARS1', 'C19ORF12':'C19orf12', 
246
                            'C12ORF57':'C12orf57', 'ADSSL1':'ADSS1', 'C12ORF65':'C12orf65', 'MARS':'MARS1', 'CXORF56':'STEEP1', 
247
                            'SARS':'SARS1', 'C12ORF4':'C12orf4', 'MUT':'MMUT', 'LOR':'LORICRIN'}
248
249
            self.miscapitalized_gene_dict = {'C10ORF10':'C10orf10','C21ORF59':'C21orf59', 'C4ORF26':'C4orf26', 'C9ORF72':'C9orf72', 'C8ORF37':'C8orf37', 'CXORF56':'CXorf56', 'C12ORF4':'C12orf4', 'C2ORF43':'C2orf43',
250
                            'C11ORF80':'C11orf80', 'C19ORF12': 'C19orf12', 'C12ORF57': 'C12orf57', 'C17ORF96':'C17orf96', 'C19ORF68':'C19orf68', 'C19ORF10':'C19orf10', 'C9ORF3':'C9orf3', 'C11ORF73':'C11orf73',
251
                            'C12ORF65':'C12orf65', 'C5ORF42': 'C5orf42', 'C2ORF71': 'C2orf71', 'C10ORF2': 'C10orf2', 'mTOR': 'MTOR', 'C12ORF55':'C12orf55', 'C19ORF18':'C19orf18', 'C12ORF66':'C12orf66', 'C5ORF63':'C5orf63',
252
                            'C11ORF95': 'C11orf95', 'C15ORF41': 'C15orf41', 'C16ORF57': 'C16orf57', 'C20orff78': 'C20orf78', 'trnM(cau)': 'trnM-CAU', 'C10ORF82':'C10orf82', 'CXORF27':'CXorf27'}
253
            self.misc_gene_to_ensembl_map = {'RP13-996F3.4':'ENSG00000259243', 'NRXN1':'ENSG00000179915', 'LOC401010':'ENSG00000217950',
254
                            'TXNB': 'ENSG00000168477', 'TREF1': 'ENSG00000124496', 'DLCRE1C': 'ENSG00000152457',
255
                            'CDC37L1C':'ENSG00000106993', 'UNQ2560':'ENSG00000145476', 'COLA11A2':'ENSG00000204248',
256
                            'SL12A2':'ENSG00000064651', 'TNFRS1B': 'ENSG00000028137', 'HUWE': 'ENSG00000086758', 
257
                            'PARRC2C': 'ENSG00000117523', 'AK055785':'ENSG00000153443', 'TRNC18':'ENSG00000182095',
258
                            'ZYFVE26':'ENSG00000072121', 'DPMI':'ENSG00000000419', 'SRD53A':'ENSG00000128039',
259
                            'VLDR':'ENSG00000147852', 'MENT1':'ENSG00000112759', 'DRPL-A':'ENSG00000111676',
260
                            'FRDA1':'ENSG00000165060', 'A1640G':'ENSG00000163554', 'GS1-541M1.1':'ENSG00000220216',
261
                            'MT-TT':'ENSG00000210195', '4576': 'ENSG00000210195', 'ZMYDN12': 'ENSG00000066185', 'NRNX1':'ENSG00000179915',
262
                            'NPNP4': 'ENSG00000131697', 'STEEP1': 'ENSG00000018610', 'ENSG00000288642':'ENSG00000288642'
263
                            } 
264
            logging.info('Reading hgnc mappings....')
265
            self.read_hgnc_mappings()
266
            logging.info('Reading biomart mappings....')
267
            self.read_biomart_mappings()
268
            logging.info('Reading shilpa mappings....')
269
            self.read_shilpa_mappings()
270
            logging.info('Reading ncbi mappings....')
271
            self.read_ncbi_mappings()
272
273
274
            # # get list of withdrawn hgnc symbols
275
            self.withdrawn_hgnc = pd.read_csv(config.HGNC_PATH / "withdrawn-12-31-19.tsv", sep='\t')
276
277
            logging.info('Retrieving unique lists of genes....')
278
279
            self.all_unique_ensembl_ids = self.get_unique_ensembl_ids()
280
            self.all_gene_symbols = self.get_gene_symbols()
281
            self.all_ncbi_ids = self.get_ncbi_ids()
282
          
283
            # remove entries that map to NULL
284
            self.hgnc = self.hgnc.loc[~pd.isnull(self.hgnc['ensembl_gene_id'])]
285
            self.ensembl_biomart_mapping = self.ensembl_biomart_mapping.loc[~pd.isnull(self.ensembl_biomart_mapping['Gene stable ID'])]
286
            self.ensembl_to_hgnc_shilpa = self.ensembl_to_hgnc_shilpa.loc[~pd.isnull(self.ensembl_to_hgnc_shilpa['ensembl_gene_id'])]
287
            self.hgnc_biomart = self.hgnc_biomart.loc[~pd.isnull(self.hgnc_biomart['Ensembl gene ID'])]
288
            self.gene2ensembl = self.gene2ensembl.loc[~pd.isnull(self.gene2ensembl['Ensembl_gene_identifier'])]
289
290
    def map_phenotype_to_hpo(self, hpo_id) -> str:
291
        '''
292
        HP:0500014 : abnormal test result -> can't map to anything
293
294
        '''
295
        # if the hpo is already in 2019 HPO, return it
296
        if hpo_id in self.hpo_ontology.nodes():
297
            return hpo_id
298
        # else map alternate hpo ids to the current version
299
        elif hpo_id in self.alt_to_curr_hpo_dict:
300
            return self.alt_to_curr_hpo_dict[hpo_id]
301
        elif hpo_id in self.OLD_TO_2019_HPO_DICT: #TODO: generalize beyond 2019 hpo dict
302
            return self.OLD_TO_2019_HPO_DICT[hpo_id]
303
        
304
        # if all else fails, map the hpo to its parents & return that
305
        else:
306
            for ontology in [self.hpo_2015, self.hpo_2019, self.hpo_2020, self.hpo_2021]:
307
                if hpo_id in ontology.nodes():
308
                    ancestors  = list(ontology.successors(hpo_id))
309
                    for ancestor in ancestors:
310
                        if ancestor in self.hpo_ontology.nodes():
311
                            return ancestor
312
                    logging.error(f'{hpo_id} can not be converted to its equivalent in the 2019 HPO hierarchy')
313
                    return hpo_id
314
315
            logging.error(f'{hpo_id} can not be converted to its equivalent in the 2019 HPO hierarchy')
316
            return hpo_id
317
318
    def convert_gene_to_ensembl_helper(self, g, log=False):
319
        #convert gene symbols to ensembl IDs
320
321
322
323
        if g in self.all_unique_ensembl_ids:
324
            return g
325
326
        # HGNC
327
        if g in self.hgnc['symbol'].tolist():
328
            return self.hgnc.loc[self.hgnc['symbol'] == g, 'ensembl_gene_id'].iloc[0]
329
        if g in self.hgnc['entrez_id'].tolist():
330
            return self.hgnc.loc[self.hgnc['entrez_id'] == g, 'ensembl_gene_id'].iloc[0]
331
332
        # ENSEMBL BIOMART
333
        if g in self.ensembl_biomart_mapping['HGNC symbol'].tolist():
334
            return self.ensembl_biomart_mapping.loc[self.ensembl_biomart_mapping['HGNC symbol'] == g, 'Gene stable ID'].iloc[0]
335
        if g in self.ensembl_biomart_mapping['Gene name'].tolist():
336
            return self.ensembl_biomart_mapping.loc[self.ensembl_biomart_mapping['Gene name'] == g, 'Gene stable ID'].iloc[0]
337
        if g in self.ensembl_biomart_mapping['NCBI gene ID'].tolist():
338
            return self.ensembl_biomart_mapping.loc[self.ensembl_biomart_mapping['NCBI gene ID'] == g, 'Gene stable ID'].iloc[0]   
339
        
340
        # SHILPA
341
        if g in self.ensembl_to_hgnc_shilpa['primary_gene_names'].tolist():
342
            return self.ensembl_to_hgnc_shilpa.loc[self.ensembl_to_hgnc_shilpa['primary_gene_names'] == g, 'ensembl_gene_id'].iloc[0]
343
344
        # HGNC BIOMART
345
        if g in self.hgnc_biomart['Approved symbol'].tolist():
346
            return self.hgnc_biomart.loc[self.hgnc_biomart['Approved symbol'] == g, 'Ensembl gene ID'].iloc[0]
347
        if g in self.hgnc_biomart['Alias symbol'].tolist():
348
            return self.hgnc_biomart.loc[self.hgnc_biomart['Alias symbol'] == g, 'Ensembl gene ID'].iloc[0]
349
        if g in self.hgnc_biomart['Previous symbol'].tolist():
350
            return self.hgnc_biomart.loc[self.hgnc_biomart['Previous symbol'] == g, 'Ensembl gene ID'].iloc[0]
351
        if g in self.hgnc_biomart['NCBI gene ID'].tolist():
352
            return self.hgnc_biomart.loc[self.hgnc_biomart['NCBI gene ID'] == g, 'Ensembl gene ID'].iloc[0]
353
354
        # Shilpa's mapping from synonyms to Ensembl IDs
355
        if g in self.synonym_to_ensembl_dict:
356
            return self.synonym_to_ensembl_dict[g]
357
358
        if g in self.gene2ensembl['GeneID'].tolist(): #NCBI gene ID to Ensembl ID
359
            return self.gene2ensembl.loc[self.gene2ensembl['GeneID'] == g,'Ensembl_gene_identifier'].iloc[0]
360
361
        if g in self.misc_gene_to_ensembl_map:
362
            return self.misc_gene_to_ensembl_map[g]
363
        else:
364
            return None
365
366
    def map_gene_to_ensembl_id(self, g, log=False):
367
        # check if null or empty string
368
        if pd.isnull(g) or g == '':
369
            return None, 'did not map' 
370
371
        # special case where two ensembl IDs are synonymous
372
        if g == 'ENSG00000262919':
373
            return 'ENSG00000147382', 'mapped to ensembl'
374
375
        # TODO: how do we know the ensembl IDs themselves aren't expired / out of date
376
        # check if already is an ensembl ID
377
        if g in self.all_unique_ensembl_ids:
378
            return g, 'mapped to ensembl'
379
380
        if g in self.misc_gene_to_ensembl_map:
381
            return self.misc_gene_to_ensembl_map[g], 'mapped to ensembl'
382
383
        # convert to hgnc alias
384
        if g in self.genecards_alias_dict:
385
            g = self.genecards_alias_dict[g]
386
387
        # fix miscapitalization issues
388
        if g in self.miscapitalized_gene_dict:
389
            g = self.miscapitalized_gene_dict[g]
390
391
        
392
393
        g = re.sub(r'([0-9]|X)(ORF)([0-9])', r'\1orf\3', g)
394
395
        
396
        # first look if the original gene name has a mapping to ensembl IDs
397
        new_g = self.convert_gene_to_ensembl_helper(g, log)
398
        
399
        # if not, convert previous/alias symbols to their current ones & check if that has mapping to ensembl ID
400
        if g in self.prev_to_curr_dict and not new_g:
401
            new_g =  self.convert_gene_to_ensembl_helper(self.prev_to_curr_dict[g], log)
402
        if g in self.alias_to_curr_dict and not new_g:
403
            new_g =  self.convert_gene_to_ensembl_helper(self.alias_to_curr_dict[g], log)
404
405
        # then look if any alias has a mapping to an Ensembl ID
406
        if g in self.curr_to_alias_dict and not new_g:
407
            for alias in self.curr_to_alias_dict[g]:
408
                new_g = self.convert_gene_to_ensembl_helper(alias, log)
409
                if new_g: break
410
        
411
        # finally look if a previous symbol has a mapping to an Ensembl ID
412
        if g in self.curr_to_prev_dict and not new_g:
413
            for prev in self.curr_to_prev_dict[g]:
414
                new_g = self.convert_gene_to_ensembl_helper(prev, log)
415
                if new_g: break
416
417
        # from NCBI: "Symbols beginning with LOC. When a published symbol 
418
        # is not available, and orthologs have not yet been determined, 
419
        # Gene will provide a symbol that is constructed as 'LOC' + the GeneID. 
420
        # This is not retained when a replacement symbol has been identified, 
421
        # although queries by the LOC term are still supported. In other words, 
422
        # a record with the symbol LOC12345 is equivalent to GeneID = 12345. 
423
        # So if the symbol changes, the record can still be retrieved on the web 
424
        # using LOC12345 as a query, or from any file using GeneID = 12345."
425
        if g.startswith('LOC') and not new_g:
426
            new_g = self.convert_gene_to_ensembl_helper(g.replace('LOC', ''), log)
427
428
        # map discontinued entrez symbols to ensembl
429
        if g in self.disc_symbol_to_geneid and not new_g: 
430
            new_g =  self.convert_gene_to_ensembl_helper(self.disc_symbol_to_geneid[g], log)
431
432
        # map discontinued entrez geneIDs to ensembl
433
        if g in self.disc_geneid_to_geneid and not new_g: 
434
            new_g =  self.convert_gene_to_ensembl_helper(self.disc_geneid_to_geneid[g], log)
435
    
436
        # # convert entrez gene id to symbol & see if there's a mapping to ensembl ID
437
        if g in self.geneid_to_entrez_symbol_dict and not new_g:
438
            new_g =  self.convert_gene_to_ensembl_helper(self.geneid_to_entrez_symbol_dict[g], log)
439
        if g in self.refseq_geneid_to_symbol_map and not new_g:
440
            new_g =  self.convert_gene_to_ensembl_helper(self.refseq_geneid_to_symbol_map[g], log)
441
        # # convert entrez symbol to entrez gene id & see if there's a mapping to ensembl ID
442
        if g in self.entrez_symbol_to_geneid_dict and not new_g:
443
            new_g =  self.convert_gene_to_ensembl_helper(self.entrez_symbol_to_geneid_dict[g], log)
444
        if g in self.refseq_symbol_to_geneid_map and not new_g:
445
            new_g =  self.convert_gene_to_ensembl_helper(self.refseq_symbol_to_geneid_map[g], log)
446
447
448
        # convert gene synonym to most common name (according to entrez gene_info)
449
        if g in self.synonym_to_entrez_symbol_dict and not new_g:
450
            new_g =  self.convert_gene_to_ensembl_helper(self.synonym_to_entrez_symbol_dict[g], log)
451
452
        if new_g:
453
            g = new_g
454
455
        if g == 'ENSG00000262919':
456
            return 'ENSG00000147382', 'mapped to ensembl'
457
458
459
        if g in self.all_unique_ensembl_ids:
460
            return g, 'mapped to ensembl'
461
        
462
        else: # we return the original gene (mapped to a current symbol) if we're not able to convert it
463
464
            if not pd.isnull(g) and g.startswith('LOC'): g = g.replace('LOC', '')
465
            if g in self.prev_to_curr_dict: g = self.prev_to_curr_dict[g]
466
            if g in self.alias_to_curr_dict: g =  self.alias_to_curr_dict[g]
467
468
            if g in self.disc_symbol_to_geneid: g = self.disc_symbol_to_geneid[g]
469
            if g in self.disc_geneid_to_geneid: g = self.disc_geneid_to_geneid[g]
470
            if g in self.entrez_symbol_to_geneid_dict: g = self.entrez_symbol_to_geneid_dict[g]
471
472
            if g in self.synonym_to_entrez_symbol_dict: g = self.synonym_to_entrez_symbol_dict[g]
473
            if not pd.isnull(g) and g.startswith('LOC'): g = g.replace('LOC', '')
474
475
        
476
            if g in self.all_gene_symbols:
477
                # if log:
478
                #     logging.error(f'There is likely a gene symbol, but no Ensembl ID for gene: {g}')
479
                return g, 'mapped to symbol/ncbi'
480
            # elif g in self.withdrawn_hgnc['WITHDRAWN_SYMBOL'].tolist():
481
            #     logging.error(f'The following gene symbol has been withdrawn: {g}')
482
            elif g in self.all_ncbi_ids:
483
                # if log:
484
                #     logging.error(f'There is likely a NCBI gene ID, but no Ensembl ID for gene: {g}')
485
                return g, 'mapped to symbol/ncbi'
486
            else:
487
                if log:
488
                    logging.error(f'The following gene can not be converted to an Ensembl ID: {g}')
489
                return g, 'did not map'
490
491
    def map_phenotypes(self, df, col_name = 'HPO_ID'):
492
        hpo_map = {p:self.map_phenotype_to_hpo(p) for p in df[col_name].unique()}
493
        df['standardized_hpo_id'] = df[col_name].replace(hpo_map) #TODO: generalize this so it's not always 2019
494
        assert len(df.loc[pd.isnull(df['standardized_hpo_id'])].index) == 0
495
        return df
496
497
    def map_genes(self, df, col_names, log=True, n_processes=4):
498
        genes_to_map = list(set([g for col_name in col_names for g in df[col_name].tolist()]))
499
500
        gene_to_ensembl_map = {}
501
        gene_to_status_map = {}
502
        for g in genes_to_map:
503
            ensembl_id, mapped_status = self.map_gene_to_ensembl_id(g, log=log)
504
            gene_to_ensembl_map[g] = ensembl_id
505
            gene_to_status_map[g] = mapped_status
506
507
        for col_name in col_names:
508
            df[(col_name + '_ensembl')] = df[col_name].replace(gene_to_ensembl_map)
509
            df[(col_name + '_mapping_status')] = df[col_name].replace(gene_to_status_map)
510
        return df
511
512
513
514
if __name__ == "__main__":
515
    pass