|
a |
|
b/data_prep/preprocess.py |
|
|
1 |
|
|
|
2 |
import sys |
|
|
3 |
import obonet |
|
|
4 |
import networkx |
|
|
5 |
import re |
|
|
6 |
import time |
|
|
7 |
import numpy as np |
|
|
8 |
import pandas as pd |
|
|
9 |
from pathlib import Path |
|
|
10 |
from multiprocessing import Pool |
|
|
11 |
sys.path.insert(0, '..') # add project_config to path |
|
|
12 |
import project_config as config |
|
|
13 |
import logging |
|
|
14 |
import pickle |
|
|
15 |
logging.basicConfig(level=logging.INFO) |
|
|
16 |
############# |
|
|
17 |
# SOURCES |
|
|
18 |
# HGNC (DOWNLOAD) - https://www.genenames.org/download/custom/ |
|
|
19 |
# ENSEMBL (BIOMART) - http://useast.ensembl.org/biomart/martview/b11081b5e1087424087a575e2792953e |
|
|
20 |
# biomart_ensembl_ncbi_map |
|
|
21 |
# NCBI (FTP) - https://ftp.ncbi.nih.gov/gene/DATA/ |
|
|
22 |
# gene info |
|
|
23 |
# gene history |
|
|
24 |
# gene2refseq |
|
|
25 |
# gene2ensembl |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
|
|
|
29 |
class Preprocessor(): |
|
|
30 |
|
|
|
31 |
def write_pkl(self, d, filename): |
|
|
32 |
file_path = Path(config.PREPROCESS_PATH / filename) |
|
|
33 |
with open(str(file_path), 'wb') as f: |
|
|
34 |
pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
35 |
|
|
|
36 |
def load_pkl(self, filename): |
|
|
37 |
file_path = Path(config.PREPROCESS_PATH / filename) |
|
|
38 |
with open(str(file_path), 'rb') as f: |
|
|
39 |
return pickle.load(f) |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
def read_hgnc_mappings(self): |
|
|
43 |
# Read in hgnc data |
|
|
44 |
hgnc = pd.read_csv(config.HGNC_PATH / "hgnc_complete_set-12-31-19.tsv", sep='\t', dtype=str) |
|
|
45 |
# hgnc = hgnc.query("symbol != 'LINC00856'") |
|
|
46 |
hgnc = hgnc[hgnc.symbol.notnull()] |
|
|
47 |
hgnc.loc[hgnc.alias_symbol == 'WISP-3', 'alias_symbol'] = 'WISP3' #other sources don't have a dash in the gene name |
|
|
48 |
self.hgnc = hgnc |
|
|
49 |
|
|
|
50 |
self.hgnc_biomart = pd.read_csv(config.HGNC_PATH / "hgnc_biomart_hgnc_ensembl_ncbi.txt", sep='\t', dtype=str).drop(columns=['Previous name']).drop_duplicates() |
|
|
51 |
|
|
|
52 |
|
|
|
53 |
if Path(config.PREPROCESS_PATH / 'prev_to_curr_dict.pkl').exists(): |
|
|
54 |
self.prev_to_curr_dict = self.load_pkl('prev_to_curr_dict.pkl') |
|
|
55 |
self.curr_to_prev_dict = self.load_pkl('curr_to_prev_dict.pkl') |
|
|
56 |
self.alias_to_curr_dict = self.load_pkl('alias_to_curr_dict.pkl') |
|
|
57 |
self.curr_to_alias_dict = self.load_pkl('curr_to_alias_dict.pkl') |
|
|
58 |
else: |
|
|
59 |
alias_genes = hgnc['alias_symbol'].str.split('|').tolist() |
|
|
60 |
hgnc_no_null_prev = hgnc.loc[~pd.isnull(hgnc.prev_symbol)] |
|
|
61 |
# map from previous gene symbol to current gene symbol |
|
|
62 |
self.prev_to_curr_dict = {p:symbol for symbol, prev_symbols in zip(hgnc_no_null_prev.symbol, hgnc_no_null_prev.prev_symbol) for p in prev_symbols.split('|')} |
|
|
63 |
self.write_pkl(self.prev_to_curr_dict, 'prev_to_curr_dict.pkl') |
|
|
64 |
# map from current gene symbol to previous gene symbol |
|
|
65 |
self.curr_to_prev_dict = {symbol:prev_symbols.split('|') for symbol, prev_symbols in zip(hgnc_no_null_prev.symbol, hgnc_no_null_prev.prev_symbol) } |
|
|
66 |
self.write_pkl(self.curr_to_prev_dict, 'curr_to_prev_dict.pkl') |
|
|
67 |
|
|
|
68 |
hgnc_no_null_alias = hgnc.loc[~pd.isnull(hgnc.alias_symbol)] |
|
|
69 |
# map from alias gene symbol to current gene symbol |
|
|
70 |
self.alias_to_curr_dict = {a:symbol for symbol, alias_symbols in zip(hgnc_no_null_alias.symbol, hgnc_no_null_alias.alias_symbol) for a in alias_symbols.split('|')} |
|
|
71 |
self.write_pkl(self.alias_to_curr_dict, 'alias_to_curr_dict.pkl') |
|
|
72 |
# map from current gene symbol to alias gene symbol |
|
|
73 |
self.curr_to_alias_dict = {symbol:alias_symbols.split('|') for symbol, alias_symbols in zip(hgnc_no_null_alias.symbol, hgnc_no_null_alias.alias_symbol) } |
|
|
74 |
self.write_pkl(self.curr_to_alias_dict, 'curr_to_alias_dict.pkl') |
|
|
75 |
|
|
|
76 |
def read_shilpa_mappings(self): |
|
|
77 |
self.ensembl_to_hgnc_shilpa = pd.read_csv(config.HGNC_PATH / "GRCh38_ensembl_gene_list_from_shilpa.tsv", sep='\t', skiprows=4, dtype=str) |
|
|
78 |
if Path(config.PREPROCESS_PATH / 'synonym_to_ensembl_dict.pkl').exists(): |
|
|
79 |
self.synonym_to_ensembl_dict = self.load_pkl('synonym_to_ensembl_dict.pkl') |
|
|
80 |
else: |
|
|
81 |
# map gene synonyms to ensembl ids using Shilpa's curated mapping |
|
|
82 |
synonym_genes = self.ensembl_to_hgnc_shilpa['gene_synonyms'].str.split(',').tolist() |
|
|
83 |
ensembl_ids = self.ensembl_to_hgnc_shilpa['ensembl_gene_id'].tolist() |
|
|
84 |
self.synonym_to_ensembl_dict = {s.split('.')[0]:ensembl for synonyms, ensembl in zip(synonym_genes, ensembl_ids) for s in synonyms} |
|
|
85 |
self.write_pkl(self.synonym_to_ensembl_dict, 'synonym_to_ensembl_dict.pkl') |
|
|
86 |
|
|
|
87 |
def read_biomart_mappings(self): |
|
|
88 |
#hgnc-ensembl-ncbi mappings from Ensembl Biomart |
|
|
89 |
if Path(config.BIOMART_PATH / 'combined_ensembl_biomart.txt').exists(): |
|
|
90 |
self.ensembl_biomart_mapping = pd.read_csv(config.BIOMART_PATH / 'combined_ensembl_biomart.txt', sep='\t', dtype=str) |
|
|
91 |
#self.entrez_to_ensembl_dict = self.load_pkl('entrez_to_ensembl_dict.pkl') |
|
|
92 |
|
|
|
93 |
else: |
|
|
94 |
ensembl_to_hgnc = pd.read_csv(config.HGNC_PATH / "ensembl_to_hgnc_map.txt", sep='\t').drop(columns=['HGNC ID', 'EntrezGene ID']) #unknown origin |
|
|
95 |
ensembl_to_hgnc['NCBI gene ID'] = None |
|
|
96 |
biomart_mapping = pd.read_csv(config.BIOMART_PATH / 'biomart_ensembl_genesymb_entrez.txt', delimiter = '\t', dtype=str) |
|
|
97 |
biomart_mapping = biomart_mapping.drop(columns=['Transcript stable ID', 'Transcript stable ID version']).drop_duplicates() |
|
|
98 |
|
|
|
99 |
# combine two mappings |
|
|
100 |
all_biomart_mapping = pd.concat([ensembl_to_hgnc, biomart_mapping], sort=False).drop(columns=['Gene stable ID version']) |
|
|
101 |
all_biomart_mapping = all_biomart_mapping.drop_duplicates() |
|
|
102 |
self.ensembl_biomart_mapping = all_biomart_mapping.sort_values(['NCBI gene ID'], na_position='last').drop_duplicates(subset=['Gene stable ID', 'Gene name', 'HGNC symbol'], keep='first') |
|
|
103 |
self.ensembl_biomart_mapping['NCBI gene ID'] = self.ensembl_biomart_mapping['NCBI gene ID'].astype(str) |
|
|
104 |
self.ensembl_biomart_mapping.to_csv(config.BIOMART_PATH / 'combined_ensembl_biomart.txt', sep='\t', index=False) |
|
|
105 |
|
|
|
106 |
def read_ncbi_mappings(self): |
|
|
107 |
self.gene2ensembl = pd.read_csv(config.NCBI_PATH / 'gene2ensembl', sep='\t', dtype='str') |
|
|
108 |
|
|
|
109 |
|
|
|
110 |
if Path(config.PREPROCESS_PATH / 'disc_symbol_to_geneid.pkl').exists(): |
|
|
111 |
self.disc_symbol_to_geneid = self.load_pkl('disc_symbol_to_geneid.pkl') |
|
|
112 |
|
|
|
113 |
self.disc_geneid_to_geneid = self.load_pkl('disc_geneid_to_geneid.pkl') |
|
|
114 |
|
|
|
115 |
self.synonym_to_entrez_symbol_dict = self.load_pkl('synonym_to_entrez_symbol_dict.pkl') |
|
|
116 |
self.geneid_to_entrez_symbol_dict = self.load_pkl('geneid_to_entrez_symbol_dict.pkl') |
|
|
117 |
self.entrez_symbol_to_geneid_dict = self.load_pkl('entrez_symbol_to_geneid_dict.pkl') |
|
|
118 |
self.refseq_geneid_to_symbol_map = self.load_pkl('refseq_geneid_to_symbol_map.pkl') |
|
|
119 |
self.refseq_symbol_to_geneid_map = self.load_pkl('refseq_symbol_to_geneid_map.pkl') |
|
|
120 |
|
|
|
121 |
else: |
|
|
122 |
# NCBI GENEID HISTORY |
|
|
123 |
gene_history = pd.read_csv(config.NCBI_PATH / 'gene_history', sep='\t', dtype='str') |
|
|
124 |
# map from discontinued symbol to current NCBI gene ID |
|
|
125 |
self.disc_symbol_to_geneid = {disc_symbol:geneid for disc_symbol, geneid in zip(gene_history['Discontinued_Symbol'],gene_history['GeneID']) if geneid != '-'} |
|
|
126 |
self.write_pkl(self.disc_symbol_to_geneid, 'disc_symbol_to_geneid.pkl') |
|
|
127 |
# map from discontinued NCBI gene ID to current NCBI gene ID |
|
|
128 |
self.disc_geneid_to_geneid = {disc_geneid:geneid for disc_geneid, geneid in zip(gene_history['Discontinued_GeneID'],gene_history['GeneID'] ) if geneid != '-'} |
|
|
129 |
self.write_pkl(self.disc_geneid_to_geneid, 'disc_geneid_to_geneid.pkl') |
|
|
130 |
|
|
|
131 |
|
|
|
132 |
# GENE INFO |
|
|
133 |
# map from synonym to gene symbol |
|
|
134 |
self.gene_info = pd.read_csv(config.NCBI_PATH / 'gene_info', sep='\t', dtype = 'str') |
|
|
135 |
gene_info_with_synonyms = self.gene_info.loc[~pd.isnull(self.gene_info.Synonyms)] |
|
|
136 |
self.synonym_to_entrez_symbol_dict = {s:symbol for symbol, synonyms in zip(gene_info_with_synonyms.Symbol, gene_info_with_synonyms.Synonyms) for s in synonyms.split('|')} |
|
|
137 |
self.write_pkl(self.synonym_to_entrez_symbol_dict, 'synonym_to_entrez_symbol_dict.pkl') |
|
|
138 |
|
|
|
139 |
# map from NCBI gene ID to Gene Symbol |
|
|
140 |
self.geneid_to_entrez_symbol_dict = {geneid:symbol for geneid, symbol in zip(self.gene_info.GeneID, self.gene_info.Symbol) } |
|
|
141 |
self.write_pkl(self.geneid_to_entrez_symbol_dict, 'geneid_to_entrez_symbol_dict.pkl') |
|
|
142 |
|
|
|
143 |
self.entrez_symbol_to_geneid_dict = {symbol:geneid for geneid, symbol in zip(gene_info.GeneID, gene_info.Symbol) } |
|
|
144 |
self.write_pkl(self.entrez_symbol_to_geneid_dict, 'entrez_symbol_to_geneid_dict.pkl') |
|
|
145 |
|
|
|
146 |
# # GENE2 REFSEQ |
|
|
147 |
gene2refseq = pd.read_csv(config.NCBI_PATH / 'gene2refseq', sep='\t', dtype='str') |
|
|
148 |
# map from NCBI gene id to symbol |
|
|
149 |
self.refseq_geneid_to_symbol_map = {gene_id:symbol for gene_id, symbol in zip(gene2refseq['GeneID'], gene2refseq['Symbol'])} |
|
|
150 |
self.write_pkl(self.refseq_geneid_to_symbol_map, 'refseq_geneid_to_symbol_map.pkl') |
|
|
151 |
|
|
|
152 |
# map from symbol to NCBI gene ID |
|
|
153 |
self.refseq_symbol_to_geneid_map = {symbol:gene_id for gene_id, symbol in zip(gene2refseq['GeneID'], gene2refseq['Symbol'])} |
|
|
154 |
self.write_pkl(self.refseq_symbol_to_geneid_map, 'refseq_symbol_to_geneid_map.pkl') |
|
|
155 |
|
|
|
156 |
def get_unique_ensembl_ids(self): |
|
|
157 |
ensembl_ids = self.hgnc['ensembl_gene_id'].tolist() + self.hgnc_biomart['Ensembl gene ID'].tolist() \ |
|
|
158 |
+ self.ensembl_biomart_mapping['Gene stable ID'].tolist() \ |
|
|
159 |
+ self.ensembl_to_hgnc_shilpa['ensembl_gene_id'].tolist() |
|
|
160 |
ensembl_ids = set(ensembl_ids) |
|
|
161 |
return [e for e in ensembl_ids if not pd.isnull(e)] |
|
|
162 |
|
|
|
163 |
def get_gene_symbols(self): |
|
|
164 |
symbols = self.hgnc['symbol'].unique().tolist() + self.hgnc_biomart['Approved symbol'].tolist() |
|
|
165 |
symbols = set(symbols) |
|
|
166 |
return [s for s in symbols if not pd.isnull(s)] |
|
|
167 |
|
|
|
168 |
def get_ncbi_ids(self): |
|
|
169 |
#TODO: make sure this works |
|
|
170 |
if Path(config.NCBI_PATH / 'all_ncbi_ids.txt').exists(): |
|
|
171 |
ncbi = pd.read_csv(config.NCBI_PATH / 'all_ncbi_ids.txt', sep='\t', dtype=str) |
|
|
172 |
|
|
|
173 |
else: |
|
|
174 |
ncbi = self.hgnc_biomart['NCBI gene ID'].tolist() + self.ensembl_biomart_mapping['NCBI gene ID'].tolist() + self.gene_info['GeneID'].tolist() |
|
|
175 |
ncbi = set(ncbi) |
|
|
176 |
ncbi = pd.DataFrame({'ids':[s for s in ncbi if not pd.isnull(s)]}) |
|
|
177 |
ncbi['ids'] = ncbi['ids'].astype(str) |
|
|
178 |
ncbi.to_csv(config.NCBI_PATH / 'all_ncbi_ids.txt', sep='\t', index=False) |
|
|
179 |
return ncbi['ids'].tolist() |
|
|
180 |
|
|
|
181 |
|
|
|
182 |
def __init__(self, process_phenotypes=True, process_genes=True, hpo_source_year='2019'): |
|
|
183 |
if process_phenotypes: |
|
|
184 |
# read in hpo hierarchy |
|
|
185 |
HPO_2015_DIR = config.UDN_DATA / 'hpo' / 'Jan_2015' |
|
|
186 |
HPO_2019_DIR = config.UDN_DATA / 'hpo' / '2019' |
|
|
187 |
HPO_2020_DIR = config.UDN_DATA / 'hpo' / '8_2020' |
|
|
188 |
HPO_2021_DIR = config.UDN_DATA / 'hpo' / '2021' |
|
|
189 |
|
|
|
190 |
self.hpo_2019 = obonet.read_obo(HPO_2019_DIR / 'hp.obo') |
|
|
191 |
self.hpo_2015 = obonet.read_obo(HPO_2015_DIR / 'hp.obo') |
|
|
192 |
self.hpo_2020 = obonet.read_obo(HPO_2020_DIR / 'hp.obo') |
|
|
193 |
self.hpo_2021 = obonet.read_obo(HPO_2021_DIR / 'hp.obo') |
|
|
194 |
|
|
|
195 |
self.hpo_source_year = hpo_source_year |
|
|
196 |
if hpo_source_year == '2019': self.hpo_ontology = self.hpo_2019 |
|
|
197 |
elif hpo_source_year == '2020': self.hpo_ontology = self.hpo_2020 |
|
|
198 |
elif hpo_source_year == '2021': self.hpo_ontology = self.hpo_2021 |
|
|
199 |
else: raise NotImplementedError |
|
|
200 |
|
|
|
201 |
# create dictionary mapping alternate HPO IDs to current HPO IDs |
|
|
202 |
alt_to_curr_hpo_path = Path(config.PREPROCESS_PATH / f'alt_to_curr_hpo_dict_{hpo_source_year}.pkl') |
|
|
203 |
if alt_to_curr_hpo_path.exists(): |
|
|
204 |
with open(str(alt_to_curr_hpo_path), 'rb') as f: |
|
|
205 |
self.alt_to_curr_hpo_dict = pickle.load(f) |
|
|
206 |
else: |
|
|
207 |
self.alt_to_curr_hpo_dict = {} |
|
|
208 |
for node_id in self.hpo_ontology.nodes(): |
|
|
209 |
node_dict = self.hpo_ontology.nodes[node_id] |
|
|
210 |
if 'alt_id' in node_dict: |
|
|
211 |
for alt_id in node_dict['alt_id']: |
|
|
212 |
self.alt_to_curr_hpo_dict[alt_id] = node_id |
|
|
213 |
with open(str(alt_to_curr_hpo_path), 'wb') as f: |
|
|
214 |
pickle.dump(self.alt_to_curr_hpo_dict, f, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
215 |
|
|
|
216 |
|
|
|
217 |
|
|
|
218 |
#map from outdated HPO codes to codes in 2019 hierarchy |
|
|
219 |
# Not in the 2015 or 2019 HPO |
|
|
220 |
# HP:0500014 (Abnormal test result) -> None, throw out |
|
|
221 |
# HP:0040290 (Abnormality of skeletal muscles) -> 'HP:0003011' |
|
|
222 |
# HP:0030963 (obsolete Abnormal aortic morphology) -> 'HP:0001679' |
|
|
223 |
# HP:0030971 (obsolete Abnormal vena cava morphology) -> 'HP:0005345' |
|
|
224 |
|
|
|
225 |
# in 2015 HPO, but not 2019. Parent is also not in 2019 |
|
|
226 |
# HP:0005111 (Dilatation of the ascending aorta) -> 'HP:0005128' |
|
|
227 |
# HP:0001146 (Pigmentary retinal degeneration) -> 'HP:0000580' |
|
|
228 |
# 'HP:0007757' (choroid hypoplasia) -> 'HP:0000610' (Abnormal choroid morphology) |
|
|
229 |
# HP:0009620 (obsolete Radial deviation of the thumb) -> HP:0040021 (Radial deviation of the thumb) |
|
|
230 |
# HP:0009448 (obsolete Aplasia of the phalanges of the 3rd finger) -> HP:0009447 (Aplasia/Hypoplasia of the phalanges of the 3rd finger) |
|
|
231 |
# HP:0004110 (obsolete Radially deviated index finger phalanges) -> HP:0009467 (Radial deviation of the 2nd finger) |
|
|
232 |
# HP:3000026 obsolete Abnormality of common carotid artery plus branches -> HP:0430021 Abnormal common carotid artery morphology |
|
|
233 |
self.OLD_TO_2019_HPO_DICT = {'HP:0040290':'HP:0003011', 'HP:0030963':'HP:0001679', |
|
|
234 |
'HP:0030971':'HP:0005345', 'HP:0005111':'HP:0004970', |
|
|
235 |
'HP:0001146':'HP:0000580', 'HP:0100637':'HP:0012720', |
|
|
236 |
'HP:0007757':'HP:0000610', 'HP:0009620': 'HP:0040021', |
|
|
237 |
'HP:0009448':'HP:0009447', 'HP:0004110': 'HP:0009467', |
|
|
238 |
'HP:3000026': 'HP:0430021'} |
|
|
239 |
|
|
|
240 |
if process_genes: |
|
|
241 |
logging.info('Initializing Gene Mappings....') |
|
|
242 |
|
|
|
243 |
# manually created from gene cards |
|
|
244 |
self.genecards_alias_dict = {'RARS':'RARS1', 'C11ORF80':'C11orf80', 'KIF1BP':'KIFBP', 'ICK':'CILK1', 'AARS':'AARS1', 'SPG23':'DSTYK', |
|
|
245 |
'C9ORF72':'C9orf72', 'C8ORF37':'C8orf37', 'HARS':'HARS1', 'GARS':'GARS1', 'C19ORF12':'C19orf12', |
|
|
246 |
'C12ORF57':'C12orf57', 'ADSSL1':'ADSS1', 'C12ORF65':'C12orf65', 'MARS':'MARS1', 'CXORF56':'STEEP1', |
|
|
247 |
'SARS':'SARS1', 'C12ORF4':'C12orf4', 'MUT':'MMUT', 'LOR':'LORICRIN'} |
|
|
248 |
|
|
|
249 |
self.miscapitalized_gene_dict = {'C10ORF10':'C10orf10','C21ORF59':'C21orf59', 'C4ORF26':'C4orf26', 'C9ORF72':'C9orf72', 'C8ORF37':'C8orf37', 'CXORF56':'CXorf56', 'C12ORF4':'C12orf4', 'C2ORF43':'C2orf43', |
|
|
250 |
'C11ORF80':'C11orf80', 'C19ORF12': 'C19orf12', 'C12ORF57': 'C12orf57', 'C17ORF96':'C17orf96', 'C19ORF68':'C19orf68', 'C19ORF10':'C19orf10', 'C9ORF3':'C9orf3', 'C11ORF73':'C11orf73', |
|
|
251 |
'C12ORF65':'C12orf65', 'C5ORF42': 'C5orf42', 'C2ORF71': 'C2orf71', 'C10ORF2': 'C10orf2', 'mTOR': 'MTOR', 'C12ORF55':'C12orf55', 'C19ORF18':'C19orf18', 'C12ORF66':'C12orf66', 'C5ORF63':'C5orf63', |
|
|
252 |
'C11ORF95': 'C11orf95', 'C15ORF41': 'C15orf41', 'C16ORF57': 'C16orf57', 'C20orff78': 'C20orf78', 'trnM(cau)': 'trnM-CAU', 'C10ORF82':'C10orf82', 'CXORF27':'CXorf27'} |
|
|
253 |
self.misc_gene_to_ensembl_map = {'RP13-996F3.4':'ENSG00000259243', 'NRXN1':'ENSG00000179915', 'LOC401010':'ENSG00000217950', |
|
|
254 |
'TXNB': 'ENSG00000168477', 'TREF1': 'ENSG00000124496', 'DLCRE1C': 'ENSG00000152457', |
|
|
255 |
'CDC37L1C':'ENSG00000106993', 'UNQ2560':'ENSG00000145476', 'COLA11A2':'ENSG00000204248', |
|
|
256 |
'SL12A2':'ENSG00000064651', 'TNFRS1B': 'ENSG00000028137', 'HUWE': 'ENSG00000086758', |
|
|
257 |
'PARRC2C': 'ENSG00000117523', 'AK055785':'ENSG00000153443', 'TRNC18':'ENSG00000182095', |
|
|
258 |
'ZYFVE26':'ENSG00000072121', 'DPMI':'ENSG00000000419', 'SRD53A':'ENSG00000128039', |
|
|
259 |
'VLDR':'ENSG00000147852', 'MENT1':'ENSG00000112759', 'DRPL-A':'ENSG00000111676', |
|
|
260 |
'FRDA1':'ENSG00000165060', 'A1640G':'ENSG00000163554', 'GS1-541M1.1':'ENSG00000220216', |
|
|
261 |
'MT-TT':'ENSG00000210195', '4576': 'ENSG00000210195', 'ZMYDN12': 'ENSG00000066185', 'NRNX1':'ENSG00000179915', |
|
|
262 |
'NPNP4': 'ENSG00000131697', 'STEEP1': 'ENSG00000018610', 'ENSG00000288642':'ENSG00000288642' |
|
|
263 |
} |
|
|
264 |
logging.info('Reading hgnc mappings....') |
|
|
265 |
self.read_hgnc_mappings() |
|
|
266 |
logging.info('Reading biomart mappings....') |
|
|
267 |
self.read_biomart_mappings() |
|
|
268 |
logging.info('Reading shilpa mappings....') |
|
|
269 |
self.read_shilpa_mappings() |
|
|
270 |
logging.info('Reading ncbi mappings....') |
|
|
271 |
self.read_ncbi_mappings() |
|
|
272 |
|
|
|
273 |
|
|
|
274 |
# # get list of withdrawn hgnc symbols |
|
|
275 |
self.withdrawn_hgnc = pd.read_csv(config.HGNC_PATH / "withdrawn-12-31-19.tsv", sep='\t') |
|
|
276 |
|
|
|
277 |
logging.info('Retrieving unique lists of genes....') |
|
|
278 |
|
|
|
279 |
self.all_unique_ensembl_ids = self.get_unique_ensembl_ids() |
|
|
280 |
self.all_gene_symbols = self.get_gene_symbols() |
|
|
281 |
self.all_ncbi_ids = self.get_ncbi_ids() |
|
|
282 |
|
|
|
283 |
# remove entries that map to NULL |
|
|
284 |
self.hgnc = self.hgnc.loc[~pd.isnull(self.hgnc['ensembl_gene_id'])] |
|
|
285 |
self.ensembl_biomart_mapping = self.ensembl_biomart_mapping.loc[~pd.isnull(self.ensembl_biomart_mapping['Gene stable ID'])] |
|
|
286 |
self.ensembl_to_hgnc_shilpa = self.ensembl_to_hgnc_shilpa.loc[~pd.isnull(self.ensembl_to_hgnc_shilpa['ensembl_gene_id'])] |
|
|
287 |
self.hgnc_biomart = self.hgnc_biomart.loc[~pd.isnull(self.hgnc_biomart['Ensembl gene ID'])] |
|
|
288 |
self.gene2ensembl = self.gene2ensembl.loc[~pd.isnull(self.gene2ensembl['Ensembl_gene_identifier'])] |
|
|
289 |
|
|
|
290 |
def map_phenotype_to_hpo(self, hpo_id) -> str: |
|
|
291 |
''' |
|
|
292 |
HP:0500014 : abnormal test result -> can't map to anything |
|
|
293 |
|
|
|
294 |
''' |
|
|
295 |
# if the hpo is already in 2019 HPO, return it |
|
|
296 |
if hpo_id in self.hpo_ontology.nodes(): |
|
|
297 |
return hpo_id |
|
|
298 |
# else map alternate hpo ids to the current version |
|
|
299 |
elif hpo_id in self.alt_to_curr_hpo_dict: |
|
|
300 |
return self.alt_to_curr_hpo_dict[hpo_id] |
|
|
301 |
elif hpo_id in self.OLD_TO_2019_HPO_DICT: #TODO: generalize beyond 2019 hpo dict |
|
|
302 |
return self.OLD_TO_2019_HPO_DICT[hpo_id] |
|
|
303 |
|
|
|
304 |
# if all else fails, map the hpo to its parents & return that |
|
|
305 |
else: |
|
|
306 |
for ontology in [self.hpo_2015, self.hpo_2019, self.hpo_2020, self.hpo_2021]: |
|
|
307 |
if hpo_id in ontology.nodes(): |
|
|
308 |
ancestors = list(ontology.successors(hpo_id)) |
|
|
309 |
for ancestor in ancestors: |
|
|
310 |
if ancestor in self.hpo_ontology.nodes(): |
|
|
311 |
return ancestor |
|
|
312 |
logging.error(f'{hpo_id} can not be converted to its equivalent in the 2019 HPO hierarchy') |
|
|
313 |
return hpo_id |
|
|
314 |
|
|
|
315 |
logging.error(f'{hpo_id} can not be converted to its equivalent in the 2019 HPO hierarchy') |
|
|
316 |
return hpo_id |
|
|
317 |
|
|
|
318 |
def convert_gene_to_ensembl_helper(self, g, log=False): |
|
|
319 |
#convert gene symbols to ensembl IDs |
|
|
320 |
|
|
|
321 |
|
|
|
322 |
|
|
|
323 |
if g in self.all_unique_ensembl_ids: |
|
|
324 |
return g |
|
|
325 |
|
|
|
326 |
# HGNC |
|
|
327 |
if g in self.hgnc['symbol'].tolist(): |
|
|
328 |
return self.hgnc.loc[self.hgnc['symbol'] == g, 'ensembl_gene_id'].iloc[0] |
|
|
329 |
if g in self.hgnc['entrez_id'].tolist(): |
|
|
330 |
return self.hgnc.loc[self.hgnc['entrez_id'] == g, 'ensembl_gene_id'].iloc[0] |
|
|
331 |
|
|
|
332 |
# ENSEMBL BIOMART |
|
|
333 |
if g in self.ensembl_biomart_mapping['HGNC symbol'].tolist(): |
|
|
334 |
return self.ensembl_biomart_mapping.loc[self.ensembl_biomart_mapping['HGNC symbol'] == g, 'Gene stable ID'].iloc[0] |
|
|
335 |
if g in self.ensembl_biomart_mapping['Gene name'].tolist(): |
|
|
336 |
return self.ensembl_biomart_mapping.loc[self.ensembl_biomart_mapping['Gene name'] == g, 'Gene stable ID'].iloc[0] |
|
|
337 |
if g in self.ensembl_biomart_mapping['NCBI gene ID'].tolist(): |
|
|
338 |
return self.ensembl_biomart_mapping.loc[self.ensembl_biomart_mapping['NCBI gene ID'] == g, 'Gene stable ID'].iloc[0] |
|
|
339 |
|
|
|
340 |
# SHILPA |
|
|
341 |
if g in self.ensembl_to_hgnc_shilpa['primary_gene_names'].tolist(): |
|
|
342 |
return self.ensembl_to_hgnc_shilpa.loc[self.ensembl_to_hgnc_shilpa['primary_gene_names'] == g, 'ensembl_gene_id'].iloc[0] |
|
|
343 |
|
|
|
344 |
# HGNC BIOMART |
|
|
345 |
if g in self.hgnc_biomart['Approved symbol'].tolist(): |
|
|
346 |
return self.hgnc_biomart.loc[self.hgnc_biomart['Approved symbol'] == g, 'Ensembl gene ID'].iloc[0] |
|
|
347 |
if g in self.hgnc_biomart['Alias symbol'].tolist(): |
|
|
348 |
return self.hgnc_biomart.loc[self.hgnc_biomart['Alias symbol'] == g, 'Ensembl gene ID'].iloc[0] |
|
|
349 |
if g in self.hgnc_biomart['Previous symbol'].tolist(): |
|
|
350 |
return self.hgnc_biomart.loc[self.hgnc_biomart['Previous symbol'] == g, 'Ensembl gene ID'].iloc[0] |
|
|
351 |
if g in self.hgnc_biomart['NCBI gene ID'].tolist(): |
|
|
352 |
return self.hgnc_biomart.loc[self.hgnc_biomart['NCBI gene ID'] == g, 'Ensembl gene ID'].iloc[0] |
|
|
353 |
|
|
|
354 |
# Shilpa's mapping from synonyms to Ensembl IDs |
|
|
355 |
if g in self.synonym_to_ensembl_dict: |
|
|
356 |
return self.synonym_to_ensembl_dict[g] |
|
|
357 |
|
|
|
358 |
if g in self.gene2ensembl['GeneID'].tolist(): #NCBI gene ID to Ensembl ID |
|
|
359 |
return self.gene2ensembl.loc[self.gene2ensembl['GeneID'] == g,'Ensembl_gene_identifier'].iloc[0] |
|
|
360 |
|
|
|
361 |
if g in self.misc_gene_to_ensembl_map: |
|
|
362 |
return self.misc_gene_to_ensembl_map[g] |
|
|
363 |
else: |
|
|
364 |
return None |
|
|
365 |
|
|
|
366 |
def map_gene_to_ensembl_id(self, g, log=False): |
|
|
367 |
# check if null or empty string |
|
|
368 |
if pd.isnull(g) or g == '': |
|
|
369 |
return None, 'did not map' |
|
|
370 |
|
|
|
371 |
# special case where two ensembl IDs are synonymous |
|
|
372 |
if g == 'ENSG00000262919': |
|
|
373 |
return 'ENSG00000147382', 'mapped to ensembl' |
|
|
374 |
|
|
|
375 |
# TODO: how do we know the ensembl IDs themselves aren't expired / out of date |
|
|
376 |
# check if already is an ensembl ID |
|
|
377 |
if g in self.all_unique_ensembl_ids: |
|
|
378 |
return g, 'mapped to ensembl' |
|
|
379 |
|
|
|
380 |
if g in self.misc_gene_to_ensembl_map: |
|
|
381 |
return self.misc_gene_to_ensembl_map[g], 'mapped to ensembl' |
|
|
382 |
|
|
|
383 |
# convert to hgnc alias |
|
|
384 |
if g in self.genecards_alias_dict: |
|
|
385 |
g = self.genecards_alias_dict[g] |
|
|
386 |
|
|
|
387 |
# fix miscapitalization issues |
|
|
388 |
if g in self.miscapitalized_gene_dict: |
|
|
389 |
g = self.miscapitalized_gene_dict[g] |
|
|
390 |
|
|
|
391 |
|
|
|
392 |
|
|
|
393 |
g = re.sub(r'([0-9]|X)(ORF)([0-9])', r'\1orf\3', g) |
|
|
394 |
|
|
|
395 |
|
|
|
396 |
# first look if the original gene name has a mapping to ensembl IDs |
|
|
397 |
new_g = self.convert_gene_to_ensembl_helper(g, log) |
|
|
398 |
|
|
|
399 |
# if not, convert previous/alias symbols to their current ones & check if that has mapping to ensembl ID |
|
|
400 |
if g in self.prev_to_curr_dict and not new_g: |
|
|
401 |
new_g = self.convert_gene_to_ensembl_helper(self.prev_to_curr_dict[g], log) |
|
|
402 |
if g in self.alias_to_curr_dict and not new_g: |
|
|
403 |
new_g = self.convert_gene_to_ensembl_helper(self.alias_to_curr_dict[g], log) |
|
|
404 |
|
|
|
405 |
# then look if any alias has a mapping to an Ensembl ID |
|
|
406 |
if g in self.curr_to_alias_dict and not new_g: |
|
|
407 |
for alias in self.curr_to_alias_dict[g]: |
|
|
408 |
new_g = self.convert_gene_to_ensembl_helper(alias, log) |
|
|
409 |
if new_g: break |
|
|
410 |
|
|
|
411 |
# finally look if a previous symbol has a mapping to an Ensembl ID |
|
|
412 |
if g in self.curr_to_prev_dict and not new_g: |
|
|
413 |
for prev in self.curr_to_prev_dict[g]: |
|
|
414 |
new_g = self.convert_gene_to_ensembl_helper(prev, log) |
|
|
415 |
if new_g: break |
|
|
416 |
|
|
|
417 |
# from NCBI: "Symbols beginning with LOC. When a published symbol |
|
|
418 |
# is not available, and orthologs have not yet been determined, |
|
|
419 |
# Gene will provide a symbol that is constructed as 'LOC' + the GeneID. |
|
|
420 |
# This is not retained when a replacement symbol has been identified, |
|
|
421 |
# although queries by the LOC term are still supported. In other words, |
|
|
422 |
# a record with the symbol LOC12345 is equivalent to GeneID = 12345. |
|
|
423 |
# So if the symbol changes, the record can still be retrieved on the web |
|
|
424 |
# using LOC12345 as a query, or from any file using GeneID = 12345." |
|
|
425 |
if g.startswith('LOC') and not new_g: |
|
|
426 |
new_g = self.convert_gene_to_ensembl_helper(g.replace('LOC', ''), log) |
|
|
427 |
|
|
|
428 |
# map discontinued entrez symbols to ensembl |
|
|
429 |
if g in self.disc_symbol_to_geneid and not new_g: |
|
|
430 |
new_g = self.convert_gene_to_ensembl_helper(self.disc_symbol_to_geneid[g], log) |
|
|
431 |
|
|
|
432 |
# map discontinued entrez geneIDs to ensembl |
|
|
433 |
if g in self.disc_geneid_to_geneid and not new_g: |
|
|
434 |
new_g = self.convert_gene_to_ensembl_helper(self.disc_geneid_to_geneid[g], log) |
|
|
435 |
|
|
|
436 |
# # convert entrez gene id to symbol & see if there's a mapping to ensembl ID |
|
|
437 |
if g in self.geneid_to_entrez_symbol_dict and not new_g: |
|
|
438 |
new_g = self.convert_gene_to_ensembl_helper(self.geneid_to_entrez_symbol_dict[g], log) |
|
|
439 |
if g in self.refseq_geneid_to_symbol_map and not new_g: |
|
|
440 |
new_g = self.convert_gene_to_ensembl_helper(self.refseq_geneid_to_symbol_map[g], log) |
|
|
441 |
# # convert entrez symbol to entrez gene id & see if there's a mapping to ensembl ID |
|
|
442 |
if g in self.entrez_symbol_to_geneid_dict and not new_g: |
|
|
443 |
new_g = self.convert_gene_to_ensembl_helper(self.entrez_symbol_to_geneid_dict[g], log) |
|
|
444 |
if g in self.refseq_symbol_to_geneid_map and not new_g: |
|
|
445 |
new_g = self.convert_gene_to_ensembl_helper(self.refseq_symbol_to_geneid_map[g], log) |
|
|
446 |
|
|
|
447 |
|
|
|
448 |
# convert gene synonym to most common name (according to entrez gene_info) |
|
|
449 |
if g in self.synonym_to_entrez_symbol_dict and not new_g: |
|
|
450 |
new_g = self.convert_gene_to_ensembl_helper(self.synonym_to_entrez_symbol_dict[g], log) |
|
|
451 |
|
|
|
452 |
if new_g: |
|
|
453 |
g = new_g |
|
|
454 |
|
|
|
455 |
if g == 'ENSG00000262919': |
|
|
456 |
return 'ENSG00000147382', 'mapped to ensembl' |
|
|
457 |
|
|
|
458 |
|
|
|
459 |
if g in self.all_unique_ensembl_ids: |
|
|
460 |
return g, 'mapped to ensembl' |
|
|
461 |
|
|
|
462 |
else: # we return the original gene (mapped to a current symbol) if we're not able to convert it |
|
|
463 |
|
|
|
464 |
if not pd.isnull(g) and g.startswith('LOC'): g = g.replace('LOC', '') |
|
|
465 |
if g in self.prev_to_curr_dict: g = self.prev_to_curr_dict[g] |
|
|
466 |
if g in self.alias_to_curr_dict: g = self.alias_to_curr_dict[g] |
|
|
467 |
|
|
|
468 |
if g in self.disc_symbol_to_geneid: g = self.disc_symbol_to_geneid[g] |
|
|
469 |
if g in self.disc_geneid_to_geneid: g = self.disc_geneid_to_geneid[g] |
|
|
470 |
if g in self.entrez_symbol_to_geneid_dict: g = self.entrez_symbol_to_geneid_dict[g] |
|
|
471 |
|
|
|
472 |
if g in self.synonym_to_entrez_symbol_dict: g = self.synonym_to_entrez_symbol_dict[g] |
|
|
473 |
if not pd.isnull(g) and g.startswith('LOC'): g = g.replace('LOC', '') |
|
|
474 |
|
|
|
475 |
|
|
|
476 |
if g in self.all_gene_symbols: |
|
|
477 |
# if log: |
|
|
478 |
# logging.error(f'There is likely a gene symbol, but no Ensembl ID for gene: {g}') |
|
|
479 |
return g, 'mapped to symbol/ncbi' |
|
|
480 |
# elif g in self.withdrawn_hgnc['WITHDRAWN_SYMBOL'].tolist(): |
|
|
481 |
# logging.error(f'The following gene symbol has been withdrawn: {g}') |
|
|
482 |
elif g in self.all_ncbi_ids: |
|
|
483 |
# if log: |
|
|
484 |
# logging.error(f'There is likely a NCBI gene ID, but no Ensembl ID for gene: {g}') |
|
|
485 |
return g, 'mapped to symbol/ncbi' |
|
|
486 |
else: |
|
|
487 |
if log: |
|
|
488 |
logging.error(f'The following gene can not be converted to an Ensembl ID: {g}') |
|
|
489 |
return g, 'did not map' |
|
|
490 |
|
|
|
491 |
def map_phenotypes(self, df, col_name = 'HPO_ID'): |
|
|
492 |
hpo_map = {p:self.map_phenotype_to_hpo(p) for p in df[col_name].unique()} |
|
|
493 |
df['standardized_hpo_id'] = df[col_name].replace(hpo_map) #TODO: generalize this so it's not always 2019 |
|
|
494 |
assert len(df.loc[pd.isnull(df['standardized_hpo_id'])].index) == 0 |
|
|
495 |
return df |
|
|
496 |
|
|
|
497 |
def map_genes(self, df, col_names, log=True, n_processes=4): |
|
|
498 |
genes_to_map = list(set([g for col_name in col_names for g in df[col_name].tolist()])) |
|
|
499 |
|
|
|
500 |
gene_to_ensembl_map = {} |
|
|
501 |
gene_to_status_map = {} |
|
|
502 |
for g in genes_to_map: |
|
|
503 |
ensembl_id, mapped_status = self.map_gene_to_ensembl_id(g, log=log) |
|
|
504 |
gene_to_ensembl_map[g] = ensembl_id |
|
|
505 |
gene_to_status_map[g] = mapped_status |
|
|
506 |
|
|
|
507 |
for col_name in col_names: |
|
|
508 |
df[(col_name + '_ensembl')] = df[col_name].replace(gene_to_ensembl_map) |
|
|
509 |
df[(col_name + '_mapping_status')] = df[col_name].replace(gene_to_status_map) |
|
|
510 |
return df |
|
|
511 |
|
|
|
512 |
|
|
|
513 |
|
|
|
514 |
if __name__ == "__main__": |
|
|
515 |
pass |