|
a |
|
b/openomics/database/annotation.py |
|
|
1 |
import os |
|
|
2 |
from io import StringIO |
|
|
3 |
from os.path import expanduser |
|
|
4 |
|
|
|
5 |
import dask.dataframe as dd |
|
|
6 |
import pandas as pd |
|
|
7 |
from bioservices import BioMart |
|
|
8 |
from pandas.errors import ParserError |
|
|
9 |
|
|
|
10 |
from .base import Database |
|
|
11 |
|
|
|
12 |
DEFAULT_CACHE_PATH = os.path.join(expanduser("~"), ".openomics") |
|
|
13 |
DEFAULT_LIBRARY_PATH = os.path.join(expanduser("~"), ".openomics", "databases") |
|
|
14 |
|
|
|
15 |
__all__ = ['ProteinAtlas', 'GTEx', 'NONCODE', 'EnsemblGenes', 'EnsemblGeneSequences', 'EnsemblTranscriptSequences', |
|
|
16 |
'EnsemblSNP', 'EnsemblSomaticVariation', 'TANRIC'] |
|
|
17 |
|
|
|
18 |
class ProteinAtlas(Database): |
|
|
19 |
"""Loads the database from . |
|
|
20 |
|
|
|
21 |
Default path: . |
|
|
22 |
Default file_resources: { |
|
|
23 |
"": "", |
|
|
24 |
"": "", |
|
|
25 |
"": "", |
|
|
26 |
} |
|
|
27 |
""" |
|
|
28 |
COLUMNS_RENAME_DICT = { |
|
|
29 |
"Gene": "protein_name", |
|
|
30 |
"Ensembl": "gene_id", |
|
|
31 |
} |
|
|
32 |
|
|
|
33 |
def __init__(self, path="https://www.proteinatlas.org/download/", file_resources=None, |
|
|
34 |
col_rename=COLUMNS_RENAME_DICT, blocksize=0, verbose=False, **kwargs): |
|
|
35 |
""" |
|
|
36 |
Args: |
|
|
37 |
path: |
|
|
38 |
file_resources: |
|
|
39 |
col_rename: |
|
|
40 |
blocksize: |
|
|
41 |
verbose: |
|
|
42 |
""" |
|
|
43 |
if file_resources is None: |
|
|
44 |
file_resources = {} |
|
|
45 |
file_resources["proteinatlas.tsv.zip"] = "proteinatlas.tsv.zip" |
|
|
46 |
|
|
|
47 |
super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose, **kwargs) |
|
|
48 |
|
|
|
49 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
50 |
""" |
|
|
51 |
Args: |
|
|
52 |
file_resources: |
|
|
53 |
blocksize: |
|
|
54 |
""" |
|
|
55 |
if blocksize: |
|
|
56 |
df = dd.read_table(file_resources["proteinatlas.tsv"], |
|
|
57 |
blocksize=None if isinstance(blocksize, bool) else blocksize) |
|
|
58 |
else: |
|
|
59 |
df = pd.read_table(file_resources["proteinatlas.tsv"]) |
|
|
60 |
|
|
|
61 |
return df |
|
|
62 |
|
|
|
63 |
def get_expressions(self, index="gene_name", type="Tissue RNA"): |
|
|
64 |
"""Returns (NX) expressions from the proteinatlas.tsv table. :param |
|
|
65 |
index: a column name to index by. If column contain multiple values, |
|
|
66 |
then aggregate by median values. :param type: one of {"Tissue RNA", |
|
|
67 |
"Cell RNA", "Blood RNA", "Brain RNA", "RNA - "}. If "RNA - ", then |
|
|
68 |
select all types of expressions. |
|
|
69 |
|
|
|
70 |
Args: |
|
|
71 |
index: |
|
|
72 |
type: |
|
|
73 |
|
|
|
74 |
Returns: |
|
|
75 |
expressions (pd.DataFrame): |
|
|
76 |
""" |
|
|
77 |
columns = "|".join([type, index]) |
|
|
78 |
expressions = self.data.filter(regex=columns).groupby( |
|
|
79 |
index).median() |
|
|
80 |
return expressions |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
class GTEx(Database): |
|
|
84 |
"""Loads the database from https://www.gtexportal.org/home/ . |
|
|
85 |
|
|
|
86 |
Default path: "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/" . |
|
|
87 |
Default file_resources: { |
|
|
88 |
"GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz", |
|
|
89 |
"GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt": "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", |
|
|
90 |
"GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz", |
|
|
91 |
} |
|
|
92 |
""" |
|
|
93 |
COLUMNS_RENAME_DICT = { |
|
|
94 |
"Name": "gene_id", |
|
|
95 |
"Description": "gene_name" |
|
|
96 |
} |
|
|
97 |
|
|
|
98 |
def __init__(self, path="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/", |
|
|
99 |
file_resources=None, col_rename=None, blocksize=0, verbose=False, **kwargs): |
|
|
100 |
""" |
|
|
101 |
Args: |
|
|
102 |
path: |
|
|
103 |
file_resources: |
|
|
104 |
col_rename: |
|
|
105 |
blocksize: |
|
|
106 |
verbose: |
|
|
107 |
""" |
|
|
108 |
if file_resources is None: |
|
|
109 |
file_resources = {} |
|
|
110 |
|
|
|
111 |
file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"] = \ |
|
|
112 |
"GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz" |
|
|
113 |
file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"] = \ |
|
|
114 |
"https://storage.googleapis.com/gtex_analysis_v8/annotations/" \ |
|
|
115 |
"GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt" |
|
|
116 |
file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz"] = \ |
|
|
117 |
"GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz" |
|
|
118 |
|
|
|
119 |
super().__init__(path, file_resources, col_rename=None, blocksize=blocksize, verbose=verbose, **kwargs) |
|
|
120 |
|
|
|
121 |
def load_dataframe(self, file_resources, blocksize=None) -> pd.DataFrame: |
|
|
122 |
""" |
|
|
123 |
Args: |
|
|
124 |
file_resources: |
|
|
125 |
blocksize: |
|
|
126 |
""" |
|
|
127 |
gene_exp_medians = pd.read_csv( |
|
|
128 |
self.file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"], |
|
|
129 |
sep='\t', header=1, skiprows=1) |
|
|
130 |
gene_exp_medians["Name"] = gene_exp_medians["Name"].str.replace("[.]\d*", "", regex=True) |
|
|
131 |
gene_exp_medians = gene_exp_medians.rename(columns=self.COLUMNS_RENAME_DICT) # Must be done here |
|
|
132 |
gene_exp_medians.set_index(["gene_id", "gene_name"], inplace=True) |
|
|
133 |
|
|
|
134 |
# # Sample attributes (needed to get tissue type) |
|
|
135 |
# SampleAttributes = pd.read_table( |
|
|
136 |
# self.file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"], |
|
|
137 |
# ) |
|
|
138 |
# SampleAttributes.set_index("SAMPID", inplace=True) |
|
|
139 |
# |
|
|
140 |
# # Transcript expression for all samples |
|
|
141 |
# transcript_exp = pd.read_csv( |
|
|
142 |
# self.file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct"], |
|
|
143 |
# sep='\t', header=1, skiprows=1) |
|
|
144 |
# print("transcript_exp", transcript_exp.columns) |
|
|
145 |
# transcript_exp["gene_id"] = transcript_exp["gene_id"].str.replace("[.]\d*", "") |
|
|
146 |
# transcript_exp["transcript_id"] = transcript_exp["transcript_id"].str.replace("[.]\d*", "") |
|
|
147 |
# transcript_exp.set_index(["gene_id", "transcript_id"], inplace=True) |
|
|
148 |
# |
|
|
149 |
# # Join by sample with tissue type, group expressions by tissue type, and compute medians for each |
|
|
150 |
# transcript_exp_medians = transcript_exp.T \ |
|
|
151 |
# .join(SampleAttributes["SMTSD"], how="left") \ |
|
|
152 |
# .groupby("SMTSD") \ |
|
|
153 |
# .median() |
|
|
154 |
# |
|
|
155 |
# # Reset multilevel index |
|
|
156 |
# transcript_exp_medians.index.rename(name=None, inplace=True) |
|
|
157 |
# transcript_exp_medians = transcript_exp_medians.T.set_index( |
|
|
158 |
# pd.MultiIndex.from_tuples(tuples=transcript_exp_medians.T.index, names=["gene_id", "transcript_id"])) |
|
|
159 |
# |
|
|
160 |
# gene_transcript_exp_medians = pd.concat([gene_exp_medians, transcript_exp_medians], join="inner", copy=True) |
|
|
161 |
# print("gene_transcript_exp_medians \n", gene_transcript_exp_medians) |
|
|
162 |
return gene_exp_medians |
|
|
163 |
|
|
|
164 |
|
|
|
165 |
class NONCODE(Database): |
|
|
166 |
"""Loads the NONCODE database from http://noncode.org . |
|
|
167 |
|
|
|
168 |
Default path: "http://www.noncode.org/datadownload" . |
|
|
169 |
Default file_resources: { |
|
|
170 |
"NONCODEv6_human.fa": "NONCODEv6_human.fa.gz", |
|
|
171 |
"": "", |
|
|
172 |
"": "", |
|
|
173 |
} |
|
|
174 |
""" |
|
|
175 |
|
|
|
176 |
def __init__(self, path="http://www.noncode.org/datadownload", file_resources=None, col_rename=None, verbose=False, |
|
|
177 |
blocksize=None, **kwargs): |
|
|
178 |
""" |
|
|
179 |
Args: |
|
|
180 |
path: |
|
|
181 |
file_resources: |
|
|
182 |
col_rename: |
|
|
183 |
verbose: |
|
|
184 |
blocksize: |
|
|
185 |
""" |
|
|
186 |
if file_resources is None: |
|
|
187 |
file_resources = {} |
|
|
188 |
file_resources["NONCODEv5_source"] = os.path.join(path, "NONCODEv5_source") |
|
|
189 |
file_resources["NONCODEv5_Transcript2Gene"] = os.path.join(path, "NONCODEv5_Transcript2Gene") |
|
|
190 |
file_resources["NONCODEv5_human.func"] = os.path.join(path, "NONCODEv5_human.func") |
|
|
191 |
|
|
|
192 |
super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose, **kwargs) |
|
|
193 |
|
|
|
194 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
195 |
""" |
|
|
196 |
Args: |
|
|
197 |
file_resources: |
|
|
198 |
blocksize: |
|
|
199 |
""" |
|
|
200 |
source_df = pd.read_table(file_resources["NONCODEv5_source"], header=None) |
|
|
201 |
source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"] |
|
|
202 |
|
|
|
203 |
transcript2gene_df = pd.read_table(file_resources["NONCODEv5_Transcript2Gene"], header=None) |
|
|
204 |
transcript2gene_df.columns = ["NONCODE Transcript ID", "NONCODE Gene ID"] |
|
|
205 |
|
|
|
206 |
if blocksize: |
|
|
207 |
self.noncode_func_df = dd.read_table(file_resources["NONCODEv5_human.func"], header=None, |
|
|
208 |
blocksize=None if isinstance(blocksize, bool) else blocksize) |
|
|
209 |
else: |
|
|
210 |
self.noncode_func_df = pd.read_table(file_resources["NONCODEv5_human.func"], header=None) |
|
|
211 |
self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"] |
|
|
212 |
self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True) |
|
|
213 |
|
|
|
214 |
# Convert to NONCODE transcript ID for the functional annotation data |
|
|
215 |
self.noncode_func_df["NONCODE Transcript ID"] = self.noncode_func_df.index.map( |
|
|
216 |
pd.Series(transcript2gene_df['NONCODE Transcript ID'].values, |
|
|
217 |
index=transcript2gene_df['NONCODE Gene ID']).to_dict()) |
|
|
218 |
|
|
|
219 |
# Convert NONCODE transcript ID to gene names |
|
|
220 |
source_gene_names_df = source_df[source_df["name type"] == "NAME"].copy() |
|
|
221 |
|
|
|
222 |
self.noncode_func_df["Gene Name"] = self.noncode_func_df["NONCODE Transcript ID"].map( |
|
|
223 |
pd.Series(source_gene_names_df['Gene ID'].values, |
|
|
224 |
index=source_gene_names_df['NONCODE Transcript ID']).to_dict()) |
|
|
225 |
|
|
|
226 |
|
|
|
227 |
class BioMartManager: |
|
|
228 |
""" |
|
|
229 |
A base class with functions to query Ensembl Biomarts "https://www.ensembl.org/biomart". |
|
|
230 |
""" |
|
|
231 |
DTYPES = { |
|
|
232 |
'entrezgene_id': 'str', |
|
|
233 |
'gene_biotype': 'category', |
|
|
234 |
'transcript_biotype': 'category', |
|
|
235 |
'chromosome_name': 'category', |
|
|
236 |
'transcript_start': 'int', |
|
|
237 |
'transcript_end': 'int', |
|
|
238 |
'transcript_length': 'int', |
|
|
239 |
'mirbase_id': 'str'} |
|
|
240 |
|
|
|
241 |
def __init__(self, dataset, attributes, host, filename): |
|
|
242 |
""" |
|
|
243 |
Args: |
|
|
244 |
dataset: |
|
|
245 |
attributes: |
|
|
246 |
host: |
|
|
247 |
filename: |
|
|
248 |
""" |
|
|
249 |
pass # Does not instantiate |
|
|
250 |
|
|
|
251 |
def retrieve_dataset(self, host, dataset, attributes, filename, blocksize=None): |
|
|
252 |
""" |
|
|
253 |
Args: |
|
|
254 |
host: |
|
|
255 |
dataset: |
|
|
256 |
attributes: |
|
|
257 |
filename: |
|
|
258 |
blocksize: |
|
|
259 |
""" |
|
|
260 |
filename = os.path.join(DEFAULT_CACHE_PATH, f"{filename}.tsv") |
|
|
261 |
|
|
|
262 |
args = dict( |
|
|
263 |
sep="\t", |
|
|
264 |
low_memory=True, |
|
|
265 |
dtype=self.DTYPES, |
|
|
266 |
) |
|
|
267 |
|
|
|
268 |
if os.path.exists(filename): |
|
|
269 |
if blocksize: |
|
|
270 |
df = dd.read_csv(filename, blocksize=None if isinstance(blocksize, bool) else blocksize, **args) |
|
|
271 |
else: |
|
|
272 |
df = pd.read_csv(filename, **args) |
|
|
273 |
else: |
|
|
274 |
df = self.query_biomart(host=host, dataset=dataset, attributes=attributes, |
|
|
275 |
cache=True, save_filename=filename) |
|
|
276 |
return df |
|
|
277 |
|
|
|
278 |
def cache_dataset(self, dataset, dataframe, save_filename): |
|
|
279 |
""" |
|
|
280 |
Args: |
|
|
281 |
dataset: |
|
|
282 |
dataframe: |
|
|
283 |
save_filename: |
|
|
284 |
""" |
|
|
285 |
if not os.path.exists(DEFAULT_CACHE_PATH): |
|
|
286 |
os.makedirs(DEFAULT_CACHE_PATH, exist_ok=True) |
|
|
287 |
|
|
|
288 |
if save_filename is None: |
|
|
289 |
save_filename = os.path.join(DEFAULT_CACHE_PATH, "{}.tsv".format(dataset)) |
|
|
290 |
|
|
|
291 |
dataframe.to_csv(save_filename, sep="\t", index=False) |
|
|
292 |
return save_filename |
|
|
293 |
|
|
|
294 |
def query_biomart(self, dataset, attributes, host="www.ensembl.org", cache=True, save_filename=None, |
|
|
295 |
blocksize=None): |
|
|
296 |
""" |
|
|
297 |
Args: |
|
|
298 |
dataset: |
|
|
299 |
attributes: |
|
|
300 |
host: |
|
|
301 |
cache: |
|
|
302 |
save_filename: |
|
|
303 |
blocksize: |
|
|
304 |
""" |
|
|
305 |
bm = BioMart(host=host) |
|
|
306 |
bm.new_query() |
|
|
307 |
bm.add_dataset_to_xml(dataset) |
|
|
308 |
for at in attributes: |
|
|
309 |
bm.add_attribute_to_xml(at) |
|
|
310 |
xml_query = bm.get_xml() |
|
|
311 |
|
|
|
312 |
print("Querying {} from {} with attributes {}...".format(dataset, host, attributes)) |
|
|
313 |
results = bm.query(xml_query) |
|
|
314 |
|
|
|
315 |
try: |
|
|
316 |
if blocksize: |
|
|
317 |
df = dd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True, |
|
|
318 |
dtype=self.DTYPES, blocksize=None if isinstance(blocksize, bool) else blocksize) |
|
|
319 |
else: |
|
|
320 |
df = pd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True, |
|
|
321 |
dtype=self.DTYPES) |
|
|
322 |
except Exception as e: |
|
|
323 |
raise ParserError(f'BioMart Query Result: {results}') |
|
|
324 |
|
|
|
325 |
if cache: |
|
|
326 |
self.cache_dataset(dataset, df, save_filename) |
|
|
327 |
return df |
|
|
328 |
|
|
|
329 |
|
|
|
330 |
class EnsemblGenes(BioMartManager, Database): |
|
|
331 |
COLUMNS_RENAME_DICT = {'ensembl_gene_id': 'gene_id', |
|
|
332 |
'external_gene_name': 'gene_name', |
|
|
333 |
'ensembl_transcript_id': 'transcript_id', |
|
|
334 |
'external_transcript_name': 'transcript_name', |
|
|
335 |
'rfam': 'Rfams'} |
|
|
336 |
|
|
|
337 |
def __init__(self, biomart="hsapiens_gene_ensembl", |
|
|
338 |
attributes=None, host="www.ensembl.org", blocksize=None): |
|
|
339 |
# Do not call super().__init__() |
|
|
340 |
""" |
|
|
341 |
Args: |
|
|
342 |
biomart: |
|
|
343 |
attributes: |
|
|
344 |
host: |
|
|
345 |
blocksize: |
|
|
346 |
""" |
|
|
347 |
if attributes is None: |
|
|
348 |
attributes = ['ensembl_gene_id', 'external_gene_name', 'ensembl_transcript_id', |
|
|
349 |
'external_transcript_name', |
|
|
350 |
'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length', |
|
|
351 |
'gene_biotype', 'transcript_biotype', ] |
|
|
352 |
self.filename = "{}.{}".format(biomart, self.__class__.__name__) |
|
|
353 |
|
|
|
354 |
self.biomart = biomart |
|
|
355 |
self.host = host |
|
|
356 |
self.data = self.load_data(dataset=biomart, attributes=attributes, host=self.host, |
|
|
357 |
filename=self.filename, blocksize=blocksize) |
|
|
358 |
|
|
|
359 |
self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) |
|
|
360 |
|
|
|
361 |
def name(self): |
|
|
362 |
return f"{super().name()} {self.biomart}" |
|
|
363 |
|
|
|
364 |
def load_data(self, dataset, attributes, host, filename=None, blocksize=None): |
|
|
365 |
""" |
|
|
366 |
Args: |
|
|
367 |
dataset: |
|
|
368 |
attributes: |
|
|
369 |
host: |
|
|
370 |
filename: |
|
|
371 |
blocksize: |
|
|
372 |
""" |
|
|
373 |
df = self.retrieve_dataset(host, dataset, attributes, filename, blocksize=blocksize) |
|
|
374 |
return df |
|
|
375 |
|
|
|
376 |
class EnsemblGeneSequences(EnsemblGenes): |
|
|
377 |
def __init__(self, biomart="hsapiens_gene_ensembl", |
|
|
378 |
attributes=None, host="www.ensembl.org", blocksize=None): |
|
|
379 |
""" |
|
|
380 |
Args: |
|
|
381 |
biomart: |
|
|
382 |
attributes: |
|
|
383 |
host: |
|
|
384 |
blocksize: |
|
|
385 |
""" |
|
|
386 |
if attributes is None: |
|
|
387 |
attributes = ['ensembl_gene_id', 'gene_exon_intron', 'gene_flank', 'coding_gene_flank', 'gene_exon', |
|
|
388 |
'coding'] |
|
|
389 |
self.filename = "{}.{}".format(biomart, self.__class__.__name__) |
|
|
390 |
|
|
|
391 |
self.biomart = biomart |
|
|
392 |
self.host = host |
|
|
393 |
self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host, |
|
|
394 |
filename=self.filename, blocksize=blocksize) |
|
|
395 |
self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) |
|
|
396 |
|
|
|
397 |
|
|
|
398 |
class EnsemblTranscriptSequences(EnsemblGenes): |
|
|
399 |
def __init__(self, biomart="hsapiens_gene_ensembl", |
|
|
400 |
attributes=None, host="www.ensembl.org", blocksize=None): |
|
|
401 |
""" |
|
|
402 |
Args: |
|
|
403 |
biomart: |
|
|
404 |
attributes: |
|
|
405 |
host: |
|
|
406 |
blocksize: |
|
|
407 |
""" |
|
|
408 |
if attributes is None: |
|
|
409 |
attributes = ['ensembl_transcript_id', 'transcript_exon_intron', 'transcript_flank', |
|
|
410 |
'coding_transcript_flank', |
|
|
411 |
'5utr', '3utr'] |
|
|
412 |
self.filename = "{}.{}".format(biomart, self.__class__.__name__) |
|
|
413 |
|
|
|
414 |
self.biomart = biomart |
|
|
415 |
self.host = host |
|
|
416 |
self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host, |
|
|
417 |
filename=self.filename, blocksize=blocksize) |
|
|
418 |
self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) |
|
|
419 |
|
|
|
420 |
class EnsemblSNP(EnsemblGenes): |
|
|
421 |
def __init__(self, biomart="hsapiens_snp", |
|
|
422 |
attributes=None, host="www.ensembl.org", blocksize=None): |
|
|
423 |
""" |
|
|
424 |
Args: |
|
|
425 |
biomart: |
|
|
426 |
attributes: |
|
|
427 |
host: |
|
|
428 |
blocksize: |
|
|
429 |
""" |
|
|
430 |
if attributes is None: |
|
|
431 |
attributes = ['synonym_name', 'variation_names', 'minor_allele', |
|
|
432 |
'associated_variant_risk_allele', |
|
|
433 |
'ensembl_gene_stable_id', 'ensembl_transcript_stable_id', |
|
|
434 |
'phenotype_name', |
|
|
435 |
'chr_name', 'chrom_start', 'chrom_end'] |
|
|
436 |
self.filename = "{}.{}".format(biomart, self.__class__.__name__) |
|
|
437 |
|
|
|
438 |
self.biomart = biomart |
|
|
439 |
self.host = host |
|
|
440 |
self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) |
|
|
441 |
|
|
|
442 |
|
|
|
443 |
class EnsemblSomaticVariation(EnsemblGenes): |
|
|
444 |
def __init__(self, biomart="hsapiens_snp_som", |
|
|
445 |
attributes=None, host="www.ensembl.org", blocksize=None): |
|
|
446 |
""" |
|
|
447 |
Args: |
|
|
448 |
biomart: |
|
|
449 |
attributes: |
|
|
450 |
host: |
|
|
451 |
blocksize: |
|
|
452 |
""" |
|
|
453 |
if attributes is None: |
|
|
454 |
attributes = ['somatic_variation_name', 'somatic_source_name', 'somatic_allele', 'somatic_minor_allele', |
|
|
455 |
'somatic_clinical_significance', 'somatic_validated', 'somatic_transcript_location', |
|
|
456 |
'somatic_mapweight', |
|
|
457 |
'somatic_chromosome_start', 'somatic_chromosome_end'] |
|
|
458 |
self.filename = "{}.{}".format(biomart, self.__class__.__name__) |
|
|
459 |
|
|
|
460 |
self.biomart = biomart |
|
|
461 |
self.host = host |
|
|
462 |
self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) |
|
|
463 |
|
|
|
464 |
|
|
|
465 |
class TANRIC(Database): |
|
|
466 |
def __init__(self, path, file_resources=None, col_rename=None, blocksize=0, verbose=False): |
|
|
467 |
""" |
|
|
468 |
Args: |
|
|
469 |
path: |
|
|
470 |
file_resources: |
|
|
471 |
col_rename: |
|
|
472 |
blocksize: |
|
|
473 |
verbose: |
|
|
474 |
""" |
|
|
475 |
super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose) |
|
|
476 |
|
|
|
477 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
478 |
""" |
|
|
479 |
Args: |
|
|
480 |
file_resources: |
|
|
481 |
blocksize: |
|
|
482 |
""" |
|
|
483 |
pass |
|
|
484 |
|
|
|
485 |
def get_expressions(self, genes_index): |
|
|
486 |
"""Preprocess LNCRNA expression file obtained from TANRIC MDAnderson, |
|
|
487 |
and replace ENSEMBL gene ID to HUGO gene names (HGNC). This function |
|
|
488 |
overwrites the GenomicData.process_expression_table() function which |
|
|
489 |
processes TCGA-Assembler data. TANRIC LNCRNA expression values are log2 |
|
|
490 |
transformed |
|
|
491 |
|
|
|
492 |
Args: |
|
|
493 |
genes_index: |
|
|
494 |
""" |
|
|
495 |
df = pd.read_table(self.file_resources["TCGA-LUAD-rnaexpr.tsv"]) |
|
|
496 |
df[genes_index] = df[genes_index].str.replace("[.]\d*", "") # Removing .# ENGS gene version number at the end |
|
|
497 |
df = df[~df[genes_index].duplicated(keep='first')] # Remove duplicate genes |
|
|
498 |
|
|
|
499 |
# Drop NA gene rows |
|
|
500 |
df.dropna(axis=0, inplace=True) |
|
|
501 |
|
|
|
502 |
# Transpose matrix to patients rows and genes columns |
|
|
503 |
df.index = df[genes_index] |
|
|
504 |
df = df.T.iloc[1:, :] |
|
|
505 |
|
|
|
506 |
# Change index string to bcr_sample_barcode standard |
|
|
507 |
def change_patient_barcode(s): |
|
|
508 |
if "Normal" in s: |
|
|
509 |
return s[s.find('TCGA'):] + "-11A" |
|
|
510 |
elif "Tumor" in s: |
|
|
511 |
return s[s.find('TCGA'):] + "-01A" |
|
|
512 |
else: |
|
|
513 |
return s |
|
|
514 |
|
|
|
515 |
df.index = df.index.map(change_patient_barcode) |
|
|
516 |
df.index.name = "gene_id" |
|
|
517 |
|
|
|
518 |
return df |