Switch to unified view

a b/openomics/database/annotation.py
1
import os
2
from io import StringIO
3
from os.path import expanduser
4
5
import dask.dataframe as dd
6
import pandas as pd
7
from bioservices import BioMart
8
from pandas.errors import ParserError
9
10
from .base import Database
11
12
DEFAULT_CACHE_PATH = os.path.join(expanduser("~"), ".openomics")
13
DEFAULT_LIBRARY_PATH = os.path.join(expanduser("~"), ".openomics", "databases")
14
15
__all__ = ['ProteinAtlas', 'GTEx', 'NONCODE', 'EnsemblGenes', 'EnsemblGeneSequences', 'EnsemblTranscriptSequences',
16
           'EnsemblSNP', 'EnsemblSomaticVariation', 'TANRIC']
17
18
class ProteinAtlas(Database):
19
    """Loads the  database from  .
20
21
        Default path:  .
22
        Default file_resources: {
23
            "": "",
24
            "": "",
25
            "": "",
26
        }
27
        """
28
    COLUMNS_RENAME_DICT = {
29
        "Gene": "protein_name",
30
        "Ensembl": "gene_id",
31
    }
32
33
    def __init__(self, path="https://www.proteinatlas.org/download/", file_resources=None,
34
                 col_rename=COLUMNS_RENAME_DICT, blocksize=0, verbose=False, **kwargs):
35
        """
36
        Args:
37
            path:
38
            file_resources:
39
            col_rename:
40
            blocksize:
41
            verbose:
42
        """
43
        if file_resources is None:
44
            file_resources = {}
45
            file_resources["proteinatlas.tsv.zip"] = "proteinatlas.tsv.zip"
46
47
        super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose, **kwargs)
48
49
    def load_dataframe(self, file_resources, blocksize=None):
50
        """
51
        Args:
52
            file_resources:
53
            blocksize:
54
        """
55
        if blocksize:
56
            df = dd.read_table(file_resources["proteinatlas.tsv"],
57
                               blocksize=None if isinstance(blocksize, bool) else blocksize)
58
        else:
59
            df = pd.read_table(file_resources["proteinatlas.tsv"])
60
61
        return df
62
63
    def get_expressions(self, index="gene_name", type="Tissue RNA"):
64
        """Returns (NX) expressions from the proteinatlas.tsv table. :param
65
        index: a column name to index by. If column contain multiple values,
66
        then aggregate by median values. :param type: one of {"Tissue RNA",
67
        "Cell RNA", "Blood RNA", "Brain RNA", "RNA - "}. If "RNA - ", then
68
        select all types of expressions.
69
70
        Args:
71
            index:
72
            type:
73
74
        Returns:
75
            expressions (pd.DataFrame):
76
        """
77
        columns = "|".join([type, index])
78
        expressions = self.data.filter(regex=columns).groupby(
79
            index).median()
80
        return expressions
81
82
83
class GTEx(Database):
84
    """Loads the  database from https://www.gtexportal.org/home/ .
85
86
    Default path: "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/" .
87
    Default file_resources: {
88
        "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz",
89
        "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt": "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
90
        "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz",
91
    }
92
    """
93
    COLUMNS_RENAME_DICT = {
94
        "Name": "gene_id",
95
        "Description": "gene_name"
96
    }
97
98
    def __init__(self, path="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/",
99
                 file_resources=None, col_rename=None, blocksize=0, verbose=False, **kwargs):
100
        """
101
        Args:
102
            path:
103
            file_resources:
104
            col_rename:
105
            blocksize:
106
            verbose:
107
        """
108
        if file_resources is None:
109
            file_resources = {}
110
111
            file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"] = \
112
                "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
113
            file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"] = \
114
                "https://storage.googleapis.com/gtex_analysis_v8/annotations/" \
115
                "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
116
            file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz"] = \
117
                "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz"
118
119
        super().__init__(path, file_resources, col_rename=None, blocksize=blocksize, verbose=verbose, **kwargs)
120
121
    def load_dataframe(self, file_resources, blocksize=None) -> pd.DataFrame:
122
        """
123
        Args:
124
            file_resources:
125
            blocksize:
126
        """
127
        gene_exp_medians = pd.read_csv(
128
            self.file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"],
129
            sep='\t', header=1, skiprows=1)
130
        gene_exp_medians["Name"] = gene_exp_medians["Name"].str.replace("[.]\d*", "", regex=True)
131
        gene_exp_medians = gene_exp_medians.rename(columns=self.COLUMNS_RENAME_DICT)  # Must be done here
132
        gene_exp_medians.set_index(["gene_id", "gene_name"], inplace=True)
133
134
        # # Sample attributes (needed to get tissue type)
135
        # SampleAttributes = pd.read_table(
136
        #     self.file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"],
137
        # )
138
        # SampleAttributes.set_index("SAMPID", inplace=True)
139
        #
140
        # # Transcript expression for all samples
141
        # transcript_exp = pd.read_csv(
142
        #     self.file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct"],
143
        #     sep='\t', header=1, skiprows=1)
144
        # print("transcript_exp", transcript_exp.columns)
145
        # transcript_exp["gene_id"] = transcript_exp["gene_id"].str.replace("[.]\d*", "")
146
        # transcript_exp["transcript_id"] = transcript_exp["transcript_id"].str.replace("[.]\d*", "")
147
        # transcript_exp.set_index(["gene_id", "transcript_id"], inplace=True)
148
        #
149
        # # Join by sample with tissue type, group expressions by tissue type, and compute medians for each
150
        # transcript_exp_medians = transcript_exp.T \
151
        #     .join(SampleAttributes["SMTSD"], how="left") \
152
        #     .groupby("SMTSD") \
153
        #     .median()
154
        #
155
        # # Reset multilevel index
156
        # transcript_exp_medians.index.rename(name=None, inplace=True)
157
        # transcript_exp_medians = transcript_exp_medians.T.set_index(
158
        #     pd.MultiIndex.from_tuples(tuples=transcript_exp_medians.T.index, names=["gene_id", "transcript_id"]))
159
        #
160
        # gene_transcript_exp_medians = pd.concat([gene_exp_medians, transcript_exp_medians], join="inner", copy=True)
161
        # print("gene_transcript_exp_medians \n", gene_transcript_exp_medians)
162
        return gene_exp_medians
163
164
165
class NONCODE(Database):
166
    """Loads the NONCODE database from http://noncode.org .
167
168
    Default path: "http://www.noncode.org/datadownload" .
169
    Default file_resources: {
170
        "NONCODEv6_human.fa": "NONCODEv6_human.fa.gz",
171
        "": "",
172
        "": "",
173
    }
174
    """
175
176
    def __init__(self, path="http://www.noncode.org/datadownload", file_resources=None, col_rename=None, verbose=False,
177
                 blocksize=None, **kwargs):
178
        """
179
        Args:
180
            path:
181
            file_resources:
182
            col_rename:
183
            verbose:
184
            blocksize:
185
        """
186
        if file_resources is None:
187
            file_resources = {}
188
            file_resources["NONCODEv5_source"] = os.path.join(path, "NONCODEv5_source")
189
            file_resources["NONCODEv5_Transcript2Gene"] = os.path.join(path, "NONCODEv5_Transcript2Gene")
190
            file_resources["NONCODEv5_human.func"] = os.path.join(path, "NONCODEv5_human.func")
191
192
        super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose, **kwargs)
193
194
    def load_dataframe(self, file_resources, blocksize=None):
195
        """
196
        Args:
197
            file_resources:
198
            blocksize:
199
        """
200
        source_df = pd.read_table(file_resources["NONCODEv5_source"], header=None)
201
        source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"]
202
203
        transcript2gene_df = pd.read_table(file_resources["NONCODEv5_Transcript2Gene"], header=None)
204
        transcript2gene_df.columns = ["NONCODE Transcript ID", "NONCODE Gene ID"]
205
206
        if blocksize:
207
            self.noncode_func_df = dd.read_table(file_resources["NONCODEv5_human.func"], header=None,
208
                                                 blocksize=None if isinstance(blocksize, bool) else blocksize)
209
        else:
210
            self.noncode_func_df = pd.read_table(file_resources["NONCODEv5_human.func"], header=None)
211
        self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"]
212
        self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True)
213
214
        # Convert to NONCODE transcript ID for the functional annotation data
215
        self.noncode_func_df["NONCODE Transcript ID"] = self.noncode_func_df.index.map(
216
            pd.Series(transcript2gene_df['NONCODE Transcript ID'].values,
217
                      index=transcript2gene_df['NONCODE Gene ID']).to_dict())
218
219
        # Convert NONCODE transcript ID to gene names
220
        source_gene_names_df = source_df[source_df["name type"] == "NAME"].copy()
221
222
        self.noncode_func_df["Gene Name"] = self.noncode_func_df["NONCODE Transcript ID"].map(
223
            pd.Series(source_gene_names_df['Gene ID'].values,
224
                      index=source_gene_names_df['NONCODE Transcript ID']).to_dict())
225
226
227
class BioMartManager:
228
    """
229
    A base class with functions to query Ensembl Biomarts "https://www.ensembl.org/biomart".
230
    """
231
    DTYPES = {
232
        'entrezgene_id': 'str',
233
        'gene_biotype': 'category',
234
        'transcript_biotype': 'category',
235
        'chromosome_name': 'category',
236
        'transcript_start': 'int',
237
        'transcript_end': 'int',
238
        'transcript_length': 'int',
239
        'mirbase_id': 'str'}
240
241
    def __init__(self, dataset, attributes, host, filename):
242
        """
243
        Args:
244
            dataset:
245
            attributes:
246
            host:
247
            filename:
248
        """
249
        pass  # Does not instantiate
250
251
    def retrieve_dataset(self, host, dataset, attributes, filename, blocksize=None):
252
        """
253
        Args:
254
            host:
255
            dataset:
256
            attributes:
257
            filename:
258
            blocksize:
259
        """
260
        filename = os.path.join(DEFAULT_CACHE_PATH, f"{filename}.tsv")
261
262
        args = dict(
263
            sep="\t",
264
            low_memory=True,
265
            dtype=self.DTYPES,
266
        )
267
268
        if os.path.exists(filename):
269
            if blocksize:
270
                df = dd.read_csv(filename, blocksize=None if isinstance(blocksize, bool) else blocksize, **args)
271
            else:
272
                df = pd.read_csv(filename, **args)
273
        else:
274
            df = self.query_biomart(host=host, dataset=dataset, attributes=attributes,
275
                                    cache=True, save_filename=filename)
276
        return df
277
278
    def cache_dataset(self, dataset, dataframe, save_filename):
279
        """
280
        Args:
281
            dataset:
282
            dataframe:
283
            save_filename:
284
        """
285
        if not os.path.exists(DEFAULT_CACHE_PATH):
286
            os.makedirs(DEFAULT_CACHE_PATH, exist_ok=True)
287
288
        if save_filename is None:
289
            save_filename = os.path.join(DEFAULT_CACHE_PATH, "{}.tsv".format(dataset))
290
291
        dataframe.to_csv(save_filename, sep="\t", index=False)
292
        return save_filename
293
294
    def query_biomart(self, dataset, attributes, host="www.ensembl.org", cache=True, save_filename=None,
295
                      blocksize=None):
296
        """
297
        Args:
298
            dataset:
299
            attributes:
300
            host:
301
            cache:
302
            save_filename:
303
            blocksize:
304
        """
305
        bm = BioMart(host=host)
306
        bm.new_query()
307
        bm.add_dataset_to_xml(dataset)
308
        for at in attributes:
309
            bm.add_attribute_to_xml(at)
310
        xml_query = bm.get_xml()
311
312
        print("Querying {} from {} with attributes {}...".format(dataset, host, attributes))
313
        results = bm.query(xml_query)
314
315
        try:
316
            if blocksize:
317
                df = dd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True,
318
                                 dtype=self.DTYPES, blocksize=None if isinstance(blocksize, bool) else blocksize)
319
            else:
320
                df = pd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True,
321
                                 dtype=self.DTYPES)
322
        except Exception as e:
323
            raise ParserError(f'BioMart Query Result: {results}')
324
325
        if cache:
326
            self.cache_dataset(dataset, df, save_filename)
327
        return df
328
329
330
class EnsemblGenes(BioMartManager, Database):
331
    COLUMNS_RENAME_DICT = {'ensembl_gene_id': 'gene_id',
332
                           'external_gene_name': 'gene_name',
333
                           'ensembl_transcript_id': 'transcript_id',
334
                           'external_transcript_name': 'transcript_name',
335
                           'rfam': 'Rfams'}
336
337
    def __init__(self, biomart="hsapiens_gene_ensembl",
338
                 attributes=None, host="www.ensembl.org", blocksize=None):
339
        # Do not call super().__init__()
340
        """
341
        Args:
342
            biomart:
343
            attributes:
344
            host:
345
            blocksize:
346
        """
347
        if attributes is None:
348
            attributes = ['ensembl_gene_id', 'external_gene_name', 'ensembl_transcript_id',
349
                          'external_transcript_name',
350
                          'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length',
351
                          'gene_biotype', 'transcript_biotype', ]
352
        self.filename = "{}.{}".format(biomart, self.__class__.__name__)
353
354
        self.biomart = biomart
355
        self.host = host
356
        self.data = self.load_data(dataset=biomart, attributes=attributes, host=self.host,
357
                                   filename=self.filename, blocksize=blocksize)
358
359
        self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
360
361
    def name(self):
362
        return f"{super().name()} {self.biomart}"
363
364
    def load_data(self, dataset, attributes, host, filename=None, blocksize=None):
365
        """
366
        Args:
367
            dataset:
368
            attributes:
369
            host:
370
            filename:
371
            blocksize:
372
        """
373
        df = self.retrieve_dataset(host, dataset, attributes, filename, blocksize=blocksize)
374
        return df
375
376
class EnsemblGeneSequences(EnsemblGenes):
377
    def __init__(self, biomart="hsapiens_gene_ensembl",
378
                 attributes=None, host="www.ensembl.org", blocksize=None):
379
        """
380
        Args:
381
            biomart:
382
            attributes:
383
            host:
384
            blocksize:
385
        """
386
        if attributes is None:
387
            attributes = ['ensembl_gene_id', 'gene_exon_intron', 'gene_flank', 'coding_gene_flank', 'gene_exon',
388
                          'coding']
389
        self.filename = "{}.{}".format(biomart, self.__class__.__name__)
390
391
        self.biomart = biomart
392
        self.host = host
393
        self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host,
394
                                 filename=self.filename, blocksize=blocksize)
395
        self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
396
397
398
class EnsemblTranscriptSequences(EnsemblGenes):
399
    def __init__(self, biomart="hsapiens_gene_ensembl",
400
                 attributes=None, host="www.ensembl.org", blocksize=None):
401
        """
402
        Args:
403
            biomart:
404
            attributes:
405
            host:
406
            blocksize:
407
        """
408
        if attributes is None:
409
            attributes = ['ensembl_transcript_id', 'transcript_exon_intron', 'transcript_flank',
410
                          'coding_transcript_flank',
411
                          '5utr', '3utr']
412
        self.filename = "{}.{}".format(biomart, self.__class__.__name__)
413
414
        self.biomart = biomart
415
        self.host = host
416
        self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host,
417
                                 filename=self.filename, blocksize=blocksize)
418
        self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
419
420
class EnsemblSNP(EnsemblGenes):
421
    def __init__(self, biomart="hsapiens_snp",
422
                 attributes=None, host="www.ensembl.org", blocksize=None):
423
        """
424
        Args:
425
            biomart:
426
            attributes:
427
            host:
428
            blocksize:
429
        """
430
        if attributes is None:
431
            attributes = ['synonym_name', 'variation_names', 'minor_allele',
432
                          'associated_variant_risk_allele',
433
                          'ensembl_gene_stable_id', 'ensembl_transcript_stable_id',
434
                          'phenotype_name',
435
                          'chr_name', 'chrom_start', 'chrom_end']
436
        self.filename = "{}.{}".format(biomart, self.__class__.__name__)
437
438
        self.biomart = biomart
439
        self.host = host
440
        self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
441
442
443
class EnsemblSomaticVariation(EnsemblGenes):
444
    def __init__(self, biomart="hsapiens_snp_som",
445
                 attributes=None, host="www.ensembl.org", blocksize=None):
446
        """
447
        Args:
448
            biomart:
449
            attributes:
450
            host:
451
            blocksize:
452
        """
453
        if attributes is None:
454
            attributes = ['somatic_variation_name', 'somatic_source_name', 'somatic_allele', 'somatic_minor_allele',
455
                          'somatic_clinical_significance', 'somatic_validated', 'somatic_transcript_location',
456
                          'somatic_mapweight',
457
                          'somatic_chromosome_start', 'somatic_chromosome_end']
458
        self.filename = "{}.{}".format(biomart, self.__class__.__name__)
459
460
        self.biomart = biomart
461
        self.host = host
462
        self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT)
463
464
465
class TANRIC(Database):
466
    def __init__(self, path, file_resources=None, col_rename=None, blocksize=0, verbose=False):
467
        """
468
        Args:
469
            path:
470
            file_resources:
471
            col_rename:
472
            blocksize:
473
            verbose:
474
        """
475
        super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose)
476
477
    def load_dataframe(self, file_resources, blocksize=None):
478
        """
479
        Args:
480
            file_resources:
481
            blocksize:
482
        """
483
        pass
484
485
    def get_expressions(self, genes_index):
486
        """Preprocess LNCRNA expression file obtained from TANRIC MDAnderson,
487
        and replace ENSEMBL gene ID to HUGO gene names (HGNC). This function
488
        overwrites the GenomicData.process_expression_table() function which
489
        processes TCGA-Assembler data. TANRIC LNCRNA expression values are log2
490
        transformed
491
492
        Args:
493
            genes_index:
494
        """
495
        df = pd.read_table(self.file_resources["TCGA-LUAD-rnaexpr.tsv"])
496
        df[genes_index] = df[genes_index].str.replace("[.]\d*", "")  # Removing .# ENGS gene version number at the end
497
        df = df[~df[genes_index].duplicated(keep='first')]  # Remove duplicate genes
498
499
        # Drop NA gene rows
500
        df.dropna(axis=0, inplace=True)
501
502
        # Transpose matrix to patients rows and genes columns
503
        df.index = df[genes_index]
504
        df = df.T.iloc[1:, :]
505
506
        # Change index string to bcr_sample_barcode standard
507
        def change_patient_barcode(s):
508
            if "Normal" in s:
509
                return s[s.find('TCGA'):] + "-11A"
510
            elif "Tumor" in s:
511
                return s[s.find('TCGA'):] + "-01A"
512
            else:
513
                return s
514
515
        df.index = df.index.map(change_patient_barcode)
516
        df.index.name = "gene_id"
517
518
        return df