--- a +++ b/openomics/database/annotation.py @@ -0,0 +1,518 @@ +import os +from io import StringIO +from os.path import expanduser + +import dask.dataframe as dd +import pandas as pd +from bioservices import BioMart +from pandas.errors import ParserError + +from .base import Database + +DEFAULT_CACHE_PATH = os.path.join(expanduser("~"), ".openomics") +DEFAULT_LIBRARY_PATH = os.path.join(expanduser("~"), ".openomics", "databases") + +__all__ = ['ProteinAtlas', 'GTEx', 'NONCODE', 'EnsemblGenes', 'EnsemblGeneSequences', 'EnsemblTranscriptSequences', + 'EnsemblSNP', 'EnsemblSomaticVariation', 'TANRIC'] + +class ProteinAtlas(Database): + """Loads the database from . + + Default path: . + Default file_resources: { + "": "", + "": "", + "": "", + } + """ + COLUMNS_RENAME_DICT = { + "Gene": "protein_name", + "Ensembl": "gene_id", + } + + def __init__(self, path="https://www.proteinatlas.org/download/", file_resources=None, + col_rename=COLUMNS_RENAME_DICT, blocksize=0, verbose=False, **kwargs): + """ + Args: + path: + file_resources: + col_rename: + blocksize: + verbose: + """ + if file_resources is None: + file_resources = {} + file_resources["proteinatlas.tsv.zip"] = "proteinatlas.tsv.zip" + + super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose, **kwargs) + + def load_dataframe(self, file_resources, blocksize=None): + """ + Args: + file_resources: + blocksize: + """ + if blocksize: + df = dd.read_table(file_resources["proteinatlas.tsv"], + blocksize=None if isinstance(blocksize, bool) else blocksize) + else: + df = pd.read_table(file_resources["proteinatlas.tsv"]) + + return df + + def get_expressions(self, index="gene_name", type="Tissue RNA"): + """Returns (NX) expressions from the proteinatlas.tsv table. :param + index: a column name to index by. If column contain multiple values, + then aggregate by median values. :param type: one of {"Tissue RNA", + "Cell RNA", "Blood RNA", "Brain RNA", "RNA - "}. If "RNA - ", then + select all types of expressions. + + Args: + index: + type: + + Returns: + expressions (pd.DataFrame): + """ + columns = "|".join([type, index]) + expressions = self.data.filter(regex=columns).groupby( + index).median() + return expressions + + +class GTEx(Database): + """Loads the database from https://www.gtexportal.org/home/ . + + Default path: "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/" . + Default file_resources: { + "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz", + "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt": "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", + "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct": "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz", + } + """ + COLUMNS_RENAME_DICT = { + "Name": "gene_id", + "Description": "gene_name" + } + + def __init__(self, path="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/", + file_resources=None, col_rename=None, blocksize=0, verbose=False, **kwargs): + """ + Args: + path: + file_resources: + col_rename: + blocksize: + verbose: + """ + if file_resources is None: + file_resources = {} + + file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"] = \ + "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz" + file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"] = \ + "https://storage.googleapis.com/gtex_analysis_v8/annotations/" \ + "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt" + file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz"] = \ + "GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz" + + super().__init__(path, file_resources, col_rename=None, blocksize=blocksize, verbose=verbose, **kwargs) + + def load_dataframe(self, file_resources, blocksize=None) -> pd.DataFrame: + """ + Args: + file_resources: + blocksize: + """ + gene_exp_medians = pd.read_csv( + self.file_resources["GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"], + sep='\t', header=1, skiprows=1) + gene_exp_medians["Name"] = gene_exp_medians["Name"].str.replace("[.]\d*", "", regex=True) + gene_exp_medians = gene_exp_medians.rename(columns=self.COLUMNS_RENAME_DICT) # Must be done here + gene_exp_medians.set_index(["gene_id", "gene_name"], inplace=True) + + # # Sample attributes (needed to get tissue type) + # SampleAttributes = pd.read_table( + # self.file_resources["GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"], + # ) + # SampleAttributes.set_index("SAMPID", inplace=True) + # + # # Transcript expression for all samples + # transcript_exp = pd.read_csv( + # self.file_resources["GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct"], + # sep='\t', header=1, skiprows=1) + # print("transcript_exp", transcript_exp.columns) + # transcript_exp["gene_id"] = transcript_exp["gene_id"].str.replace("[.]\d*", "") + # transcript_exp["transcript_id"] = transcript_exp["transcript_id"].str.replace("[.]\d*", "") + # transcript_exp.set_index(["gene_id", "transcript_id"], inplace=True) + # + # # Join by sample with tissue type, group expressions by tissue type, and compute medians for each + # transcript_exp_medians = transcript_exp.T \ + # .join(SampleAttributes["SMTSD"], how="left") \ + # .groupby("SMTSD") \ + # .median() + # + # # Reset multilevel index + # transcript_exp_medians.index.rename(name=None, inplace=True) + # transcript_exp_medians = transcript_exp_medians.T.set_index( + # pd.MultiIndex.from_tuples(tuples=transcript_exp_medians.T.index, names=["gene_id", "transcript_id"])) + # + # gene_transcript_exp_medians = pd.concat([gene_exp_medians, transcript_exp_medians], join="inner", copy=True) + # print("gene_transcript_exp_medians \n", gene_transcript_exp_medians) + return gene_exp_medians + + +class NONCODE(Database): + """Loads the NONCODE database from http://noncode.org . + + Default path: "http://www.noncode.org/datadownload" . + Default file_resources: { + "NONCODEv6_human.fa": "NONCODEv6_human.fa.gz", + "": "", + "": "", + } + """ + + def __init__(self, path="http://www.noncode.org/datadownload", file_resources=None, col_rename=None, verbose=False, + blocksize=None, **kwargs): + """ + Args: + path: + file_resources: + col_rename: + verbose: + blocksize: + """ + if file_resources is None: + file_resources = {} + file_resources["NONCODEv5_source"] = os.path.join(path, "NONCODEv5_source") + file_resources["NONCODEv5_Transcript2Gene"] = os.path.join(path, "NONCODEv5_Transcript2Gene") + file_resources["NONCODEv5_human.func"] = os.path.join(path, "NONCODEv5_human.func") + + super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose, **kwargs) + + def load_dataframe(self, file_resources, blocksize=None): + """ + Args: + file_resources: + blocksize: + """ + source_df = pd.read_table(file_resources["NONCODEv5_source"], header=None) + source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"] + + transcript2gene_df = pd.read_table(file_resources["NONCODEv5_Transcript2Gene"], header=None) + transcript2gene_df.columns = ["NONCODE Transcript ID", "NONCODE Gene ID"] + + if blocksize: + self.noncode_func_df = dd.read_table(file_resources["NONCODEv5_human.func"], header=None, + blocksize=None if isinstance(blocksize, bool) else blocksize) + else: + self.noncode_func_df = pd.read_table(file_resources["NONCODEv5_human.func"], header=None) + self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"] + self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True) + + # Convert to NONCODE transcript ID for the functional annotation data + self.noncode_func_df["NONCODE Transcript ID"] = self.noncode_func_df.index.map( + pd.Series(transcript2gene_df['NONCODE Transcript ID'].values, + index=transcript2gene_df['NONCODE Gene ID']).to_dict()) + + # Convert NONCODE transcript ID to gene names + source_gene_names_df = source_df[source_df["name type"] == "NAME"].copy() + + self.noncode_func_df["Gene Name"] = self.noncode_func_df["NONCODE Transcript ID"].map( + pd.Series(source_gene_names_df['Gene ID'].values, + index=source_gene_names_df['NONCODE Transcript ID']).to_dict()) + + +class BioMartManager: + """ + A base class with functions to query Ensembl Biomarts "https://www.ensembl.org/biomart". + """ + DTYPES = { + 'entrezgene_id': 'str', + 'gene_biotype': 'category', + 'transcript_biotype': 'category', + 'chromosome_name': 'category', + 'transcript_start': 'int', + 'transcript_end': 'int', + 'transcript_length': 'int', + 'mirbase_id': 'str'} + + def __init__(self, dataset, attributes, host, filename): + """ + Args: + dataset: + attributes: + host: + filename: + """ + pass # Does not instantiate + + def retrieve_dataset(self, host, dataset, attributes, filename, blocksize=None): + """ + Args: + host: + dataset: + attributes: + filename: + blocksize: + """ + filename = os.path.join(DEFAULT_CACHE_PATH, f"{filename}.tsv") + + args = dict( + sep="\t", + low_memory=True, + dtype=self.DTYPES, + ) + + if os.path.exists(filename): + if blocksize: + df = dd.read_csv(filename, blocksize=None if isinstance(blocksize, bool) else blocksize, **args) + else: + df = pd.read_csv(filename, **args) + else: + df = self.query_biomart(host=host, dataset=dataset, attributes=attributes, + cache=True, save_filename=filename) + return df + + def cache_dataset(self, dataset, dataframe, save_filename): + """ + Args: + dataset: + dataframe: + save_filename: + """ + if not os.path.exists(DEFAULT_CACHE_PATH): + os.makedirs(DEFAULT_CACHE_PATH, exist_ok=True) + + if save_filename is None: + save_filename = os.path.join(DEFAULT_CACHE_PATH, "{}.tsv".format(dataset)) + + dataframe.to_csv(save_filename, sep="\t", index=False) + return save_filename + + def query_biomart(self, dataset, attributes, host="www.ensembl.org", cache=True, save_filename=None, + blocksize=None): + """ + Args: + dataset: + attributes: + host: + cache: + save_filename: + blocksize: + """ + bm = BioMart(host=host) + bm.new_query() + bm.add_dataset_to_xml(dataset) + for at in attributes: + bm.add_attribute_to_xml(at) + xml_query = bm.get_xml() + + print("Querying {} from {} with attributes {}...".format(dataset, host, attributes)) + results = bm.query(xml_query) + + try: + if blocksize: + df = dd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True, + dtype=self.DTYPES, blocksize=None if isinstance(blocksize, bool) else blocksize) + else: + df = pd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True, + dtype=self.DTYPES) + except Exception as e: + raise ParserError(f'BioMart Query Result: {results}') + + if cache: + self.cache_dataset(dataset, df, save_filename) + return df + + +class EnsemblGenes(BioMartManager, Database): + COLUMNS_RENAME_DICT = {'ensembl_gene_id': 'gene_id', + 'external_gene_name': 'gene_name', + 'ensembl_transcript_id': 'transcript_id', + 'external_transcript_name': 'transcript_name', + 'rfam': 'Rfams'} + + def __init__(self, biomart="hsapiens_gene_ensembl", + attributes=None, host="www.ensembl.org", blocksize=None): + # Do not call super().__init__() + """ + Args: + biomart: + attributes: + host: + blocksize: + """ + if attributes is None: + attributes = ['ensembl_gene_id', 'external_gene_name', 'ensembl_transcript_id', + 'external_transcript_name', + 'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length', + 'gene_biotype', 'transcript_biotype', ] + self.filename = "{}.{}".format(biomart, self.__class__.__name__) + + self.biomart = biomart + self.host = host + self.data = self.load_data(dataset=biomart, attributes=attributes, host=self.host, + filename=self.filename, blocksize=blocksize) + + self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) + + def name(self): + return f"{super().name()} {self.biomart}" + + def load_data(self, dataset, attributes, host, filename=None, blocksize=None): + """ + Args: + dataset: + attributes: + host: + filename: + blocksize: + """ + df = self.retrieve_dataset(host, dataset, attributes, filename, blocksize=blocksize) + return df + +class EnsemblGeneSequences(EnsemblGenes): + def __init__(self, biomart="hsapiens_gene_ensembl", + attributes=None, host="www.ensembl.org", blocksize=None): + """ + Args: + biomart: + attributes: + host: + blocksize: + """ + if attributes is None: + attributes = ['ensembl_gene_id', 'gene_exon_intron', 'gene_flank', 'coding_gene_flank', 'gene_exon', + 'coding'] + self.filename = "{}.{}".format(biomart, self.__class__.__name__) + + self.biomart = biomart + self.host = host + self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host, + filename=self.filename, blocksize=blocksize) + self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) + + +class EnsemblTranscriptSequences(EnsemblGenes): + def __init__(self, biomart="hsapiens_gene_ensembl", + attributes=None, host="www.ensembl.org", blocksize=None): + """ + Args: + biomart: + attributes: + host: + blocksize: + """ + if attributes is None: + attributes = ['ensembl_transcript_id', 'transcript_exon_intron', 'transcript_flank', + 'coding_transcript_flank', + '5utr', '3utr'] + self.filename = "{}.{}".format(biomart, self.__class__.__name__) + + self.biomart = biomart + self.host = host + self.df = self.load_data(dataset=biomart, attributes=attributes, host=self.host, + filename=self.filename, blocksize=blocksize) + self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) + +class EnsemblSNP(EnsemblGenes): + def __init__(self, biomart="hsapiens_snp", + attributes=None, host="www.ensembl.org", blocksize=None): + """ + Args: + biomart: + attributes: + host: + blocksize: + """ + if attributes is None: + attributes = ['synonym_name', 'variation_names', 'minor_allele', + 'associated_variant_risk_allele', + 'ensembl_gene_stable_id', 'ensembl_transcript_stable_id', + 'phenotype_name', + 'chr_name', 'chrom_start', 'chrom_end'] + self.filename = "{}.{}".format(biomart, self.__class__.__name__) + + self.biomart = biomart + self.host = host + self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) + + +class EnsemblSomaticVariation(EnsemblGenes): + def __init__(self, biomart="hsapiens_snp_som", + attributes=None, host="www.ensembl.org", blocksize=None): + """ + Args: + biomart: + attributes: + host: + blocksize: + """ + if attributes is None: + attributes = ['somatic_variation_name', 'somatic_source_name', 'somatic_allele', 'somatic_minor_allele', + 'somatic_clinical_significance', 'somatic_validated', 'somatic_transcript_location', + 'somatic_mapweight', + 'somatic_chromosome_start', 'somatic_chromosome_end'] + self.filename = "{}.{}".format(biomart, self.__class__.__name__) + + self.biomart = biomart + self.host = host + self.data = self.data.rename(columns=self.COLUMNS_RENAME_DICT) + + +class TANRIC(Database): + def __init__(self, path, file_resources=None, col_rename=None, blocksize=0, verbose=False): + """ + Args: + path: + file_resources: + col_rename: + blocksize: + verbose: + """ + super().__init__(path, file_resources, col_rename=col_rename, blocksize=blocksize, verbose=verbose) + + def load_dataframe(self, file_resources, blocksize=None): + """ + Args: + file_resources: + blocksize: + """ + pass + + def get_expressions(self, genes_index): + """Preprocess LNCRNA expression file obtained from TANRIC MDAnderson, + and replace ENSEMBL gene ID to HUGO gene names (HGNC). This function + overwrites the GenomicData.process_expression_table() function which + processes TCGA-Assembler data. TANRIC LNCRNA expression values are log2 + transformed + + Args: + genes_index: + """ + df = pd.read_table(self.file_resources["TCGA-LUAD-rnaexpr.tsv"]) + df[genes_index] = df[genes_index].str.replace("[.]\d*", "") # Removing .# ENGS gene version number at the end + df = df[~df[genes_index].duplicated(keep='first')] # Remove duplicate genes + + # Drop NA gene rows + df.dropna(axis=0, inplace=True) + + # Transpose matrix to patients rows and genes columns + df.index = df[genes_index] + df = df.T.iloc[1:, :] + + # Change index string to bcr_sample_barcode standard + def change_patient_barcode(s): + if "Normal" in s: + return s[s.find('TCGA'):] + "-11A" + elif "Tumor" in s: + return s[s.find('TCGA'):] + "-01A" + else: + return s + + df.index = df.index.map(change_patient_barcode) + df.index.name = "gene_id" + + return df