OpenOmics / Git / Diff of /openomics/database/disease.py

Models:
AlyssaS/
OpenOmics
Downloads: 1
Diff of /openomics/database/disease.py [000000] .. [548210]
Switch to side-by-side view

--- a
+++ b/openomics/database/disease.py
@@ -0,0 +1,190 @@
+from abc import abstractmethod
+
+import pandas as pd
+from openomics.database.base import Database, Annotatable
+
+__all__ = ['DiseaseAssociation', 'MalaCards', 'DisGeNet', 'HMDD', 'LncRNADisease']
+
+class DiseaseAssociation(Database):
+    def __init__(self, path, file_resources=None, **kwargs):
+        """
+        Args:
+            path:
+            file_resources:
+            **kwargs:
+        """
+        super().__init__(path, file_resources, **kwargs)
+
+    @abstractmethod
+    def get_disease_assocs(self, index="gene_name"):
+        """
+        Args:
+            index:
+        """
+        return self.data.groupby(index)[Annotatable.DISEASE_ASSOCIATIONS_COL].unique()
+
+
+class MalaCards(DiseaseAssociation):
+    """Loads the MalaCards database from "http://zdzlab.einstein.yu.edu/1/hedd/" .
+
+    Default path: "http://zdzlab.einstein.yu.edu/1/hedd/" .
+    Default file_resources: {
+        "MalaCards.csv": "download.action.php?filename=DataDownload/MalaCards.csv",
+    }
+    """
+    COLUMNS_RENAME_DICT = {
+        "geneSymbol": "gene_name",
+        "maladyMainName": Annotatable.DISEASE_ASSOCIATIONS_COL
+    }
+
+    def __init__(self, path="http://zdzlab.einstein.yu.edu/1/hedd/", file_resources=None,
+                 col_rename=COLUMNS_RENAME_DICT, **kwargs):
+        """
+        Args:
+            path:
+            file_resources:
+            col_rename:
+            **kwargs:
+        """
+        if file_resources is None:
+            file_resources = {}
+            file_resources["MalaCards.csv"] = "download.action.php?filename=DataDownload/MalaCards.csv"
+
+        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
+
+    def load_dataframe(self, file_resources, blocksize=None):
+        # type: (dict, int) -> pd.DataFrame
+        """
+        Args:
+            file_resources:
+            blocksize:
+        """
+        df = pd.read_csv(file_resources["MalaCards.csv"])
+        return df
+
+
+class DisGeNet(DiseaseAssociation):
+    """Loads the  database from  .
+
+    Default path:  .
+    Default file_resources: {
+        "": "",
+        "": "",
+        "": "",
+    }
+    """
+    COLUMNS_RENAME_DICT = {"geneSymbol": "gene_name",
+                           "diseaseName": Annotatable.DISEASE_ASSOCIATIONS_COL}
+
+    def __init__(self, path="https://www.disgenet.org/static/disgenet_ap1/files/downloads/",
+                 file_resources=None, curated=True, col_rename=COLUMNS_RENAME_DICT,
+                 **kwargs):
+        """
+        Args:
+            path:
+            file_resources:
+            curated:
+            col_rename:
+            **kwargs:
+        """
+        if file_resources is None:
+            file_resources = {}
+            file_resources["curated_gene_disease_associations.tsv"] = "curated_gene_disease_associations.tsv.gz"
+            file_resources["all_gene_disease_associations.tsv"] = "all_gene_disease_associations.tsv.gz"
+
+        self.curated = curated
+        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
+
+    def load_dataframe(self, file_resources, blocksize=None):
+        """
+        Args:
+            file_resources:
+            blocksize:
+        """
+        if self.curated:
+            df = pd.read_table(file_resources["curated_gene_disease_associations.tsv"],
+                               usecols=["geneSymbol", "diseaseName", "score"])
+        else:
+            df = pd.read_table(file_resources["all_gene_disease_associations.tsv"],
+                               usecols=["geneSymbol", "diseaseName", "score"])
+
+        df["diseaseName"] = df["diseaseName"].str.lower()
+        return df
+
+
+class HMDD(DiseaseAssociation):
+    """Loads the HMDD database from "http://www.cuilab.cn/static/hmdd3" .
+
+    Default path: "http://www.cuilab.cn/static/hmdd3/data/" .
+    Default file_resources: {
+        "alldata.txt": "alldata.txt",
+    }
+    """
+    COLUMNS_RENAME_DICT = {
+        "mir": "gene_name",
+        "disease": Annotatable.DISEASE_ASSOCIATIONS_COL
+    }
+
+    def __init__(self, path="http://www.cuilab.cn/static/hmdd3/data/",
+                 file_resources=None, col_rename=COLUMNS_RENAME_DICT,
+                 **kwargs):
+        """
+        Args:
+            path:
+            file_resources:
+            col_rename:
+            **kwargs:
+        """
+        if file_resources is None:
+            file_resources = {}
+            file_resources["alldata.txt"] = "alldata.txt"
+
+        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
+
+    def load_dataframe(self, file_resources, blocksize=None):
+        """
+        Args:
+            file_resources:
+            blocksize:
+        """
+        df = pd.read_csv(file_resources["alldata.txt"], sep="\t", encoding="unicode_escape")
+        df["disease"] = df["disease"].str.lower()
+        return df
+
+
+class LncRNADisease(DiseaseAssociation):
+    COLUMNS_RENAME_DICT = {
+        "LncRNA name": "gene_name",
+        "Disease name": Annotatable.DISEASE_ASSOCIATIONS_COL
+    }
+
+    def __init__(self, path="http://www.cuilab.cn/files/images/ldd/",
+                 file_resources=None, species="Human", col_rename=COLUMNS_RENAME_DICT,
+                 **kwargs):
+        """
+        Args:
+            path:
+            file_resources:
+            species:
+            col_rename:
+            **kwargs:
+        """
+        if file_resources is None:
+            file_resources = {}
+            file_resources["data_v2017.txt"] = "data_v2017.txt"
+
+        self.species = species
+        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
+
+    def load_dataframe(self, file_resources, blocksize=None):
+        """
+        Args:
+            file_resources:
+            blocksize:
+        """
+        df = pd.read_csv(self.file_resources["data_v2017.txt"], header=None, sep="\t", encoding="unicode_escape")
+        df.columns = ["LncRNA name", "Disease name", "Dysfunction type", "Description", "Chr",
+                      "Start", "End", "Strand", "Species", "Alias", "Sequence", "Reference"]
+        df = df[df["Species"] == self.species]
+        df["Disease name"] = df["Disease name"].str.lower()
+        return df