Switch to unified view

a b/openomics/database/disease.py
1
from abc import abstractmethod
2
3
import pandas as pd
4
from openomics.database.base import Database, Annotatable
5
6
__all__ = ['DiseaseAssociation', 'MalaCards', 'DisGeNet', 'HMDD', 'LncRNADisease']
7
8
class DiseaseAssociation(Database):
9
    def __init__(self, path, file_resources=None, **kwargs):
10
        """
11
        Args:
12
            path:
13
            file_resources:
14
            **kwargs:
15
        """
16
        super().__init__(path, file_resources, **kwargs)
17
18
    @abstractmethod
19
    def get_disease_assocs(self, index="gene_name"):
20
        """
21
        Args:
22
            index:
23
        """
24
        return self.data.groupby(index)[Annotatable.DISEASE_ASSOCIATIONS_COL].unique()
25
26
27
class MalaCards(DiseaseAssociation):
28
    """Loads the MalaCards database from "http://zdzlab.einstein.yu.edu/1/hedd/" .
29
30
    Default path: "http://zdzlab.einstein.yu.edu/1/hedd/" .
31
    Default file_resources: {
32
        "MalaCards.csv": "download.action.php?filename=DataDownload/MalaCards.csv",
33
    }
34
    """
35
    COLUMNS_RENAME_DICT = {
36
        "geneSymbol": "gene_name",
37
        "maladyMainName": Annotatable.DISEASE_ASSOCIATIONS_COL
38
    }
39
40
    def __init__(self, path="http://zdzlab.einstein.yu.edu/1/hedd/", file_resources=None,
41
                 col_rename=COLUMNS_RENAME_DICT, **kwargs):
42
        """
43
        Args:
44
            path:
45
            file_resources:
46
            col_rename:
47
            **kwargs:
48
        """
49
        if file_resources is None:
50
            file_resources = {}
51
            file_resources["MalaCards.csv"] = "download.action.php?filename=DataDownload/MalaCards.csv"
52
53
        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
54
55
    def load_dataframe(self, file_resources, blocksize=None):
56
        # type: (dict, int) -> pd.DataFrame
57
        """
58
        Args:
59
            file_resources:
60
            blocksize:
61
        """
62
        df = pd.read_csv(file_resources["MalaCards.csv"])
63
        return df
64
65
66
class DisGeNet(DiseaseAssociation):
67
    """Loads the  database from  .
68
69
    Default path:  .
70
    Default file_resources: {
71
        "": "",
72
        "": "",
73
        "": "",
74
    }
75
    """
76
    COLUMNS_RENAME_DICT = {"geneSymbol": "gene_name",
77
                           "diseaseName": Annotatable.DISEASE_ASSOCIATIONS_COL}
78
79
    def __init__(self, path="https://www.disgenet.org/static/disgenet_ap1/files/downloads/",
80
                 file_resources=None, curated=True, col_rename=COLUMNS_RENAME_DICT,
81
                 **kwargs):
82
        """
83
        Args:
84
            path:
85
            file_resources:
86
            curated:
87
            col_rename:
88
            **kwargs:
89
        """
90
        if file_resources is None:
91
            file_resources = {}
92
            file_resources["curated_gene_disease_associations.tsv"] = "curated_gene_disease_associations.tsv.gz"
93
            file_resources["all_gene_disease_associations.tsv"] = "all_gene_disease_associations.tsv.gz"
94
95
        self.curated = curated
96
        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
97
98
    def load_dataframe(self, file_resources, blocksize=None):
99
        """
100
        Args:
101
            file_resources:
102
            blocksize:
103
        """
104
        if self.curated:
105
            df = pd.read_table(file_resources["curated_gene_disease_associations.tsv"],
106
                               usecols=["geneSymbol", "diseaseName", "score"])
107
        else:
108
            df = pd.read_table(file_resources["all_gene_disease_associations.tsv"],
109
                               usecols=["geneSymbol", "diseaseName", "score"])
110
111
        df["diseaseName"] = df["diseaseName"].str.lower()
112
        return df
113
114
115
class HMDD(DiseaseAssociation):
116
    """Loads the HMDD database from "http://www.cuilab.cn/static/hmdd3" .
117
118
    Default path: "http://www.cuilab.cn/static/hmdd3/data/" .
119
    Default file_resources: {
120
        "alldata.txt": "alldata.txt",
121
    }
122
    """
123
    COLUMNS_RENAME_DICT = {
124
        "mir": "gene_name",
125
        "disease": Annotatable.DISEASE_ASSOCIATIONS_COL
126
    }
127
128
    def __init__(self, path="http://www.cuilab.cn/static/hmdd3/data/",
129
                 file_resources=None, col_rename=COLUMNS_RENAME_DICT,
130
                 **kwargs):
131
        """
132
        Args:
133
            path:
134
            file_resources:
135
            col_rename:
136
            **kwargs:
137
        """
138
        if file_resources is None:
139
            file_resources = {}
140
            file_resources["alldata.txt"] = "alldata.txt"
141
142
        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
143
144
    def load_dataframe(self, file_resources, blocksize=None):
145
        """
146
        Args:
147
            file_resources:
148
            blocksize:
149
        """
150
        df = pd.read_csv(file_resources["alldata.txt"], sep="\t", encoding="unicode_escape")
151
        df["disease"] = df["disease"].str.lower()
152
        return df
153
154
155
class LncRNADisease(DiseaseAssociation):
156
    COLUMNS_RENAME_DICT = {
157
        "LncRNA name": "gene_name",
158
        "Disease name": Annotatable.DISEASE_ASSOCIATIONS_COL
159
    }
160
161
    def __init__(self, path="http://www.cuilab.cn/files/images/ldd/",
162
                 file_resources=None, species="Human", col_rename=COLUMNS_RENAME_DICT,
163
                 **kwargs):
164
        """
165
        Args:
166
            path:
167
            file_resources:
168
            species:
169
            col_rename:
170
            **kwargs:
171
        """
172
        if file_resources is None:
173
            file_resources = {}
174
            file_resources["data_v2017.txt"] = "data_v2017.txt"
175
176
        self.species = species
177
        super().__init__(path, file_resources, col_rename=col_rename, **kwargs)
178
179
    def load_dataframe(self, file_resources, blocksize=None):
180
        """
181
        Args:
182
            file_resources:
183
            blocksize:
184
        """
185
        df = pd.read_csv(self.file_resources["data_v2017.txt"], header=None, sep="\t", encoding="unicode_escape")
186
        df.columns = ["LncRNA name", "Disease name", "Dysfunction type", "Description", "Chr",
187
                      "Start", "End", "Strand", "Species", "Alias", "Sequence", "Reference"]
188
        df = df[df["Species"] == self.species]
189
        df["Disease name"] = df["Disease name"].str.lower()
190
        return df