|
a |
|
b/openomics/database/disease.py |
|
|
1 |
from abc import abstractmethod |
|
|
2 |
|
|
|
3 |
import pandas as pd |
|
|
4 |
from openomics.database.base import Database, Annotatable |
|
|
5 |
|
|
|
6 |
__all__ = ['DiseaseAssociation', 'MalaCards', 'DisGeNet', 'HMDD', 'LncRNADisease'] |
|
|
7 |
|
|
|
8 |
class DiseaseAssociation(Database): |
|
|
9 |
def __init__(self, path, file_resources=None, **kwargs): |
|
|
10 |
""" |
|
|
11 |
Args: |
|
|
12 |
path: |
|
|
13 |
file_resources: |
|
|
14 |
**kwargs: |
|
|
15 |
""" |
|
|
16 |
super().__init__(path, file_resources, **kwargs) |
|
|
17 |
|
|
|
18 |
@abstractmethod |
|
|
19 |
def get_disease_assocs(self, index="gene_name"): |
|
|
20 |
""" |
|
|
21 |
Args: |
|
|
22 |
index: |
|
|
23 |
""" |
|
|
24 |
return self.data.groupby(index)[Annotatable.DISEASE_ASSOCIATIONS_COL].unique() |
|
|
25 |
|
|
|
26 |
|
|
|
27 |
class MalaCards(DiseaseAssociation): |
|
|
28 |
"""Loads the MalaCards database from "http://zdzlab.einstein.yu.edu/1/hedd/" . |
|
|
29 |
|
|
|
30 |
Default path: "http://zdzlab.einstein.yu.edu/1/hedd/" . |
|
|
31 |
Default file_resources: { |
|
|
32 |
"MalaCards.csv": "download.action.php?filename=DataDownload/MalaCards.csv", |
|
|
33 |
} |
|
|
34 |
""" |
|
|
35 |
COLUMNS_RENAME_DICT = { |
|
|
36 |
"geneSymbol": "gene_name", |
|
|
37 |
"maladyMainName": Annotatable.DISEASE_ASSOCIATIONS_COL |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
def __init__(self, path="http://zdzlab.einstein.yu.edu/1/hedd/", file_resources=None, |
|
|
41 |
col_rename=COLUMNS_RENAME_DICT, **kwargs): |
|
|
42 |
""" |
|
|
43 |
Args: |
|
|
44 |
path: |
|
|
45 |
file_resources: |
|
|
46 |
col_rename: |
|
|
47 |
**kwargs: |
|
|
48 |
""" |
|
|
49 |
if file_resources is None: |
|
|
50 |
file_resources = {} |
|
|
51 |
file_resources["MalaCards.csv"] = "download.action.php?filename=DataDownload/MalaCards.csv" |
|
|
52 |
|
|
|
53 |
super().__init__(path, file_resources, col_rename=col_rename, **kwargs) |
|
|
54 |
|
|
|
55 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
56 |
# type: (dict, int) -> pd.DataFrame |
|
|
57 |
""" |
|
|
58 |
Args: |
|
|
59 |
file_resources: |
|
|
60 |
blocksize: |
|
|
61 |
""" |
|
|
62 |
df = pd.read_csv(file_resources["MalaCards.csv"]) |
|
|
63 |
return df |
|
|
64 |
|
|
|
65 |
|
|
|
66 |
class DisGeNet(DiseaseAssociation): |
|
|
67 |
"""Loads the database from . |
|
|
68 |
|
|
|
69 |
Default path: . |
|
|
70 |
Default file_resources: { |
|
|
71 |
"": "", |
|
|
72 |
"": "", |
|
|
73 |
"": "", |
|
|
74 |
} |
|
|
75 |
""" |
|
|
76 |
COLUMNS_RENAME_DICT = {"geneSymbol": "gene_name", |
|
|
77 |
"diseaseName": Annotatable.DISEASE_ASSOCIATIONS_COL} |
|
|
78 |
|
|
|
79 |
def __init__(self, path="https://www.disgenet.org/static/disgenet_ap1/files/downloads/", |
|
|
80 |
file_resources=None, curated=True, col_rename=COLUMNS_RENAME_DICT, |
|
|
81 |
**kwargs): |
|
|
82 |
""" |
|
|
83 |
Args: |
|
|
84 |
path: |
|
|
85 |
file_resources: |
|
|
86 |
curated: |
|
|
87 |
col_rename: |
|
|
88 |
**kwargs: |
|
|
89 |
""" |
|
|
90 |
if file_resources is None: |
|
|
91 |
file_resources = {} |
|
|
92 |
file_resources["curated_gene_disease_associations.tsv"] = "curated_gene_disease_associations.tsv.gz" |
|
|
93 |
file_resources["all_gene_disease_associations.tsv"] = "all_gene_disease_associations.tsv.gz" |
|
|
94 |
|
|
|
95 |
self.curated = curated |
|
|
96 |
super().__init__(path, file_resources, col_rename=col_rename, **kwargs) |
|
|
97 |
|
|
|
98 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
99 |
""" |
|
|
100 |
Args: |
|
|
101 |
file_resources: |
|
|
102 |
blocksize: |
|
|
103 |
""" |
|
|
104 |
if self.curated: |
|
|
105 |
df = pd.read_table(file_resources["curated_gene_disease_associations.tsv"], |
|
|
106 |
usecols=["geneSymbol", "diseaseName", "score"]) |
|
|
107 |
else: |
|
|
108 |
df = pd.read_table(file_resources["all_gene_disease_associations.tsv"], |
|
|
109 |
usecols=["geneSymbol", "diseaseName", "score"]) |
|
|
110 |
|
|
|
111 |
df["diseaseName"] = df["diseaseName"].str.lower() |
|
|
112 |
return df |
|
|
113 |
|
|
|
114 |
|
|
|
115 |
class HMDD(DiseaseAssociation): |
|
|
116 |
"""Loads the HMDD database from "http://www.cuilab.cn/static/hmdd3" . |
|
|
117 |
|
|
|
118 |
Default path: "http://www.cuilab.cn/static/hmdd3/data/" . |
|
|
119 |
Default file_resources: { |
|
|
120 |
"alldata.txt": "alldata.txt", |
|
|
121 |
} |
|
|
122 |
""" |
|
|
123 |
COLUMNS_RENAME_DICT = { |
|
|
124 |
"mir": "gene_name", |
|
|
125 |
"disease": Annotatable.DISEASE_ASSOCIATIONS_COL |
|
|
126 |
} |
|
|
127 |
|
|
|
128 |
def __init__(self, path="http://www.cuilab.cn/static/hmdd3/data/", |
|
|
129 |
file_resources=None, col_rename=COLUMNS_RENAME_DICT, |
|
|
130 |
**kwargs): |
|
|
131 |
""" |
|
|
132 |
Args: |
|
|
133 |
path: |
|
|
134 |
file_resources: |
|
|
135 |
col_rename: |
|
|
136 |
**kwargs: |
|
|
137 |
""" |
|
|
138 |
if file_resources is None: |
|
|
139 |
file_resources = {} |
|
|
140 |
file_resources["alldata.txt"] = "alldata.txt" |
|
|
141 |
|
|
|
142 |
super().__init__(path, file_resources, col_rename=col_rename, **kwargs) |
|
|
143 |
|
|
|
144 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
145 |
""" |
|
|
146 |
Args: |
|
|
147 |
file_resources: |
|
|
148 |
blocksize: |
|
|
149 |
""" |
|
|
150 |
df = pd.read_csv(file_resources["alldata.txt"], sep="\t", encoding="unicode_escape") |
|
|
151 |
df["disease"] = df["disease"].str.lower() |
|
|
152 |
return df |
|
|
153 |
|
|
|
154 |
|
|
|
155 |
class LncRNADisease(DiseaseAssociation): |
|
|
156 |
COLUMNS_RENAME_DICT = { |
|
|
157 |
"LncRNA name": "gene_name", |
|
|
158 |
"Disease name": Annotatable.DISEASE_ASSOCIATIONS_COL |
|
|
159 |
} |
|
|
160 |
|
|
|
161 |
def __init__(self, path="http://www.cuilab.cn/files/images/ldd/", |
|
|
162 |
file_resources=None, species="Human", col_rename=COLUMNS_RENAME_DICT, |
|
|
163 |
**kwargs): |
|
|
164 |
""" |
|
|
165 |
Args: |
|
|
166 |
path: |
|
|
167 |
file_resources: |
|
|
168 |
species: |
|
|
169 |
col_rename: |
|
|
170 |
**kwargs: |
|
|
171 |
""" |
|
|
172 |
if file_resources is None: |
|
|
173 |
file_resources = {} |
|
|
174 |
file_resources["data_v2017.txt"] = "data_v2017.txt" |
|
|
175 |
|
|
|
176 |
self.species = species |
|
|
177 |
super().__init__(path, file_resources, col_rename=col_rename, **kwargs) |
|
|
178 |
|
|
|
179 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
180 |
""" |
|
|
181 |
Args: |
|
|
182 |
file_resources: |
|
|
183 |
blocksize: |
|
|
184 |
""" |
|
|
185 |
df = pd.read_csv(self.file_resources["data_v2017.txt"], header=None, sep="\t", encoding="unicode_escape") |
|
|
186 |
df.columns = ["LncRNA name", "Disease name", "Dysfunction type", "Description", "Chr", |
|
|
187 |
"Start", "End", "Strand", "Species", "Alias", "Sequence", "Reference"] |
|
|
188 |
df = df[df["Species"] == self.species] |
|
|
189 |
df["Disease name"] = df["Disease name"].str.lower() |
|
|
190 |
return df |