|
a |
|
b/openomics/database/interaction.py |
|
|
1 |
import copy |
|
|
2 |
import os |
|
|
3 |
from abc import abstractmethod |
|
|
4 |
from collections.abc import Iterable |
|
|
5 |
from typing import List, Dict, Any, Union, Optional |
|
|
6 |
|
|
|
7 |
import dask.dataframe as dd |
|
|
8 |
import networkx as nx |
|
|
9 |
import pandas as pd |
|
|
10 |
import scipy.sparse as ssp |
|
|
11 |
from Bio import SeqIO |
|
|
12 |
from logzero import logger |
|
|
13 |
from pandas.core.dtypes.common import is_numeric_dtype |
|
|
14 |
|
|
|
15 |
from openomics.database.base import Database |
|
|
16 |
from openomics.database.sequence import SequenceDatabase, UniProt |
|
|
17 |
from openomics.transforms.df import filter_rows |
|
|
18 |
|
|
|
19 |
__all__ = ['STRING', 'GeneMania', 'IntAct', 'BioGRID', 'MiRTarBase', 'LncBase', 'TargetScan', 'TarBase', |
|
|
20 |
'LncReg', 'LncRNA2Target', 'lncRNome', 'NPInter', 'RNAInter', 'StarBase'] |
|
|
21 |
|
|
|
22 |
class Interactions(Database): |
|
|
23 |
edges: Optional[Union[pd.DataFrame, dd.DataFrame]] |
|
|
24 |
def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None, |
|
|
25 |
edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None, |
|
|
26 |
directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): |
|
|
27 |
""" |
|
|
28 |
This is an abstract class used to instantiate a database given a folder containing various file resources. When creating a Database class, the load_data function is called where the file resources are load as a DataFrame and performs necessary processings. This class provides an interface for RNA classes to annotate various genomic annotation, functional annotation, sequences, and disease associations. |
|
|
29 |
Args: |
|
|
30 |
path (str): |
|
|
31 |
The folder path containing the data files. |
|
|
32 |
file_resources (dict): |
|
|
33 |
Default None, used to list required files for load_network of the dataset. A dictionary where keys are required filenames and value are file paths. If None, then the class constructor should automatically build the required file resources dict. |
|
|
34 |
source_col_name (str): |
|
|
35 |
Column name of DataFrame to be used as the source node names. |
|
|
36 |
target_col_name (str): |
|
|
37 |
Column name of DataFrame to be used as the target node names. |
|
|
38 |
edge_attr (list): |
|
|
39 |
A list of column names to be included as attributes for each edge (source-target pairs). |
|
|
40 |
filters (dict): |
|
|
41 |
Optional. A dict with key matching the data table (from load_network()) columns and values for the filtering on that column. |
|
|
42 |
directed (bool): default True, |
|
|
43 |
Whether to create a directed or an undirected network. |
|
|
44 |
relabel_nodes (dict): default None, |
|
|
45 |
A dictionary to rename nodes in the network, where the nodes with name <dict[key]> will be renamed to <dict[value]> |
|
|
46 |
blocksize (): |
|
|
47 |
""" |
|
|
48 |
self.filters = filters |
|
|
49 |
self.source_col_name = source_col_name |
|
|
50 |
self.target_col_name = target_col_name |
|
|
51 |
self.directed = directed |
|
|
52 |
self.edge_attr = edge_attr |
|
|
53 |
|
|
|
54 |
super().__init__(path=path, file_resources=file_resources, blocksize=blocksize, **kwargs) |
|
|
55 |
self.network = self.load_network(file_resources=self.file_resources, source_col_name=source_col_name, |
|
|
56 |
target_col_name=target_col_name, edge_attr=edge_attr, directed=directed, |
|
|
57 |
filters=filters, blocksize=blocksize) |
|
|
58 |
|
|
|
59 |
if relabel_nodes is not None: |
|
|
60 |
self.network = nx.relabel_nodes(self.network, mapping=relabel_nodes) |
|
|
61 |
|
|
|
62 |
self.close() |
|
|
63 |
|
|
|
64 |
@classmethod |
|
|
65 |
def name(cls): |
|
|
66 |
return cls.__name__ |
|
|
67 |
|
|
|
68 |
@abstractmethod |
|
|
69 |
def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str, |
|
|
70 |
edge_attr: Union[str, List[str]], directed: bool, filters: Dict[str, Any], blocksize=None) \ |
|
|
71 |
-> nx.Graph: |
|
|
72 |
""" |
|
|
73 |
Handles data processing from `file_resources` to a Pandas DataFrame which contain edgelist data, then constructs |
|
|
74 |
and return a NetworkX Graph. |
|
|
75 |
Args: |
|
|
76 |
file_resources: a dict of file name and file path/object |
|
|
77 |
source_col_name (str): column name of the dataframe for source in the edge |
|
|
78 |
target_col_name (str): column name of the dataframe for target in the edge |
|
|
79 |
edge_attr (list): list of str for column data to include in each edge |
|
|
80 |
directed (bool): True to return a DiGraph(), else Graph() |
|
|
81 |
filters: A dict of {column name: column values} to filter the dataframe |
|
|
82 |
blocksize (): |
|
|
83 |
Returns: |
|
|
84 |
network: a NetworkX Graph or DiGraph |
|
|
85 |
""" |
|
|
86 |
raise NotImplementedError |
|
|
87 |
|
|
|
88 |
def get_interactions(self, nodelist=None, data=False, inclusive=True, relabel_nodes: Dict[str, str] = None): |
|
|
89 |
""" |
|
|
90 |
|
|
|
91 |
Args: |
|
|
92 |
nodelist (list): |
|
|
93 |
A list of nodes to fetch edges from |
|
|
94 |
data (bool): default False |
|
|
95 |
Whether to include edge attributes |
|
|
96 |
inclusive (bool): default False |
|
|
97 |
Whether to only retrieve edges from nodes inclusive in nodelist. |
|
|
98 |
|
|
|
99 |
Returns: |
|
|
100 |
edges (OutEdgeView): a NetworkX edgelist |
|
|
101 |
""" |
|
|
102 |
if not hasattr(self, "network"): |
|
|
103 |
raise Exception( |
|
|
104 |
"{} does not have network interaction data yet. Must run load_network() and assign self.network field first.".format( |
|
|
105 |
self.name())) |
|
|
106 |
|
|
|
107 |
g = self.network |
|
|
108 |
if relabel_nodes: |
|
|
109 |
g = nx.relabel_nodes(g, relabel_nodes, copy=False) |
|
|
110 |
|
|
|
111 |
if nodelist is None: |
|
|
112 |
return g.edges(data=data) |
|
|
113 |
|
|
|
114 |
if inclusive: |
|
|
115 |
return g.subgraph(nodelist).edges(data=data) |
|
|
116 |
else: |
|
|
117 |
return g.edges(nbunch=nodelist, data=data) |
|
|
118 |
|
|
|
119 |
|
|
|
120 |
class STRING(Interactions, SequenceDatabase): |
|
|
121 |
"""Loads the STRING database from https://string-db.org/ . |
|
|
122 |
|
|
|
123 |
Default path: "https://stringdb-static.org/download/" . |
|
|
124 |
Default file_resources: { |
|
|
125 |
"{species_id}.protein.info.txt.gz": f"protein.info.{version}/{species_id}.protein.info.{version}.txt.gz", |
|
|
126 |
"{species_id}.protein.aliases.txt.gz": f"protein.links.{version}/{species_id}.protein.aliases.{version}.txt.gz", |
|
|
127 |
"{species_id}.protein.links.txt.gz": f"protein.links.{version}/{species_id}.protein.links.{version}.txt.gz", |
|
|
128 |
"{species_id}.protein.sequences.fa.gz": f"protein.sequences.{version}/{species_id}.protein.sequences.{version}.fa.gz" |
|
|
129 |
} |
|
|
130 |
|
|
|
131 |
Edge attributes for protein.actions.txt include ["mode", 'action', 'is_directional', 'a_is_acting' "score"] |
|
|
132 |
Edge attributes for protein.actions.txt include ["combined_score"] |
|
|
133 |
""" |
|
|
134 |
COLUMNS_RENAME_DICT = { |
|
|
135 |
"#string_protein_id": "string_protein_id", |
|
|
136 |
"protein_external_id": "protein_id", |
|
|
137 |
"preferred_name": "gene_name", |
|
|
138 |
'#ncbi_taxid': 'species_id', |
|
|
139 |
'string_protein_id_2': 'homologous_protein_id', |
|
|
140 |
} |
|
|
141 |
|
|
|
142 |
def __init__(self, path="https://stringdb-static.org/download/", file_resources=None, |
|
|
143 |
species_id: Union[str, List[str]] = "9606", version="v11.0", |
|
|
144 |
source_col_name="protein1", target_col_name="protein2", |
|
|
145 |
edge_attr: Union[str, List[str]] = 'combined_score', directed=False, |
|
|
146 |
relabel_nodes=None, |
|
|
147 |
index_col='#string_protein_id', |
|
|
148 |
keys=None, |
|
|
149 |
alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'}, |
|
|
150 |
blocksize=None, **kwargs): |
|
|
151 |
""" |
|
|
152 |
|
|
|
153 |
Args: |
|
|
154 |
path (): |
|
|
155 |
file_resources (): |
|
|
156 |
species_id (): List of str of species id's |
|
|
157 |
Provide a species_id string or a list of species_id's to download the species-specific STRING dataset, and |
|
|
158 |
integrate them. If species_id is None, then download the full-dataset version of STRING, which is very |
|
|
159 |
time-consuming. |
|
|
160 |
version (): |
|
|
161 |
source_col_name (): |
|
|
162 |
target_col_name (): |
|
|
163 |
source_index (): |
|
|
164 |
target_index (): |
|
|
165 |
edge_attr (): |
|
|
166 |
directed (): |
|
|
167 |
relabel_nodes (): |
|
|
168 |
verbose (): |
|
|
169 |
blocksize (): |
|
|
170 |
""" |
|
|
171 |
self.version = version |
|
|
172 |
self.species_id = copy.copy(species_id) |
|
|
173 |
self.alias_types = alias_types |
|
|
174 |
assert isinstance(edge_attr, str) |
|
|
175 |
|
|
|
176 |
if file_resources is None: |
|
|
177 |
file_resources = {} |
|
|
178 |
if isinstance(species_id, (Iterable, str)) and len(species_id): |
|
|
179 |
species_list = [species_id] if isinstance(species_id, str) else species_id |
|
|
180 |
for species in species_list: |
|
|
181 |
file_resources[f"{species}.protein.info.txt.gz"] = \ |
|
|
182 |
os.path.join(path, f"protein.info.{version}/{species}.protein.info.{version}.txt.gz") |
|
|
183 |
file_resources[f"{species}.protein.links.txt.gz"] = \ |
|
|
184 |
os.path.join(path, f"protein.links.{version}/{species}.protein.links.{version}.txt.gz") |
|
|
185 |
file_resources[f"{species}.protein.links.detailed.txt.gz"] = \ |
|
|
186 |
os.path.join(path, f"protein.links.detailed.{version}/" |
|
|
187 |
f"{species}.protein.links.detailed.{version}.txt.gz") |
|
|
188 |
file_resources[f"{species}.protein.homology.txt.gz"] = \ |
|
|
189 |
os.path.join(path, f"protein.homology.{version}/{species}.protein.homology.{version}.txt.gz") |
|
|
190 |
file_resources[f"{species}.clusters.proteins.txt.gz"] = \ |
|
|
191 |
os.path.join(path, f"clusters.proteins.{version}/{species}.clusters.proteins.{version}.txt.gz") |
|
|
192 |
file_resources[f"{species}.protein.aliases.txt.gz"] = \ |
|
|
193 |
os.path.join(path, f"protein.aliases.{version}/{species}.protein.aliases.{version}.txt.gz") |
|
|
194 |
file_resources[f"{species}.enrichment.terms.txt.gz"] = \ |
|
|
195 |
os.path.join(path, f"enrichment.terms.{version}/{species}.enrichment.terms.{version}.txt.gz") |
|
|
196 |
file_resources[f"{species}.protein.sequences.fa.gz"] = \ |
|
|
197 |
os.path.join(path, f"protein.sequences.{version}/{species}.protein.sequences.{version}.fa.gz") |
|
|
198 |
else: |
|
|
199 |
file_resources["protein.info.txt.gz"] = os.path.join(path, f"protein.info.{version}.txt.gz") |
|
|
200 |
file_resources["protein.links.txt.gz"] = os.path.join(path, f"protein.links.{version}.txt.gz") |
|
|
201 |
file_resources["protein.sequences.fa.gz"] = os.path.join(path, f"protein.sequences.{version}.fa.gz") |
|
|
202 |
else: |
|
|
203 |
if isinstance(self.species_id, Iterable): |
|
|
204 |
file_resources = {fn: fp for fn, fp in file_resources.items() \ |
|
|
205 |
if any(fn.startswith(species) for species in self.species_id)} |
|
|
206 |
|
|
|
207 |
super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
208 |
target_col_name=target_col_name, edge_attr=edge_attr, directed=directed, |
|
|
209 |
relabel_nodes=relabel_nodes, blocksize=blocksize, index_col=index_col, keys=keys, |
|
|
210 |
col_rename=STRING.COLUMNS_RENAME_DICT, **kwargs) |
|
|
211 |
|
|
|
212 |
def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: |
|
|
213 |
# Load nodes |
|
|
214 |
dfs = [] |
|
|
215 |
if blocksize: |
|
|
216 |
for filename in [fn for fn, path in file_resources.items() \ |
|
|
217 |
if 'info.txt' in fn and isinstance(path, str)]: |
|
|
218 |
compression = 'gzip' if filename.endswith(".gz") else None |
|
|
219 |
info_df = dd.read_table(file_resources[filename], na_values=['annotation not available'], |
|
|
220 |
low_memory=True, compression=compression, |
|
|
221 |
dtype={'protein_size': 'int8'}, |
|
|
222 |
blocksize=None if isinstance(blocksize, bool) else blocksize) |
|
|
223 |
|
|
|
224 |
if self.keys is not None: |
|
|
225 |
info_df = info_df.loc[info_df[self.index_col].isin(self.keys)] |
|
|
226 |
|
|
|
227 |
if self.index_col: |
|
|
228 |
info_df = info_df.set_index(self.index_col, sorted=True) |
|
|
229 |
|
|
|
230 |
# Join other attributes to node_info |
|
|
231 |
species_id = filename.split(".")[0] |
|
|
232 |
attrs = self.load_accessory_data(file_resources, species_id=species_id, |
|
|
233 |
alias_types=self.alias_types, blocksize=False) |
|
|
234 |
if attrs is not None: |
|
|
235 |
new_cols = attrs.columns.difference(info_df.columns) |
|
|
236 |
info_df = info_df.join(attrs[new_cols], on=self.index_col) |
|
|
237 |
|
|
|
238 |
dfs.append(info_df) |
|
|
239 |
else: |
|
|
240 |
for filename in file_resources: |
|
|
241 |
if filename.endswith("protein.info.txt"): |
|
|
242 |
info_df = pd.read_table(file_resources[filename], na_values=['annotation not available'], |
|
|
243 |
dtype={'protein_size': 'int8'}, |
|
|
244 |
index_col=self.index_col, low_memory=True) |
|
|
245 |
index_split = info_df['#string_protein_id'].str.split(".", expand=True, n=1) |
|
|
246 |
info_df = info_df.assign(species_id=index_split[0], protein_embl_id=index_split[1]) |
|
|
247 |
|
|
|
248 |
# Join other attributes to node_info |
|
|
249 |
species_id = filename.split(".")[0] |
|
|
250 |
attrs = self.load_accessory_data(file_resources, species_id=species_id, |
|
|
251 |
alias_types=self.alias_types, |
|
|
252 |
blocksize=blocksize) |
|
|
253 |
if attrs is not None: |
|
|
254 |
new_cols = attrs.columns.difference(info_df.columns) |
|
|
255 |
info_df = info_df.join(attrs[new_cols], on=self.index_col) |
|
|
256 |
dfs.append(info_df) |
|
|
257 |
|
|
|
258 |
if not len(dfs): |
|
|
259 |
raise Exception("Must provide at least one 'protein.info.txt' file.") |
|
|
260 |
|
|
|
261 |
if blocksize: |
|
|
262 |
protein_info: dd.DataFrame = dd.concat(dfs, axis=0, interleave_partitions=True) |
|
|
263 |
else: |
|
|
264 |
protein_info = pd.concat(dfs, axis=0) |
|
|
265 |
|
|
|
266 |
return protein_info |
|
|
267 |
|
|
|
268 |
def load_accessory_data(self, file_resources: Dict[str, str], species_id: str, |
|
|
269 |
accessory_files=['protein.aliases', 'protein.homology', 'protein.enrichment', |
|
|
270 |
'clusters.proteins'], |
|
|
271 |
alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'}, blocksize=False, ) \ |
|
|
272 |
-> Union[pd.DataFrame, dd.DataFrame]: |
|
|
273 |
""" |
|
|
274 |
Stack the annotations files for the provided `species_id`, such that rows in the annotations are filtered by |
|
|
275 |
`keys` (if not null), indexed by "#string_protein_id", and with attributes transformed to a dataframe columns. |
|
|
276 |
|
|
|
277 |
Args: |
|
|
278 |
file_resources (): a dict of filename and filepath |
|
|
279 |
species_id (str): the species_id string which is used to select only files that have the same prefix. |
|
|
280 |
accessory_files (List[str]): |
|
|
281 |
A list of strings that specify which types of annotation files to integrate, i.e., only select files |
|
|
282 |
having a substring matching one of these. |
|
|
283 |
Default ['protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins']. |
|
|
284 |
alias_types (): a set of string, default {'Ensembl_UniProt_AC'} |
|
|
285 |
A set of `source` values in the `protein.aliases` annotation to aggregate `alias`'s for. |
|
|
286 |
Must be a subset of {'Ensembl_Source', 'Ensembl_gene', 'Ensembl_transcript', 'Ensembl_UniGene', |
|
|
287 |
'Ensembl_RefSeq_short', 'Ensembl_RefSeq', 'Ensembl_OTTG', 'Ensembl_OTTP', 'Ensembl_UCSC', |
|
|
288 |
'Ensembl_UniProt', 'Ensembl_UniProt_AC', 'Ensembl_EntrezGene', 'Ensembl_EMBL', 'Ensembl_protein_id'} |
|
|
289 |
blocksize (bool): Recommended to use Pandas to avoid uncessary overhead. |
|
|
290 |
|
|
|
291 |
Returns: |
|
|
292 |
dd.Dataframe or pd.DataFrame |
|
|
293 |
|
|
|
294 |
""" |
|
|
295 |
allowed_prefixes = {'protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins'} |
|
|
296 |
if not set(accessory_files).issubset(allowed_prefixes): |
|
|
297 |
logger.warn(f'{set(accessory_files).difference(allowed_prefixes)} files are not supported') |
|
|
298 |
|
|
|
299 |
select_files = [] |
|
|
300 |
for fn, path in file_resources.items(): |
|
|
301 |
if fn.startswith(species_id) and any(ftype in fn for ftype in accessory_files): |
|
|
302 |
select_files.append(fn) |
|
|
303 |
|
|
|
304 |
dfs = [] |
|
|
305 |
for filename in select_files: |
|
|
306 |
args = dict( |
|
|
307 |
low_memory=True, |
|
|
308 |
dtype={'cluster_id': 'category', '#ncbi_taxid': 'category', 'category': 'category', |
|
|
309 |
'source': 'category'}) |
|
|
310 |
compression = 'gzip' if filename.endswith(".gz") else None |
|
|
311 |
if blocksize: |
|
|
312 |
if not isinstance(file_resources[filename], str): continue |
|
|
313 |
df = dd.read_table(file_resources[filename], compression=compression, **args) |
|
|
314 |
else: |
|
|
315 |
df = pd.read_table(file_resources[filename], **args) |
|
|
316 |
|
|
|
317 |
# Set index for df |
|
|
318 |
for col in ['#string_protein_id', 'protein_id', '#string_protein_1']: |
|
|
319 |
if col in df.columns: |
|
|
320 |
df = df.set_index(col, sorted=True) if blocksize else df.set_index(col) |
|
|
321 |
break |
|
|
322 |
|
|
|
323 |
# Set index |
|
|
324 |
if df.index.name is None: |
|
|
325 |
continue |
|
|
326 |
elif self.index_col and df.index.name != self.index_col: |
|
|
327 |
df.index = df.index.rename(self.index_col) |
|
|
328 |
if blocksize: |
|
|
329 |
assert df.known_divisions |
|
|
330 |
|
|
|
331 |
# Filter rows |
|
|
332 |
if self.keys is not None: |
|
|
333 |
df = df.loc[df.index.isin(self.keys)] |
|
|
334 |
|
|
|
335 |
# Groupby on index and perform appropriate transforms depending on the annotation type |
|
|
336 |
if 'protein.homology' in filename: |
|
|
337 |
df = df.loc[df.index != df['string_protein_id_2']] |
|
|
338 |
df = df.groupby(self.index_col)['string_protein_id_2'].unique().to_frame() |
|
|
339 |
# TODO ignored column of size of homologous regions |
|
|
340 |
|
|
|
341 |
elif 'clusters.protein' in filename: |
|
|
342 |
df = df.groupby(self.index_col)[['cluster_id', '#ncbi_taxid']].unique() |
|
|
343 |
|
|
|
344 |
elif 'protein.enrichment' in filename: |
|
|
345 |
df = df.groupby(self.index_col)['term'].unique().to_frame() |
|
|
346 |
|
|
|
347 |
elif 'protein.aliases' in filename: |
|
|
348 |
df = df.loc[df['source'].isin(alias_types)] |
|
|
349 |
df['source'] = df['source'].cat.set_categories(alias_types) |
|
|
350 |
if blocksize: |
|
|
351 |
# Set alias values to lists so pivot_table(..., aggfunc='sum') will concatenate them |
|
|
352 |
df = df.assign(alias=df['alias'].map(lambda x: [x], meta=pd.Series([[""]]))) |
|
|
353 |
df = dd.pivot_table(df.reset_index(), |
|
|
354 |
index='#string_protein_id', columns='source', values='alias', aggfunc='sum') |
|
|
355 |
else: |
|
|
356 |
df = df.reset_index().groupby([self.index_col, 'source'])['alias'].unique().unstack(level=1) |
|
|
357 |
|
|
|
358 |
if blocksize and not df.known_divisions: |
|
|
359 |
df.divisions = df.compute_current_divisions() |
|
|
360 |
|
|
|
361 |
if not len(df.index): |
|
|
362 |
continue |
|
|
363 |
|
|
|
364 |
dfs.append(df) |
|
|
365 |
|
|
|
366 |
if dfs: |
|
|
367 |
attrs = dd.concat(dfs, axis=1) if blocksize else pd.concat(dfs, axis=1) |
|
|
368 |
else: |
|
|
369 |
attrs = None |
|
|
370 |
|
|
|
371 |
return attrs |
|
|
372 |
|
|
|
373 |
def load_network(self, file_resources, source_col_name='protein1', target_col_name='protein2', |
|
|
374 |
edge_attr: Union[str, List[str]] = 'combined_score', directed=False, filters=None, blocksize=None): |
|
|
375 |
keys = self.data.index.compute() if isinstance(self.data, dd.DataFrame) else self.data.index |
|
|
376 |
select_files = [fn for fn, path in file_resources.items() if "links" in fn] |
|
|
377 |
|
|
|
378 |
# Load edges |
|
|
379 |
edges_dfs = [] |
|
|
380 |
for filename in select_files: |
|
|
381 |
args = dict(sep=" ", low_memory=True, |
|
|
382 |
dtype={'protein1': 'category', 'protein2': 'category', |
|
|
383 |
'neighborhood': 'uint8', 'fusion': 'uint8', 'cooccurence': 'uint8', |
|
|
384 |
'coexpression': 'uint8', 'experimental': 'uint8', 'database': 'uint8', |
|
|
385 |
'textmining': 'uint8', 'combined_score': 'uint8'}) |
|
|
386 |
if blocksize: |
|
|
387 |
if not isinstance(file_resources[filename], str): continue |
|
|
388 |
compression = 'gzip' if filename.endswith(".gz") else None |
|
|
389 |
df: dd.DataFrame = dd.read_table(file_resources[filename], compression=compression, **args, |
|
|
390 |
blocksize=None if isinstance(blocksize, bool) else blocksize) |
|
|
391 |
|
|
|
392 |
if compression: |
|
|
393 |
logger.info(f"Repartitioning {filename} from {df.npartitions} " |
|
|
394 |
f"partitions to {blocksize}-size partitions") |
|
|
395 |
df = df.repartition(partition_size=blocksize) |
|
|
396 |
|
|
|
397 |
else: |
|
|
398 |
df = pd.read_table(file_resources[filename], **args) |
|
|
399 |
|
|
|
400 |
df = df.loc[df[source_col_name].isin(keys) & df[target_col_name].isin(keys)] |
|
|
401 |
edges_dfs.append(df) |
|
|
402 |
|
|
|
403 |
if len(edges_dfs) == 0: |
|
|
404 |
return |
|
|
405 |
|
|
|
406 |
# Concatenate multiple edgelists into dataframe |
|
|
407 |
edges_df = dd.concat(edges_dfs, axis=0) if blocksize else pd.concat(edges_dfs, axis=0) |
|
|
408 |
edges_df = edges_df.rename(columns=self.COLUMNS_RENAME_DICT) |
|
|
409 |
logger.info(f"{self.name()}-{self.species_id}: {edges_df.columns.tolist()}, {edges_df.shape}") |
|
|
410 |
|
|
|
411 |
# Convert edge_attr (edge weights) from 3 digit integer to float |
|
|
412 |
assignfunc = {} |
|
|
413 |
for col in (edge_attr if isinstance(edge_attr, list) else [edge_attr]): |
|
|
414 |
if col in edges_df.columns and is_numeric_dtype(edges_df[col]): |
|
|
415 |
assignfunc[col] = edges_df[col].astype('float16') / 1000 |
|
|
416 |
if assignfunc: |
|
|
417 |
edges_df = edges_df.assign(**assignfunc) |
|
|
418 |
|
|
|
419 |
edges_df = filter_rows(edges_df, filters=filters) |
|
|
420 |
|
|
|
421 |
self.edges = edges_df |
|
|
422 |
# Set ordering for rows and columns |
|
|
423 |
node2idx = {node: i for i, node in enumerate(keys)} |
|
|
424 |
|
|
|
425 |
if isinstance(edges_df, dd.DataFrame): |
|
|
426 |
def edgelist2adj(df: pd.DataFrame) -> ssp.coo_matrix: |
|
|
427 |
if df.shape[0] == 1 and df.iloc[0, 0] == 'foo': |
|
|
428 |
return None |
|
|
429 |
|
|
|
430 |
df = df.assign(row=df[source_col_name].map(node2idx).astype('int'), |
|
|
431 |
col=df[target_col_name].map(node2idx).astype('int')) |
|
|
432 |
df = df.dropna(subset=['row', 'col']) |
|
|
433 |
|
|
|
434 |
if df.shape[0] == 0: |
|
|
435 |
return None |
|
|
436 |
|
|
|
437 |
coo_adj = ssp.coo_matrix((df[edge_attr], (df['row'], df['col'])), |
|
|
438 |
shape=(len(keys), len(keys))) |
|
|
439 |
coo_adj.eliminate_zeros() |
|
|
440 |
return coo_adj |
|
|
441 |
|
|
|
442 |
# Create a sparse adjacency matrix for each partition, then add them to combine |
|
|
443 |
adj = edges_df.reduction(chunk=edgelist2adj, |
|
|
444 |
aggregate=lambda x: x.dropna().sum() if not x.isna().all() else None, |
|
|
445 |
meta=pd.Series([ssp.coo_matrix])).compute() |
|
|
446 |
assert len(adj) == 1, f"len(adj) = {len(adj)}" |
|
|
447 |
|
|
|
448 |
G = nx.from_scipy_sparse_matrix(adj[0], create_using=nx.DiGraph() if directed else nx.Graph(), |
|
|
449 |
edge_attribute='weight') |
|
|
450 |
idx2node = {i: node for i, node in enumerate(keys)} |
|
|
451 |
G = nx.relabel_nodes(G, mapping=idx2node, copy=True) |
|
|
452 |
del adj |
|
|
453 |
|
|
|
454 |
else: |
|
|
455 |
# Determine which edge attr to add |
|
|
456 |
if isinstance(edge_attr, (list, tuple)): |
|
|
457 |
cols = edges_df.columns.intersection(edge_attr + [source_col_name, target_col_name]) |
|
|
458 |
edges_df = edges_df[cols] |
|
|
459 |
use_attrs = True |
|
|
460 |
elif isinstance(edge_attr, str): |
|
|
461 |
cols = edges_df.columns.intersection([source_col_name, target_col_name, edge_attr]) |
|
|
462 |
edges_df = edges_df[cols] |
|
|
463 |
use_attrs = edge_attr |
|
|
464 |
else: |
|
|
465 |
use_attrs = False |
|
|
466 |
G = nx.from_pandas_edgelist(edges_df, source=source_col_name, target=target_col_name, |
|
|
467 |
edge_attr=use_attrs, create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
468 |
|
|
|
469 |
return G |
|
|
470 |
|
|
|
471 |
def get_sequences(self, index="protein_id", omic=None, agg=None): |
|
|
472 |
if hasattr(self, "seq_dict"): |
|
|
473 |
return self.seq_dict |
|
|
474 |
|
|
|
475 |
self.seq_dict = {} |
|
|
476 |
collisions = 0 |
|
|
477 |
for record in SeqIO.parse(self.file_resources["protein.sequences.fa"], "fasta"): |
|
|
478 |
gene_id = str(record.name) |
|
|
479 |
|
|
|
480 |
sequence_str = str(record.seq) |
|
|
481 |
if index == "protein_name": |
|
|
482 |
key = self.protein_id2name[gene_id] |
|
|
483 |
elif index == "protein_id": |
|
|
484 |
key = gene_id |
|
|
485 |
|
|
|
486 |
if key in self.seq_dict: |
|
|
487 |
collisions += 1 |
|
|
488 |
|
|
|
489 |
self.seq_dict[key] = sequence_str |
|
|
490 |
|
|
|
491 |
logger.warn("Seq {} collisions: {}".format(index, collisions)) |
|
|
492 |
return self.seq_dict |
|
|
493 |
|
|
|
494 |
|
|
|
495 |
class GeneMania(Interactions): |
|
|
496 |
"""Loads the GeneMania database from . |
|
|
497 |
|
|
|
498 |
Default path: local_directory . |
|
|
499 |
Default file_resources: { |
|
|
500 |
"COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt": "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt", |
|
|
501 |
"identifier_mappings.txt": "identifier_mappings.txt", |
|
|
502 |
} |
|
|
503 |
""" |
|
|
504 |
|
|
|
505 |
def __init__(self, path, file_resources=None, source_col_name="Gene_A", target_col_name="Gene_B", |
|
|
506 |
edge_attr=None, filters=None, directed=True, relabel_nodes=None, **kwargs): |
|
|
507 |
if edge_attr is None: |
|
|
508 |
edge_attr = ["Weight"] |
|
|
509 |
if file_resources is None: |
|
|
510 |
file_resources = {} |
|
|
511 |
file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"] = os.path.join(path, |
|
|
512 |
"COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt") |
|
|
513 |
file_resources["identifier_mappings.txt"] = os.path.join(path, |
|
|
514 |
"identifier_mappings.txt") |
|
|
515 |
|
|
|
516 |
super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
517 |
target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, |
|
|
518 |
relabel_nodes=relabel_nodes, **kwargs) |
|
|
519 |
|
|
|
520 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
521 |
blocksize=None): |
|
|
522 |
interactions = pd.read_table(file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"], low_memory=True) |
|
|
523 |
identifier = pd.read_table(file_resources["identifier_mappings.txt"]) |
|
|
524 |
|
|
|
525 |
# Rename ENSG ID's to gene names |
|
|
526 |
identifier = identifier[identifier["Source"] == "Gene Name"] |
|
|
527 |
id_mapping = pd.Series(identifier["Name"].values, index=identifier["Preferred_Name"]).to_dict() |
|
|
528 |
interactions.replace(id_mapping, inplace=True) |
|
|
529 |
|
|
|
530 |
genemania_RNA_RNA_network = nx.from_pandas_edgelist(interactions, source=source_col_name, |
|
|
531 |
target=target_col_name, |
|
|
532 |
edge_attr=edge_attr, |
|
|
533 |
create_using=nx.DiGraph()) |
|
|
534 |
return genemania_RNA_RNA_network |
|
|
535 |
|
|
|
536 |
|
|
|
537 |
class IntAct(Interactions): |
|
|
538 |
|
|
|
539 |
def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None, |
|
|
540 |
source_index: str = None, target_index: str = None, edge_attr: List[str] = None, filters: dict = None, |
|
|
541 |
directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): |
|
|
542 |
super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed, |
|
|
543 |
relabel_nodes, blocksize, **kwargs) |
|
|
544 |
|
|
|
545 |
|
|
|
546 |
class BioGRID(Interactions): |
|
|
547 |
"""Loads the BioGRID database from https://thebiogrid.org . |
|
|
548 |
|
|
|
549 |
Default path: "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/" . |
|
|
550 |
Default file_resources: { |
|
|
551 |
"BIOGRID-ALL-LATEST.tab2.zip": "BIOGRID-ALL-LATEST.tab2.zip", |
|
|
552 |
} |
|
|
553 |
""" |
|
|
554 |
|
|
|
555 |
def __init__(self, path="https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/", |
|
|
556 |
file_resources=None, source_col_name="Official Symbol Interactor A", |
|
|
557 |
target_col_name="Official Symbol Interactor B", |
|
|
558 |
edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'], |
|
|
559 |
filters=None, directed=False, relabel_nodes=None, **kwargs): |
|
|
560 |
""" |
|
|
561 |
|
|
|
562 |
Args: |
|
|
563 |
path (): |
|
|
564 |
file_resources (): |
|
|
565 |
source_col_name (): |
|
|
566 |
target_col_name (): |
|
|
567 |
source_index (): |
|
|
568 |
target_index (): |
|
|
569 |
edge_attr (): |
|
|
570 |
filters (): Default None, example {"Organism Interactor A": 9606}. |
|
|
571 |
directed (): |
|
|
572 |
relabel_nodes (): |
|
|
573 |
**kwargs (): |
|
|
574 |
""" |
|
|
575 |
if file_resources is None: |
|
|
576 |
file_resources = {} |
|
|
577 |
file_resources["BIOGRID-ALL-LATEST.tab2.zip"] = os.path.join(path, "BIOGRID-ALL-LATEST.tab2.zip") |
|
|
578 |
|
|
|
579 |
super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
580 |
target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, |
|
|
581 |
relabel_nodes=relabel_nodes, **kwargs) |
|
|
582 |
|
|
|
583 |
def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: |
|
|
584 |
args = dict(na_values=["-"], header=0, low_memory=True, |
|
|
585 |
# usecols=['Official Symbol Interactor A', 'Official Symbol Interactor B', |
|
|
586 |
# 'Organism Interactor A', 'Score', 'Throughput', 'Qualifications', |
|
|
587 |
# 'Modification', 'Phenotypes', 'Source Database'], |
|
|
588 |
dtype={'Score': 'float', 'Entrez Gene Interactor A': 'category', |
|
|
589 |
'Entrez Gene Interactor B': 'category', |
|
|
590 |
'BioGRID ID Interactor A': 'category', 'BioGRID ID Interactor B': 'category', |
|
|
591 |
'Systematic Name Interactor A': 'category', 'Systematic Name Interactor B': 'category', |
|
|
592 |
'Official Symbol Interactor A': 'category', 'Official Symbol Interactor B': 'category', |
|
|
593 |
'Pubmed ID': 'str', 'Throughput': 'category', 'Experimental System Type': 'category', |
|
|
594 |
'Experimental System': 'category', 'Modification': 'category', 'Source Database': 'category', |
|
|
595 |
'Organism Interactor A': 'category', 'Organism Interactor B': 'category'}) |
|
|
596 |
|
|
|
597 |
if blocksize: |
|
|
598 |
edges = dd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], blocksize=blocksize, **args, ) |
|
|
599 |
else: |
|
|
600 |
edges = pd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], **args, ) |
|
|
601 |
|
|
|
602 |
self.edges = edges |
|
|
603 |
|
|
|
604 |
return edges |
|
|
605 |
|
|
|
606 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
607 |
blocksize=None): |
|
|
608 |
df = self.edges |
|
|
609 |
df = filter_rows(df, filters) |
|
|
610 |
network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, |
|
|
611 |
edge_attr=edge_attr, |
|
|
612 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
613 |
return network |
|
|
614 |
|
|
|
615 |
|
|
|
616 |
class MiRTarBase(Interactions): |
|
|
617 |
"""Loads the database from . |
|
|
618 |
|
|
|
619 |
Default path: . |
|
|
620 |
Default file_resources: { |
|
|
621 |
"": "", |
|
|
622 |
"": "", |
|
|
623 |
"": "", |
|
|
624 |
} |
|
|
625 |
""" |
|
|
626 |
|
|
|
627 |
def __init__(self, path="http://mirtarbase.mbc.nctu.edu.tw/cache/download/7.0/", file_resources=None, |
|
|
628 |
source_col_name="miRNA", target_col_name="Target Gene", |
|
|
629 |
edge_attr=None, |
|
|
630 |
filters=None, |
|
|
631 |
directed=True, |
|
|
632 |
relabel_nodes=None, |
|
|
633 |
strip_mirna_name=False, **kwargs): |
|
|
634 |
""" |
|
|
635 |
|
|
|
636 |
Args: |
|
|
637 |
path (): |
|
|
638 |
file_resources (): |
|
|
639 |
source_col_name (): |
|
|
640 |
target_col_name (): |
|
|
641 |
source_index (): |
|
|
642 |
target_index (): |
|
|
643 |
edge_attr (): |
|
|
644 |
filters (): default None, example {"Species (Target Gene)": "Homo sapiens"} |
|
|
645 |
directed (): |
|
|
646 |
relabel_nodes (): |
|
|
647 |
strip_mirna_name (): |
|
|
648 |
**kwargs (): |
|
|
649 |
""" |
|
|
650 |
if edge_attr is None: |
|
|
651 |
edge_attr = ["Support Type"] |
|
|
652 |
self.strip_mirna_name = strip_mirna_name |
|
|
653 |
|
|
|
654 |
if file_resources is None: |
|
|
655 |
file_resources = {} |
|
|
656 |
file_resources["miRTarBase_MTI.xlsx"] = os.path.join(path, "miRTarBase_MTI.xlsx") |
|
|
657 |
|
|
|
658 |
super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
659 |
target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, |
|
|
660 |
relabel_nodes=relabel_nodes, **kwargs) |
|
|
661 |
|
|
|
662 |
def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: |
|
|
663 |
df = pd.read_excel(self.file_resources["miRTarBase_MTI.xlsx"]) |
|
|
664 |
self.edges = df |
|
|
665 |
return df |
|
|
666 |
|
|
|
667 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
668 |
blocksize=None): |
|
|
669 |
df = self.data |
|
|
670 |
df = filter_rows(df, filters) |
|
|
671 |
|
|
|
672 |
df['miRNA'] = df['miRNA'].str.rstrip('*') |
|
|
673 |
|
|
|
674 |
if self.strip_mirna_name: |
|
|
675 |
df['miRNA'] = df['miRNA'].str.lower().str.replace("-3p.*|-5p.*", "", regex=True) |
|
|
676 |
|
|
|
677 |
mir_target_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, |
|
|
678 |
edge_attr=edge_attr, |
|
|
679 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
680 |
return mir_target_network |
|
|
681 |
|
|
|
682 |
|
|
|
683 |
class LncBase(Interactions, Database): |
|
|
684 |
"""Loads the LncBase database from http://carolina.imis.athena-innovation.gr/diana_tools/web/index.php?r=lncbasev2%2Findex . |
|
|
685 |
|
|
|
686 |
Default path: local_directory . |
|
|
687 |
Default file_resources: { |
|
|
688 |
"LncBasev2_download.csv": "LncBasev2_download.csv"", |
|
|
689 |
} |
|
|
690 |
""" |
|
|
691 |
|
|
|
692 |
def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads/', file_resources=None, strip_mirna_name=False, |
|
|
693 |
source_col_name="mirna", target_col_name="geneId", |
|
|
694 |
edge_attr=None, |
|
|
695 |
filters=None, |
|
|
696 |
directed=True, |
|
|
697 |
relabel_nodes=None, ): |
|
|
698 |
""" |
|
|
699 |
|
|
|
700 |
Args: |
|
|
701 |
path (): |
|
|
702 |
file_resources (): |
|
|
703 |
strip_mirna_name (): |
|
|
704 |
source_col_name (): |
|
|
705 |
target_col_name (): |
|
|
706 |
source_index (): |
|
|
707 |
target_index (): |
|
|
708 |
edge_attr (): |
|
|
709 |
filters (): default None. Example: {"species": "Homo sapiens"} |
|
|
710 |
directed (): |
|
|
711 |
relabel_nodes (): |
|
|
712 |
""" |
|
|
713 |
self.strip_mirna_name = strip_mirna_name |
|
|
714 |
|
|
|
715 |
if edge_attr is None: |
|
|
716 |
edge_attr = ["tissue", "positive_negative"] |
|
|
717 |
if file_resources is None: |
|
|
718 |
file_resources = {} |
|
|
719 |
file_resources["LncBasev2_download.csv"] = os.path.join(path, "lncbase_v2_exp_data.tar.gz") |
|
|
720 |
|
|
|
721 |
super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
722 |
target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, |
|
|
723 |
relabel_nodes=relabel_nodes) |
|
|
724 |
|
|
|
725 |
def get_rename_dict(self, from_index="geneId", to_index="geneName"): |
|
|
726 |
lncbase_df = pd.read_table(self.file_resources["LncBasev2_download.csv"], low_memory=True) |
|
|
727 |
gene_id_to_gene_name_dict = pd.Series(lncbase_df["geneName"].values, |
|
|
728 |
index=lncbase_df["geneId"]).to_dict() |
|
|
729 |
return gene_id_to_gene_name_dict |
|
|
730 |
|
|
|
731 |
def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: |
|
|
732 |
df = pd.read_table(file_resources["LncBasev2_download.csv"], low_memory=True) |
|
|
733 |
df.replace({"species": {"Homo Sapiens": "Homo sapiens", "Mus Musculus": "Mus musculus"}}, inplace=True) |
|
|
734 |
return df |
|
|
735 |
|
|
|
736 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
737 |
blocksize=None): |
|
|
738 |
df = self.data |
|
|
739 |
df = filter_rows(df, filters) |
|
|
740 |
|
|
|
741 |
if self.strip_mirna_name: |
|
|
742 |
df['mirna'] = df['mirna'].str.lower() |
|
|
743 |
df['mirna'] = df['mirna'].str.replace("-3p.*|-5p.*", "", regex=True) |
|
|
744 |
|
|
|
745 |
if edge_attr is None: |
|
|
746 |
edge_attr = ["tissue", "positive_negative"] |
|
|
747 |
lncBase_lncRNA_miRNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, |
|
|
748 |
edge_attr=edge_attr, |
|
|
749 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
750 |
return lncBase_lncRNA_miRNA_network |
|
|
751 |
|
|
|
752 |
|
|
|
753 |
class TarBase(Interactions): |
|
|
754 |
""" |
|
|
755 |
|
|
|
756 |
""" |
|
|
757 |
|
|
|
758 |
def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads', file_resources: Dict = None, |
|
|
759 |
source_col_name: str = 'mirna', target_col_name: str = 'geneName', |
|
|
760 |
edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None, |
|
|
761 |
directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): |
|
|
762 |
""" |
|
|
763 |
|
|
|
764 |
Args: |
|
|
765 |
path (): |
|
|
766 |
file_resources (): |
|
|
767 |
source_col_name (): |
|
|
768 |
target_col_name (): |
|
|
769 |
edge_attr (): |
|
|
770 |
filters (): |
|
|
771 |
directed (): |
|
|
772 |
relabel_nodes (): |
|
|
773 |
blocksize (): |
|
|
774 |
**kwargs (): |
|
|
775 |
""" |
|
|
776 |
if file_resources is None: |
|
|
777 |
file_resources = { |
|
|
778 |
'tarbase_v8_data.tar.gz': 'https://dianalab.e-ce.uth.gr/downloads/tarbase_v8_data.tar.gz', |
|
|
779 |
'speclist': 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/speclist', |
|
|
780 |
} |
|
|
781 |
|
|
|
782 |
super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed, |
|
|
783 |
relabel_nodes, blocksize, **kwargs) |
|
|
784 |
|
|
|
785 |
def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: |
|
|
786 |
edges = pd.read_table(file_resources['tarbase_v8_data.tar.gz'], compression='tar', |
|
|
787 |
dtype={'tissue': 'category', 'method': 'category', 'positive_negative': 'category', |
|
|
788 |
'species': 'category', |
|
|
789 |
'direct_indirect': 'category', 'up_down': 'category', 'cell_line': 'category', |
|
|
790 |
}) |
|
|
791 |
|
|
|
792 |
if 'speclist' in file_resources: |
|
|
793 |
species_df = UniProt.get_species_list(file_resources['speclist']) |
|
|
794 |
species_df = species_df[['Official (scientific) name', 'Common name', 'Synonym']].melt(ignore_index=False) |
|
|
795 |
species_df = species_df.dropna().reset_index() |
|
|
796 |
species_name2id = species_df.set_index('value')['NCBI-taxon'].to_dict() |
|
|
797 |
edges['species_id'] = edges['species'].map(species_name2id) |
|
|
798 |
|
|
|
799 |
self.edges = edges |
|
|
800 |
return edges |
|
|
801 |
|
|
|
802 |
def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str, edge_attr: List[str], |
|
|
803 |
directed: bool, filters: Dict[str, Any], blocksize=None): |
|
|
804 |
df = self.data |
|
|
805 |
df = filter_rows(df, filters) |
|
|
806 |
|
|
|
807 |
# Remove parenthesis containing 3 letter species name |
|
|
808 |
df['geneName'] = df['geneName'].str.replace(r'(\(\w{3}\)){1}$', '', regex=True) |
|
|
809 |
idx = df['geneName'].str.contains('\(') |
|
|
810 |
df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.replace(r'(\(\d of \d\))', '', regex=True).str.strip() |
|
|
811 |
|
|
|
812 |
idx = df['geneName'].str.contains("\(\w*\)", regex=True) |
|
|
813 |
df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.extract(r'\((\w*)\)(\w*)')[0] |
|
|
814 |
|
|
|
815 |
idx = df['geneName'].str.contains('\(') |
|
|
816 |
df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.split('(', expand=True)[0] |
|
|
817 |
|
|
|
818 |
g = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, |
|
|
819 |
edge_attr=edge_attr, |
|
|
820 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
821 |
return g |
|
|
822 |
|
|
|
823 |
|
|
|
824 |
class RNAInter(Interactions): |
|
|
825 |
""" |
|
|
826 |
|
|
|
827 |
""" |
|
|
828 |
|
|
|
829 |
def __init__(self, path='http://www.rnainter.org/raidMedia/download/', file_resources: Dict = None, |
|
|
830 |
source_col_name: str = 'Interactor1.Symbol', target_col_name: str = 'Interactor2.Symbol', |
|
|
831 |
edge_attr: List[str] = 'score', filters: Union[str, Dict[str, Union[str, List[str]]]] = None, |
|
|
832 |
directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs): |
|
|
833 |
""" |
|
|
834 |
|
|
|
835 |
Args: |
|
|
836 |
path (): |
|
|
837 |
file_resources (): |
|
|
838 |
source_col_name (): |
|
|
839 |
target_col_name (): |
|
|
840 |
edge_attr (): |
|
|
841 |
filters (): |
|
|
842 |
directed (): |
|
|
843 |
relabel_nodes (): |
|
|
844 |
blocksize (): |
|
|
845 |
**kwargs (): |
|
|
846 |
""" |
|
|
847 |
if file_resources is None: |
|
|
848 |
file_resources = { |
|
|
849 |
'Download_data_RR.tar.gz': 'Download_data_RR.tar.gz', |
|
|
850 |
'Download_data_RP.tar.gz': 'Download_data_RP.tar.gz', |
|
|
851 |
} |
|
|
852 |
|
|
|
853 |
super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed, |
|
|
854 |
relabel_nodes, blocksize, **kwargs) |
|
|
855 |
|
|
|
856 |
def load_dataframe(self, file_resources: Dict, blocksize: int = None) -> pd.DataFrame: |
|
|
857 |
args = dict(dtype={'Category1': 'category', 'Category2': 'category', |
|
|
858 |
'Species1': 'category', 'Species2': 'category', 'score': 'float', |
|
|
859 |
'predict': 'category', 'weak': 'category', 'strong': 'category'}) |
|
|
860 |
edge_files = (fn for fn in file_resources if fn.startswith('Download_data')) |
|
|
861 |
for fn in edge_files: |
|
|
862 |
if blocksize: |
|
|
863 |
if not isinstance(file_resources[fn], str): continue |
|
|
864 |
edges = dd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args) |
|
|
865 |
else: |
|
|
866 |
edges = pd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args) |
|
|
867 |
|
|
|
868 |
edges = filter_rows(edges, self.filters) |
|
|
869 |
|
|
|
870 |
self.edges = edges |
|
|
871 |
return edges |
|
|
872 |
|
|
|
873 |
def load_network(self, file_resources, source_col_name='Interactor1.Symbol', target_col_name='Interactor2.Symbol', |
|
|
874 |
edge_attr='score', directed=True, filters=None, blocksize=None): |
|
|
875 |
edges = self.data |
|
|
876 |
if filters != self.filters: |
|
|
877 |
edges = filter_rows(edges, filters) |
|
|
878 |
|
|
|
879 |
g = nx.from_pandas_edgelist(edges, source=source_col_name, target=target_col_name, |
|
|
880 |
edge_attr=edge_attr, |
|
|
881 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
882 |
return g |
|
|
883 |
|
|
|
884 |
|
|
|
885 |
class TargetScan(Interactions, Database): |
|
|
886 |
"""Loads the TargetScan database from "http://www.targetscan.org/" . |
|
|
887 |
|
|
|
888 |
Default path: "http://www.targetscan.org/vert_72/vert_72_data_download/" . |
|
|
889 |
Default file_resources: { |
|
|
890 |
"miR_Family_Info.txt": "miR_Family_Info.txt.zip", |
|
|
891 |
"Predicted_Targets_Info.default_predictions.txt": "Predicted_Targets_Info.default_predictions.txt.zip", |
|
|
892 |
"": "", |
|
|
893 |
} |
|
|
894 |
""" |
|
|
895 |
|
|
|
896 |
def __init__(self, path="http://www.targetscan.org/vert_72/vert_72_data_download/", file_resources=None, |
|
|
897 |
source_col_name="MiRBase ID", target_col_name="Gene Symbol", |
|
|
898 |
edge_attr=["tissue", "positive_negative"], directed=True, relabel_nodes=None, species_id=None, |
|
|
899 |
strip_mirna_name=False, **kwargs): |
|
|
900 |
self.strip_mirna_name = strip_mirna_name |
|
|
901 |
self.species_id = species_id |
|
|
902 |
if file_resources is None: |
|
|
903 |
file_resources = {} |
|
|
904 |
file_resources["miR_Family_Info.txt.zip"] = os.path.join(path, "miR_Family_Info.txt.zip") |
|
|
905 |
file_resources["Predicted_Targets_Info.default_predictions.txt"] = os.path.join(path, |
|
|
906 |
"Predicted_Targets_Info.default_predictions.txt") |
|
|
907 |
|
|
|
908 |
super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
909 |
target_col_name=target_col_name, |
|
|
910 |
directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs) |
|
|
911 |
|
|
|
912 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
913 |
blocksize=None): |
|
|
914 |
self.df = self.process_miR_family_info_table(file_resources, self.species_id) |
|
|
915 |
interactions_df = self.process_interactions_table(file_resources, self.df, self.species_id) |
|
|
916 |
print(self.name(), interactions_df.columns.tolist()) |
|
|
917 |
|
|
|
918 |
mir_target_network = nx.from_pandas_edgelist(interactions_df, |
|
|
919 |
source=source_col_name, target=target_col_name, |
|
|
920 |
edge_attr=edge_attr, |
|
|
921 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
922 |
return mir_target_network |
|
|
923 |
|
|
|
924 |
def process_miR_family_info_table(self, file_resources, species=None): |
|
|
925 |
miR_Family_Info_df = pd.read_table(file_resources["miR_Family_Info.txt"], delimiter='\t') |
|
|
926 |
|
|
|
927 |
if species: |
|
|
928 |
miR_Family_Info_df = miR_Family_Info_df[miR_Family_Info_df['Species ID'] == species] |
|
|
929 |
|
|
|
930 |
# Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19 |
|
|
931 |
if self.strip_mirna_name: |
|
|
932 |
miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.lower() |
|
|
933 |
miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "") |
|
|
934 |
|
|
|
935 |
miR_Family_Info_df.drop_duplicates(inplace=True) |
|
|
936 |
miR_Family_Info_df = miR_Family_Info_df.filter(items=['miR family', 'MiRBase ID', 'Seed+m8', 'Mature sequence', |
|
|
937 |
'Family Conservation?', 'MiRBase Accession'], |
|
|
938 |
axis="columns") |
|
|
939 |
miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].astype(str) |
|
|
940 |
return miR_Family_Info_df |
|
|
941 |
|
|
|
942 |
def process_interactions_table(self, file_resources, family_to_miR_df, species_id): |
|
|
943 |
""" |
|
|
944 |
This functions joins the interactions data table between miR Family and targets, and |
|
|
945 |
Args: |
|
|
946 |
file_resources: |
|
|
947 |
family_to_miR_df: |
|
|
948 |
species_id: |
|
|
949 |
|
|
|
950 |
Returns: |
|
|
951 |
|
|
|
952 |
""" |
|
|
953 |
# Load data frame from file |
|
|
954 |
family_interactions_df = pd.read_table(file_resources["Predicted_Targets_Info.default_predictions.txt"], |
|
|
955 |
dtype={'Species ID': 'category'}, |
|
|
956 |
delimiter='\t', low_memory=True) |
|
|
957 |
|
|
|
958 |
# Select only miRNA-target pairs of certain species_id |
|
|
959 |
if species_id: |
|
|
960 |
family_interactions_df = family_interactions_df[family_interactions_df["Species ID"] == species_id] |
|
|
961 |
|
|
|
962 |
family_interactions_df = family_interactions_df.filter(items=["miR Family", "Gene Symbol"], axis="columns") |
|
|
963 |
family_to_miR_df = family_to_miR_df.filter(items=['miR family', 'MiRBase ID'], axis="columns") |
|
|
964 |
family_to_miR_df = family_to_miR_df.rename(columns={'miR family': 'miR Family'}) |
|
|
965 |
|
|
|
966 |
# map miRBase ID names to miR Family |
|
|
967 |
# family_interactions_df = pd.merge(family_interactions_df, family_to_miR_df, how='outer', on="miR Family") |
|
|
968 |
|
|
|
969 |
family_to_miR_df.set_genes_index("miR Family", inplace=True) |
|
|
970 |
family_interactions_df.set_genes_index("miR Family", inplace=True) |
|
|
971 |
mir_interactions_df = family_interactions_df.join(family_to_miR_df, how='outer', on="miR Family").reset_index() |
|
|
972 |
|
|
|
973 |
# Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19 |
|
|
974 |
if self.strip_mirna_name: |
|
|
975 |
mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.lower() |
|
|
976 |
mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "") |
|
|
977 |
|
|
|
978 |
return mir_interactions_df |
|
|
979 |
|
|
|
980 |
|
|
|
981 |
class LncReg(Interactions): |
|
|
982 |
"""Loads the database from . |
|
|
983 |
|
|
|
984 |
Default path: . |
|
|
985 |
Default file_resources: { |
|
|
986 |
"": "", |
|
|
987 |
"": "", |
|
|
988 |
"": "", |
|
|
989 |
} |
|
|
990 |
""" |
|
|
991 |
def __init__(self, path, file_resources, |
|
|
992 |
source_col_name='A_name_in_paper', target_col_name='B_name_in_paper', |
|
|
993 |
source_index="transcript_name", target_index="gene_name", |
|
|
994 |
edge_attr=["relationship", "mechanism", "pmid"], filters=None, directed=True, relabel_nodes=None, |
|
|
995 |
verbose=False): |
|
|
996 |
if file_resources is None: |
|
|
997 |
file_resources = {} |
|
|
998 |
file_resources["data.xlsx"] = os.path.join(path, "data.xlsx") |
|
|
999 |
|
|
|
1000 |
super().__init__(path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
1001 |
target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, |
|
|
1002 |
relabel_nodes=relabel_nodes, verbose=verbose) |
|
|
1003 |
|
|
|
1004 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
1005 |
blocksize=None): |
|
|
1006 |
df = pd.read_excel(self.file_resources["data.xlsx"]) |
|
|
1007 |
print(self.name(), df.columns.tolist()) |
|
|
1008 |
|
|
|
1009 |
df = df[df["species"] == "Homo sapiens"] |
|
|
1010 |
df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ |
|
|
1011 |
"B_name_in_paper"].str.replace("-3p.*|-5p.*", "") |
|
|
1012 |
df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ |
|
|
1013 |
"B_name_in_paper"].str.replace("MIR", "hsa-mir-") |
|
|
1014 |
df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][ |
|
|
1015 |
"B_name_in_paper"].str.replace("let-", "hsa-let-") |
|
|
1016 |
|
|
|
1017 |
LncReg_lncRNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, |
|
|
1018 |
edge_attr=edge_attr, |
|
|
1019 |
create_using=nx.DiGraph()) |
|
|
1020 |
return LncReg_lncRNA_RNA_network |
|
|
1021 |
|
|
|
1022 |
|
|
|
1023 |
class lncRInter(Interactions): |
|
|
1024 |
"""Loads the database from . |
|
|
1025 |
|
|
|
1026 |
Default path: . |
|
|
1027 |
Default file_resources: { |
|
|
1028 |
"": "", |
|
|
1029 |
"": "", |
|
|
1030 |
"": "", |
|
|
1031 |
} |
|
|
1032 |
""" |
|
|
1033 |
|
|
|
1034 |
def __init__(self, path, file_resources=None, source_col_name="lncrna", |
|
|
1035 |
target_col_name='Interacting partner', |
|
|
1036 |
edge_attr=None, filters=None, |
|
|
1037 |
directed=True, relabel_nodes=None, **kwargs): |
|
|
1038 |
if edge_attr is None: |
|
|
1039 |
edge_attr = ["Interaction Class", "Interaction Mode", "Tissue", "Phenotype"] |
|
|
1040 |
if file_resources is None: |
|
|
1041 |
file_resources = {} |
|
|
1042 |
file_resources["human_interactions.txt"] = os.path.join(path, "human_interactions.txt") |
|
|
1043 |
|
|
|
1044 |
super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name, |
|
|
1045 |
edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs) |
|
|
1046 |
|
|
|
1047 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
1048 |
blocksize=None): |
|
|
1049 |
lncRInter_df = pd.read_table(file_resources["human_interactions.txt"]) |
|
|
1050 |
print(self.name(), lncRInter_df.columns.tolist()) |
|
|
1051 |
|
|
|
1052 |
lncRInter_df = filter_rows(lncRInter_df, filters) |
|
|
1053 |
# Data cleaning |
|
|
1054 |
lncRInter_df.loc[lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"] = \ |
|
|
1055 |
lncRInter_df.loc[ |
|
|
1056 |
lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"].str.lower() |
|
|
1057 |
lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mirlet", "hsa-let-") |
|
|
1058 |
lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mir", "hsa-mir-") |
|
|
1059 |
lncRInter_df["Interacting partner"][ |
|
|
1060 |
lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")] = \ |
|
|
1061 |
lncRInter_df["Interacting partner"][ |
|
|
1062 |
lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")].apply( |
|
|
1063 |
lambda x: x[:-1] + "-" + x[-1]) |
|
|
1064 |
|
|
|
1065 |
lncRInter_network = nx.from_pandas_edgelist(lncRInter_df, source=source_col_name, |
|
|
1066 |
target=target_col_name, |
|
|
1067 |
edge_attr=edge_attr, |
|
|
1068 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
1069 |
return lncRInter_network |
|
|
1070 |
|
|
|
1071 |
|
|
|
1072 |
class LncRNA2Target(Interactions): |
|
|
1073 |
"""Loads the database from . |
|
|
1074 |
|
|
|
1075 |
Default path: . |
|
|
1076 |
Default file_resources: { |
|
|
1077 |
"": "", |
|
|
1078 |
"": "", |
|
|
1079 |
"": "", |
|
|
1080 |
} |
|
|
1081 |
""" |
|
|
1082 |
|
|
|
1083 |
def __init__(self, path="http://123.59.132.21/lncrna2target/data/", file_resources=None, edge_attr=None, |
|
|
1084 |
filters=None, |
|
|
1085 |
directed=True, relabel_nodes=None, version="high_throughput", **kwargs): |
|
|
1086 |
""" |
|
|
1087 |
|
|
|
1088 |
Args: |
|
|
1089 |
filters (): default None, example {"species_id": 9606, "Species": "Homo sapiens"}. |
|
|
1090 |
version (str): one of ["high_throughput", "low_throughput"]. |
|
|
1091 |
The high_throughput version of lncRNA2Target database is v2.0 and low_throughput is v1.0, according to the database's website. |
|
|
1092 |
species_id (str, int): one of [9606, "Homo sapiens"]. |
|
|
1093 |
The species column in high_throughput is formatted in int (e.g. 9606) and in low_throughput is in str (e.g. "Homo sapiens") |
|
|
1094 |
""" |
|
|
1095 |
self.version = version |
|
|
1096 |
if file_resources is None: |
|
|
1097 |
file_resources = {} |
|
|
1098 |
file_resources["lncRNA_target_from_high_throughput_experiments.txt.rar"] = \ |
|
|
1099 |
os.path.join(path, "lncrna_target.rar") |
|
|
1100 |
file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"] = \ |
|
|
1101 |
os.path.join(path, "lncRNA_target_from_low_throughput_experiments.xlsx") |
|
|
1102 |
|
|
|
1103 |
if self.version == "high_throughput": |
|
|
1104 |
super().__init__(path, file_resources, source_col_name="lncrna_symbol", target_col_name="gene_symbol", |
|
|
1105 |
edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, |
|
|
1106 |
**kwargs) |
|
|
1107 |
if self.version == "low_throughput": |
|
|
1108 |
super().__init__(path, file_resources, source_col_name="GENCODE_gene_name", |
|
|
1109 |
target_col_name="Target_official_symbol", edge_attr=edge_attr, filters=filters, |
|
|
1110 |
directed=directed, relabel_nodes=relabel_nodes, **kwargs) |
|
|
1111 |
|
|
|
1112 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
1113 |
blocksize=None): |
|
|
1114 |
network = None |
|
|
1115 |
if self.version == "high_throughput": |
|
|
1116 |
network = self.load_network_high_throughput(file_resources, source_col_name, target_col_name, edge_attr, |
|
|
1117 |
directed) |
|
|
1118 |
elif self.version == "low_throughput": |
|
|
1119 |
network = self.load_network_low_throughput(file_resources, source_col_name, target_col_name, edge_attr, |
|
|
1120 |
directed) |
|
|
1121 |
else: |
|
|
1122 |
logger.warn("LncRNA2Target version argument must be one of 'high_throughput' or 'low_throughput'") |
|
|
1123 |
|
|
|
1124 |
return network |
|
|
1125 |
|
|
|
1126 |
def load_network_high_throughput(self, file_resources, source_col_name="lncrna_symbol", |
|
|
1127 |
target_col_name="gene_symbol", |
|
|
1128 |
edge_attr=None, directed=True, filters=None): |
|
|
1129 |
edges = pd.read_table(file_resources["lncRNA_target_from_high_throughput_experiments.txt"], sep="\t") |
|
|
1130 |
edges = filter_rows(edges, filters) |
|
|
1131 |
|
|
|
1132 |
edges["lncrna_symbol"] = edges["lncrna_symbol"].str.upper() |
|
|
1133 |
edges["lncrna_symbol"] = edges["lncrna_symbol"].str.replace("LINC", "") |
|
|
1134 |
edges["gene_symbol"] = edges["gene_symbol"].str.upper() |
|
|
1135 |
|
|
|
1136 |
self.data = self.edges = edges |
|
|
1137 |
lncrna2target_high_throughput_network = nx.from_pandas_edgelist(edges, |
|
|
1138 |
source=source_col_name, |
|
|
1139 |
target=target_col_name, |
|
|
1140 |
edge_attr=edge_attr, |
|
|
1141 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
1142 |
return lncrna2target_high_throughput_network |
|
|
1143 |
|
|
|
1144 |
def load_network_low_throughput(self, file_resources, source_col_name="GENCODE_gene_name", |
|
|
1145 |
target_col_name="Target_official_symbol", |
|
|
1146 |
edge_attr=None, directed=True, filters=None): |
|
|
1147 |
edges = pd.read_excel(file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"]) |
|
|
1148 |
edges = filter_rows(edges, filters) |
|
|
1149 |
|
|
|
1150 |
edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("(?i)(mir)", "hsa-mir-", |
|
|
1151 |
regex=True) |
|
|
1152 |
edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("--", "-") |
|
|
1153 |
edges["Target_official_symbol"].apply(lambda x: x.lower() if "mir" in x.lower() else x.upper()) |
|
|
1154 |
edges["GENCODE_gene_name"] = edges["GENCODE_gene_name"].str.upper() |
|
|
1155 |
|
|
|
1156 |
self.data = self.edges = edges |
|
|
1157 |
lncrna2target_low_throughput_network = nx.from_pandas_edgelist(edges, |
|
|
1158 |
source=source_col_name, |
|
|
1159 |
target=target_col_name, |
|
|
1160 |
edge_attr=edge_attr, |
|
|
1161 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
1162 |
return lncrna2target_low_throughput_network |
|
|
1163 |
|
|
|
1164 |
|
|
|
1165 |
class lncRNome(Interactions, Database): |
|
|
1166 |
"""Loads the lncRNome database from . |
|
|
1167 |
|
|
|
1168 |
Default path: . |
|
|
1169 |
Default file_resources: { |
|
|
1170 |
"": "", |
|
|
1171 |
"": "", |
|
|
1172 |
"": "", |
|
|
1173 |
} |
|
|
1174 |
""" |
|
|
1175 |
|
|
|
1176 |
def __init__(self, path, file_resources, source_col_name='Gene Name', target_col_name='Binding miRNAs', |
|
|
1177 |
edge_attr=["miRNA Interaction Site", "Transcript ID"], directed=True, relabel_nodes=None, |
|
|
1178 |
**kwargs): |
|
|
1179 |
if file_resources is None: |
|
|
1180 |
file_resources = {} |
|
|
1181 |
file_resources["miRNA_binding_sites.txt"] = os.path.join(path, "miRNA_binding_sites.txt") |
|
|
1182 |
file_resources["general_information.txt"] = os.path.join(path, "general_information.txt") |
|
|
1183 |
|
|
|
1184 |
super().__init__(path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
1185 |
target_col_name=target_col_name, |
|
|
1186 |
directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs) |
|
|
1187 |
|
|
|
1188 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
1189 |
blocksize=None): |
|
|
1190 |
df = pd.read_table(self.file_resources["miRNA_binding_sites.txt"], header=0) |
|
|
1191 |
print(self.name(), df.columns.tolist()) |
|
|
1192 |
|
|
|
1193 |
df['Binding miRNAs'] = df['Binding miRNAs'].str.lower() |
|
|
1194 |
df['Binding miRNAs'] = df['Binding miRNAs'].str.replace("-3p.*|-5p.*", "", regex=True) |
|
|
1195 |
|
|
|
1196 |
lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name, |
|
|
1197 |
target=target_col_name, |
|
|
1198 |
edge_attr=edge_attr, |
|
|
1199 |
create_using=nx.DiGraph()) |
|
|
1200 |
|
|
|
1201 |
return lncRNome_miRNA_binding_sites_network |
|
|
1202 |
|
|
|
1203 |
def load_dataframe(self, file_resources, blocksize=None): |
|
|
1204 |
return pd.read_table(self.file_resources["general_information.txt"], header=0, |
|
|
1205 |
usecols=["Gene Name", "Transcript Name", "Transcript Type", "Location", "Strand"]) |
|
|
1206 |
|
|
|
1207 |
|
|
|
1208 |
class NPInter(Interactions): |
|
|
1209 |
"""Loads the NPInter database from http://bigdata.ibp.ac.cn/npinter4/ . |
|
|
1210 |
|
|
|
1211 |
Default path: "http://bigdata.ibp.ac.cn/npinter4/download/" . |
|
|
1212 |
Default file_resources: { |
|
|
1213 |
"interaction_NPInterv4.expr.txt": "file/interaction_NPInterv4.expr.txt.gz", |
|
|
1214 |
} |
|
|
1215 |
""" |
|
|
1216 |
def __init__(self, path="http://bigdata.ibp.ac.cn/npinter4/download/", file_resources=None, |
|
|
1217 |
source_col_name='ncName', target_col_name='tarName', |
|
|
1218 |
edge_attr=["tarType", "tissueOrCell", "tag", 'class', "level"], |
|
|
1219 |
filters=None, |
|
|
1220 |
directed=True, relabel_nodes=None, verbose=False): |
|
|
1221 |
if file_resources is None: |
|
|
1222 |
file_resources = {} |
|
|
1223 |
file_resources["interaction_NPInterv4.expr.txt.gz"] = \ |
|
|
1224 |
os.path.join(path, "file/interaction_NPInterv4.expr.txt.gz") |
|
|
1225 |
|
|
|
1226 |
super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name, |
|
|
1227 |
target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed, |
|
|
1228 |
relabel_nodes=relabel_nodes, verbose=verbose) |
|
|
1229 |
|
|
|
1230 |
def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame: |
|
|
1231 |
df = pd.read_table(file_resources["interaction_NPInterv4.expr.txt"], header=0, na_values=["-"]) |
|
|
1232 |
print(self.name(), df.columns.tolist()) |
|
|
1233 |
df["ncName"] = df["ncName"].str.upper() |
|
|
1234 |
df["ncName"] = df["ncName"].str.strip("LNCRNA-") |
|
|
1235 |
df["ncName"] = df["ncName"].str.replace("MALAT-1", "MALAT1") |
|
|
1236 |
df["ncName"] = df["ncName"].str.replace("^MIR-", "hsa-mir-", regex=True) |
|
|
1237 |
df["ncName"] = df["ncName"].str.replace("^MICRORNA-", "hsa-mir-", regex=True) |
|
|
1238 |
|
|
|
1239 |
df["tarName"] = df["tarName"].str.upper() |
|
|
1240 |
|
|
|
1241 |
return df |
|
|
1242 |
|
|
|
1243 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
1244 |
blocksize=None): |
|
|
1245 |
df = self.data |
|
|
1246 |
df = filter_rows(df, filters) |
|
|
1247 |
|
|
|
1248 |
lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name, |
|
|
1249 |
target=target_col_name, |
|
|
1250 |
edge_attr=edge_attr, |
|
|
1251 |
create_using=nx.DiGraph() if directed else nx.Graph()) |
|
|
1252 |
|
|
|
1253 |
return lncRNome_miRNA_binding_sites_network |
|
|
1254 |
|
|
|
1255 |
|
|
|
1256 |
class StarBase(Interactions): |
|
|
1257 |
"""Loads the database from . |
|
|
1258 |
|
|
|
1259 |
Default path: . |
|
|
1260 |
Default file_resources: { |
|
|
1261 |
"": "", |
|
|
1262 |
"": "", |
|
|
1263 |
"": "", |
|
|
1264 |
} |
|
|
1265 |
""" |
|
|
1266 |
|
|
|
1267 |
def __init__(self, path, file_resources, source_col_name="geneName", target_col_name="pairGeneName", |
|
|
1268 |
min_interactionNum=1, min_expNum=1, |
|
|
1269 |
edge_attr=None, directed=True, relabel_nodes=None, **kwargs): |
|
|
1270 |
if file_resources is None: |
|
|
1271 |
file_resources = {} |
|
|
1272 |
file_resources["starbase_3.0_lncrna_rna_interactions.csv"] = \ |
|
|
1273 |
os.path.join(path, "starbase_3.0_lncrna_rna_interactions.csv") |
|
|
1274 |
self.min_interactionNum = min_interactionNum |
|
|
1275 |
self.min_expNum = min_expNum |
|
|
1276 |
super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name, |
|
|
1277 |
directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs) |
|
|
1278 |
|
|
|
1279 |
def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters, |
|
|
1280 |
blocksize=None): |
|
|
1281 |
df = pd.read_csv(self.file_resources["starbase_3.0_lncrna_rna_interactions.csv"], header=0) |
|
|
1282 |
|
|
|
1283 |
df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][ |
|
|
1284 |
"pairGeneName"].str.lower() |
|
|
1285 |
df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][ |
|
|
1286 |
"pairGeneName"].str.replace("-3p.*|-5p.*", "") |
|
|
1287 |
df = df[df["interactionNum"] >= self.min_interactionNum] |
|
|
1288 |
df = df[df["expNum"] >= self.min_expNum] |
|
|
1289 |
|
|
|
1290 |
self.starBase_RNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name, |
|
|
1291 |
edge_attr=["interactionNum"], |
|
|
1292 |
create_using=nx.DiGraph()) |
|
|
1293 |
return self.starBase_RNA_RNA_network |