Switch to unified view

a b/openomics/database/interaction.py
1
import copy
2
import os
3
from abc import abstractmethod
4
from collections.abc import Iterable
5
from typing import List, Dict, Any, Union, Optional
6
7
import dask.dataframe as dd
8
import networkx as nx
9
import pandas as pd
10
import scipy.sparse as ssp
11
from Bio import SeqIO
12
from logzero import logger
13
from pandas.core.dtypes.common import is_numeric_dtype
14
15
from openomics.database.base import Database
16
from openomics.database.sequence import SequenceDatabase, UniProt
17
from openomics.transforms.df import filter_rows
18
19
__all__ = ['STRING', 'GeneMania', 'IntAct', 'BioGRID', 'MiRTarBase', 'LncBase', 'TargetScan', 'TarBase',
20
           'LncReg', 'LncRNA2Target', 'lncRNome', 'NPInter', 'RNAInter', 'StarBase']
21
22
class Interactions(Database):
23
    edges: Optional[Union[pd.DataFrame, dd.DataFrame]]
24
    def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None,
25
                 edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None,
26
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
27
        """
28
        This is an abstract class used to instantiate a database given a folder containing various file resources. When creating a Database class, the load_data function is called where the file resources are load as a DataFrame and performs necessary processings. This class provides an interface for RNA classes to annotate various genomic annotation, functional annotation, sequences, and disease associations.
29
        Args:
30
            path (str):
31
                The folder path containing the data files.
32
            file_resources (dict):
33
                Default None, used to list required files for load_network of the dataset. A dictionary where keys are required filenames and value are file paths. If None, then the class constructor should automatically build the required file resources dict.
34
            source_col_name (str):
35
                Column name of DataFrame to be used as the source node names.
36
            target_col_name (str):
37
                Column name of DataFrame to be used as the target node names.
38
            edge_attr (list):
39
                A list of column names to be included as attributes for each edge (source-target pairs).
40
            filters (dict):
41
                Optional. A dict with key matching the data table (from load_network()) columns and values for the filtering on that column.
42
            directed (bool): default True,
43
                Whether to create a directed or an undirected network.
44
            relabel_nodes (dict): default None,
45
                A dictionary to rename nodes in the network, where the nodes with name <dict[key]> will be renamed to <dict[value]>
46
            blocksize ():
47
        """
48
        self.filters = filters
49
        self.source_col_name = source_col_name
50
        self.target_col_name = target_col_name
51
        self.directed = directed
52
        self.edge_attr = edge_attr
53
54
        super().__init__(path=path, file_resources=file_resources, blocksize=blocksize, **kwargs)
55
        self.network = self.load_network(file_resources=self.file_resources, source_col_name=source_col_name,
56
                                         target_col_name=target_col_name, edge_attr=edge_attr, directed=directed,
57
                                         filters=filters, blocksize=blocksize)
58
59
        if relabel_nodes is not None:
60
            self.network = nx.relabel_nodes(self.network, mapping=relabel_nodes)
61
62
        self.close()
63
64
    @classmethod
65
    def name(cls):
66
        return cls.__name__
67
68
    @abstractmethod
69
    def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str,
70
                     edge_attr: Union[str, List[str]], directed: bool, filters: Dict[str, Any], blocksize=None) \
71
        -> nx.Graph:
72
        """
73
        Handles data processing from `file_resources` to a Pandas DataFrame which contain edgelist data, then constructs
74
        and return a NetworkX Graph.
75
        Args:
76
            file_resources: a dict of file name and file path/object
77
            source_col_name (str): column name of the dataframe for source in the edge
78
            target_col_name (str): column name of the dataframe for target in the edge
79
            edge_attr (list): list of str for column data to include in each edge
80
            directed (bool): True to return a DiGraph(), else Graph()
81
            filters: A dict of {column name: column values} to filter the dataframe
82
            blocksize ():
83
        Returns:
84
            network: a NetworkX Graph or DiGraph
85
        """
86
        raise NotImplementedError
87
88
    def get_interactions(self, nodelist=None, data=False, inclusive=True, relabel_nodes: Dict[str, str] = None):
89
        """
90
91
        Args:
92
            nodelist (list):
93
                A list of nodes to fetch edges from
94
            data (bool): default False
95
                Whether to include edge attributes
96
            inclusive (bool): default False
97
                Whether to only retrieve edges from nodes inclusive in nodelist.
98
99
        Returns:
100
            edges (OutEdgeView): a NetworkX edgelist
101
        """
102
        if not hasattr(self, "network"):
103
            raise Exception(
104
                "{} does not have network interaction data yet. Must run load_network() and assign self.network field first.".format(
105
                    self.name()))
106
107
        g = self.network
108
        if relabel_nodes:
109
            g = nx.relabel_nodes(g, relabel_nodes, copy=False)
110
111
        if nodelist is None:
112
            return g.edges(data=data)
113
114
        if inclusive:
115
            return g.subgraph(nodelist).edges(data=data)
116
        else:
117
            return g.edges(nbunch=nodelist, data=data)
118
119
120
class STRING(Interactions, SequenceDatabase):
121
    """Loads the STRING database from https://string-db.org/ .
122
123
    Default path: "https://stringdb-static.org/download/" .
124
    Default file_resources: {
125
        "{species_id}.protein.info.txt.gz": f"protein.info.{version}/{species_id}.protein.info.{version}.txt.gz",
126
        "{species_id}.protein.aliases.txt.gz": f"protein.links.{version}/{species_id}.protein.aliases.{version}.txt.gz",
127
        "{species_id}.protein.links.txt.gz": f"protein.links.{version}/{species_id}.protein.links.{version}.txt.gz",
128
        "{species_id}.protein.sequences.fa.gz": f"protein.sequences.{version}/{species_id}.protein.sequences.{version}.fa.gz"
129
    }
130
131
    Edge attributes for protein.actions.txt include ["mode", 'action', 'is_directional', 'a_is_acting' "score"]
132
    Edge attributes for protein.actions.txt include ["combined_score"]
133
    """
134
    COLUMNS_RENAME_DICT = {
135
        "#string_protein_id": "string_protein_id",
136
        "protein_external_id": "protein_id",
137
        "preferred_name": "gene_name",
138
        '#ncbi_taxid': 'species_id',
139
        'string_protein_id_2': 'homologous_protein_id',
140
    }
141
142
    def __init__(self, path="https://stringdb-static.org/download/", file_resources=None,
143
                 species_id: Union[str, List[str]] = "9606", version="v11.0",
144
                 source_col_name="protein1", target_col_name="protein2",
145
                 edge_attr: Union[str, List[str]] = 'combined_score', directed=False,
146
                 relabel_nodes=None,
147
                 index_col='#string_protein_id',
148
                 keys=None,
149
                 alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'},
150
                 blocksize=None, **kwargs):
151
        """
152
153
        Args:
154
            path ():
155
            file_resources ():
156
            species_id (): List of str of species id's
157
                Provide a species_id string or a list of species_id's to download the species-specific STRING dataset, and
158
                integrate them. If species_id is None, then download the full-dataset version of STRING, which is very
159
                time-consuming.
160
            version ():
161
            source_col_name ():
162
            target_col_name ():
163
            source_index ():
164
            target_index ():
165
            edge_attr ():
166
            directed ():
167
            relabel_nodes ():
168
            verbose ():
169
            blocksize ():
170
        """
171
        self.version = version
172
        self.species_id = copy.copy(species_id)
173
        self.alias_types = alias_types
174
        assert isinstance(edge_attr, str)
175
176
        if file_resources is None:
177
            file_resources = {}
178
            if isinstance(species_id, (Iterable, str)) and len(species_id):
179
                species_list = [species_id] if isinstance(species_id, str) else species_id
180
                for species in species_list:
181
                    file_resources[f"{species}.protein.info.txt.gz"] = \
182
                        os.path.join(path, f"protein.info.{version}/{species}.protein.info.{version}.txt.gz")
183
                    file_resources[f"{species}.protein.links.txt.gz"] = \
184
                        os.path.join(path, f"protein.links.{version}/{species}.protein.links.{version}.txt.gz")
185
                    file_resources[f"{species}.protein.links.detailed.txt.gz"] = \
186
                        os.path.join(path, f"protein.links.detailed.{version}/"
187
                                           f"{species}.protein.links.detailed.{version}.txt.gz")
188
                    file_resources[f"{species}.protein.homology.txt.gz"] = \
189
                        os.path.join(path, f"protein.homology.{version}/{species}.protein.homology.{version}.txt.gz")
190
                    file_resources[f"{species}.clusters.proteins.txt.gz"] = \
191
                        os.path.join(path, f"clusters.proteins.{version}/{species}.clusters.proteins.{version}.txt.gz")
192
                    file_resources[f"{species}.protein.aliases.txt.gz"] = \
193
                        os.path.join(path, f"protein.aliases.{version}/{species}.protein.aliases.{version}.txt.gz")
194
                    file_resources[f"{species}.enrichment.terms.txt.gz"] = \
195
                        os.path.join(path, f"enrichment.terms.{version}/{species}.enrichment.terms.{version}.txt.gz")
196
                    file_resources[f"{species}.protein.sequences.fa.gz"] = \
197
                        os.path.join(path, f"protein.sequences.{version}/{species}.protein.sequences.{version}.fa.gz")
198
            else:
199
                file_resources["protein.info.txt.gz"] = os.path.join(path, f"protein.info.{version}.txt.gz")
200
                file_resources["protein.links.txt.gz"] = os.path.join(path, f"protein.links.{version}.txt.gz")
201
                file_resources["protein.sequences.fa.gz"] = os.path.join(path, f"protein.sequences.{version}.fa.gz")
202
        else:
203
            if isinstance(self.species_id, Iterable):
204
                file_resources = {fn: fp for fn, fp in file_resources.items() \
205
                                  if any(fn.startswith(species) for species in self.species_id)}
206
207
        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
208
                         target_col_name=target_col_name, edge_attr=edge_attr, directed=directed,
209
                         relabel_nodes=relabel_nodes, blocksize=blocksize, index_col=index_col, keys=keys,
210
                         col_rename=STRING.COLUMNS_RENAME_DICT, **kwargs)
211
212
    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
213
        # Load nodes
214
        dfs = []
215
        if blocksize:
216
            for filename in [fn for fn, path in file_resources.items() \
217
                             if 'info.txt' in fn and isinstance(path, str)]:
218
                compression = 'gzip' if filename.endswith(".gz") else None
219
                info_df = dd.read_table(file_resources[filename], na_values=['annotation not available'],
220
                                        low_memory=True, compression=compression,
221
                                        dtype={'protein_size': 'int8'},
222
                                        blocksize=None if isinstance(blocksize, bool) else blocksize)
223
224
                if self.keys is not None:
225
                    info_df = info_df.loc[info_df[self.index_col].isin(self.keys)]
226
227
                if self.index_col:
228
                    info_df = info_df.set_index(self.index_col, sorted=True)
229
230
                # Join other attributes to node_info
231
                species_id = filename.split(".")[0]
232
                attrs = self.load_accessory_data(file_resources, species_id=species_id,
233
                                                 alias_types=self.alias_types, blocksize=False)
234
                if attrs is not None:
235
                    new_cols = attrs.columns.difference(info_df.columns)
236
                    info_df = info_df.join(attrs[new_cols], on=self.index_col)
237
238
                dfs.append(info_df)
239
        else:
240
            for filename in file_resources:
241
                if filename.endswith("protein.info.txt"):
242
                    info_df = pd.read_table(file_resources[filename], na_values=['annotation not available'],
243
                                            dtype={'protein_size': 'int8'},
244
                                            index_col=self.index_col, low_memory=True)
245
                    index_split = info_df['#string_protein_id'].str.split(".", expand=True, n=1)
246
                    info_df = info_df.assign(species_id=index_split[0], protein_embl_id=index_split[1])
247
248
                    # Join other attributes to node_info
249
                    species_id = filename.split(".")[0]
250
                    attrs = self.load_accessory_data(file_resources, species_id=species_id,
251
                                                     alias_types=self.alias_types,
252
                                                     blocksize=blocksize)
253
                    if attrs is not None:
254
                        new_cols = attrs.columns.difference(info_df.columns)
255
                        info_df = info_df.join(attrs[new_cols], on=self.index_col)
256
                    dfs.append(info_df)
257
258
        if not len(dfs):
259
            raise Exception("Must provide at least one 'protein.info.txt' file.")
260
261
        if blocksize:
262
            protein_info: dd.DataFrame = dd.concat(dfs, axis=0, interleave_partitions=True)
263
        else:
264
            protein_info = pd.concat(dfs, axis=0)
265
266
        return protein_info
267
268
    def load_accessory_data(self, file_resources: Dict[str, str], species_id: str,
269
                            accessory_files=['protein.aliases', 'protein.homology', 'protein.enrichment',
270
                                             'clusters.proteins'],
271
                            alias_types={'Ensembl_UniProt', 'Ensembl_UniProt_AC'}, blocksize=False, ) \
272
        -> Union[pd.DataFrame, dd.DataFrame]:
273
        """
274
        Stack the annotations files for the provided `species_id`, such that rows in the annotations are filtered by
275
        `keys` (if not null), indexed by "#string_protein_id", and with attributes transformed to a dataframe columns.
276
277
        Args:
278
            file_resources (): a dict of filename and filepath
279
            species_id (str): the species_id string which is used to select only files that have the same prefix.
280
            accessory_files (List[str]):
281
                A list of strings that specify which types of annotation files to integrate, i.e., only select files
282
                having a substring matching one of these.
283
                Default ['protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins'].
284
            alias_types (): a set of string, default {'Ensembl_UniProt_AC'}
285
                A set of `source` values in the `protein.aliases` annotation to aggregate `alias`'s for.
286
                Must be a subset of {'Ensembl_Source', 'Ensembl_gene', 'Ensembl_transcript', 'Ensembl_UniGene',
287
                    'Ensembl_RefSeq_short', 'Ensembl_RefSeq', 'Ensembl_OTTG', 'Ensembl_OTTP', 'Ensembl_UCSC',
288
                    'Ensembl_UniProt', 'Ensembl_UniProt_AC', 'Ensembl_EntrezGene', 'Ensembl_EMBL', 'Ensembl_protein_id'}
289
            blocksize (bool): Recommended to use Pandas to avoid uncessary overhead.
290
291
        Returns:
292
            dd.Dataframe or pd.DataFrame
293
294
        """
295
        allowed_prefixes = {'protein.aliases', 'protein.homology', 'protein.enrichment', 'clusters.proteins'}
296
        if not set(accessory_files).issubset(allowed_prefixes):
297
            logger.warn(f'{set(accessory_files).difference(allowed_prefixes)} files are not supported')
298
299
        select_files = []
300
        for fn, path in file_resources.items():
301
            if fn.startswith(species_id) and any(ftype in fn for ftype in accessory_files):
302
                select_files.append(fn)
303
304
        dfs = []
305
        for filename in select_files:
306
            args = dict(
307
                low_memory=True,
308
                dtype={'cluster_id': 'category', '#ncbi_taxid': 'category', 'category': 'category',
309
                       'source': 'category'})
310
            compression = 'gzip' if filename.endswith(".gz") else None
311
            if blocksize:
312
                if not isinstance(file_resources[filename], str): continue
313
                df = dd.read_table(file_resources[filename], compression=compression, **args)
314
            else:
315
                df = pd.read_table(file_resources[filename], **args)
316
317
            # Set index for df
318
            for col in ['#string_protein_id', 'protein_id', '#string_protein_1']:
319
                if col in df.columns:
320
                    df = df.set_index(col, sorted=True) if blocksize else df.set_index(col)
321
                    break
322
323
            # Set index
324
            if df.index.name is None:
325
                continue
326
            elif self.index_col and df.index.name != self.index_col:
327
                df.index = df.index.rename(self.index_col)
328
            if blocksize:
329
                assert df.known_divisions
330
331
            # Filter rows
332
            if self.keys is not None:
333
                df = df.loc[df.index.isin(self.keys)]
334
335
            # Groupby on index and perform appropriate transforms depending on the annotation type
336
            if 'protein.homology' in filename:
337
                df = df.loc[df.index != df['string_protein_id_2']]
338
                df = df.groupby(self.index_col)['string_protein_id_2'].unique().to_frame()
339
                # TODO ignored column of size of homologous regions
340
341
            elif 'clusters.protein' in filename:
342
                df = df.groupby(self.index_col)[['cluster_id', '#ncbi_taxid']].unique()
343
344
            elif 'protein.enrichment' in filename:
345
                df = df.groupby(self.index_col)['term'].unique().to_frame()
346
347
            elif 'protein.aliases' in filename:
348
                df = df.loc[df['source'].isin(alias_types)]
349
                df['source'] = df['source'].cat.set_categories(alias_types)
350
                if blocksize:
351
                    # Set alias values to lists so pivot_table(..., aggfunc='sum') will concatenate them
352
                    df = df.assign(alias=df['alias'].map(lambda x: [x], meta=pd.Series([[""]])))
353
                    df = dd.pivot_table(df.reset_index(),
354
                                        index='#string_protein_id', columns='source', values='alias', aggfunc='sum')
355
                else:
356
                    df = df.reset_index().groupby([self.index_col, 'source'])['alias'].unique().unstack(level=1)
357
358
            if blocksize and not df.known_divisions:
359
                df.divisions = df.compute_current_divisions()
360
361
            if not len(df.index):
362
                continue
363
364
            dfs.append(df)
365
366
        if dfs:
367
            attrs = dd.concat(dfs, axis=1) if blocksize else pd.concat(dfs, axis=1)
368
        else:
369
            attrs = None
370
371
        return attrs
372
373
    def load_network(self, file_resources, source_col_name='protein1', target_col_name='protein2',
374
                     edge_attr: Union[str, List[str]] = 'combined_score', directed=False, filters=None, blocksize=None):
375
        keys = self.data.index.compute() if isinstance(self.data, dd.DataFrame) else self.data.index
376
        select_files = [fn for fn, path in file_resources.items() if "links" in fn]
377
378
        # Load edges
379
        edges_dfs = []
380
        for filename in select_files:
381
            args = dict(sep=" ", low_memory=True,
382
                        dtype={'protein1': 'category', 'protein2': 'category',
383
                               'neighborhood': 'uint8', 'fusion': 'uint8', 'cooccurence': 'uint8',
384
                               'coexpression': 'uint8', 'experimental': 'uint8', 'database': 'uint8',
385
                               'textmining': 'uint8', 'combined_score': 'uint8'})
386
            if blocksize:
387
                if not isinstance(file_resources[filename], str): continue
388
                compression = 'gzip' if filename.endswith(".gz") else None
389
                df: dd.DataFrame = dd.read_table(file_resources[filename], compression=compression, **args,
390
                                                 blocksize=None if isinstance(blocksize, bool) else blocksize)
391
392
                if compression:
393
                    logger.info(f"Repartitioning {filename} from {df.npartitions} "
394
                                f"partitions to {blocksize}-size partitions")
395
                    df = df.repartition(partition_size=blocksize)
396
397
            else:
398
                df = pd.read_table(file_resources[filename], **args)
399
400
            df = df.loc[df[source_col_name].isin(keys) & df[target_col_name].isin(keys)]
401
            edges_dfs.append(df)
402
403
        if len(edges_dfs) == 0:
404
            return
405
406
        # Concatenate multiple edgelists into dataframe
407
        edges_df = dd.concat(edges_dfs, axis=0) if blocksize else pd.concat(edges_dfs, axis=0)
408
        edges_df = edges_df.rename(columns=self.COLUMNS_RENAME_DICT)
409
        logger.info(f"{self.name()}-{self.species_id}: {edges_df.columns.tolist()}, {edges_df.shape}")
410
411
        # Convert edge_attr (edge weights) from 3 digit integer to float
412
        assignfunc = {}
413
        for col in (edge_attr if isinstance(edge_attr, list) else [edge_attr]):
414
            if col in edges_df.columns and is_numeric_dtype(edges_df[col]):
415
                assignfunc[col] = edges_df[col].astype('float16') / 1000
416
        if assignfunc:
417
            edges_df = edges_df.assign(**assignfunc)
418
419
        edges_df = filter_rows(edges_df, filters=filters)
420
421
        self.edges = edges_df
422
        # Set ordering for rows and columns
423
        node2idx = {node: i for i, node in enumerate(keys)}
424
425
        if isinstance(edges_df, dd.DataFrame):
426
            def edgelist2adj(df: pd.DataFrame) -> ssp.coo_matrix:
427
                if df.shape[0] == 1 and df.iloc[0, 0] == 'foo':
428
                    return None
429
430
                df = df.assign(row=df[source_col_name].map(node2idx).astype('int'),
431
                               col=df[target_col_name].map(node2idx).astype('int'))
432
                df = df.dropna(subset=['row', 'col'])
433
434
                if df.shape[0] == 0:
435
                    return None
436
437
                coo_adj = ssp.coo_matrix((df[edge_attr], (df['row'], df['col'])),
438
                                         shape=(len(keys), len(keys)))
439
                coo_adj.eliminate_zeros()
440
                return coo_adj
441
442
            # Create a sparse adjacency matrix for each partition, then add them to combine
443
            adj = edges_df.reduction(chunk=edgelist2adj,
444
                                     aggregate=lambda x: x.dropna().sum() if not x.isna().all() else None,
445
                                     meta=pd.Series([ssp.coo_matrix])).compute()
446
            assert len(adj) == 1, f"len(adj) = {len(adj)}"
447
448
            G = nx.from_scipy_sparse_matrix(adj[0], create_using=nx.DiGraph() if directed else nx.Graph(),
449
                                            edge_attribute='weight')
450
            idx2node = {i: node for i, node in enumerate(keys)}
451
            G = nx.relabel_nodes(G, mapping=idx2node, copy=True)
452
            del adj
453
454
        else:
455
            # Determine which edge attr to add
456
            if isinstance(edge_attr, (list, tuple)):
457
                cols = edges_df.columns.intersection(edge_attr + [source_col_name, target_col_name])
458
                edges_df = edges_df[cols]
459
                use_attrs = True
460
            elif isinstance(edge_attr, str):
461
                cols = edges_df.columns.intersection([source_col_name, target_col_name, edge_attr])
462
                edges_df = edges_df[cols]
463
                use_attrs = edge_attr
464
            else:
465
                use_attrs = False
466
            G = nx.from_pandas_edgelist(edges_df, source=source_col_name, target=target_col_name,
467
                                        edge_attr=use_attrs, create_using=nx.DiGraph() if directed else nx.Graph())
468
469
        return G
470
471
    def get_sequences(self, index="protein_id", omic=None, agg=None):
472
        if hasattr(self, "seq_dict"):
473
            return self.seq_dict
474
475
        self.seq_dict = {}
476
        collisions = 0
477
        for record in SeqIO.parse(self.file_resources["protein.sequences.fa"], "fasta"):
478
            gene_id = str(record.name)
479
480
            sequence_str = str(record.seq)
481
            if index == "protein_name":
482
                key = self.protein_id2name[gene_id]
483
            elif index == "protein_id":
484
                key = gene_id
485
486
            if key in self.seq_dict:
487
                collisions += 1
488
489
            self.seq_dict[key] = sequence_str
490
491
        logger.warn("Seq {} collisions: {}".format(index, collisions))
492
        return self.seq_dict
493
494
495
class GeneMania(Interactions):
496
    """Loads the GeneMania database from  .
497
498
    Default path: local_directory .
499
    Default file_resources: {
500
        "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt": "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt",
501
        "identifier_mappings.txt": "identifier_mappings.txt",
502
    }
503
    """
504
505
    def __init__(self, path, file_resources=None, source_col_name="Gene_A", target_col_name="Gene_B",
506
                 edge_attr=None, filters=None, directed=True, relabel_nodes=None, **kwargs):
507
        if edge_attr is None:
508
            edge_attr = ["Weight"]
509
        if file_resources is None:
510
            file_resources = {}
511
            file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"] = os.path.join(path,
512
                                                                                        "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt")
513
            file_resources["identifier_mappings.txt"] = os.path.join(path,
514
                                                                     "identifier_mappings.txt")
515
516
        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
517
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
518
                         relabel_nodes=relabel_nodes, **kwargs)
519
520
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
521
                     blocksize=None):
522
        interactions = pd.read_table(file_resources["COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"], low_memory=True)
523
        identifier = pd.read_table(file_resources["identifier_mappings.txt"])
524
525
        # Rename ENSG ID's to gene names
526
        identifier = identifier[identifier["Source"] == "Gene Name"]
527
        id_mapping = pd.Series(identifier["Name"].values, index=identifier["Preferred_Name"]).to_dict()
528
        interactions.replace(id_mapping, inplace=True)
529
530
        genemania_RNA_RNA_network = nx.from_pandas_edgelist(interactions, source=source_col_name,
531
                                                            target=target_col_name,
532
                                                            edge_attr=edge_attr,
533
                                                            create_using=nx.DiGraph())
534
        return genemania_RNA_RNA_network
535
536
537
class IntAct(Interactions):
538
539
    def __init__(self, path, file_resources: Dict, source_col_name: str = None, target_col_name: str = None,
540
                 source_index: str = None, target_index: str = None, edge_attr: List[str] = None, filters: dict = None,
541
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
542
        super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed,
543
                         relabel_nodes, blocksize, **kwargs)
544
545
546
class BioGRID(Interactions):
547
    """Loads the BioGRID database from https://thebiogrid.org .
548
549
    Default path: "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/" .
550
    Default file_resources: {
551
        "BIOGRID-ALL-LATEST.tab2.zip": "BIOGRID-ALL-LATEST.tab2.zip",
552
    }
553
    """
554
555
    def __init__(self, path="https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/",
556
                 file_resources=None, source_col_name="Official Symbol Interactor A",
557
                 target_col_name="Official Symbol Interactor B",
558
                 edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'],
559
                 filters=None, directed=False, relabel_nodes=None, **kwargs):
560
        """
561
562
        Args:
563
            path ():
564
            file_resources ():
565
            source_col_name ():
566
            target_col_name ():
567
            source_index ():
568
            target_index ():
569
            edge_attr ():
570
            filters (): Default None, example {"Organism Interactor A": 9606}.
571
            directed ():
572
            relabel_nodes ():
573
            **kwargs ():
574
        """
575
        if file_resources is None:
576
            file_resources = {}
577
            file_resources["BIOGRID-ALL-LATEST.tab2.zip"] = os.path.join(path, "BIOGRID-ALL-LATEST.tab2.zip")
578
579
        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
580
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
581
                         relabel_nodes=relabel_nodes, **kwargs)
582
583
    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
584
        args = dict(na_values=["-"], header=0, low_memory=True,
585
                    # usecols=['Official Symbol Interactor A', 'Official Symbol Interactor B',
586
                    #          'Organism Interactor A', 'Score', 'Throughput', 'Qualifications',
587
                    #          'Modification', 'Phenotypes', 'Source Database'],
588
                    dtype={'Score': 'float', 'Entrez Gene Interactor A': 'category',
589
                           'Entrez Gene Interactor B': 'category',
590
                           'BioGRID ID Interactor A': 'category', 'BioGRID ID Interactor B': 'category',
591
                           'Systematic Name Interactor A': 'category', 'Systematic Name Interactor B': 'category',
592
                           'Official Symbol Interactor A': 'category', 'Official Symbol Interactor B': 'category',
593
                           'Pubmed ID': 'str', 'Throughput': 'category', 'Experimental System Type': 'category',
594
                           'Experimental System': 'category', 'Modification': 'category', 'Source Database': 'category',
595
                           'Organism Interactor A': 'category', 'Organism Interactor B': 'category'})
596
597
        if blocksize:
598
            edges = dd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], blocksize=blocksize, **args, )
599
        else:
600
            edges = pd.read_table(file_resources["BIOGRID-ALL-LATEST.tab2"], **args, )
601
602
        self.edges = edges
603
604
        return edges
605
606
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
607
                     blocksize=None):
608
        df = self.edges
609
        df = filter_rows(df, filters)
610
        network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
611
                                          edge_attr=edge_attr,
612
                                          create_using=nx.DiGraph() if directed else nx.Graph())
613
        return network
614
615
616
class MiRTarBase(Interactions):
617
    """Loads the  database from  .
618
619
        Default path:  .
620
        Default file_resources: {
621
            "": "",
622
            "": "",
623
            "": "",
624
        }
625
        """
626
627
    def __init__(self, path="http://mirtarbase.mbc.nctu.edu.tw/cache/download/7.0/", file_resources=None,
628
                 source_col_name="miRNA", target_col_name="Target Gene",
629
                 edge_attr=None,
630
                 filters=None,
631
                 directed=True,
632
                 relabel_nodes=None,
633
                 strip_mirna_name=False, **kwargs):
634
        """
635
636
        Args:
637
            path ():
638
            file_resources ():
639
            source_col_name ():
640
            target_col_name ():
641
            source_index ():
642
            target_index ():
643
            edge_attr ():
644
            filters (): default None, example {"Species (Target Gene)": "Homo sapiens"}
645
            directed ():
646
            relabel_nodes ():
647
            strip_mirna_name ():
648
            **kwargs ():
649
        """
650
        if edge_attr is None:
651
            edge_attr = ["Support Type"]
652
        self.strip_mirna_name = strip_mirna_name
653
654
        if file_resources is None:
655
            file_resources = {}
656
            file_resources["miRTarBase_MTI.xlsx"] = os.path.join(path, "miRTarBase_MTI.xlsx")
657
658
        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
659
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
660
                         relabel_nodes=relabel_nodes, **kwargs)
661
662
    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
663
        df = pd.read_excel(self.file_resources["miRTarBase_MTI.xlsx"])
664
        self.edges = df
665
        return df
666
667
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
668
                     blocksize=None):
669
        df = self.data
670
        df = filter_rows(df, filters)
671
672
        df['miRNA'] = df['miRNA'].str.rstrip('*')
673
674
        if self.strip_mirna_name:
675
            df['miRNA'] = df['miRNA'].str.lower().str.replace("-3p.*|-5p.*", "", regex=True)
676
677
        mir_target_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
678
                                                     edge_attr=edge_attr,
679
                                                     create_using=nx.DiGraph() if directed else nx.Graph())
680
        return mir_target_network
681
682
683
class LncBase(Interactions, Database):
684
    """Loads the LncBase database from http://carolina.imis.athena-innovation.gr/diana_tools/web/index.php?r=lncbasev2%2Findex .
685
686
    Default path: local_directory .
687
    Default file_resources: {
688
        "LncBasev2_download.csv": "LncBasev2_download.csv"",
689
    }
690
    """
691
692
    def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads/', file_resources=None, strip_mirna_name=False,
693
                 source_col_name="mirna", target_col_name="geneId",
694
                 edge_attr=None,
695
                 filters=None,
696
                 directed=True,
697
                 relabel_nodes=None, ):
698
        """
699
700
        Args:
701
            path ():
702
            file_resources ():
703
            strip_mirna_name ():
704
            source_col_name ():
705
            target_col_name ():
706
            source_index ():
707
            target_index ():
708
            edge_attr ():
709
            filters (): default None. Example: {"species": "Homo sapiens"}
710
            directed ():
711
            relabel_nodes ():
712
        """
713
        self.strip_mirna_name = strip_mirna_name
714
715
        if edge_attr is None:
716
            edge_attr = ["tissue", "positive_negative"]
717
        if file_resources is None:
718
            file_resources = {}
719
            file_resources["LncBasev2_download.csv"] = os.path.join(path, "lncbase_v2_exp_data.tar.gz")
720
721
        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
722
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
723
                         relabel_nodes=relabel_nodes)
724
725
    def get_rename_dict(self, from_index="geneId", to_index="geneName"):
726
        lncbase_df = pd.read_table(self.file_resources["LncBasev2_download.csv"], low_memory=True)
727
        gene_id_to_gene_name_dict = pd.Series(lncbase_df["geneName"].values,
728
                                              index=lncbase_df["geneId"]).to_dict()
729
        return gene_id_to_gene_name_dict
730
731
    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
732
        df = pd.read_table(file_resources["LncBasev2_download.csv"], low_memory=True)
733
        df.replace({"species": {"Homo Sapiens": "Homo sapiens", "Mus Musculus": "Mus musculus"}}, inplace=True)
734
        return df
735
736
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
737
                     blocksize=None):
738
        df = self.data
739
        df = filter_rows(df, filters)
740
741
        if self.strip_mirna_name:
742
            df['mirna'] = df['mirna'].str.lower()
743
            df['mirna'] = df['mirna'].str.replace("-3p.*|-5p.*", "", regex=True)
744
745
        if edge_attr is None:
746
            edge_attr = ["tissue", "positive_negative"]
747
        lncBase_lncRNA_miRNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
748
                                                               edge_attr=edge_attr,
749
                                                               create_using=nx.DiGraph() if directed else nx.Graph())
750
        return lncBase_lncRNA_miRNA_network
751
752
753
class TarBase(Interactions):
754
    """
755
756
    """
757
758
    def __init__(self, path='https://dianalab.e-ce.uth.gr/downloads', file_resources: Dict = None,
759
                 source_col_name: str = 'mirna', target_col_name: str = 'geneName',
760
                 edge_attr: List[str] = None, filters: Union[str, Dict[str, Union[str, List[str]]]] = None,
761
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
762
        """
763
764
        Args:
765
            path ():
766
            file_resources ():
767
            source_col_name ():
768
            target_col_name ():
769
            edge_attr ():
770
            filters ():
771
            directed ():
772
            relabel_nodes ():
773
            blocksize ():
774
            **kwargs ():
775
        """
776
        if file_resources is None:
777
            file_resources = {
778
                'tarbase_v8_data.tar.gz': 'https://dianalab.e-ce.uth.gr/downloads/tarbase_v8_data.tar.gz',
779
                'speclist': 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/speclist',
780
            }
781
782
        super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed,
783
                         relabel_nodes, blocksize, **kwargs)
784
785
    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
786
        edges = pd.read_table(file_resources['tarbase_v8_data.tar.gz'], compression='tar',
787
                              dtype={'tissue': 'category', 'method': 'category', 'positive_negative': 'category',
788
                                     'species': 'category',
789
                                     'direct_indirect': 'category', 'up_down': 'category', 'cell_line': 'category',
790
                                     })
791
792
        if 'speclist' in file_resources:
793
            species_df = UniProt.get_species_list(file_resources['speclist'])
794
            species_df = species_df[['Official (scientific) name', 'Common name', 'Synonym']].melt(ignore_index=False)
795
            species_df = species_df.dropna().reset_index()
796
            species_name2id = species_df.set_index('value')['NCBI-taxon'].to_dict()
797
            edges['species_id'] = edges['species'].map(species_name2id)
798
799
        self.edges = edges
800
        return edges
801
802
    def load_network(self, file_resources: Dict, source_col_name: str, target_col_name: str, edge_attr: List[str],
803
                     directed: bool, filters: Dict[str, Any], blocksize=None):
804
        df = self.data
805
        df = filter_rows(df, filters)
806
807
        # Remove parenthesis containing 3 letter species name
808
        df['geneName'] = df['geneName'].str.replace(r'(\(\w{3}\)){1}$', '', regex=True)
809
        idx = df['geneName'].str.contains('\(')
810
        df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.replace(r'(\(\d of \d\))', '', regex=True).str.strip()
811
812
        idx = df['geneName'].str.contains("\(\w*\)", regex=True)
813
        df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.extract(r'\((\w*)\)(\w*)')[0]
814
815
        idx = df['geneName'].str.contains('\(')
816
        df.loc[idx, 'geneName'] = df.loc[idx, 'geneName'].str.split('(', expand=True)[0]
817
818
        g = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
819
                                    edge_attr=edge_attr,
820
                                    create_using=nx.DiGraph() if directed else nx.Graph())
821
        return g
822
823
824
class RNAInter(Interactions):
825
    """
826
827
    """
828
829
    def __init__(self, path='http://www.rnainter.org/raidMedia/download/', file_resources: Dict = None,
830
                 source_col_name: str = 'Interactor1.Symbol', target_col_name: str = 'Interactor2.Symbol',
831
                 edge_attr: List[str] = 'score', filters: Union[str, Dict[str, Union[str, List[str]]]] = None,
832
                 directed: bool = True, relabel_nodes: dict = None, blocksize=None, **kwargs):
833
        """
834
835
        Args:
836
            path ():
837
            file_resources ():
838
            source_col_name ():
839
            target_col_name ():
840
            edge_attr ():
841
            filters ():
842
            directed ():
843
            relabel_nodes ():
844
            blocksize ():
845
            **kwargs ():
846
        """
847
        if file_resources is None:
848
            file_resources = {
849
                'Download_data_RR.tar.gz': 'Download_data_RR.tar.gz',
850
                'Download_data_RP.tar.gz': 'Download_data_RP.tar.gz',
851
            }
852
853
        super().__init__(path, file_resources, source_col_name, target_col_name, edge_attr, filters, directed,
854
                         relabel_nodes, blocksize, **kwargs)
855
856
    def load_dataframe(self, file_resources: Dict, blocksize: int = None) -> pd.DataFrame:
857
        args = dict(dtype={'Category1': 'category', 'Category2': 'category',
858
                           'Species1': 'category', 'Species2': 'category', 'score': 'float',
859
                           'predict': 'category', 'weak': 'category', 'strong': 'category'})
860
        edge_files = (fn for fn in file_resources if fn.startswith('Download_data'))
861
        for fn in edge_files:
862
            if blocksize:
863
                if not isinstance(file_resources[fn], str): continue
864
                edges = dd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args)
865
            else:
866
                edges = pd.read_table(file_resources[fn], compression='tar' if fn.endswith('.tar.gz') else None, **args)
867
868
        edges = filter_rows(edges, self.filters)
869
870
        self.edges = edges
871
        return edges
872
873
    def load_network(self, file_resources, source_col_name='Interactor1.Symbol', target_col_name='Interactor2.Symbol',
874
                     edge_attr='score', directed=True, filters=None, blocksize=None):
875
        edges = self.data
876
        if filters != self.filters:
877
            edges = filter_rows(edges, filters)
878
879
        g = nx.from_pandas_edgelist(edges, source=source_col_name, target=target_col_name,
880
                                    edge_attr=edge_attr,
881
                                    create_using=nx.DiGraph() if directed else nx.Graph())
882
        return g
883
884
885
class TargetScan(Interactions, Database):
886
    """Loads the TargetScan database from "http://www.targetscan.org/" .
887
888
    Default path: "http://www.targetscan.org/vert_72/vert_72_data_download/" .
889
    Default file_resources: {
890
        "miR_Family_Info.txt": "miR_Family_Info.txt.zip",
891
        "Predicted_Targets_Info.default_predictions.txt": "Predicted_Targets_Info.default_predictions.txt.zip",
892
        "": "",
893
    }
894
    """
895
896
    def __init__(self, path="http://www.targetscan.org/vert_72/vert_72_data_download/", file_resources=None,
897
                 source_col_name="MiRBase ID", target_col_name="Gene Symbol",
898
                 edge_attr=["tissue", "positive_negative"], directed=True, relabel_nodes=None, species_id=None,
899
                 strip_mirna_name=False, **kwargs):
900
        self.strip_mirna_name = strip_mirna_name
901
        self.species_id = species_id
902
        if file_resources is None:
903
            file_resources = {}
904
            file_resources["miR_Family_Info.txt.zip"] = os.path.join(path, "miR_Family_Info.txt.zip")
905
            file_resources["Predicted_Targets_Info.default_predictions.txt"] = os.path.join(path,
906
                                                                                            "Predicted_Targets_Info.default_predictions.txt")
907
908
        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
909
                         target_col_name=target_col_name,
910
                         directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)
911
912
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
913
                     blocksize=None):
914
        self.df = self.process_miR_family_info_table(file_resources, self.species_id)
915
        interactions_df = self.process_interactions_table(file_resources, self.df, self.species_id)
916
        print(self.name(), interactions_df.columns.tolist())
917
918
        mir_target_network = nx.from_pandas_edgelist(interactions_df,
919
                                                     source=source_col_name, target=target_col_name,
920
                                                     edge_attr=edge_attr,
921
                                                     create_using=nx.DiGraph() if directed else nx.Graph())
922
        return mir_target_network
923
924
    def process_miR_family_info_table(self, file_resources, species=None):
925
        miR_Family_Info_df = pd.read_table(file_resources["miR_Family_Info.txt"], delimiter='\t')
926
927
        if species:
928
            miR_Family_Info_df = miR_Family_Info_df[miR_Family_Info_df['Species ID'] == species]
929
930
        # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19
931
        if self.strip_mirna_name:
932
            miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.lower()
933
            miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "")
934
935
        miR_Family_Info_df.drop_duplicates(inplace=True)
936
        miR_Family_Info_df = miR_Family_Info_df.filter(items=['miR family', 'MiRBase ID', 'Seed+m8', 'Mature sequence',
937
                                                              'Family Conservation?', 'MiRBase Accession'],
938
                                                       axis="columns")
939
        miR_Family_Info_df['MiRBase ID'] = miR_Family_Info_df['MiRBase ID'].astype(str)
940
        return miR_Family_Info_df
941
942
    def process_interactions_table(self, file_resources, family_to_miR_df, species_id):
943
        """
944
        This functions joins the interactions data table between miR Family and targets, and
945
        Args:
946
            file_resources:
947
            family_to_miR_df:
948
            species_id:
949
950
        Returns:
951
952
        """
953
        # Load data frame from file
954
        family_interactions_df = pd.read_table(file_resources["Predicted_Targets_Info.default_predictions.txt"],
955
                                               dtype={'Species ID': 'category'},
956
                                               delimiter='\t', low_memory=True)
957
958
        # Select only miRNA-target pairs of certain species_id
959
        if species_id:
960
            family_interactions_df = family_interactions_df[family_interactions_df["Species ID"] == species_id]
961
962
        family_interactions_df = family_interactions_df.filter(items=["miR Family", "Gene Symbol"], axis="columns")
963
        family_to_miR_df = family_to_miR_df.filter(items=['miR family', 'MiRBase ID'], axis="columns")
964
        family_to_miR_df = family_to_miR_df.rename(columns={'miR family': 'miR Family'})
965
966
        # map miRBase ID names to miR Family
967
        # family_interactions_df = pd.merge(family_interactions_df, family_to_miR_df, how='outer', on="miR Family")
968
969
        family_to_miR_df.set_genes_index("miR Family", inplace=True)
970
        family_interactions_df.set_genes_index("miR Family", inplace=True)
971
        mir_interactions_df = family_interactions_df.join(family_to_miR_df, how='outer', on="miR Family").reset_index()
972
973
        # Standardize MiRBase ID to miRNA names obtained from RNA-seq hg19
974
        if self.strip_mirna_name:
975
            mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.lower()
976
            mir_interactions_df['MiRBase ID'] = mir_interactions_df['MiRBase ID'].str.replace("-3p.*|-5p.*", "")
977
978
        return mir_interactions_df
979
980
981
class LncReg(Interactions):
982
    """Loads the  database from  .
983
984
    Default path:  .
985
    Default file_resources: {
986
        "": "",
987
        "": "",
988
        "": "",
989
    }
990
    """
991
    def __init__(self, path, file_resources,
992
                 source_col_name='A_name_in_paper', target_col_name='B_name_in_paper',
993
                 source_index="transcript_name", target_index="gene_name",
994
                 edge_attr=["relationship", "mechanism", "pmid"], filters=None, directed=True, relabel_nodes=None,
995
                 verbose=False):
996
        if file_resources is None:
997
            file_resources = {}
998
            file_resources["data.xlsx"] = os.path.join(path, "data.xlsx")
999
1000
        super().__init__(path, file_resources=file_resources, source_col_name=source_col_name,
1001
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
1002
                         relabel_nodes=relabel_nodes, verbose=verbose)
1003
1004
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
1005
                     blocksize=None):
1006
        df = pd.read_excel(self.file_resources["data.xlsx"])
1007
        print(self.name(), df.columns.tolist())
1008
1009
        df = df[df["species"] == "Homo sapiens"]
1010
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
1011
            "B_name_in_paper"].str.replace("-3p.*|-5p.*", "")
1012
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
1013
            "B_name_in_paper"].str.replace("MIR", "hsa-mir-")
1014
        df.loc[df["B_category"] == "miRNA", "B_name_in_paper"] = df[df["B_category"] == "miRNA"][
1015
            "B_name_in_paper"].str.replace("let-", "hsa-let-")
1016
1017
        LncReg_lncRNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
1018
                                                            edge_attr=edge_attr,
1019
                                                            create_using=nx.DiGraph())
1020
        return LncReg_lncRNA_RNA_network
1021
1022
1023
class lncRInter(Interactions):
1024
    """Loads the  database from  .
1025
1026
    Default path:  .
1027
    Default file_resources: {
1028
        "": "",
1029
        "": "",
1030
        "": "",
1031
    }
1032
    """
1033
1034
    def __init__(self, path, file_resources=None, source_col_name="lncrna",
1035
                 target_col_name='Interacting partner',
1036
                 edge_attr=None, filters=None,
1037
                 directed=True, relabel_nodes=None, **kwargs):
1038
        if edge_attr is None:
1039
            edge_attr = ["Interaction Class", "Interaction Mode", "Tissue", "Phenotype"]
1040
        if file_resources is None:
1041
            file_resources = {}
1042
            file_resources["human_interactions.txt"] = os.path.join(path, "human_interactions.txt")
1043
1044
        super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name,
1045
                         edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes, **kwargs)
1046
1047
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
1048
                     blocksize=None):
1049
        lncRInter_df = pd.read_table(file_resources["human_interactions.txt"])
1050
        print(self.name(), lncRInter_df.columns.tolist())
1051
1052
        lncRInter_df = filter_rows(lncRInter_df, filters)
1053
        # Data cleaning
1054
        lncRInter_df.loc[lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"] = \
1055
            lncRInter_df.loc[
1056
                lncRInter_df["Interacting partner"].str.contains("MIR"), "Interacting partner"].str.lower()
1057
        lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mirlet", "hsa-let-")
1058
        lncRInter_df["Interacting partner"] = lncRInter_df["Interacting partner"].str.replace("mir", "hsa-mir-")
1059
        lncRInter_df["Interacting partner"][
1060
            lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")] = \
1061
            lncRInter_df["Interacting partner"][
1062
                lncRInter_df["Interacting partner"].str.contains(r"[mir|let]\-[\d]+[a-z]+[\d]+")].apply(
1063
                lambda x: x[:-1] + "-" + x[-1])
1064
1065
        lncRInter_network = nx.from_pandas_edgelist(lncRInter_df, source=source_col_name,
1066
                                                    target=target_col_name,
1067
                                                    edge_attr=edge_attr,
1068
                                                    create_using=nx.DiGraph() if directed else nx.Graph())
1069
        return lncRInter_network
1070
1071
1072
class LncRNA2Target(Interactions):
1073
    """Loads the  database from  .
1074
1075
            Default path:  .
1076
            Default file_resources: {
1077
                "": "",
1078
                "": "",
1079
                "": "",
1080
            }
1081
            """
1082
1083
    def __init__(self, path="http://123.59.132.21/lncrna2target/data/", file_resources=None, edge_attr=None,
1084
                 filters=None,
1085
                 directed=True, relabel_nodes=None, version="high_throughput", **kwargs):
1086
        """
1087
1088
        Args:
1089
            filters (): default None, example {"species_id": 9606, "Species": "Homo sapiens"}.
1090
            version (str): one of ["high_throughput", "low_throughput"].
1091
                The high_throughput version of lncRNA2Target database is v2.0 and low_throughput is v1.0, according to the database's website.
1092
            species_id (str, int): one of [9606, "Homo sapiens"].
1093
                The species column in high_throughput is formatted in int (e.g. 9606) and in low_throughput is in str (e.g. "Homo sapiens")
1094
        """
1095
        self.version = version
1096
        if file_resources is None:
1097
            file_resources = {}
1098
            file_resources["lncRNA_target_from_high_throughput_experiments.txt.rar"] = \
1099
                os.path.join(path, "lncrna_target.rar")
1100
            file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"] = \
1101
                os.path.join(path, "lncRNA_target_from_low_throughput_experiments.xlsx")
1102
1103
        if self.version == "high_throughput":
1104
            super().__init__(path, file_resources, source_col_name="lncrna_symbol", target_col_name="gene_symbol",
1105
                             edge_attr=edge_attr, filters=filters, directed=directed, relabel_nodes=relabel_nodes,
1106
                             **kwargs)
1107
        if self.version == "low_throughput":
1108
            super().__init__(path, file_resources, source_col_name="GENCODE_gene_name",
1109
                             target_col_name="Target_official_symbol", edge_attr=edge_attr, filters=filters,
1110
                             directed=directed, relabel_nodes=relabel_nodes, **kwargs)
1111
1112
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
1113
                     blocksize=None):
1114
        network = None
1115
        if self.version == "high_throughput":
1116
            network = self.load_network_high_throughput(file_resources, source_col_name, target_col_name, edge_attr,
1117
                                                        directed)
1118
        elif self.version == "low_throughput":
1119
            network = self.load_network_low_throughput(file_resources, source_col_name, target_col_name, edge_attr,
1120
                                                       directed)
1121
        else:
1122
            logger.warn("LncRNA2Target version argument must be one of 'high_throughput' or 'low_throughput'")
1123
1124
        return network
1125
1126
    def load_network_high_throughput(self, file_resources, source_col_name="lncrna_symbol",
1127
                                     target_col_name="gene_symbol",
1128
                                     edge_attr=None, directed=True, filters=None):
1129
        edges = pd.read_table(file_resources["lncRNA_target_from_high_throughput_experiments.txt"], sep="\t")
1130
        edges = filter_rows(edges, filters)
1131
1132
        edges["lncrna_symbol"] = edges["lncrna_symbol"].str.upper()
1133
        edges["lncrna_symbol"] = edges["lncrna_symbol"].str.replace("LINC", "")
1134
        edges["gene_symbol"] = edges["gene_symbol"].str.upper()
1135
1136
        self.data = self.edges = edges
1137
        lncrna2target_high_throughput_network = nx.from_pandas_edgelist(edges,
1138
                                                                        source=source_col_name,
1139
                                                                        target=target_col_name,
1140
                                                                        edge_attr=edge_attr,
1141
                                                                        create_using=nx.DiGraph() if directed else nx.Graph())
1142
        return lncrna2target_high_throughput_network
1143
1144
    def load_network_low_throughput(self, file_resources, source_col_name="GENCODE_gene_name",
1145
                                    target_col_name="Target_official_symbol",
1146
                                    edge_attr=None, directed=True, filters=None):
1147
        edges = pd.read_excel(file_resources["lncRNA_target_from_low_throughput_experiments.xlsx"])
1148
        edges = filter_rows(edges, filters)
1149
1150
        edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("(?i)(mir)", "hsa-mir-",
1151
                                                                                      regex=True)
1152
        edges["Target_official_symbol"] = edges["Target_official_symbol"].str.replace("--", "-")
1153
        edges["Target_official_symbol"].apply(lambda x: x.lower() if "mir" in x.lower() else x.upper())
1154
        edges["GENCODE_gene_name"] = edges["GENCODE_gene_name"].str.upper()
1155
1156
        self.data = self.edges = edges
1157
        lncrna2target_low_throughput_network = nx.from_pandas_edgelist(edges,
1158
                                                                       source=source_col_name,
1159
                                                                       target=target_col_name,
1160
                                                                       edge_attr=edge_attr,
1161
                                                                       create_using=nx.DiGraph() if directed else nx.Graph())
1162
        return lncrna2target_low_throughput_network
1163
1164
1165
class lncRNome(Interactions, Database):
1166
    """Loads the lncRNome database from  .
1167
1168
    Default path:  .
1169
    Default file_resources: {
1170
        "": "",
1171
        "": "",
1172
        "": "",
1173
    }
1174
    """
1175
1176
    def __init__(self, path, file_resources, source_col_name='Gene Name', target_col_name='Binding miRNAs',
1177
                 edge_attr=["miRNA Interaction Site", "Transcript ID"], directed=True, relabel_nodes=None,
1178
                 **kwargs):
1179
        if file_resources is None:
1180
            file_resources = {}
1181
            file_resources["miRNA_binding_sites.txt"] = os.path.join(path, "miRNA_binding_sites.txt")
1182
            file_resources["general_information.txt"] = os.path.join(path, "general_information.txt")
1183
1184
        super().__init__(path, file_resources=file_resources, source_col_name=source_col_name,
1185
                         target_col_name=target_col_name,
1186
                         directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)
1187
1188
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
1189
                     blocksize=None):
1190
        df = pd.read_table(self.file_resources["miRNA_binding_sites.txt"], header=0)
1191
        print(self.name(), df.columns.tolist())
1192
1193
        df['Binding miRNAs'] = df['Binding miRNAs'].str.lower()
1194
        df['Binding miRNAs'] = df['Binding miRNAs'].str.replace("-3p.*|-5p.*", "", regex=True)
1195
1196
        lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name,
1197
                                                                       target=target_col_name,
1198
                                                                       edge_attr=edge_attr,
1199
                                                                       create_using=nx.DiGraph())
1200
1201
        return lncRNome_miRNA_binding_sites_network
1202
1203
    def load_dataframe(self, file_resources, blocksize=None):
1204
        return pd.read_table(self.file_resources["general_information.txt"], header=0,
1205
                             usecols=["Gene Name", "Transcript Name", "Transcript Type", "Location", "Strand"])
1206
1207
1208
class NPInter(Interactions):
1209
    """Loads the NPInter database from http://bigdata.ibp.ac.cn/npinter4/ .
1210
1211
    Default path: "http://bigdata.ibp.ac.cn/npinter4/download/" .
1212
    Default file_resources: {
1213
        "interaction_NPInterv4.expr.txt": "file/interaction_NPInterv4.expr.txt.gz",
1214
    }
1215
    """
1216
    def __init__(self, path="http://bigdata.ibp.ac.cn/npinter4/download/", file_resources=None,
1217
                 source_col_name='ncName', target_col_name='tarName',
1218
                 edge_attr=["tarType", "tissueOrCell", "tag", 'class', "level"],
1219
                 filters=None,
1220
                 directed=True, relabel_nodes=None, verbose=False):
1221
        if file_resources is None:
1222
            file_resources = {}
1223
            file_resources["interaction_NPInterv4.expr.txt.gz"] = \
1224
                os.path.join(path, "file/interaction_NPInterv4.expr.txt.gz")
1225
1226
        super().__init__(path=path, file_resources=file_resources, source_col_name=source_col_name,
1227
                         target_col_name=target_col_name, edge_attr=edge_attr, filters=filters, directed=directed,
1228
                         relabel_nodes=relabel_nodes, verbose=verbose)
1229
1230
    def load_dataframe(self, file_resources: Dict[str, str], blocksize: int = None) -> pd.DataFrame:
1231
        df = pd.read_table(file_resources["interaction_NPInterv4.expr.txt"], header=0, na_values=["-"])
1232
        print(self.name(), df.columns.tolist())
1233
        df["ncName"] = df["ncName"].str.upper()
1234
        df["ncName"] = df["ncName"].str.strip("LNCRNA-")
1235
        df["ncName"] = df["ncName"].str.replace("MALAT-1", "MALAT1")
1236
        df["ncName"] = df["ncName"].str.replace("^MIR-", "hsa-mir-", regex=True)
1237
        df["ncName"] = df["ncName"].str.replace("^MICRORNA-", "hsa-mir-", regex=True)
1238
1239
        df["tarName"] = df["tarName"].str.upper()
1240
1241
        return df
1242
1243
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
1244
                     blocksize=None):
1245
        df = self.data
1246
        df = filter_rows(df, filters)
1247
1248
        lncRNome_miRNA_binding_sites_network = nx.from_pandas_edgelist(df, source=source_col_name,
1249
                                                                       target=target_col_name,
1250
                                                                       edge_attr=edge_attr,
1251
                                                                       create_using=nx.DiGraph() if directed else nx.Graph())
1252
1253
        return lncRNome_miRNA_binding_sites_network
1254
1255
1256
class StarBase(Interactions):
1257
    """Loads the  database from  .
1258
1259
    Default path:  .
1260
    Default file_resources: {
1261
        "": "",
1262
        "": "",
1263
        "": "",
1264
    }
1265
    """
1266
1267
    def __init__(self, path, file_resources, source_col_name="geneName", target_col_name="pairGeneName",
1268
                 min_interactionNum=1, min_expNum=1,
1269
                 edge_attr=None, directed=True, relabel_nodes=None, **kwargs):
1270
        if file_resources is None:
1271
            file_resources = {}
1272
            file_resources["starbase_3.0_lncrna_rna_interactions.csv"] = \
1273
                os.path.join(path, "starbase_3.0_lncrna_rna_interactions.csv")
1274
        self.min_interactionNum = min_interactionNum
1275
        self.min_expNum = min_expNum
1276
        super().__init__(path, file_resources, source_col_name=source_col_name, target_col_name=target_col_name,
1277
                         directed=directed, relabel_nodes=relabel_nodes, edge_attr=edge_attr, **kwargs)
1278
1279
    def load_network(self, file_resources, source_col_name, target_col_name, edge_attr, directed, filters,
1280
                     blocksize=None):
1281
        df = pd.read_csv(self.file_resources["starbase_3.0_lncrna_rna_interactions.csv"], header=0)
1282
1283
        df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][
1284
            "pairGeneName"].str.lower()
1285
        df.loc[df["pairGeneType"] == "miRNA", "pairGeneName"] = df[df["pairGeneType"] == "miRNA"][
1286
            "pairGeneName"].str.replace("-3p.*|-5p.*", "")
1287
        df = df[df["interactionNum"] >= self.min_interactionNum]
1288
        df = df[df["expNum"] >= self.min_expNum]
1289
1290
        self.starBase_RNA_RNA_network = nx.from_pandas_edgelist(df, source=source_col_name, target=target_col_name,
1291
                                                                edge_attr=["interactionNum"],
1292
                                                                create_using=nx.DiGraph())
1293
        return self.starBase_RNA_RNA_network