a b/deepdta-toy/testdatahelper.py
1
import csv, pickle, json, os, math, sys
2
from collections import OrderedDict
3
from bioservices import UniProt
4
import numpy as np
5
import pickle
6
7
8
PROT_FILE = "proteins.fasta"
9
CHEM_FILE = "ligands.tab"
10
AFF_FILE = "Y.tab"
11
12
def prepare_new_data(fpath, test=True):
13
14
    prots = read_proteins(fpath)
15
    chems = read_chemicals(fpath)
16
    Y = np.zeros((len(chems), len(prots)))
17
18
    if  os.path.exists(fpath + AFF_FILE):
19
        Y = np.loadtxt(fpath + AFF_FILE)
20
21
    pickle.dump(Y, open(fpath + "Y","wb"), protocol=pickle.HIGHEST_PROTOCOL)
22
    label_row_inds, label_col_inds = np.where(np.isnan(Y)==False)
23
24
    #json.dump(linepos, open(FLAGS.test_path  + "csv_pos_match.txt","w"))
25
    indic = set(range(len(label_row_inds)))
26
    indic = sorted(indic, key=os.urandom)
27
28
29
    if not os.path.exists(fpath + "folds/"):
30
        os.makedirs(fpath + "folds/")
31
    if test:
32
        json.dump(indic, open(fpath + "folds/test_fold.txt","w"))
33
    else:
34
        json.dump(indic, open(fpath + "folds/train_fold.txt","w"))
35
36
37
def read_chemicals(datafolder):
38
    counter =0
39
    filepath = datafolder + CHEM_FILE
40
    chemicals = {}
41
    with open(filepath) as file:
42
         next(file)
43
         for row in file:
44
            chem_id = row.split('\t')[0]
45
            smiles = (row.split('\t')[1]).strip()
46
            chemicals[chem_id] = smiles
47
            counter +=1
48
49
    print("%d number(s) of chemical(s)" % counter)
50
    json.dump(chemicals, open(datafolder + 'ligands.txt', 'w'))
51
52
    return chemicals
53
54
55
def read_proteins(datafolder):
56
    proteins = {}
57
    counter =0
58
    fa=""
59
    filename = datafolder + PROT_FILE
60
    with open(filename) as f:
61
        fa = f.readlines()
62
63
    idindex=[]
64
    for i, line in enumerate(fa):
65
        if ">" in line:
66
            idindex.append(i)
67
    idindex.append(i)
68
69
    for i, idx in enumerate(idindex):
70
71
        if i < len(idindex)-1:
72
            idx1 = idindex[i+1]
73
            info = fa[idx].split()
74
75
            pid = info[0][4:10]
76
            seq = "".join(fa[idx+1:idx1])
77
            seq = seq.replace("\n","")
78
            proteins[pid] = seq
79
            counter +=1
80
81
    print("%d number(s) of protein(s)" % counter)
82
    json.dump(proteins, open(datafolder + 'proteins.txt', 'w'))
83
84
    return proteins
85