|
a |
|
b/deepdta-toy/testdatahelper.py |
|
|
1 |
import csv, pickle, json, os, math, sys |
|
|
2 |
from collections import OrderedDict |
|
|
3 |
from bioservices import UniProt |
|
|
4 |
import numpy as np |
|
|
5 |
import pickle |
|
|
6 |
|
|
|
7 |
|
|
|
8 |
PROT_FILE = "proteins.fasta" |
|
|
9 |
CHEM_FILE = "ligands.tab" |
|
|
10 |
AFF_FILE = "Y.tab" |
|
|
11 |
|
|
|
12 |
def prepare_new_data(fpath, test=True): |
|
|
13 |
|
|
|
14 |
prots = read_proteins(fpath) |
|
|
15 |
chems = read_chemicals(fpath) |
|
|
16 |
Y = np.zeros((len(chems), len(prots))) |
|
|
17 |
|
|
|
18 |
if os.path.exists(fpath + AFF_FILE): |
|
|
19 |
Y = np.loadtxt(fpath + AFF_FILE) |
|
|
20 |
|
|
|
21 |
pickle.dump(Y, open(fpath + "Y","wb"), protocol=pickle.HIGHEST_PROTOCOL) |
|
|
22 |
label_row_inds, label_col_inds = np.where(np.isnan(Y)==False) |
|
|
23 |
|
|
|
24 |
#json.dump(linepos, open(FLAGS.test_path + "csv_pos_match.txt","w")) |
|
|
25 |
indic = set(range(len(label_row_inds))) |
|
|
26 |
indic = sorted(indic, key=os.urandom) |
|
|
27 |
|
|
|
28 |
|
|
|
29 |
if not os.path.exists(fpath + "folds/"): |
|
|
30 |
os.makedirs(fpath + "folds/") |
|
|
31 |
if test: |
|
|
32 |
json.dump(indic, open(fpath + "folds/test_fold.txt","w")) |
|
|
33 |
else: |
|
|
34 |
json.dump(indic, open(fpath + "folds/train_fold.txt","w")) |
|
|
35 |
|
|
|
36 |
|
|
|
37 |
def read_chemicals(datafolder): |
|
|
38 |
counter =0 |
|
|
39 |
filepath = datafolder + CHEM_FILE |
|
|
40 |
chemicals = {} |
|
|
41 |
with open(filepath) as file: |
|
|
42 |
next(file) |
|
|
43 |
for row in file: |
|
|
44 |
chem_id = row.split('\t')[0] |
|
|
45 |
smiles = (row.split('\t')[1]).strip() |
|
|
46 |
chemicals[chem_id] = smiles |
|
|
47 |
counter +=1 |
|
|
48 |
|
|
|
49 |
print("%d number(s) of chemical(s)" % counter) |
|
|
50 |
json.dump(chemicals, open(datafolder + 'ligands.txt', 'w')) |
|
|
51 |
|
|
|
52 |
return chemicals |
|
|
53 |
|
|
|
54 |
|
|
|
55 |
def read_proteins(datafolder): |
|
|
56 |
proteins = {} |
|
|
57 |
counter =0 |
|
|
58 |
fa="" |
|
|
59 |
filename = datafolder + PROT_FILE |
|
|
60 |
with open(filename) as f: |
|
|
61 |
fa = f.readlines() |
|
|
62 |
|
|
|
63 |
idindex=[] |
|
|
64 |
for i, line in enumerate(fa): |
|
|
65 |
if ">" in line: |
|
|
66 |
idindex.append(i) |
|
|
67 |
idindex.append(i) |
|
|
68 |
|
|
|
69 |
for i, idx in enumerate(idindex): |
|
|
70 |
|
|
|
71 |
if i < len(idindex)-1: |
|
|
72 |
idx1 = idindex[i+1] |
|
|
73 |
info = fa[idx].split() |
|
|
74 |
|
|
|
75 |
pid = info[0][4:10] |
|
|
76 |
seq = "".join(fa[idx+1:idx1]) |
|
|
77 |
seq = seq.replace("\n","") |
|
|
78 |
proteins[pid] = seq |
|
|
79 |
counter +=1 |
|
|
80 |
|
|
|
81 |
print("%d number(s) of protein(s)" % counter) |
|
|
82 |
json.dump(proteins, open(datafolder + 'proteins.txt', 'w')) |
|
|
83 |
|
|
|
84 |
return proteins |
|
|
85 |
|