import csv, pickle, json, os, math, sys
from collections import OrderedDict
from bioservices import UniProt
import numpy as np
import pickle
PROT_FILE = "proteins.fasta"
CHEM_FILE = "ligands.tab"
AFF_FILE = "Y.tab"
def prepare_new_data(fpath, test=True):
prots = read_proteins(fpath)
chems = read_chemicals(fpath)
Y = np.zeros((len(chems), len(prots)))
if os.path.exists(fpath + AFF_FILE):
Y = np.loadtxt(fpath + AFF_FILE)
pickle.dump(Y, open(fpath + "Y","wb"), protocol=pickle.HIGHEST_PROTOCOL)
label_row_inds, label_col_inds = np.where(np.isnan(Y)==False)
#json.dump(linepos, open(FLAGS.test_path + "csv_pos_match.txt","w"))
indic = set(range(len(label_row_inds)))
indic = sorted(indic, key=os.urandom)
if not os.path.exists(fpath + "folds/"):
os.makedirs(fpath + "folds/")
if test:
json.dump(indic, open(fpath + "folds/test_fold.txt","w"))
else:
json.dump(indic, open(fpath + "folds/train_fold.txt","w"))
def read_chemicals(datafolder):
counter =0
filepath = datafolder + CHEM_FILE
chemicals = {}
with open(filepath) as file:
next(file)
for row in file:
chem_id = row.split('\t')[0]
smiles = (row.split('\t')[1]).strip()
chemicals[chem_id] = smiles
counter +=1
print("%d number(s) of chemical(s)" % counter)
json.dump(chemicals, open(datafolder + 'ligands.txt', 'w'))
return chemicals
def read_proteins(datafolder):
proteins = {}
counter =0
fa=""
filename = datafolder + PROT_FILE
with open(filename) as f:
fa = f.readlines()
idindex=[]
for i, line in enumerate(fa):
if ">" in line:
idindex.append(i)
idindex.append(i)
for i, idx in enumerate(idindex):
if i < len(idindex)-1:
idx1 = idindex[i+1]
info = fa[idx].split()
pid = info[0][4:10]
seq = "".join(fa[idx+1:idx1])
seq = seq.replace("\n","")
proteins[pid] = seq
counter +=1
print("%d number(s) of protein(s)" % counter)
json.dump(proteins, open(datafolder + 'proteins.txt', 'w'))
return proteins