--- a +++ b/deepdta-toy/testdatahelper.py @@ -0,0 +1,85 @@ +import csv, pickle, json, os, math, sys +from collections import OrderedDict +from bioservices import UniProt +import numpy as np +import pickle + + +PROT_FILE = "proteins.fasta" +CHEM_FILE = "ligands.tab" +AFF_FILE = "Y.tab" + +def prepare_new_data(fpath, test=True): + + prots = read_proteins(fpath) + chems = read_chemicals(fpath) + Y = np.zeros((len(chems), len(prots))) + + if os.path.exists(fpath + AFF_FILE): + Y = np.loadtxt(fpath + AFF_FILE) + + pickle.dump(Y, open(fpath + "Y","wb"), protocol=pickle.HIGHEST_PROTOCOL) + label_row_inds, label_col_inds = np.where(np.isnan(Y)==False) + + #json.dump(linepos, open(FLAGS.test_path + "csv_pos_match.txt","w")) + indic = set(range(len(label_row_inds))) + indic = sorted(indic, key=os.urandom) + + + if not os.path.exists(fpath + "folds/"): + os.makedirs(fpath + "folds/") + if test: + json.dump(indic, open(fpath + "folds/test_fold.txt","w")) + else: + json.dump(indic, open(fpath + "folds/train_fold.txt","w")) + + +def read_chemicals(datafolder): + counter =0 + filepath = datafolder + CHEM_FILE + chemicals = {} + with open(filepath) as file: + next(file) + for row in file: + chem_id = row.split('\t')[0] + smiles = (row.split('\t')[1]).strip() + chemicals[chem_id] = smiles + counter +=1 + + print("%d number(s) of chemical(s)" % counter) + json.dump(chemicals, open(datafolder + 'ligands.txt', 'w')) + + return chemicals + + +def read_proteins(datafolder): + proteins = {} + counter =0 + fa="" + filename = datafolder + PROT_FILE + with open(filename) as f: + fa = f.readlines() + + idindex=[] + for i, line in enumerate(fa): + if ">" in line: + idindex.append(i) + idindex.append(i) + + for i, idx in enumerate(idindex): + + if i < len(idindex)-1: + idx1 = idindex[i+1] + info = fa[idx].split() + + pid = info[0][4:10] + seq = "".join(fa[idx+1:idx1]) + seq = seq.replace("\n","") + proteins[pid] = seq + counter +=1 + + print("%d number(s) of protein(s)" % counter) + json.dump(proteins, open(datafolder + 'proteins.txt', 'w')) + + return proteins +