DeepDTA / Git / [796dd7] /deepdta-toy/testdatahelper.py

Models:
Amanda-D/
DeepDTA
Downloads: 1
[796dd7]: / deepdta-toy / testdatahelper.py
History
Download this file
86 lines (63 with data), 2.3 kB

import csv, pickle, json, os, math, sys
from collections import OrderedDict
from bioservices import UniProt
import numpy as np
import pickle


PROT_FILE = "proteins.fasta"
CHEM_FILE = "ligands.tab"
AFF_FILE = "Y.tab"

def prepare_new_data(fpath, test=True):

    prots = read_proteins(fpath)
    chems = read_chemicals(fpath)
    Y = np.zeros((len(chems), len(prots)))

    if  os.path.exists(fpath + AFF_FILE):
        Y = np.loadtxt(fpath + AFF_FILE)

    pickle.dump(Y, open(fpath + "Y","wb"), protocol=pickle.HIGHEST_PROTOCOL)
    label_row_inds, label_col_inds = np.where(np.isnan(Y)==False)

    #json.dump(linepos, open(FLAGS.test_path  + "csv_pos_match.txt","w"))
    indic = set(range(len(label_row_inds)))
    indic = sorted(indic, key=os.urandom)


    if not os.path.exists(fpath + "folds/"):
        os.makedirs(fpath + "folds/")
    if test:
        json.dump(indic, open(fpath + "folds/test_fold.txt","w"))
    else:
        json.dump(indic, open(fpath + "folds/train_fold.txt","w"))


def read_chemicals(datafolder):
    counter =0
    filepath = datafolder + CHEM_FILE
    chemicals = {}
    with open(filepath) as file:
         next(file)
         for row in file:
            chem_id = row.split('\t')[0]
            smiles = (row.split('\t')[1]).strip()
            chemicals[chem_id] = smiles
            counter +=1

    print("%d number(s) of chemical(s)" % counter)
    json.dump(chemicals, open(datafolder + 'ligands.txt', 'w'))

    return chemicals


def read_proteins(datafolder):
    proteins = {}
    counter =0
    fa=""
    filename = datafolder + PROT_FILE
    with open(filename) as f:
        fa = f.readlines()

    idindex=[]
    for i, line in enumerate(fa):
        if ">" in line:
            idindex.append(i)
    idindex.append(i)

    for i, idx in enumerate(idindex):

        if i < len(idindex)-1:
            idx1 = idindex[i+1]
            info = fa[idx].split()

            pid = info[0][4:10]
            seq = "".join(fa[idx+1:idx1])
            seq = seq.replace("\n","")
            proteins[pid] = seq
            counter +=1

    print("%d number(s) of protein(s)" % counter)
    json.dump(proteins, open(datafolder + 'proteins.txt', 'w'))

    return proteins