DL-mo / Git / [0b32b6] /python-scripts/data/cancer_d2d/data

Models:
AlyssaS/
DL-mo
Downloads: 1
[0b32b6]: / python-scripts / data / cancer_d2d / data_process.py
History
Download this file
126 lines (93 with data), 3.8 kB

import numpy as np
from numpy.lib.shape_base import split

def process(datapath,omics_name):
    pass




if __name__ == '__main__':
    datapaths=['breast','gbm']
    omics_names=['log_exp','log_mirna','methy']

    # datapath='./breast'
    datapath='./gbm'

    omics1 = np.loadtxt('{path}/log_exp_omics.txt'.format(path=datapath), str)

    omics2 = np.loadtxt('{path}/log_mirna_omics.txt'.format(path=datapath), str)

    omics3 = np.loadtxt('{path}/methy_omics.txt'.format(path=datapath), str)

    


    names = omics1[0]
    # get sample names
    names = np.delete(names, 0, axis=0)
    for i in range(len(names)):
        names[i]=str(names[i]).replace('.','')
        names[i]=str(names[i]).replace('"','')
    
    # delete row names
    omics1 = np.delete(omics1, 0, axis=1)
    # delete col names
    #omics1 = np.delete(omics1, 0, axis=0)
    omics1 = np.transpose(omics1)
    for i in range(len(omics1)):
        omics1[i][0]=str(omics1[i][0]).replace('.','')
        omics1[i][0]=str(omics1[i][0]).replace('"','')


    # delete row names
    omics2 = np.delete(omics2, 0, axis=1)
    # delete col names
    #omics1 = np.delete(omics1, 0, axis=0)
    omics2 = np.transpose(omics2)
    for i in range(len(omics2)):
        omics2[i][0]=str(omics2[i][0]).replace('.','')
        omics2[i][0]=str(omics2[i][0]).replace('"','')
    
    # delete row names
    omics3 = np.delete(omics3, 0, axis=1)
    # delete col names
    #omics1 = np.delete(omics1, 0, axis=0)
    omics3 = np.transpose(omics3)
    for i in range(len(omics3)):
        omics3[i][0]=str(omics3[i][0]).replace('.','')
        omics3[i][0]=str(omics3[i][0]).replace('"','')

    print(omics1.shape)
    print(omics2.shape)
    print(omics3.shape)

    #print(omics1[1])
    #print(len(omics1))
    #subtype
    # subtype=['Luminal A','Luminal B','Basal-like','Normal-like','HER2-enriched']
    subtype=['Proneural','Classical','Mesenchymal','Neural','HER2-enriched']
    #解决\ufeff问题
    labels = np.loadtxt('{path}/{path}.csv'.format(path=datapath),str,delimiter=',',encoding='UTF-8-sig')

    for i in range(len(labels)):
        labels[i][0]=(labels[i][0]).replace('-','')
        labels[i][0]=(labels[i][0]).replace('"','')
    print('*'*20)
    labels = np.delete(labels, np.isin(labels[:,0], names, invert=True), axis=0)
    #print(np.where(np.isin(labels[:,0], names, invert=True)))
    print('*'*20)
    #labels[]
    # delete NA
    labels = np.delete(labels, np.where(labels=='NA'), axis=0)
    for i in range(len(subtype)):
        labels[labels==subtype[i]]=i
    # print(labels[0][0])
    # print(names)
    # print(names[0])
    # print(labels[0][0]==names[0])

    omics1 = np.delete(omics1, np.isin(omics1[:,0], labels[:,0], invert=True), axis=0)
    omics2 = np.delete(omics2, np.isin(omics2[:,0], labels[:,0], invert=True), axis=0)
    omics3 = np.delete(omics3, np.isin(omics3[:,0], labels[:,0], invert=True), axis=0)
    # print(labels.shape)
    # print(omics1.shape)

    #测试样本是否对齐
    # for i in range(len(omics1)):
    #     if omics1[i][0] != labels[i][0]:
    #         print('zly'*100)
    #     else:
    #         print('caicai'*100)
    
    # for i in range(len(omics2)):
    #     if omics2[i][0] != labels[i][0]:
    #         print('zly'*100)
    #     else:
    #         print('caicai'*100)
    
    # for i in range(len(omics3)):
    #     if omics3[i][0] != labels[i][0]:
    #         print('zly'*100)
    #     else:
    #         print('caicai'*100)

    np.savetxt('{path}/after_log_exp.txt'.format(path=datapath),omics1,fmt="%s")
    np.savetxt('{path}/after_log_mirna.txt'.format(path=datapath),omics2,fmt="%s")
    np.savetxt('{path}/after_methy.txt'.format(path=datapath),omics3,fmt="%s")
    np.savetxt('{path}/after_labels.txt'.format(path=datapath),labels,fmt="%s")