[0b32b6]: / python-scripts / data / cancer_d2d / data_process.py

Download this file

126 lines (93 with data), 3.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
from numpy.lib.shape_base import split
def process(datapath,omics_name):
pass
if __name__ == '__main__':
datapaths=['breast','gbm']
omics_names=['log_exp','log_mirna','methy']
# datapath='./breast'
datapath='./gbm'
omics1 = np.loadtxt('{path}/log_exp_omics.txt'.format(path=datapath), str)
omics2 = np.loadtxt('{path}/log_mirna_omics.txt'.format(path=datapath), str)
omics3 = np.loadtxt('{path}/methy_omics.txt'.format(path=datapath), str)
names = omics1[0]
# get sample names
names = np.delete(names, 0, axis=0)
for i in range(len(names)):
names[i]=str(names[i]).replace('.','')
names[i]=str(names[i]).replace('"','')
# delete row names
omics1 = np.delete(omics1, 0, axis=1)
# delete col names
#omics1 = np.delete(omics1, 0, axis=0)
omics1 = np.transpose(omics1)
for i in range(len(omics1)):
omics1[i][0]=str(omics1[i][0]).replace('.','')
omics1[i][0]=str(omics1[i][0]).replace('"','')
# delete row names
omics2 = np.delete(omics2, 0, axis=1)
# delete col names
#omics1 = np.delete(omics1, 0, axis=0)
omics2 = np.transpose(omics2)
for i in range(len(omics2)):
omics2[i][0]=str(omics2[i][0]).replace('.','')
omics2[i][0]=str(omics2[i][0]).replace('"','')
# delete row names
omics3 = np.delete(omics3, 0, axis=1)
# delete col names
#omics1 = np.delete(omics1, 0, axis=0)
omics3 = np.transpose(omics3)
for i in range(len(omics3)):
omics3[i][0]=str(omics3[i][0]).replace('.','')
omics3[i][0]=str(omics3[i][0]).replace('"','')
print(omics1.shape)
print(omics2.shape)
print(omics3.shape)
#print(omics1[1])
#print(len(omics1))
#subtype
# subtype=['Luminal A','Luminal B','Basal-like','Normal-like','HER2-enriched']
subtype=['Proneural','Classical','Mesenchymal','Neural','HER2-enriched']
#解决\ufeff问题
labels = np.loadtxt('{path}/{path}.csv'.format(path=datapath),str,delimiter=',',encoding='UTF-8-sig')
for i in range(len(labels)):
labels[i][0]=(labels[i][0]).replace('-','')
labels[i][0]=(labels[i][0]).replace('"','')
print('*'*20)
labels = np.delete(labels, np.isin(labels[:,0], names, invert=True), axis=0)
#print(np.where(np.isin(labels[:,0], names, invert=True)))
print('*'*20)
#labels[]
# delete NA
labels = np.delete(labels, np.where(labels=='NA'), axis=0)
for i in range(len(subtype)):
labels[labels==subtype[i]]=i
# print(labels[0][0])
# print(names)
# print(names[0])
# print(labels[0][0]==names[0])
omics1 = np.delete(omics1, np.isin(omics1[:,0], labels[:,0], invert=True), axis=0)
omics2 = np.delete(omics2, np.isin(omics2[:,0], labels[:,0], invert=True), axis=0)
omics3 = np.delete(omics3, np.isin(omics3[:,0], labels[:,0], invert=True), axis=0)
# print(labels.shape)
# print(omics1.shape)
#测试样本是否对齐
# for i in range(len(omics1)):
# if omics1[i][0] != labels[i][0]:
# print('zly'*100)
# else:
# print('caicai'*100)
# for i in range(len(omics2)):
# if omics2[i][0] != labels[i][0]:
# print('zly'*100)
# else:
# print('caicai'*100)
# for i in range(len(omics3)):
# if omics3[i][0] != labels[i][0]:
# print('zly'*100)
# else:
# print('caicai'*100)
np.savetxt('{path}/after_log_exp.txt'.format(path=datapath),omics1,fmt="%s")
np.savetxt('{path}/after_log_mirna.txt'.format(path=datapath),omics2,fmt="%s")
np.savetxt('{path}/after_methy.txt'.format(path=datapath),omics3,fmt="%s")
np.savetxt('{path}/after_labels.txt'.format(path=datapath),labels,fmt="%s")