--- a +++ b/large_net/test1.py @@ -0,0 +1,376 @@ +import json +import csv +import numpy as np +import os +import pandas as pd +import random +import tensorflow as tf +import matplotlib.pyplot as plt + +import keras +from keras.models import Sequential,Model +from keras.utils.generic_utils import get_custom_objects +from keras import backend as K +from keras.layers import Input, Conv1D, Reshape,\ + GlobalAveragePooling2D, Dense, BatchNormalization, Activation,AveragePooling1D, \ + GlobalMaxPooling2D, Flatten, MaxPool1D, Conv2D,MaxPool2D,SeparableConv2D,Conv3D,Add,Dropout +from keras.utils.vis_utils import plot_model +#from keras.applications.mobilenet import DepthwiseConv2D +from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping + +from keras.layers.advanced_activations import LeakyReLU, PReLU +from functions import quan_detector, most_repeared_promoter,dataset +from sklearn.metrics import confusion_matrix +import argparse +from keras.callbacks import LearningRateScheduler, TensorBoard, ModelCheckpoint +from sklearn import metrics +from sklearn.metrics import roc_auc_score + +np.random.seed(41) +tf.set_random_seed(41) +random.seed(41) + + +labels_file = './labes.csv' +labels_df = pd.read_csv(labels_file, index_col=0) +ids_csv = labels_df.FID.tolist() + +files_num_chr7 = [1907,2,1908,2780,2428,3,1173,1291] +files_num_chr17 = [2264,865,66,69,1931,71,1932,70] +files_num_chr9 = [502, 1420, 1503, 1504, 1505, 1506, 1507, 1508] +files_num_chr22 = [65,66,0,80,15,16,59,5] + +def fn(d_dir): + file_list = os.listdir(d_dir) + n = [i.split('.')[0] for i in file_list] + num = [int(i.split('_')[1]) for i in n if len(i)>3] + return num + + + +num_7 = len(files_num_chr7) +num_9 = len(files_num_chr9) +num_17 = len(files_num_chr17) +num_22 = len(files_num_chr22) + +files_num = files_num_chr7 + files_num_chr9 + files_num_chr17 + files_num_chr22 + +all_HL_prom = np.zeros((11908,64*len(files_num))) +for idx in range(len(files_num)): + promoter_num = files_num[idx] + if idx <num_7: + #promoter_file = './chr7_hl_prom/chr22_'+str(promoter_num)+'.json' + promoter_file = './chr7_hl_prom/chr22_'+str(promoter_num)+'.json' + elif idx <num_7+num_9 and idx >= num_7: + promoter_file = './chr9_hl_prom/chr9_'+str(promoter_num)+'.json' + elif idx <num_7+num_9+ num_17 and idx >= num_7+num_9: + promoter_file = './chr17_hl_prom/chr22_'+str(promoter_num)+'.json' + else: + promoter_file = './chr22_hl_prom/chr22_'+str(promoter_num)+'.json' + + # # read files + with open(promoter_file) as json_data: + ind_var = json.load(json_data) + var_num = [] + for i in ids_csv: + id_name = str(i) + temp = ind_var[id_name] + var_seq = [int(t) for t in temp] + var_num.append(var_seq) + all_HL_prom[:,idx*64:(idx+1)*64] =np.array(var_num) + + +print(all_HL_prom.shape) +n = len(files_num) + +labels_df['vars'] = all_HL_prom.tolist() +lab_num = {1: [1, 0], # control + 2: [0, 1]} # ALS +lab_num_batch = {'c1': [1,0,0,0], # control + 'c3': [0,1,0,0], + 'c5': [0,0,1,0], + 'c44':[0,0,0,1]} # ALS + +pheno_new = [] +pheno_batch = [] +for i in labels_df.Pheno.tolist(): + pheno_new.append(lab_num[i]) + +# for i in labels_df.Sex.tolist(): +# pheno_new.append(lab_num[i]) +for i in labels_df.FID.tolist(): + l = i.split('-')[0] + pheno_batch.append(lab_num_batch[l]) + +d = {"Pheno": pheno_new, "Vars": labels_df.vars} +dataset_ = pd.DataFrame(d) +dataset_X = np.array(dataset_.Vars.tolist()) +dataset_Y = np.array(dataset_.Pheno.tolist()) + + +N,M = dataset_X.shape +print(N,M) +# network accuracy + +t_idx = [int(line.strip()) for line in open("train_id.txt", 'r')] +te_idx = [int(line.strip()) for line in open("test_id.txt", 'r')] +x_tv = dataset_X[t_idx] +y_tv = dataset_Y[t_idx] +x_test = dataset_X[te_idx] +y_test = dataset_Y[te_idx] + + + +x_train, y_train, x_val, y_val,tr_idx,val_idx = dataset(x_tv, y_tv, test_ratio=0.05) + + +# print x_test.shape +num_classes = y_test.shape[-1] +x_train = x_train.astype('float32') +x_test = x_test.astype('float32') +x_val = x_val.astype('float32') + +x_train = x_train.astype('float32').reshape((len(x_train), M,1)) +x_test = x_test.astype('float32').reshape((len(x_test),M,1)) +x_val = x_val.astype('float32').reshape((len(x_val),M,1)) + +print(np.sum(y_test,axis=0)) +print(np.sum(y_val,axis=0)) +print(np.sum(y_train,axis=0)) +conv_kwargs = {'padding':'same', + 'data_format':'channels_first'} + +mp_kwargs = {'padding':'same', + 'data_format':'channels_first'} + + +def architecture_X(input_shape, num_classes): + act = 'relu' + + x = Input(shape=(M, 1)) + input_Conv = Conv1D(filters = 256, kernel_size =64, strides=64)(x) + input_Conv = Conv1D(filters = 256, kernel_size =1,strides=1)(input_Conv) + input_BN = BatchNormalization(epsilon=1e-05)(input_Conv) + input_Conv = Conv1D(filters = 256, kernel_size =1,strides=1)(input_Conv) + input_act = Activation(act)(input_BN) + + + reshape_h = Reshape((int(M/64), 16, 16))(input_act) + # DepthwiseConv2D(32, strides=(2, 2), **conv_kwargs)(x) + + # add traditional conv + conv1 = Conv2D(64, (3, 3),activation=act,**conv_kwargs)(reshape_h) + conv1 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv1) + conv1 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv1) + mp1 = MaxPool2D(pool_size=(2, 2),**mp_kwargs)(conv1) + + Sconv1 = SeparableConv2D(128, (2, 2),activation=act,**conv_kwargs)(mp1) + # Sconv1 = SeparableConv2D(32, (2, 2),activation=act,**conv_kwargs)(Sconv1) + # Sconv1 = SeparableConv2D(32, (2, 2),activation=act,**conv_kwargs)(Sconv1) + Smp1 = MaxPool2D(pool_size=(2, 2),**mp_kwargs)(Sconv1) + + Smp1 = Conv2D(256, (1, 1),activation=act,**conv_kwargs)(Smp1) + + Sconv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Smp1) + Sconv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Sconv2) + Sconv2 = SeparableConv2D(64, (2, 2),activation=act,**conv_kwargs)(Sconv2) + Smp2 = Sconv2#MaxPool2D(pool_size=(2, 2),**mp_kwargs)(Sconv2) + + + conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Smp1) + conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv2) + conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv2) + mp2 = conv2#MaxPool2D(pool_size=(2, 2),**mp_kwargs)(conv2) + + sum_add =keras.layers.Concatenate(axis=-1)([mp2,Smp2]) + + conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(sum_add) + conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv3) + conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv3) + conv3 = MaxPool2D(pool_size=(1, 2),**mp_kwargs)(conv3) + + conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3) + conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3) + conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3) + +# new = keras.layers.dot([conv2,conv3], axes = -2) + conv4 = keras.layers.Add()([Sconv2,conv2,conv3]) + + conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4) + conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4) + conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4) +# conv4_ = Conv2D(64, (1, 1),activation=act,**conv_kwargs)(conv4) + + conv5 = keras.layers.Add()([conv4,conv3]) + conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5) + conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5) + conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5) +# conv5 = keras.layers.Add()([conv4,conv5]) + + mp3 = GlobalAveragePooling2D(data_format='channels_first')(conv5) + + flatten =mp3#Flatten()(mp3) + + d1 = Dense(64*4, activation='linear')(flatten) + d1_act = Activation(act)(d1) + + d2 = Dense(16, activation='linear')(d1_act) + flatten = Activation(act)(d2) + + # added + # flatten = BatchNormalization(epsilon=1e-05)(flatten) + pred = Dense(num_classes, activation='softmax')(flatten) + # Compile model + model = Model(inputs=[x], outputs=[pred]) + model.compile(loss='categorical_crossentropy', + optimizer=keras.optimizers.adagrad(lr=0.002,decay=0.0001),#0.0013 + metrics=['accuracy']) + return model +def scheduler(epoch): + if epoch < 50: + return 0.1 + if epoch < 122: + return 0.01 + return 0.003# 0.001 + +cnn = architecture_X(M, num_classes) +reduce_lr = ReduceLROnPlateau(monitor = 'val_acc', + factor = 0.7, + patience = 50, + min_lr = 0.00001,verbose=1) +earlystop = EarlyStopping(monitor='val_acc',patience=25,verbose=0,mode='auto') + +# print cnn.summary() +history = cnn.fit(x_train, y_train, + batch_size=16*2,#640, + epochs=30,#300, + verbose=1,callbacks = [reduce_lr], + validation_data=(x_val, y_val)) + +print("=" * 5) +print(np.sum(y_test,axis=0)) +print(cnn.evaluate(x_test,y_test)) +y_pred = cnn.predict(x_test) +y_pred_ = np.argmax(y_pred, axis=1) +y_test_num = np.argmax(y_test, axis=1) +tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred_).ravel() + +acc = (tp + tn) * 1. / (tp + fp + tn + fn) + +# from scikitplot as skplt +# skplt.metrics.plot_roc_curve(y_test_num,y_pred) + +print('='*10) +print('='*5,' ','Our network') +print('='*10) + +print('test: ', acc) + +ps = tp*1./(tp+fp) +rc = tp*1./(tp+fn) +print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) +print("Pression: ", ps) +print("Recall:", rc) +print("F1: ",2*(ps*rc)/(ps+rc)) + +fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test_num, np.max(y_pred,axis=-1)) +print("auc: ",metrics.auc(fp_rate, tp_rate)) + +print('='*10) +print('='*5,' ','LR') +print('='*10) +from sklearn.linear_model import LogisticRegression + +from sklearn import svm +X = x_train.reshape((len(x_train), M)) +y = y_train.argmax(axis=1).astype('float32') +logisticRegr = LogisticRegression(C=10,max_iter=10) +logisticRegr.fit(X, y) +y_pred = logisticRegr.predict(x_test.reshape((len(x_test), M))) + +y_test_num = np.argmax(y_test, axis=1) +tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() + +acc = (tp + tn) * 1. / (tp + fp + tn + fn) +ps = tp*1./(tp+fp) +rc = tp*1./(tp+fn) +print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) +print("Pression: ", ps) +print("Recall:", rc) +print("F1: ",2*(ps*rc)/(ps+rc)) + + +print('='*10) +print('='*5,' ','SVM') +print('='*10) +from sklearn import svm +X = x_train.reshape((len(x_train), M)) +y = y_train.argmax(axis=1).astype('float32') +clf = svm.SVC(gamma=0.001) +clf.fit(X, y) +y_pred = clf.predict(x_test.reshape((len(x_test), M))) + +y_test_num = np.argmax(y_test, axis=1) +tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() + +acc = (tp + tn) * 1. / (tp + fp + tn + fn) +ps = tp*1./(tp+fp) +rc = tp*1./(tp+fn) +print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) +print("Pression: ", ps) +print("Recall:", rc) +print("F1: ",2*(ps*rc)/(ps+rc)) +#print("auc: ",roc_auc_score(y_test_num,clf.predict_proba(x_test.reshape((len(x_test), M))))) + + +print('='*10) +print('='*5,' ','RF') +print('='*10) +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +X = x_train.reshape((len(x_train), M)) +y = y_train.argmax(axis=1).astype('float32') +# Instantiate model with 1000 decision trees +rf = RandomForestClassifier(max_depth=5, n_estimators=1000, max_features=100) +# Train the model on training data +rf.fit(X, y) + +y_pred = rf.predict(x_test.reshape((len(x_test), M))) + +y_test_num = np.argmax(y_test, axis=1) +tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() + +acc = (tp + tn) * 1. / (tp + fp + tn + fn) + +ps = tp*1./(tp+fp) +rc = tp*1./(tp+fn) +print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) +print("Pression: ", ps) +print("Recall:", rc) +print("F1: ",2*(ps*rc)/(ps+rc)) + +print('='*10) +print('='*5,' ','AdaBoost') +print('='*10) +from sklearn.tree import DecisionTreeClassifier +X = x_train.reshape((len(x_train), M)) +y = y_train.argmax(axis=1).astype('float32') +# Instantiate model with 1000 decision trees +bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), + algorithm="SAMME.R", + n_estimators=1000) +# Train the model on training data +bdt.fit(X, y) + +y_pred = bdt.predict(x_test.reshape((len(x_test), M))) + +y_test_num = np.argmax(y_test, axis=1) +tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() + +acc = (tp + tn) * 1. / (tp + fp + tn + fn) + +ps = tp*1./(tp+fp) +rc = tp*1./(tp+fn) +print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) +print("Pression: ", ps) +print("Recall:", rc) +print("F1: ",2*(ps*rc)/(ps+rc))