In [11]:
## import data
import numpy as np
import pandas as pd

data = pd.read_csv('pheno_allgenic.csv')
data_prepost = data[pd.notnull(data['SRS.prepost.cat'])]

x_prepost = data_prepost.iloc[:, list(range(3, 30))]
y_prepost = data_prepost.iloc[:, 1]   

## Linear SVM 

In [45]:
## RFE CV for feature selection
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold  
from sklearn.svm import LinearSVC

y_prepost_array = y_prepost.to_numpy()
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

feature_dict = {}

i=0 
for train, test in outer_cv.split(x_prepost_trans, y_prepost_array):
    i+=1
    estimator = LinearSVC(random_state=0, class_weight='balanced')  
    selector = RFECV(estimator, step=1, cv=inner_cv, scoring = 'roc_auc')
    selector = selector.fit(x_prepost_trans[train], y_prepost_array[train])
    feature_dict['round %i' %i] = selector.ranking_  #Selected features are assigned as rank 1.    

























In [46]:
## summary feature selected times
feature_sum_dict = {}
feature_sum_sort_dict = {}
for i in range(0, len(x_prepost.columns)):
    feature_sum_dict[x_prepost.columns[i]] = 0
    for j in range(1,11):
        if feature_dict['round %d' %j] [i] == 1:
            feature_sum_dict[x_prepost.columns[i]] +=1
feature_sum_sort_dict = {k: v for k, v in sorted(feature_sum_dict.items(), key=lambda item: item[1], reverse=True)} 
feature_sum_sort_dict

{'comorbid.anxiety': 5,
 'comorbid.depression': 5,
 'synapgene_ratio_comb': 5,
 'SRS.pre.p': 3,
 'comorbid.adhd': 3,
 'treatment.kontakt': 3,
 'cnv_size': 3,
 'cgi.pre': 2,
 'comorbid.other': 2,
 'cnvgene_num': 2,
 'damage_num': 2,
 'ndd_num': 2,
 'ddcgas.pre': 1,
 'abas.f.total.pre': 1,
 'ados.tot': 1,
 'wisc.fsiq': 1,
 'age.pre': 1,
 'clnsig_recode': 1,
 'X0.5asd': 1,
 'X1.0adhd': 1,
 'lof_cln': 1,
 'synapgene_asd0.01_common': 1,
 'Gender': 0,
 'treatment.pharma': 0,
 'treatment.councel': 0,
 'treatment.cbt': 0,
 'BIN3_all_recode': 0}

In [47]:
## use selected features to train the model
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_curve, auc  
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

y_prepost_array = y_prepost.to_numpy()
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

fpr_train_dict = {}
tpr_train_dict = {}
fpr_test_dict = {}
tpr_test_dict = {}
auc_train_dict = {}
auc_test_dict = {}
accuracy_train_dict = {}
accuracy_test_dict = {}

c_dict = {}
feature_totalselect_dict = {}

svm = LinearSVC(random_state=0, class_weight='balanced')
tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 10]}] 
#linear SVM
for i in range(0, 5):  
    feature_list = []
    feature_select_dict = {}
    feature_select_dict={a:u for a, u in feature_sum_sort_dict.items() if u > i}
    for b in feature_select_dict:
        feature_list.append(list(x_prepost.columns).index(b)) 
    x_prepost_sel = x_prepost.iloc[:, feature_list]
    
    c_dict['feature select time > %i'%i]=[]
    fpr_train_dict['feature select time > %i'%i]=[]
    tpr_train_dict['feature select time > %i'%i]=[]
    fpr_test_dict['feature select time > %i'%i]=[]
    tpr_test_dict['feature select time > %i'%i]=[]
    auc_train_dict['feature select time > %i'%i]=[]
    auc_test_dict['feature select time > %i'%i]=[]
    accuracy_train_dict['feature select time > %i'%i]=[]
    accuracy_test_dict['feature select time > %i'%i]=[]
    feature_totalselect_dict['feature select time > %i'%i] = list(feature_select_dict.keys())
    
    for train, test in outer_cv.split(x_prepost_sel, y_prepost_array):
        clf = GridSearchCV(estimator=svm, param_grid=tuned_parameters, scoring='roc_auc', cv=inner_cv) 
        clf.fit(x_prepost_sel[train], y_prepost_array[train])
        clf_outer = LinearSVC(penalty='l2',random_state=0, class_weight='balanced', C=clf.best_params_['C'])  
        clf_outer.fit(x_prepost_sel[train], y_prepost_array[train])
        
        predicted, y_pred = clf_outer.predict(x_prepost_sel[train]), clf_outer.predict(x_prepost_sel[test])
        fpr_train, tpr_train, thresholds_train = roc_curve(y_prepost_array[train], predicted)
        fpr_test, tpr_test, thresholds_test = roc_curve(y_prepost_array[test], y_pred)
        auc_train, auc_test = auc(fpr_train, tpr_train), auc(fpr_test, tpr_test)
        accuracy_train, accuracy_test = accuracy_score(y_prepost_array[train], predicted), accuracy_score(y_prepost_array[test], y_pred)
        
        c_dict['feature select time > %i'%i].append(clf.best_params_['C'])

        fpr_train_dict['feature select time > %i'%i].append(fpr_train[1])
        tpr_train_dict['feature select time > %i'%i].append(tpr_train[1])
        fpr_test_dict['feature select time > %i'%i].append(fpr_test[1])
        tpr_test_dict['feature select time > %i'%i].append(tpr_test[1])
        auc_train_dict['feature select time > %i'%i].append(auc_train)
        auc_test_dict['feature select time > %i'%i].append(auc_test)
        accuracy_train_dict['feature select time > %i'%i].append(accuracy_train)
        accuracy_test_dict['feature select time > %i'%i].append(accuracy_test)

















In [48]:
## calculate average auc train and test value in outer CV
auc_avg_test_dict = {}
auc_avg_train_dict = {}
accuracy_avg_test_dict = {}
accuracy_avg_train_dict = {}
sensi_avg_test_dict = {}
sensi_avg_train_dict = {}
speci_avg_test_dict = {}
speci_avg_train_dict = {}
for key in auc_test_dict:
    auc_test, auc_train = 0.0, 0.0
    accuracy_test, accuracy_train = 0.0, 0.0
    sensi_test, sensi_train = 0.0, 0.0
    speci_test, speci_train = 0.0, 0.0
    for time in auc_test_dict[key]:
        auc_test += time
    for time in auc_train_dict[key]:
        auc_train += time
    for time in accuracy_test_dict[key]:
        accuracy_test += time
    for time in accuracy_train_dict[key]:
        accuracy_train += time
    for time in tpr_test_dict[key]:
        sensi_test += time
    for time in tpr_train_dict[key]:
        sensi_train += time
    for time in fpr_test_dict[key]:
        speci_test += 1-time
    for time in fpr_train_dict[key]:
        speci_train += 1-time
    auc_avg_test, auc_avg_train = auc_test/10, auc_train/10
    accuracy_avg_test, accuracy_avg_train = accuracy_test/10, accuracy_train/10
    auc_avg_test_dict[key]=auc_avg_test
    auc_avg_train_dict[key]=auc_avg_train
    accuracy_avg_test_dict[key]=accuracy_avg_test
    accuracy_avg_train_dict[key]=accuracy_avg_train
    sensi_avg_test, sensi_avg_train = sensi_test/10, sensi_train/10
    speci_avg_test, speci_avg_train = speci_test/10, speci_train/10
    sensi_avg_test_dict[key]=sensi_avg_test
    sensi_avg_train_dict[key]=sensi_avg_train
    speci_avg_test_dict[key]=speci_avg_test
    speci_avg_train_dict[key]=speci_avg_train
print(auc_avg_test_dict)
print(accuracy_avg_test_dict)
print(sensi_avg_test_dict)
print(speci_avg_test_dict)

{'feature select time > 0': 0.522638888888889, 'feature select time > 1': 0.6322222222222221, 'feature select time > 2': 0.5931944444444444, 'feature select time > 3': 0.5345833333333334, 'feature select time > 4': 0.5345833333333334}
{'feature select time > 0': 0.5201754385964913, 'feature select time > 1': 0.6342105263157894, 'feature select time > 2': 0.5964912280701754, 'feature select time > 3': 0.5383040935672514, 'feature select time > 4': 0.5383040935672514}
{'feature select time > 0': 0.5652777777777778, 'feature select time > 1': 0.6, 'feature select time > 2': 0.5430555555555556, 'feature select time > 3': 0.49027777777777776, 'feature select time > 4': 0.49027777777777776}
{'feature select time > 0': 0.4800000000000001, 'feature select time > 1': 0.6644444444444445, 'feature select time > 2': 0.6433333333333333, 'feature select time > 3': 0.5788888888888888, 'feature select time > 4': 0.5788888888888888}


In [51]:
print(auc_avg_train_dict)
print(accuracy_avg_train_dict)
print(sensi_avg_train_dict)
print(speci_avg_train_dict)

{'feature select time > 0': 0.6356412510828668, 'feature select time > 1': 0.652589603185891, 'feature select time > 2': 0.6133601517952961, 'feature select time > 3': 0.5493436049443374, 'feature select time > 4': 0.5493436049443374}
{'feature select time > 0': 0.6343848018249216, 'feature select time > 1': 0.6535286569717706, 'feature select time > 2': 0.616463501568292, 'feature select time > 3': 0.551960364984317, 'feature select time > 4': 0.551960364984317}
{'feature select time > 0': 0.6590506329113923, 'feature select time > 1': 0.6350949367088606, 'feature select time > 2': 0.5555379746835443, 'feature select time > 3': 0.5012025316455696, 'feature select time > 4': 0.5012025316455696}
{'feature select time > 0': 0.6122318692543413, 'feature select time > 1': 0.6700842696629213, 'feature select time > 2': 0.671182328907048, 'feature select time > 3': 0.5974846782431052, 'feature select time > 4': 0.5974846782431052}
