Diff of /rbfSVM.ipynb [000000] .. [b3f871]

Switch to side-by-side view

--- a
+++ b/rbfSVM.ipynb
@@ -0,0 +1,252 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## import data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## import data\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "data = pd.read_csv('pheno_allgenic.csv')\n",
+    "data_prepost = data[pd.notnull(data['SRS.prepost.cat'])]\n",
+    "\n",
+    "x_prepost = data_prepost.iloc[:, list(range(3, 30))]\n",
+    "y_prepost = data_prepost.iloc[:, 1]   \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Support vector machine with rbf kernel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## try rbf SVM model for training. For feature selection, use sequential forward selection method to select most important feature set in each inner CV and calculate the total number of each features after 10-time of outer CV.\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.model_selection import StratifiedKFold \n",
+    "from sklearn.svm import SVC\n",
+    "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n",
+    "    \n",
+    "y_prepost_array = y_prepost.to_numpy()\n",
+    "inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)\n",
+    "outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)\n",
+    "\n",
+    "feature_list = []\n",
+    "for train, test in outer_cv.split(x_prepost, y_prepost_array):\n",
+    "    #rbf SVC\n",
+    "    clf_outer = SVC(class_weight='balanced', kernel='rbf', random_state=0)  \n",
+    "    ##stepwise forward selection\n",
+    "    sfs = SFS(clf_outer, k_features=(1, 27), forward=True, floating=True, verbose=0, scoring='roc_auc',  cv=inner_cv)   \n",
+    "    sfs = sfs.fit(x_prepost[train], y_prepost_array[train])\n",
+    "    feature_list.append(list(sfs.k_feature_idx_))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'synapgene_ratio_comb': 8,\n",
+       " 'synapgene_asd0.01_common': 7,\n",
+       " 'treatment.kontakt': 7,\n",
+       " 'treatment.cbt': 5,\n",
+       " 'ndd_num': 5,\n",
+       " 'lof_cln': 5,\n",
+       " 'ados.tot': 4,\n",
+       " 'comorbid.anxiety': 4,\n",
+       " 'comorbid.other': 4,\n",
+       " 'cnv_size': 4,\n",
+       " 'BIN3_all_recode': 4,\n",
+       " 'age.pre': 3,\n",
+       " 'SRS.pre.p': 3,\n",
+       " 'cnvgene_num': 3,\n",
+       " 'X1.0adhd': 2,\n",
+       " 'damage_num': 2,\n",
+       " 'abas.f.total.pre': 2,\n",
+       " 'treatment.councel': 2,\n",
+       " 'wisc.fsiq': 1,\n",
+       " 'comorbid.depression': 1,\n",
+       " 'X0.5asd': 1,\n",
+       " 'ddcgas.pre': 1,\n",
+       " 'treatment.pharma': 1}"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "## summary the total select times for each feature after 10-time outer CV as importance measurement\n",
+    "feature_sum_dict = {}\n",
+    "feature_sum_sort_dict = {}\n",
+    "for i in feature_list:\n",
+    "    for j in i:\n",
+    "        if x_prepost.columns[j] not in feature_sum_dict:\n",
+    "            feature_sum_dict[x_prepost.columns[j]] = 1\n",
+    "        else:\n",
+    "            feature_sum_dict[x_prepost.columns[j]] += 1\n",
+    "\n",
+    "## sort features in feature dict based on selected times\n",
+    "feature_sum_sort_dict = {k: v for k, v in sorted(feature_sum_dict.items(), key=lambda item: item[1], reverse=True)}\n",
+    "feature_sum_sort_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## choose the most selected features in SVM for prediction. Use gridsearchCV to optimize parameters\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.metrics import accuracy_score, roc_curve, auc  \n",
+    "\n",
+    "y_prepost_array = y_prepost.to_numpy()\n",
+    "inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)\n",
+    "outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)\n",
+    "\n",
+    "fpr_train_dict = {}\n",
+    "tpr_train_dict = {}\n",
+    "fpr_test_dict = {}\n",
+    "tpr_test_dict = {}\n",
+    "auc_train_dict = {}\n",
+    "auc_test_dict = {}\n",
+    "accuracy_train_dict = {}\n",
+    "accuracy_test_dict = {}\n",
+    "\n",
+    "c_dict = {}\n",
+    "gamma_dict = {}\n",
+    "feature_totalselect_dict = {}\n",
+    "\n",
+    "#rbf SVC\n",
+    "tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1, 10]}] \n",
+    "svm = SVC(kernel='rbf', class_weight='balanced')\n",
+    "for i in range(2,8):   \n",
+    "    feature_list = []\n",
+    "    feature_select_dict = {}\n",
+    "    feature_select_dict={a:u for a, u in feature_sum_sort_dict.items() if u > i}\n",
+    "    for b in feature_select_dict:\n",
+    "        feature_list.append(list(x_prepost.columns).index(b)) \n",
+    "    x_prepost_sel = x_prepost.iloc[:, feature_list]\n",
+    "    \n",
+    "    c_dict['feature select time > %i'%i]=[]\n",
+    "    gamma_dict['feature select time > %i'%i]=[]\n",
+    "        \n",
+    "    fpr_train_dict['feature select time > %i'%i]=[]  ## fpr: sensitivity\n",
+    "    tpr_train_dict['feature select time > %i'%i]=[]  ## 1-tpr: specificity\n",
+    "    fpr_test_dict['feature select time > %i'%i]=[]\n",
+    "    tpr_test_dict['feature select time > %i'%i]=[]\n",
+    "    auc_train_dict['feature select time > %i'%i]=[]\n",
+    "    auc_test_dict['feature select time > %i'%i]=[]\n",
+    "    accuracy_train_dict['feature select time > %i'%i]=[]\n",
+    "    accuracy_test_dict['feature select time > %i'%i]=[]\n",
+    "    feature_totalselect_dict['feature select time > %i'%i] = list(feature_select_dict.keys())\n",
+    "        \n",
+    "    for train, test in outer_cv.split(x_prepost_sel, y_prepost_array):\n",
+    "        clf = GridSearchCV(estimator=svm, param_grid=tuned_parameters, scoring='roc_auc', cv=inner_cv)\n",
+    "        clf.fit(x_prepost_sel[train], y_prepost_array[train])\n",
+    "        #rbf\n",
+    "        clf_outer = SVC(class_weight='balanced', C=clf.best_params_['C'], kernel='rbf', gamma=clf.best_params_['gamma'], random_state=0)  \n",
+    "        \n",
+    "        clf_outer.fit(x_prepost_sel[train], y_prepost_array[train])\n",
+    "        predicted, y_pred = clf_outer.predict(x_prepost_sel[train]), clf_outer.predict(x_prepost_sel[test])\n",
+    "\n",
+    "        fpr_train, tpr_train, thresholds_train = roc_curve(y_prepost_array[train], predicted)\n",
+    "        fpr_test, tpr_test, thresholds_test = roc_curve(y_prepost_array[test], y_pred)\n",
+    "        auc_train, auc_test = auc(fpr_train, tpr_train), auc(fpr_test, tpr_test)\n",
+    "        accuracy_train, accuracy_test = accuracy_score(y_prepost_array[train], predicted), accuracy_score(y_prepost_array[test], y_pred)\n",
+    "\n",
+    "        c_dict['feature select time > %i'%i].append(clf.best_params_['C'])\n",
+    "        gamma_dict['feature select time > %i'%i].append(clf.best_params_['gamma'])\n",
+    "        \n",
+    "        fpr_train_dict['feature select time > %i'%i].append(fpr_train[1])\n",
+    "        tpr_train_dict['feature select time > %i'%i].append(tpr_train[1])\n",
+    "        fpr_test_dict['feature select time > %i'%i].append(fpr_test[1])\n",
+    "        tpr_test_dict['feature select time > %i'%i].append(tpr_test[1])\n",
+    "        auc_train_dict['feature select time > %i'%i].append(auc_train)\n",
+    "        auc_test_dict['feature select time > %i'%i].append(auc_test)\n",
+    "        accuracy_train_dict['feature select time > %i'%i].append(accuracy_train)\n",
+    "        accuracy_test_dict['feature select time > %i'%i].append(accuracy_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'feature select time > 2': 0.5294444444444444, 'feature select time > 3': 0.6040277777777778, 'feature select time > 4': 0.5488888888888889, 'feature select time > 5': 0.584861111111111, 'feature select time > 6': 0.584861111111111, 'feature select time > 7': 0.5298611111111111}\n"
+     ]
+    }
+   ],
+   "source": [
+    "## calculate average auc train and test value in outer CV\n",
+    "auc_avg_test_dict = {}\n",
+    "auc_avg_train_dict = {}\n",
+    "for key in auc_test_dict:\n",
+    "    auc_test, auc_train = 0.0, 0.0\n",
+    "    for time in auc_test_dict[key]:\n",
+    "        auc_test += time\n",
+    "    for time in auc_train_dict[key]:\n",
+    "        auc_train += time\n",
+    "    auc_avg_test, auc_avg_train = auc_test/10, auc_train/10\n",
+    "    auc_avg_test_dict[key]=auc_avg_test\n",
+    "    auc_avg_train_dict[key]=auc_avg_train\n",
+    "print(auc_avg_test_dict)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}