{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "%pylab inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "from tqdm import tqdm_notebook\n", "import seaborn as sns\n", "sns.set()\n", "sns.set_style('whitegrid')\n", "from functools import reduce\n", "plt.rcParams['figure.dpi'] = 120\n", "import h5py\n", "from glob import glob\n", "import re\n", "\n", "if '../scripts' not in sys.path:\n", " sys.path.append('../scripts')\n", "from importlib import reload\n", "import figure_template\n", "# force reload of the module\n", "reload(figure_template)\n", "from figure_template import display_dataframe, embed_pdf_figure, embed_pdf_pages, std_plot" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "dataset = 'exorbase'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summarize feature stability\n", "\n", "### Pi-score\n", "\n", "$$\\pi = \\vert \\mathrm{log}_2 FC\\vert \\cdot (-\\mathrm{log}_{10} p_{\\mathrm{adj}})$$\n", "\n", "### Differential expression" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count_methodfeaturecountsfeaturecounts_lncrnafeaturecounts_mrna
compare_group
Normal-CRC0.8627530.2841930.493112
Normal-HCC0.7935980.3461360.726043
Normal-PAAD0.2843000.1045220.361159
\n", "
" ], "text/plain": [ "count_method featurecounts featurecounts_lncrna featurecounts_mrna\n", "compare_group \n", "Normal-CRC 0.862753 0.284193 0.493112\n", "Normal-HCC 0.793598 0.346136 0.726043\n", "Normal-PAAD 0.284300 0.104522 0.361159" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "summary = pd.read_table('../output/{}/summary/cross_validation_diffexp/feature_stability.txt'.format(dataset))\n", "summary_subset = summary.query('(fold_change_direction == \"any\") and (n_features == \"10\") and (diffexp_method == \"deseq2\")')\n", "summary_table = summary_subset\\\n", " .groupby(['count_method', 'compare_group'], as_index=True)['feature_stability'].mean()\\\n", " .unstack(level=0)\n", "display_dataframe(\n", " summary_table,\n", " filename='summarize_feature_selection',\n", " format='excel'\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n", " dtype=object)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary['count_method'].unique()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(figsize=(7, 5))\n", "sns.barplot('count_method', 'feature_stability', hue='compare_group',\n", " order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n", " data=summary_subset,\n", " errwidth=1.2, capsize=0.05,\n", " ax=ax,\n", " )\n", "ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n", "ax.set_ylim(0, 1)\n", "ax.set_ylabel('Feature stability')\n", "ax.set_xlabel('Feature type')\n", "ax.set_title('Differential expression')\n", "#std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n", "fig.tight_layout()\n", "embed_pdf_figure(title='Differential expression feature stability')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Machine learning" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count_methodfeaturecountsfeaturecounts_lncrnafeaturecounts_mrna
compare_groupclassifier
Normal-CRClinear_svm0.2402230.3375620.230315
logistic_regression0.2024430.2993950.208803
random_forest0.1082370.3647650.091755
Normal-HCClinear_svm0.2380150.3495150.300904
logistic_regression0.2331080.3393760.314727
random_forest0.0837860.3599020.118420
Normal-PAADlinear_svm0.2005620.3893310.153183
logistic_regression0.2073490.3481140.166106
random_forest0.0708660.1459040.111386
\n", "
" ], "text/plain": [ "count_method featurecounts featurecounts_lncrna \\\n", "compare_group classifier \n", "Normal-CRC linear_svm 0.240223 0.337562 \n", " logistic_regression 0.202443 0.299395 \n", " random_forest 0.108237 0.364765 \n", "Normal-HCC linear_svm 0.238015 0.349515 \n", " logistic_regression 0.233108 0.339376 \n", " random_forest 0.083786 0.359902 \n", "Normal-PAAD linear_svm 0.200562 0.389331 \n", " logistic_regression 0.207349 0.348114 \n", " random_forest 0.070866 0.145904 \n", "\n", "count_method featurecounts_mrna \n", "compare_group classifier \n", "Normal-CRC linear_svm 0.230315 \n", " logistic_regression 0.208803 \n", " random_forest 0.091755 \n", "Normal-HCC linear_svm 0.300904 \n", " logistic_regression 0.314727 \n", " random_forest 0.118420 \n", "Normal-PAAD linear_svm 0.153183 \n", " logistic_regression 0.166106 \n", " random_forest 0.111386 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "summary = pd.read_table('../output/{}/summary/cross_validation/feature_stability.txt'.format(dataset))\n", "summary_subset = summary.query('(fold_change_direction == \"any\") \\\n", " and (n_features == \"10\") \\\n", " and (clustering_score_name == \"uca_score\")')\n", "summary_table = summary_subset\\\n", " .groupby(['count_method', 'compare_group', 'classifier'], as_index=True)['feature_stability'].mean()\\\n", " .unstack(level=0)\n", "display_dataframe(\n", " summary_table,\n", " filename='summarize_feature_selection'\n", ")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for compare_group in summary['compare_group'].unique():\n", " summary_subset = summary.query('(fold_change_direction == \"any\") \\\n", " and (n_features == \"10\") \\\n", " and (clustering_score_name == \"uca_score\") \\\n", " and (compare_group == \"{}\")'.format(compare_group))\n", " fig, ax = plt.subplots(figsize=(8, 4))\n", " sns.barplot('count_method', 'feature_stability', hue='classifier',\n", " order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n", " data=summary_subset,\n", " errwidth=1.2, capsize=0.05,\n", " ax=ax,\n", " )\n", " ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n", " ax.set_ylim(0, 1)\n", " ax.set_ylabel('Feature stability')\n", " ax.set_xlabel('Feature type')\n", " ax.set_title('{}'.format(compare_group))\n", " #std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n", " fig.tight_layout()\n", " embed_pdf_figure(title='Feature stability of machine learning ({})'.format(compare_group))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Metrics on test set\n", "\n", "### Differential expression" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count_methodfeaturecountsfeaturecounts_lncrnafeaturecounts_mrna
compare_groupclassifier
Normal-CRClinear_svm0.7485710.8285710.797143
logistic_regression0.9685710.9285710.805714
random_forest0.9792860.8635710.797143
Normal-HCClinear_svm0.9192860.8757140.891429
logistic_regression0.9428570.8750000.931429
random_forest0.9692860.8389290.944286
Normal-PAADlinear_svm0.7542860.6295240.875238
logistic_regression0.7466670.6514290.815714
random_forest0.6733330.5861900.754286
\n", "
" ], "text/plain": [ "count_method featurecounts featurecounts_lncrna \\\n", "compare_group classifier \n", "Normal-CRC linear_svm 0.748571 0.828571 \n", " logistic_regression 0.968571 0.928571 \n", " random_forest 0.979286 0.863571 \n", "Normal-HCC linear_svm 0.919286 0.875714 \n", " logistic_regression 0.942857 0.875000 \n", " random_forest 0.969286 0.838929 \n", "Normal-PAAD linear_svm 0.754286 0.629524 \n", " logistic_regression 0.746667 0.651429 \n", " random_forest 0.673333 0.586190 \n", "\n", "count_method featurecounts_mrna \n", "compare_group classifier \n", "Normal-CRC linear_svm 0.797143 \n", " logistic_regression 0.805714 \n", " random_forest 0.797143 \n", "Normal-HCC linear_svm 0.891429 \n", " logistic_regression 0.931429 \n", " random_forest 0.944286 \n", "Normal-PAAD linear_svm 0.875238 \n", " logistic_regression 0.815714 \n", " random_forest 0.754286 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "summary = pd.read_table('../output/{}/summary/cross_validation_diffexp/metrics.test.txt'.format(dataset))\n", "summary_subset = summary.query('(fold_change_direction == \"any\") and (n_features == \"10\") and (diffexp_method == \"deseq2\")')\n", "summary_table = summary_subset \\\n", " .groupby(['count_method', 'compare_group', 'classifier'], as_index=True)['roc_auc'].mean()\\\n", " .unstack(level=0)\n", "display_dataframe(\n", " summary_table,\n", " filename='summarize_feature_selection'\n", ")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for compare_group in summary['compare_group'].unique():\n", " summary_subset = summary.query('(fold_change_direction == \"any\") \\\n", " and (n_features == 10) \\\n", " and (compare_group == \"{}\")'.format(compare_group))\n", " fig, ax = plt.subplots(figsize=(8, 4))\n", " sns.barplot('count_method', 'roc_auc', hue='classifier',\n", " order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n", " data=summary_subset,\n", " errwidth=1.2, capsize=0.05,\n", " ax=ax)\n", " ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n", " ax.set_ylim(0, 1)\n", " ax.set_ylabel('AUROC')\n", " ax.set_xlabel('Feature type')\n", " ax.set_title('{}'.format(compare_group))\n", " #std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n", " fig.tight_layout()\n", " embed_pdf_figure(title='AUROC of differential expression ({})'.format(compare_group))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Machine learning" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count_methodfeaturecountsfeaturecounts_lncrnafeaturecounts_mrna
compare_groupclassifier
Normal-CRClinear_svm0.9642860.7857140.848571
logistic_regression0.9314290.8028570.857143
random_forest0.9907140.9814290.917857
Normal-HCClinear_svm0.9357140.8778570.935714
logistic_regression0.9557140.8485710.953571
random_forest0.9742860.9828570.956071
Normal-PAADlinear_svm0.8380950.8514290.821905
logistic_regression0.8647620.7733330.806667
random_forest0.9366670.8495240.906667
\n", "
" ], "text/plain": [ "count_method featurecounts featurecounts_lncrna \\\n", "compare_group classifier \n", "Normal-CRC linear_svm 0.964286 0.785714 \n", " logistic_regression 0.931429 0.802857 \n", " random_forest 0.990714 0.981429 \n", "Normal-HCC linear_svm 0.935714 0.877857 \n", " logistic_regression 0.955714 0.848571 \n", " random_forest 0.974286 0.982857 \n", "Normal-PAAD linear_svm 0.838095 0.851429 \n", " logistic_regression 0.864762 0.773333 \n", " random_forest 0.936667 0.849524 \n", "\n", "count_method featurecounts_mrna \n", "compare_group classifier \n", "Normal-CRC linear_svm 0.848571 \n", " logistic_regression 0.857143 \n", " random_forest 0.917857 \n", "Normal-HCC linear_svm 0.935714 \n", " logistic_regression 0.953571 \n", " random_forest 0.956071 \n", "Normal-PAAD linear_svm 0.821905 \n", " logistic_regression 0.806667 \n", " random_forest 0.906667 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "summary = pd.read_table('../output/{}/summary/cross_validation/metrics.test.txt'.format(dataset))\n", "summary_table = summary.query('(fold_change_direction == \"any\") and (n_features == \"10\")')\\\n", " .groupby(['count_method', 'compare_group', 'classifier'], as_index=True)['roc_auc'].mean()\\\n", " .unstack(level=0)\n", "display_dataframe(\n", " summary_table,\n", " filename='summarize_feature_selection'\n", ")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for compare_group in summary['compare_group'].unique():\n", " summary_subset = summary.query('(fold_change_direction == \"any\") \\\n", " and (n_features == 10) \\\n", " and (clustering_score_name == \"uca_score\") \\\n", " and (compare_group == \"{}\")'.format(compare_group))\n", " fig, ax = plt.subplots(figsize=(8, 4))\n", " sns.barplot('count_method', 'roc_auc', hue='classifier',\n", " order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n", " data=summary_subset,\n", " errwidth=1.2, capsize=0.05,\n", " ax=ax)\n", " ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n", " ax.set_ylim(0, 1)\n", " ax.set_ylabel('AUROC')\n", " ax.set_xlabel('Feature type')\n", " ax.set_title('{}'.format(compare_group))\n", " #std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n", " fig.tight_layout()\n", " embed_pdf_figure(title='AUROC of machine learning ({})'.format(compare_group))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }