{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from tqdm import tqdm_notebook\n",
"import seaborn as sns\n",
"sns.set()\n",
"sns.set_style('whitegrid')\n",
"from functools import reduce\n",
"plt.rcParams['figure.dpi'] = 120\n",
"import h5py\n",
"from glob import glob\n",
"import re\n",
"\n",
"if '../scripts' not in sys.path:\n",
" sys.path.append('../scripts')\n",
"from importlib import reload\n",
"import figure_template\n",
"# force reload of the module\n",
"reload(figure_template)\n",
"from figure_template import display_dataframe, embed_pdf_figure, embed_pdf_pages, std_plot"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dataset = 'exorbase'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summarize feature stability\n",
"\n",
"### Pi-score\n",
"\n",
"$$\\pi = \\vert \\mathrm{log}_2 FC\\vert \\cdot (-\\mathrm{log}_{10} p_{\\mathrm{adj}})$$\n",
"\n",
"### Differential expression"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" count_method | \n",
" featurecounts | \n",
" featurecounts_lncrna | \n",
" featurecounts_mrna | \n",
"
\n",
" \n",
" compare_group | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Normal-CRC | \n",
" 0.862753 | \n",
" 0.284193 | \n",
" 0.493112 | \n",
"
\n",
" \n",
" Normal-HCC | \n",
" 0.793598 | \n",
" 0.346136 | \n",
" 0.726043 | \n",
"
\n",
" \n",
" Normal-PAAD | \n",
" 0.284300 | \n",
" 0.104522 | \n",
" 0.361159 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"count_method featurecounts featurecounts_lncrna featurecounts_mrna\n",
"compare_group \n",
"Normal-CRC 0.862753 0.284193 0.493112\n",
"Normal-HCC 0.793598 0.346136 0.726043\n",
"Normal-PAAD 0.284300 0.104522 0.361159"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"summary = pd.read_table('../output/{}/summary/cross_validation_diffexp/feature_stability.txt'.format(dataset))\n",
"summary_subset = summary.query('(fold_change_direction == \"any\") and (n_features == \"10\") and (diffexp_method == \"deseq2\")')\n",
"summary_table = summary_subset\\\n",
" .groupby(['count_method', 'compare_group'], as_index=True)['feature_stability'].mean()\\\n",
" .unstack(level=0)\n",
"display_dataframe(\n",
" summary_table,\n",
" filename='summarize_feature_selection',\n",
" format='excel'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n",
" dtype=object)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summary['count_method'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots(figsize=(7, 5))\n",
"sns.barplot('count_method', 'feature_stability', hue='compare_group',\n",
" order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n",
" data=summary_subset,\n",
" errwidth=1.2, capsize=0.05,\n",
" ax=ax,\n",
" )\n",
"ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n",
"ax.set_ylim(0, 1)\n",
"ax.set_ylabel('Feature stability')\n",
"ax.set_xlabel('Feature type')\n",
"ax.set_title('Differential expression')\n",
"#std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n",
"fig.tight_layout()\n",
"embed_pdf_figure(title='Differential expression feature stability')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Machine learning"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count_method | \n",
" featurecounts | \n",
" featurecounts_lncrna | \n",
" featurecounts_mrna | \n",
"
\n",
" \n",
" compare_group | \n",
" classifier | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Normal-CRC | \n",
" linear_svm | \n",
" 0.240223 | \n",
" 0.337562 | \n",
" 0.230315 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.202443 | \n",
" 0.299395 | \n",
" 0.208803 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.108237 | \n",
" 0.364765 | \n",
" 0.091755 | \n",
"
\n",
" \n",
" Normal-HCC | \n",
" linear_svm | \n",
" 0.238015 | \n",
" 0.349515 | \n",
" 0.300904 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.233108 | \n",
" 0.339376 | \n",
" 0.314727 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.083786 | \n",
" 0.359902 | \n",
" 0.118420 | \n",
"
\n",
" \n",
" Normal-PAAD | \n",
" linear_svm | \n",
" 0.200562 | \n",
" 0.389331 | \n",
" 0.153183 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.207349 | \n",
" 0.348114 | \n",
" 0.166106 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.070866 | \n",
" 0.145904 | \n",
" 0.111386 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"count_method featurecounts featurecounts_lncrna \\\n",
"compare_group classifier \n",
"Normal-CRC linear_svm 0.240223 0.337562 \n",
" logistic_regression 0.202443 0.299395 \n",
" random_forest 0.108237 0.364765 \n",
"Normal-HCC linear_svm 0.238015 0.349515 \n",
" logistic_regression 0.233108 0.339376 \n",
" random_forest 0.083786 0.359902 \n",
"Normal-PAAD linear_svm 0.200562 0.389331 \n",
" logistic_regression 0.207349 0.348114 \n",
" random_forest 0.070866 0.145904 \n",
"\n",
"count_method featurecounts_mrna \n",
"compare_group classifier \n",
"Normal-CRC linear_svm 0.230315 \n",
" logistic_regression 0.208803 \n",
" random_forest 0.091755 \n",
"Normal-HCC linear_svm 0.300904 \n",
" logistic_regression 0.314727 \n",
" random_forest 0.118420 \n",
"Normal-PAAD linear_svm 0.153183 \n",
" logistic_regression 0.166106 \n",
" random_forest 0.111386 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"summary = pd.read_table('../output/{}/summary/cross_validation/feature_stability.txt'.format(dataset))\n",
"summary_subset = summary.query('(fold_change_direction == \"any\") \\\n",
" and (n_features == \"10\") \\\n",
" and (clustering_score_name == \"uca_score\")')\n",
"summary_table = summary_subset\\\n",
" .groupby(['count_method', 'compare_group', 'classifier'], as_index=True)['feature_stability'].mean()\\\n",
" .unstack(level=0)\n",
"display_dataframe(\n",
" summary_table,\n",
" filename='summarize_feature_selection'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for compare_group in summary['compare_group'].unique():\n",
" summary_subset = summary.query('(fold_change_direction == \"any\") \\\n",
" and (n_features == \"10\") \\\n",
" and (clustering_score_name == \"uca_score\") \\\n",
" and (compare_group == \"{}\")'.format(compare_group))\n",
" fig, ax = plt.subplots(figsize=(8, 4))\n",
" sns.barplot('count_method', 'feature_stability', hue='classifier',\n",
" order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n",
" data=summary_subset,\n",
" errwidth=1.2, capsize=0.05,\n",
" ax=ax,\n",
" )\n",
" ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n",
" ax.set_ylim(0, 1)\n",
" ax.set_ylabel('Feature stability')\n",
" ax.set_xlabel('Feature type')\n",
" ax.set_title('{}'.format(compare_group))\n",
" #std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n",
" fig.tight_layout()\n",
" embed_pdf_figure(title='Feature stability of machine learning ({})'.format(compare_group))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Metrics on test set\n",
"\n",
"### Differential expression"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count_method | \n",
" featurecounts | \n",
" featurecounts_lncrna | \n",
" featurecounts_mrna | \n",
"
\n",
" \n",
" compare_group | \n",
" classifier | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Normal-CRC | \n",
" linear_svm | \n",
" 0.748571 | \n",
" 0.828571 | \n",
" 0.797143 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.968571 | \n",
" 0.928571 | \n",
" 0.805714 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.979286 | \n",
" 0.863571 | \n",
" 0.797143 | \n",
"
\n",
" \n",
" Normal-HCC | \n",
" linear_svm | \n",
" 0.919286 | \n",
" 0.875714 | \n",
" 0.891429 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.942857 | \n",
" 0.875000 | \n",
" 0.931429 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.969286 | \n",
" 0.838929 | \n",
" 0.944286 | \n",
"
\n",
" \n",
" Normal-PAAD | \n",
" linear_svm | \n",
" 0.754286 | \n",
" 0.629524 | \n",
" 0.875238 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.746667 | \n",
" 0.651429 | \n",
" 0.815714 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.673333 | \n",
" 0.586190 | \n",
" 0.754286 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"count_method featurecounts featurecounts_lncrna \\\n",
"compare_group classifier \n",
"Normal-CRC linear_svm 0.748571 0.828571 \n",
" logistic_regression 0.968571 0.928571 \n",
" random_forest 0.979286 0.863571 \n",
"Normal-HCC linear_svm 0.919286 0.875714 \n",
" logistic_regression 0.942857 0.875000 \n",
" random_forest 0.969286 0.838929 \n",
"Normal-PAAD linear_svm 0.754286 0.629524 \n",
" logistic_regression 0.746667 0.651429 \n",
" random_forest 0.673333 0.586190 \n",
"\n",
"count_method featurecounts_mrna \n",
"compare_group classifier \n",
"Normal-CRC linear_svm 0.797143 \n",
" logistic_regression 0.805714 \n",
" random_forest 0.797143 \n",
"Normal-HCC linear_svm 0.891429 \n",
" logistic_regression 0.931429 \n",
" random_forest 0.944286 \n",
"Normal-PAAD linear_svm 0.875238 \n",
" logistic_regression 0.815714 \n",
" random_forest 0.754286 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"summary = pd.read_table('../output/{}/summary/cross_validation_diffexp/metrics.test.txt'.format(dataset))\n",
"summary_subset = summary.query('(fold_change_direction == \"any\") and (n_features == \"10\") and (diffexp_method == \"deseq2\")')\n",
"summary_table = summary_subset \\\n",
" .groupby(['count_method', 'compare_group', 'classifier'], as_index=True)['roc_auc'].mean()\\\n",
" .unstack(level=0)\n",
"display_dataframe(\n",
" summary_table,\n",
" filename='summarize_feature_selection'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for compare_group in summary['compare_group'].unique():\n",
" summary_subset = summary.query('(fold_change_direction == \"any\") \\\n",
" and (n_features == 10) \\\n",
" and (compare_group == \"{}\")'.format(compare_group))\n",
" fig, ax = plt.subplots(figsize=(8, 4))\n",
" sns.barplot('count_method', 'roc_auc', hue='classifier',\n",
" order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n",
" data=summary_subset,\n",
" errwidth=1.2, capsize=0.05,\n",
" ax=ax)\n",
" ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n",
" ax.set_ylim(0, 1)\n",
" ax.set_ylabel('AUROC')\n",
" ax.set_xlabel('Feature type')\n",
" ax.set_title('{}'.format(compare_group))\n",
" #std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n",
" fig.tight_layout()\n",
" embed_pdf_figure(title='AUROC of differential expression ({})'.format(compare_group))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Machine learning"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count_method | \n",
" featurecounts | \n",
" featurecounts_lncrna | \n",
" featurecounts_mrna | \n",
"
\n",
" \n",
" compare_group | \n",
" classifier | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Normal-CRC | \n",
" linear_svm | \n",
" 0.964286 | \n",
" 0.785714 | \n",
" 0.848571 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.931429 | \n",
" 0.802857 | \n",
" 0.857143 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.990714 | \n",
" 0.981429 | \n",
" 0.917857 | \n",
"
\n",
" \n",
" Normal-HCC | \n",
" linear_svm | \n",
" 0.935714 | \n",
" 0.877857 | \n",
" 0.935714 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.955714 | \n",
" 0.848571 | \n",
" 0.953571 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.974286 | \n",
" 0.982857 | \n",
" 0.956071 | \n",
"
\n",
" \n",
" Normal-PAAD | \n",
" linear_svm | \n",
" 0.838095 | \n",
" 0.851429 | \n",
" 0.821905 | \n",
"
\n",
" \n",
" logistic_regression | \n",
" 0.864762 | \n",
" 0.773333 | \n",
" 0.806667 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.936667 | \n",
" 0.849524 | \n",
" 0.906667 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"count_method featurecounts featurecounts_lncrna \\\n",
"compare_group classifier \n",
"Normal-CRC linear_svm 0.964286 0.785714 \n",
" logistic_regression 0.931429 0.802857 \n",
" random_forest 0.990714 0.981429 \n",
"Normal-HCC linear_svm 0.935714 0.877857 \n",
" logistic_regression 0.955714 0.848571 \n",
" random_forest 0.974286 0.982857 \n",
"Normal-PAAD linear_svm 0.838095 0.851429 \n",
" logistic_regression 0.864762 0.773333 \n",
" random_forest 0.936667 0.849524 \n",
"\n",
"count_method featurecounts_mrna \n",
"compare_group classifier \n",
"Normal-CRC linear_svm 0.848571 \n",
" logistic_regression 0.857143 \n",
" random_forest 0.917857 \n",
"Normal-HCC linear_svm 0.935714 \n",
" logistic_regression 0.953571 \n",
" random_forest 0.956071 \n",
"Normal-PAAD linear_svm 0.821905 \n",
" logistic_regression 0.806667 \n",
" random_forest 0.906667 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"summary = pd.read_table('../output/{}/summary/cross_validation/metrics.test.txt'.format(dataset))\n",
"summary_table = summary.query('(fold_change_direction == \"any\") and (n_features == \"10\")')\\\n",
" .groupby(['count_method', 'compare_group', 'classifier'], as_index=True)['roc_auc'].mean()\\\n",
" .unstack(level=0)\n",
"display_dataframe(\n",
" summary_table,\n",
" filename='summarize_feature_selection'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for compare_group in summary['compare_group'].unique():\n",
" summary_subset = summary.query('(fold_change_direction == \"any\") \\\n",
" and (n_features == 10) \\\n",
" and (clustering_score_name == \"uca_score\") \\\n",
" and (compare_group == \"{}\")'.format(compare_group))\n",
" fig, ax = plt.subplots(figsize=(8, 4))\n",
" sns.barplot('count_method', 'roc_auc', hue='classifier',\n",
" order=['featurecounts', 'featurecounts_mrna', 'featurecounts_lncrna'],\n",
" data=summary_subset,\n",
" errwidth=1.2, capsize=0.05,\n",
" ax=ax)\n",
" ax.legend(title='Classifier', bbox_to_anchor=(1.04,0.5), loc=\"center left\", borderaxespad=0)\n",
" ax.set_ylim(0, 1)\n",
" ax.set_ylabel('AUROC')\n",
" ax.set_xlabel('Feature type')\n",
" ax.set_title('{}'.format(compare_group))\n",
" #std_plot(ax, xlabel='Feature type', ylabel='Feature stability')\n",
" fig.tight_layout()\n",
" embed_pdf_figure(title='AUROC of machine learning ({})'.format(compare_group))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 2
}