127 lines (126 with data), 3.1 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"#10-fold cross validation, and this was repeated for 100 times."
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"train_dfx = pd.read_excel('C1_train.xlsx', sheet_name='Proteins', index_col = 'Proteins')\n",
"test_dfx = pd.read_excel('C2_test.xlsx', sheet_name='Proteins', index_col = 'Proteins')\n",
"\n",
"dfy = pd.read_excel('C0_subject_info.xlsx', index_col = 'Patient ID')\n",
"\n",
"comm_proteins = list(set(train_dfx.index) & set(test_dfx.index))\n",
"\n",
"comm_proteins.sort()\n",
"\n",
"train_x = train_dfx.loc[comm_proteins].fillna(0)\n",
"test_x = test_dfx.loc[comm_proteins].fillna(0)\n",
"\n",
"testX = test_x[test_x.columns[1:]].T\n",
"trainX = train_x[train_x.columns[1:]].T\n",
"\n",
"dfx = trainX.append(testX)\n",
"dfx = dfx.join(dfy['severe'])\n",
"dfx['Train_Test'] = dfx.index.isin(train_dfx.columns[1:])\n",
"dfx['Train_Test'] = dfx['Train_Test'].apply(lambda x:'Train' if x else 'Test')\n",
"dfx = dfx[['severe', 'Train_Test']].join(dfx[dfx.columns[:-2]])\n",
"dfx = dfx.rename(columns={'severe':'Severe'})\n",
"\n",
"dfx.to_excel('./proteins_processed.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"train_dfx = pd.read_excel('C1_train.xlsx', sheet_name='Metabolites', index_col = 'Metabolites')\n",
"test_dfx = pd.read_excel('C2_test.xlsx', sheet_name='Metabolites', index_col = 'Metabolites')\n",
"test_dfx = test_dfx[test_dfx.columns[1:]].T\n",
"train_dfx = train_dfx[train_dfx.columns[1:]].T"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"dfx = train_dfx.append(test_dfx)\n",
"dfx = dfx.fillna(dfx.min())"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [],
"source": [
"dfx.to_excel('./Metabolites_processed.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}