[d90d15]: / preprocessing_scr / annotations.ipynb

Download this file

3882 lines (3881 with data), 532.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "import pandas as pd \n",
    "import matplotlib.pyplot as plt \n",
    "import numpy as np \n",
    "from scipy.stats import zscore\n",
    "import seaborn as sns\n",
    "import sys,os\n",
    "import gzip\n",
    "import ftplib\n",
    "import re\n",
    "#pd.options.mode.chained_assignment = None  # default='warn'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clinical trial cohorts profiled with microarrays\n",
    "https://support.bioconductor.org/p/36041/ - no way to access phenoData without loading the whole dataset. \n",
    "Therefore we download _series_matrix.txt.gz and parse its header. \n",
    "Alternative way could be applying getGEO() followed by phenoData(), but this also means downloading the whole dataset. \n",
    "\n",
    "# PDX \n",
    "RECIST Response Categories\n",
    "\n",
    "# TCGA\n",
    "RECIST Response Categories\n",
    "\n",
    "# GDSC \n",
    "- binary \n",
    "- continious \n",
    "\n",
    "# ToDo:\n",
    " - python wrapper for getGEO function\n",
    " - automatic download of supplementary files \n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "nine_drugs = ['Docetaxel', 'Cisplatin', 'Erlotinib', 'Bortezomib','5-Fluorouracil',\n",
    "         'Tamoxifen', 'Cetuximab', 'Paclitaxel', 'Gemcitabine']\n",
    "\n",
    "EGFRi_drugs = ['Cetuximab', 'Panitumumab','Erlotinib','Pelitinib','Gefitinib','Lapatinib','Afatinib','ZD-6474']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "root_dir = \"/home/olya/SFU/Hossein/v2/\"\n",
    "tmp_dir = \"/home/olya/SFU/Hossein/arrays/annotations/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create folders for training,testing and pre-training in the root_dir  \n",
    "for folder in [\"preprocessed/\",\"preprocessed/annotations/\"]:\n",
    "    if not os.path.exists(root_dir+\"/\"+folder):\n",
    "        os.makedirs(root_dir+\"/\"+folder)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clinical trial cohorts\n",
    "### GSE6434 -  pre-treatmen expression of breast tumours from 24 patients with assessed tumour response to neoadjuvant docetaxel\n",
    "consists of two subsets GSE349 and GSE350 comprising of resistant (14) and sensitive (10) patients respectively."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Docetaxel R: 14 S: 10\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>response_original</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GSM4901</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>R</td>\n",
       "      <td>residual tumor of 25% or greater remaining volume</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM4902</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>R</td>\n",
       "      <td>residual tumor of 25% or greater remaining volume</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM4903</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>S</td>\n",
       "      <td>less than 25% residual tumor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM4904</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>R</td>\n",
       "      <td>residual tumor of 25% or greater remaining volume</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM4905</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>R</td>\n",
       "      <td>residual tumor of 25% or greater remaining volume</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  drug response  \\\n",
       "sample_name                       \n",
       "GSM4901      Docetaxel        R   \n",
       "GSM4902      Docetaxel        R   \n",
       "GSM4903      Docetaxel        S   \n",
       "GSM4904      Docetaxel        R   \n",
       "GSM4905      Docetaxel        R   \n",
       "\n",
       "                                             response_original  \n",
       "sample_name                                                     \n",
       "GSM4901      residual tumor of 25% or greater remaining volume  \n",
       "GSM4902      residual tumor of 25% or greater remaining volume  \n",
       "GSM4903                           less than 25% residual tumor  \n",
       "GSM4904      residual tumor of 25% or greater remaining volume  \n",
       "GSM4905      residual tumor of 25% or greater remaining volume  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "resistans_GSE349 = [\"GSM4901\",\"GSM4902\",\"GSM4904\",\"GSM4905\",\"GSM4906\",\"GSM4909\",\"GSM4910\",\n",
    "                    \"GSM4911\",\"GSM4912\",\"GSM4913\",\"GSM4916\",\"GSM4918\",\"GSM4922\",\"GSM4924\"]\n",
    "sensitive_GSE350 = [\"GSM4903\",\"GSM4907\",\"GSM4908\",\"GSM4914\",\"GSM4915\",\"GSM4917\",\"GSM4919\",\n",
    "                    \"GSM4920\",\"GSM4921\",\"GSM4923\"]\n",
    "\n",
    "responses_dict = {}\n",
    "#print(\"R:\",len(resistans_GSE349 ),\"S:\",len(sensitive_GSE350))\n",
    "drug = \"Docetaxel\"\n",
    "for s in resistans_GSE349:\n",
    "    responses_dict[s] = {\"response\":\"R\",\"drug\":drug,\"response_original\":\"residual tumor of 25% or greater remaining volume\"} \n",
    "for s in sensitive_GSE350:\n",
    "    responses_dict[s] = {\"response\":\"S\",\"drug\":drug,\"response_original\":\"less than 25% residual tumor\"}     \n",
    "df = pd.DataFrame.from_dict(responses_dict).T\n",
    "df.index.name = \"sample_name\"\n",
    "df.sort_values(by=\"sample_name\",inplace=True)\n",
    "df.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE6434_response.\"+drug+\".tsv\",sep = \"\\t\")\n",
    "print(drug,\"R:\",df[df[\"response\"]==\"R\"].shape[0],\"S:\",df[df[\"response\"]==\"S\"].shape[0])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GSE18864 - Pretreatment tumor samples from the clinical trial of  cisplatin monotherapy in triple negative breast cancer \n",
    "- download matrix file from ftp: \n",
    "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE18nnn/GSE18864/matrix/GSE18864_series_matrix.txt.gz\n",
    "- parse header\n",
    "- convert =Miller-Payne response into binary S/R :\n",
    "    - 5 >=Miller-Payne response > 1 → “Sensitive”;\n",
    "    - Miller-Payne response == 1 →  “Resistant”;\n",
    "Miller-Payne response scale is explained here: https://www.researchgate.net/publication/263296704_Correlation_of_clinico-pathologic_and_radiologic_parameters_of_response_to_neoadjuvant_chemotherapy_in_breast_cancer \n",
    "- 51 samples without response excluded "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_GEO_matrix(fname,ftppath,destination=os.getcwd(),ftp='ftp.ncbi.nlm.nih.gov'):\n",
    "    ftp = ftplib.FTP(ftp)   \n",
    "    ftp.login() \n",
    "    ftp.cwd(ftppath)\n",
    "    #ftp.retrlines('LIST')\n",
    "    file_handle = open(destination+\"/\"+fname, 'wb')\n",
    "    ftp.retrbinary('RETR '+fname, file_handle.write)\n",
    "    file_handle.close()\n",
    "    ftp.quit()\n",
    "    return destination+\"/\"+fname"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def millerPayne2RECIST(response):\n",
    "    try :\n",
    "        response = int(response)\n",
    "    except:\n",
    "        #print(response,file=sys.stderr)\n",
    "        return response\n",
    "    if response <= 1:\n",
    "        return \"R\"\n",
    "    elif 5 >= response > 1:\n",
    "        return \"S\"\n",
    "    else:\n",
    "        print(response,file=sys.stderr)\n",
    "        return None\n",
    "\n",
    "def read_matrix(fname,index = \"GSM\"):\n",
    "    df = {}\n",
    "    with gzip.open(fname) as infile:\n",
    "        for line in infile.readlines():\n",
    "            if line.startswith(\"!\"):\n",
    "                line = line.rstrip().replace('\"','').split(\"\\t\")\n",
    "                line =  map(lambda x : x.rstrip().lstrip(),line)\n",
    "                #print(line)\n",
    "                if line[0] == '!Sample_title':\n",
    "                    df[\"title\"] = line[1:]\n",
    "                if line[0] == '!Sample_geo_accession':\n",
    "                    df[\"GSM\"] = line[1:]\n",
    "                if line[0] == '!Sample_source_name_ch1':\n",
    "                    df[\"source\"] = line[1:]\n",
    "                if line[0] == '!Sample_characteristics_ch1':\n",
    "                    if \":\" in line[1]:\n",
    "                        sep = \": \"\n",
    "                    elif \"=\" in line[1]:\n",
    "                        sep = \"= \"\n",
    "                    else:\n",
    "                        pass\n",
    "                    field = line[1].split(sep)[0]\n",
    "                    df[field] = map(lambda x : x.replace(field+sep,\"\").rstrip().lstrip(),line[1:])\n",
    "    df=pd.DataFrame.from_dict(df)\n",
    "    df.set_index(index,drop=True,inplace=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cisplatin R: 8 S: 16\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>miller-payne response</th>\n",
       "      <th>grade</th>\n",
       "      <th>brca genotype</th>\n",
       "      <th>p53 status</th>\n",
       "      <th>er/pr/her2 status</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GSM467523</th>\n",
       "      <td>Cisplatin</td>\n",
       "      <td>S</td>\n",
       "      <td>3</td>\n",
       "      <td>III</td>\n",
       "      <td>WT</td>\n",
       "      <td>unknown</td>\n",
       "      <td>neg/neg/neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM467524</th>\n",
       "      <td>Cisplatin</td>\n",
       "      <td>S</td>\n",
       "      <td>4</td>\n",
       "      <td>III</td>\n",
       "      <td>WT</td>\n",
       "      <td>MSM</td>\n",
       "      <td>neg/neg/neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM467525</th>\n",
       "      <td>Cisplatin</td>\n",
       "      <td>S</td>\n",
       "      <td>5</td>\n",
       "      <td>III</td>\n",
       "      <td>WT</td>\n",
       "      <td>NSM</td>\n",
       "      <td>neg/neg/neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM467526</th>\n",
       "      <td>Cisplatin</td>\n",
       "      <td>R</td>\n",
       "      <td>1</td>\n",
       "      <td>III</td>\n",
       "      <td>WT</td>\n",
       "      <td>MSM</td>\n",
       "      <td>neg/neg/neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM467527</th>\n",
       "      <td>Cisplatin</td>\n",
       "      <td>S</td>\n",
       "      <td>5</td>\n",
       "      <td>III</td>\n",
       "      <td>BRCA1 germline mutation</td>\n",
       "      <td>MSM</td>\n",
       "      <td>neg/neg/neg</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  drug response miller-payne response grade  \\\n",
       "sample_name                                                   \n",
       "GSM467523    Cisplatin        S                     3   III   \n",
       "GSM467524    Cisplatin        S                     4   III   \n",
       "GSM467525    Cisplatin        S                     5   III   \n",
       "GSM467526    Cisplatin        R                     1   III   \n",
       "GSM467527    Cisplatin        S                     5   III   \n",
       "\n",
       "                       brca genotype p53 status er/pr/her2 status  \n",
       "sample_name                                                        \n",
       "GSM467523                         WT    unknown       neg/neg/neg  \n",
       "GSM467524                         WT        MSM       neg/neg/neg  \n",
       "GSM467525                         WT        NSM       neg/neg/neg  \n",
       "GSM467526                         WT        MSM       neg/neg/neg  \n",
       "GSM467527    BRCA1 germline mutation        MSM       neg/neg/neg  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fpath = download_GEO_matrix(\"GSE18864_series_matrix.txt.gz\",'/geo/series/GSE18nnn/GSE18864/matrix/'\n",
    "                    ,destination=tmp_dir)\n",
    "df = read_matrix(fpath)\n",
    "os.remove(fpath)\n",
    "df[\"response\"] = df[\"miller-payne response\"].apply(millerPayne2RECIST)\n",
    "df = df[df[\"response\"] != \"n/a\"]\n",
    "df[\"drug\"] = [\"Cisplatin\"]*df.shape[0]\n",
    "df.index.name = \"sample_name\"\n",
    "df.sort_values(by=\"sample_name\",inplace=True)\n",
    "df = df[[\"drug\",\"response\"]+[\"miller-payne response\",\"grade\",\"brca genotype\",\"p53 status\",\"er/pr/her2 status\"]]\n",
    "df.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE18864_response.\"+\"Cisplatin\"+\".tsv\",sep = \"\\t\")\n",
    "print(\"Cisplatin\",\"R:\",df[df[\"response\"]==\"R\"].shape[0],\"S:\",df[df[\"response\"]==\"S\"].shape[0])\n",
    "df.head(5) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GSE25065 - response and survival following neoadjuvant taxane-anthracycline chemotherapy in  in HER2-negative invasive breast cancer.\n",
    "- two taxanes studied: Taxol == Paclitaxel and Taxotere == Docetaxel\n",
    "- download matrix file from ftp: \n",
    "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE25nnn/GSE25065/matrix/GSE25065_series_matrix.txt.gz\n",
    "- parse header\n",
    "- rename drugs and response groups as following:\n",
    "\n",
    "#### Response \"pathologic_response_pcr_rd\" and \"pathologic_response_rcb_class\"\n",
    "- *pathologic_response_pcr_rd*\n",
    "    - pCR = pathologic complete response \n",
    "    - RD = residual disease\n",
    "- *pathologic_response_rcb_class*,RCB =  residual cancer burden: \n",
    "    - RCB-I is minimal RD\n",
    "    - RCB-II is moderate RD\n",
    "    - RCB-III extensive RD \n",
    "    - pCR is always RCB-0/I.\n",
    "\n",
    "In the paper,  pCR+RCB-I group was compared with RCB-II/III and pCR+RCB-I/II with RCB-III\n",
    "\n",
    "I suggest :\n",
    "* R = RCB-III\n",
    "* S = pCR, RCB-I/II\n",
    "* NA - 63 patients RD without RCB score (excluded)\n",
    "\n",
    "#### Drugs ('type_taxane')\n",
    "Paclitaxel == Taxol; Docetaxel == Taxotere"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dict = {\"Taxol\":\"Paclitaxel\",\"Taxotere\":\"Docetaxel\"}\n",
    "def RCB2response(row):\n",
    "    if row[\"pathologic_response_pcr_rd\"] == \"pCR\":\n",
    "        response = \"S\"\n",
    "    else:# row[\"pathologic_response_pcr_rd\"] == \"RD\":\n",
    "        if row[\"pathologic_response_rcb_class\"] == \"RCB-0/I\" or row[\"pathologic_response_rcb_class\"] == \"RCB-II\":\n",
    "            response = \"S\"\n",
    "        elif  row[\"pathologic_response_rcb_class\"] == \"RCB-III\":\n",
    "            response = \"R\"\n",
    "        else: \n",
    "            #print(row[\"pathologic_response_pcr_rd\"],row[\"pathologic_response_rcb_class\"],file = sys.stderr)\n",
    "            response = \"NA\"\n",
    "    #else: \n",
    "        #print(row[\"pathologic_response_pcr_rd\"],row[\"pathologic_response_rcb_class\"],file = sys.stderr)\n",
    "    #    response = \"NA\"\n",
    "    return response\n",
    "fpath = download_GEO_matrix(\"GSE25065_series_matrix.txt.gz\",\n",
    "                            '/geo/series/GSE25nnn/GSE25065/matrix/',destination=tmp_dir)\n",
    "df = read_matrix(fpath)\n",
    "os.remove(fpath)\n",
    "df.index.name = \"sample_name\"\n",
    "cols = [u'age_years', u'chemosensitivity_prediction', u'clinical_ajcc_stage',\n",
    "       u'clinical_nodal_status', u'clinical_t_stage', u'dlda30_prediction',\n",
    "       u'drfs_1_event_0_censored', u'drfs_even_time_years', u'er_status_ihc',\n",
    "       u'erbb2_status', u'esr1_status', u'ggi_class', u'grade', u'her2_status',\n",
    "       u'pam50_class', u'pr_status_ihc',u'rcb_0_i_prediction', u'sample id', \n",
    "       u'set_class', u'source', u'tissue', u'title']\n",
    "#'type_taxane','pathologic_response_pcr_rd','pathologic_response_rcb_class'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pathologic_response_pcr_rd</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>NA</th>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RD</th>\n",
       "      <td>140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pCR</th>\n",
       "      <td>42</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            title\n",
       "pathologic_response_pcr_rd       \n",
       "NA                             16\n",
       "RD                            140\n",
       "pCR                            42"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[[\"title\",\"pathologic_response_pcr_rd\"]].groupby(\"pathologic_response_pcr_rd\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pathologic_response_rcb_class</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>NA</th>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RCB-0/I</th>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RCB-II</th>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RCB-III</th>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               title\n",
       "pathologic_response_rcb_class       \n",
       "NA                                82\n",
       "RCB-0/I                           32\n",
       "RCB-II                            53\n",
       "RCB-III                           31"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[[\"title\",\"pathologic_response_rcb_class\"]].groupby(\"pathologic_response_rcb_class\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "samples with NA response 63\n",
      "Paclitaxel R: 26 S: 58\n",
      "Docetaxel R: 5 S: 46\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/olya/miniconda2/lib/python2.7/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  # This is added back by InteractiveShellApp.init_path()\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>pathologic_response_pcr_rd</th>\n",
       "      <th>pathologic_response_rcb_class</th>\n",
       "      <th>type_taxane</th>\n",
       "      <th>age_years</th>\n",
       "      <th>chemosensitivity_prediction</th>\n",
       "      <th>clinical_ajcc_stage</th>\n",
       "      <th>clinical_nodal_status</th>\n",
       "      <th>clinical_t_stage</th>\n",
       "      <th>...</th>\n",
       "      <th>grade</th>\n",
       "      <th>her2_status</th>\n",
       "      <th>pam50_class</th>\n",
       "      <th>pr_status_ihc</th>\n",
       "      <th>rcb_0_i_prediction</th>\n",
       "      <th>sample id</th>\n",
       "      <th>set_class</th>\n",
       "      <th>source</th>\n",
       "      <th>tissue</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GSM615632</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>S</td>\n",
       "      <td>pCR</td>\n",
       "      <td>NA</td>\n",
       "      <td>Taxotere</td>\n",
       "      <td>41.9</td>\n",
       "      <td>Rx Insensitive</td>\n",
       "      <td>IIB</td>\n",
       "      <td>N1</td>\n",
       "      <td>T2</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>P</td>\n",
       "      <td>Basal</td>\n",
       "      <td>P</td>\n",
       "      <td>RCB-0/I</td>\n",
       "      <td>5</td>\n",
       "      <td>SET-Low</td>\n",
       "      <td>USO</td>\n",
       "      <td>breast cancer tumor</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM615634</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>S</td>\n",
       "      <td>pCR</td>\n",
       "      <td>NA</td>\n",
       "      <td>Taxotere</td>\n",
       "      <td>47.1</td>\n",
       "      <td>Rx Insensitive</td>\n",
       "      <td>IIA</td>\n",
       "      <td>N0</td>\n",
       "      <td>T2</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>N</td>\n",
       "      <td>LumB</td>\n",
       "      <td>P</td>\n",
       "      <td>RCB-0/I</td>\n",
       "      <td>13</td>\n",
       "      <td>SET-Low</td>\n",
       "      <td>USO</td>\n",
       "      <td>breast cancer tumor</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM615635</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>S</td>\n",
       "      <td>pCR</td>\n",
       "      <td>NA</td>\n",
       "      <td>Taxotere</td>\n",
       "      <td>43.8</td>\n",
       "      <td>Rx Insensitive</td>\n",
       "      <td>IIIC</td>\n",
       "      <td>N3</td>\n",
       "      <td>T3</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>N</td>\n",
       "      <td>LumB</td>\n",
       "      <td>P</td>\n",
       "      <td>RCB-II/III</td>\n",
       "      <td>18</td>\n",
       "      <td>SET-Low</td>\n",
       "      <td>USO</td>\n",
       "      <td>breast cancer tumor</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM615636</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>S</td>\n",
       "      <td>pCR</td>\n",
       "      <td>NA</td>\n",
       "      <td>Taxotere</td>\n",
       "      <td>50.8</td>\n",
       "      <td>Rx Sensitive</td>\n",
       "      <td>IIIA</td>\n",
       "      <td>N1</td>\n",
       "      <td>T3</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>N</td>\n",
       "      <td>LumB</td>\n",
       "      <td>P</td>\n",
       "      <td>RCB-0/I</td>\n",
       "      <td>20</td>\n",
       "      <td>SET-Low</td>\n",
       "      <td>USO</td>\n",
       "      <td>breast cancer tumor</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM615637</th>\n",
       "      <td>Docetaxel</td>\n",
       "      <td>S</td>\n",
       "      <td>pCR</td>\n",
       "      <td>NA</td>\n",
       "      <td>Taxotere</td>\n",
       "      <td>34.1</td>\n",
       "      <td>Rx Insensitive</td>\n",
       "      <td>IIB</td>\n",
       "      <td>N0</td>\n",
       "      <td>T3</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>N</td>\n",
       "      <td>Basal</td>\n",
       "      <td>N</td>\n",
       "      <td>RCB-0/I</td>\n",
       "      <td>23</td>\n",
       "      <td>SET-Low</td>\n",
       "      <td>USO</td>\n",
       "      <td>breast cancer tumor</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 27 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  drug response pathologic_response_pcr_rd  \\\n",
       "sample_name                                                  \n",
       "GSM615632    Docetaxel        S                        pCR   \n",
       "GSM615634    Docetaxel        S                        pCR   \n",
       "GSM615635    Docetaxel        S                        pCR   \n",
       "GSM615636    Docetaxel        S                        pCR   \n",
       "GSM615637    Docetaxel        S                        pCR   \n",
       "\n",
       "            pathologic_response_rcb_class type_taxane age_years  \\\n",
       "sample_name                                                       \n",
       "GSM615632                              NA    Taxotere      41.9   \n",
       "GSM615634                              NA    Taxotere      47.1   \n",
       "GSM615635                              NA    Taxotere      43.8   \n",
       "GSM615636                              NA    Taxotere      50.8   \n",
       "GSM615637                              NA    Taxotere      34.1   \n",
       "\n",
       "            chemosensitivity_prediction clinical_ajcc_stage  \\\n",
       "sample_name                                                   \n",
       "GSM615632                Rx Insensitive                 IIB   \n",
       "GSM615634                Rx Insensitive                 IIA   \n",
       "GSM615635                Rx Insensitive                IIIC   \n",
       "GSM615636                  Rx Sensitive                IIIA   \n",
       "GSM615637                Rx Insensitive                 IIB   \n",
       "\n",
       "            clinical_nodal_status clinical_t_stage  ...  grade her2_status  \\\n",
       "sample_name                                         ...                      \n",
       "GSM615632                      N1               T2  ...      3           P   \n",
       "GSM615634                      N0               T2  ...      2           N   \n",
       "GSM615635                      N3               T3  ...      3           N   \n",
       "GSM615636                      N1               T3  ...      3           N   \n",
       "GSM615637                      N0               T3  ...      3           N   \n",
       "\n",
       "            pam50_class pr_status_ihc rcb_0_i_prediction sample id set_class  \\\n",
       "sample_name                                                                    \n",
       "GSM615632         Basal             P            RCB-0/I         5   SET-Low   \n",
       "GSM615634          LumB             P            RCB-0/I        13   SET-Low   \n",
       "GSM615635          LumB             P         RCB-II/III        18   SET-Low   \n",
       "GSM615636          LumB             P            RCB-0/I        20   SET-Low   \n",
       "GSM615637         Basal             N            RCB-0/I        23   SET-Low   \n",
       "\n",
       "            source               tissue title  \n",
       "sample_name                                    \n",
       "GSM615632      USO  breast cancer tumor     5  \n",
       "GSM615634      USO  breast cancer tumor    13  \n",
       "GSM615635      USO  breast cancer tumor    18  \n",
       "GSM615636      USO  breast cancer tumor    20  \n",
       "GSM615637      USO  breast cancer tumor    23  \n",
       "\n",
       "[5 rows x 27 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"response\"] = df.apply(lambda row : RCB2response(row),axis =1 )\n",
    "# drop NA response\n",
    "s = df[df[\"response\"] == \"NA\"].index.values\n",
    "print(\"samples with NA response\", df[df[\"response\"] == \"NA\"].shape[0])\n",
    "df = df[df[\"response\"] != \"NA\"]\n",
    "df[\"drug\"] = df[\"type_taxane\"].apply(lambda x : drug_dict[x])\n",
    "df = df[[\"drug\",\"response\",'pathologic_response_pcr_rd','pathologic_response_rcb_class','type_taxane'] +\n",
    "       cols]\n",
    "for drug in list(set(df[\"drug\"].values)):\n",
    "    d = df[df[\"drug\"]==drug]\n",
    "    d.sort_values(by=\"sample_name\",inplace=True)\n",
    "    d.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE25065_response.\"+drug+\".tsv\",sep = \"\\t\")\n",
    "    print(drug,\"R:\",d[d[\"response\"]==\"R\"].shape[0],\"S:\",d[d[\"response\"]==\"S\"].shape[0])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GSE33072 - resistance to erlotinib (25) and PI3K pathway inhibitors (sorafenib, 37) in non-small cell lung cancer\n",
    "According to the paper, the main outcome measure is disease control rate (DCR) at 8 weeks.\n",
    "DC was assessed by radiologists and defined as a CR, PR or SD according to the RECIST.\n",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4211116/#SD1\n",
    "#### Sorafenib\n",
    " - month-to-progression - 'pfsm (month):ch1'\n",
    " - whether DC or not '8_week_disease control_1yes_0no:ch1' and '8-week disease control (1=yes, 0=no):ch1'\n",
    " -  sensitive if DC, resistant otherwise\n",
    "\n",
    "\n",
    "#### Erlotinib\n",
    " - whether DC or not - not available neither in GEO phenoData nor in publications\n",
    " - month-to-progression - 'progression-free survival time (months):ch1'\n",
    " \n",
    "Geeleher 2014 et al. calcluate Pearson's $r$ between months-to-progression and predicted responses. \n",
    "\n",
    "DC at eight weeks was unavailable for erlotinib-treated patients; therefore we used months-to-progression (PFSM) to define resistant and sensitive patients. Since eight weeks is approximately 1.86 months, we assigned patients with months-to-progression < 1.86 as resistant and months-to-progression >= 1.86 as sensitive. This is an uncertain assignment.\n",
    "Erlotinib-treated patients had DC annotation and were assigned to sensitive if DC, and resistant otherwise. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/olya/miniconda2/lib/python2.7/site-packages/pandas/core/indexing.py:362: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  self.obj[key] = _infer_fill_value(value)\n",
      "/home/olya/miniconda2/lib/python2.7/site-packages/pandas/core/indexing.py:543: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  self.obj[item] = s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sorafenib R: 16 S: 23\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>pfsm (month):ch1</th>\n",
       "      <th>treatment:ch1</th>\n",
       "      <th>8_week_disease control_1yes_0no:ch1</th>\n",
       "      <th>8-week disease control (1=yes, 0=no):ch1</th>\n",
       "      <th>pfsc (1=progressed; 0=not progressed):ch1</th>\n",
       "      <th>title</th>\n",
       "      <th>geo_accession</th>\n",
       "      <th>status</th>\n",
       "      <th>...</th>\n",
       "      <th>prior_tx_for_mets:ch1</th>\n",
       "      <th>progression-free survival status:ch1</th>\n",
       "      <th>progression-free survival time (months):ch1</th>\n",
       "      <th>race:ch1</th>\n",
       "      <th>randomization_date:ch1</th>\n",
       "      <th>smoking_status:ch1</th>\n",
       "      <th>stage_at_diagnosis:ch1</th>\n",
       "      <th>transition/transversion:ch1</th>\n",
       "      <th>type kras aa change:ch1</th>\n",
       "      <th>type of kras mut:ch1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GSM677317</th>\n",
       "      <td>Sorafenib</td>\n",
       "      <td>R</td>\n",
       "      <td>1.6756</td>\n",
       "      <td>sorafenib</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>LM118</td>\n",
       "      <td>GSM677317</td>\n",
       "      <td>Public on Jun 01 2012</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>White</td>\n",
       "      <td>2008-02-06</td>\n",
       "      <td>Former</td>\n",
       "      <td>IV</td>\n",
       "      <td>Transversion</td>\n",
       "      <td>VAL</td>\n",
       "      <td>GGT12GTT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM677322</th>\n",
       "      <td>Sorafenib</td>\n",
       "      <td>S</td>\n",
       "      <td>9.1663</td>\n",
       "      <td>sorafenib</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>LM227</td>\n",
       "      <td>GSM677322</td>\n",
       "      <td>Public on Jun 01 2012</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>White</td>\n",
       "      <td>2008-05-29</td>\n",
       "      <td>Former</td>\n",
       "      <td>IIIB</td>\n",
       "      <td>Transition</td>\n",
       "      <td>ASP</td>\n",
       "      <td>GGT12GAT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM677333</th>\n",
       "      <td>Sorafenib</td>\n",
       "      <td>S</td>\n",
       "      <td>2.7598</td>\n",
       "      <td>sorafenib</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>LM450</td>\n",
       "      <td>GSM677333</td>\n",
       "      <td>Public on Jun 01 2012</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>White</td>\n",
       "      <td>2009-05-13</td>\n",
       "      <td>Former</td>\n",
       "      <td>IV</td>\n",
       "      <td>Transversion</td>\n",
       "      <td>CYS</td>\n",
       "      <td>GGT12TGT</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 89 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  drug response pfsm (month):ch1 treatment:ch1  \\\n",
       "sample_name                                                      \n",
       "GSM677317    Sorafenib        R           1.6756     sorafenib   \n",
       "GSM677322    Sorafenib        S           9.1663     sorafenib   \n",
       "GSM677333    Sorafenib        S           2.7598     sorafenib   \n",
       "\n",
       "            8_week_disease control_1yes_0no:ch1  \\\n",
       "sample_name                                       \n",
       "GSM677317                                   NaN   \n",
       "GSM677322                                   NaN   \n",
       "GSM677333                                   NaN   \n",
       "\n",
       "            8-week disease control (1=yes, 0=no):ch1  \\\n",
       "sample_name                                            \n",
       "GSM677317                                          0   \n",
       "GSM677322                                          1   \n",
       "GSM677333                                          1   \n",
       "\n",
       "            pfsc (1=progressed; 0=not progressed):ch1  title geo_accession  \\\n",
       "sample_name                                                                  \n",
       "GSM677317                                           1  LM118     GSM677317   \n",
       "GSM677322                                           1  LM227     GSM677322   \n",
       "GSM677333                                           1  LM450     GSM677333   \n",
       "\n",
       "                            status         ...          prior_tx_for_mets:ch1  \\\n",
       "sample_name                                ...                                  \n",
       "GSM677317    Public on Jun 01 2012         ...                              3   \n",
       "GSM677322    Public on Jun 01 2012         ...                              2   \n",
       "GSM677333    Public on Jun 01 2012         ...                              1   \n",
       "\n",
       "            progression-free survival status:ch1  \\\n",
       "sample_name                                        \n",
       "GSM677317                                    NaN   \n",
       "GSM677322                                    NaN   \n",
       "GSM677333                                    NaN   \n",
       "\n",
       "            progression-free survival time (months):ch1 race:ch1  \\\n",
       "sample_name                                                        \n",
       "GSM677317                                           NaN    White   \n",
       "GSM677322                                           NaN    White   \n",
       "GSM677333                                           NaN    White   \n",
       "\n",
       "            randomization_date:ch1 smoking_status:ch1 stage_at_diagnosis:ch1  \\\n",
       "sample_name                                                                    \n",
       "GSM677317               2008-02-06             Former                     IV   \n",
       "GSM677322               2008-05-29             Former                   IIIB   \n",
       "GSM677333               2009-05-13             Former                     IV   \n",
       "\n",
       "            transition/transversion:ch1 type kras aa change:ch1  \\\n",
       "sample_name                                                       \n",
       "GSM677317                  Transversion                     VAL   \n",
       "GSM677322                    Transition                     ASP   \n",
       "GSM677333                  Transversion                     CYS   \n",
       "\n",
       "            type of kras mut:ch1  \n",
       "sample_name                       \n",
       "GSM677317               GGT12GTT  \n",
       "GSM677322               GGT12GAT  \n",
       "GSM677333               GGT12TGT  \n",
       "\n",
       "[3 rows x 89 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#fpath = download_GEO_matrix(\"GSE33072_series_matrix.txt.gz\",\n",
    "#                            '/geo/series/GSE33nnn/GSE33072/matrix/',destination=tmp_dir)\n",
    "#df = read_matrix(fpath)\n",
    "# annotation in GEO is messed up, therefore read phenoData with R GEOquery\n",
    "df = pd.read_csv(\"GSE33072_annotation.tsv\",sep=\"\\t\",index_col=0)\n",
    "df = df.dropna(subset=[\"treatment:ch1\"])\n",
    "df_e  = df[df[\"treatment:ch1\"] == \"erlotinib\"]\n",
    "df_s  = df[df[\"treatment:ch1\"] == \"sorafenib\"]\n",
    "#print(\"Erlotinib:\",df_e.shape[0],\"Sorafenib:\",df_s.shape[0])\n",
    "df_s.loc[:,\"drug\"] = \"Sorafenib\"\n",
    "df_s.loc[:,\"response\"] = \"R\"\n",
    "df_s.loc[df_s[\"8-week disease control (1=yes, 0=no):ch1\"] == 1.0, \"response\"] = \"S\"\n",
    "cols_order = [\"drug\",\"response\",'pfsm (month):ch1',\"treatment:ch1\",'8_week_disease control_1yes_0no:ch1',\n",
    "      '8-week disease control (1=yes, 0=no):ch1',\n",
    "      'pfsc (1=progressed; 0=not progressed):ch1']\n",
    "df_s = df_s[list(cols_order)+list(df_s.columns.values)]\n",
    "df_s = df_s.T.drop_duplicates().T\n",
    "df_s = df_s.dropna(how=\"all\",axis=1)\n",
    "df_s.index.name = \"sample_name\"\n",
    "df_s.sort_values(by=\"sample_name\",inplace=True)\n",
    "print(\"Sorafenib\",\"R:\",df_s.loc[df_s[\"response\"]==\"R\",:].shape[0],\"S:\",\n",
    "      df_s.loc[df_s[\"response\"]==\"S\",:].shape[0])\n",
    "df_s.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE33072_response.\"+\"Sorafenib\"+\".tsv\",sep = \"\\t\")\n",
    "df_s.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Erlotinib R: 12 S: 13\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>progression-free survival time (months):ch1</th>\n",
       "      <th>treatment:ch1</th>\n",
       "      <th>progression-free survival status:ch1</th>\n",
       "      <th>title</th>\n",
       "      <th>geo_accession</th>\n",
       "      <th>status</th>\n",
       "      <th>submission_date</th>\n",
       "      <th>last_update_date</th>\n",
       "      <th>...</th>\n",
       "      <th>egfr index:ch1</th>\n",
       "      <th>egfr mutation:ch1</th>\n",
       "      <th>glyc_replaced_by_c_d_v_a:ch1</th>\n",
       "      <th>kras mutation:ch1</th>\n",
       "      <th>kras_mut_codon:ch1</th>\n",
       "      <th>kras_mut_iw:ch1</th>\n",
       "      <th>kras_mut_type:ch1</th>\n",
       "      <th>randomization date:ch1</th>\n",
       "      <th>transition_transversion:ch1</th>\n",
       "      <th>type_kras_aa_change:ch1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GSM677318</th>\n",
       "      <td>Erlotinib</td>\n",
       "      <td>S</td>\n",
       "      <td>3.2526</td>\n",
       "      <td>erlotinib</td>\n",
       "      <td>1</td>\n",
       "      <td>LM124</td>\n",
       "      <td>GSM677318</td>\n",
       "      <td>Public on Jun 01 2012</td>\n",
       "      <td>Feb 17 2011</td>\n",
       "      <td>Jun 01 2012</td>\n",
       "      <td>...</td>\n",
       "      <td>0.21</td>\n",
       "      <td>WT</td>\n",
       "      <td>C</td>\n",
       "      <td>Mutant</td>\n",
       "      <td>12</td>\n",
       "      <td>Yes</td>\n",
       "      <td>GGT12TGT</td>\n",
       "      <td>9/27/2007</td>\n",
       "      <td>Transversion</td>\n",
       "      <td>CYS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM677321</th>\n",
       "      <td>Erlotinib</td>\n",
       "      <td>S</td>\n",
       "      <td>2.037</td>\n",
       "      <td>erlotinib</td>\n",
       "      <td>1</td>\n",
       "      <td>LM218</td>\n",
       "      <td>GSM677321</td>\n",
       "      <td>Public on Jun 01 2012</td>\n",
       "      <td>Feb 17 2011</td>\n",
       "      <td>Jun 01 2012</td>\n",
       "      <td>...</td>\n",
       "      <td>1.05</td>\n",
       "      <td>WT</td>\n",
       "      <td>V</td>\n",
       "      <td>Mutant</td>\n",
       "      <td>12</td>\n",
       "      <td>Yes</td>\n",
       "      <td>GGT12GTT</td>\n",
       "      <td>4/17/2008</td>\n",
       "      <td>Transversion</td>\n",
       "      <td>VAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM677326</th>\n",
       "      <td>Erlotinib</td>\n",
       "      <td>S</td>\n",
       "      <td>2.0698</td>\n",
       "      <td>erlotinib</td>\n",
       "      <td>1</td>\n",
       "      <td>LM237</td>\n",
       "      <td>GSM677326</td>\n",
       "      <td>Public on Jun 01 2012</td>\n",
       "      <td>Feb 17 2011</td>\n",
       "      <td>Jun 01 2012</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.26</td>\n",
       "      <td>WT</td>\n",
       "      <td>V</td>\n",
       "      <td>Mutant</td>\n",
       "      <td>12</td>\n",
       "      <td>Yes</td>\n",
       "      <td>GGT12GTT</td>\n",
       "      <td>7/24/2008</td>\n",
       "      <td>Transversion</td>\n",
       "      <td>VAL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 67 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  drug response progression-free survival time (months):ch1  \\\n",
       "sample_name                                                                   \n",
       "GSM677318    Erlotinib        S                                      3.2526   \n",
       "GSM677321    Erlotinib        S                                       2.037   \n",
       "GSM677326    Erlotinib        S                                      2.0698   \n",
       "\n",
       "            treatment:ch1 progression-free survival status:ch1  title  \\\n",
       "sample_name                                                             \n",
       "GSM677318       erlotinib                                    1  LM124   \n",
       "GSM677321       erlotinib                                    1  LM218   \n",
       "GSM677326       erlotinib                                    1  LM237   \n",
       "\n",
       "            geo_accession                 status submission_date  \\\n",
       "sample_name                                                        \n",
       "GSM677318       GSM677318  Public on Jun 01 2012     Feb 17 2011   \n",
       "GSM677321       GSM677321  Public on Jun 01 2012     Feb 17 2011   \n",
       "GSM677326       GSM677326  Public on Jun 01 2012     Feb 17 2011   \n",
       "\n",
       "            last_update_date           ...           egfr index:ch1  \\\n",
       "sample_name                            ...                            \n",
       "GSM677318        Jun 01 2012           ...                     0.21   \n",
       "GSM677321        Jun 01 2012           ...                     1.05   \n",
       "GSM677326        Jun 01 2012           ...                    -0.26   \n",
       "\n",
       "            egfr mutation:ch1 glyc_replaced_by_c_d_v_a:ch1 kras mutation:ch1  \\\n",
       "sample_name                                                                    \n",
       "GSM677318                  WT                            C            Mutant   \n",
       "GSM677321                  WT                            V            Mutant   \n",
       "GSM677326                  WT                            V            Mutant   \n",
       "\n",
       "            kras_mut_codon:ch1 kras_mut_iw:ch1 kras_mut_type:ch1  \\\n",
       "sample_name                                                        \n",
       "GSM677318                   12             Yes          GGT12TGT   \n",
       "GSM677321                   12             Yes          GGT12GTT   \n",
       "GSM677326                   12             Yes          GGT12GTT   \n",
       "\n",
       "            randomization date:ch1 transition_transversion:ch1  \\\n",
       "sample_name                                                      \n",
       "GSM677318                9/27/2007                Transversion   \n",
       "GSM677321                4/17/2008                Transversion   \n",
       "GSM677326                7/24/2008                Transversion   \n",
       "\n",
       "            type_kras_aa_change:ch1  \n",
       "sample_name                          \n",
       "GSM677318                       CYS  \n",
       "GSM677321                       VAL  \n",
       "GSM677326                       VAL  \n",
       "\n",
       "[3 rows x 67 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_e.loc[:,\"drug\"] = \"Erlotinib\"\n",
    "df_e.loc[:,\"response\"] = \"S\"\n",
    "df_e.loc[df_e['progression-free survival time (months):ch1']< 1.86, \"response\"] = \"R\"\n",
    "df_e = df_e.dropna(how=\"all\",axis=1)\n",
    "cols_order = [\"drug\",\"response\",'progression-free survival time (months):ch1',\"treatment:ch1\",\"treatment:ch1\",\n",
    "       'progression-free survival status:ch1']\n",
    "df_e = df_e[list(cols_order)+list(df_e.columns.values)]\n",
    "df_e = df_e.T.drop_duplicates().T\n",
    "print(\"Erlotinib\",\"R:\",df_e.loc[df_e[\"response\"]==\"R\",:].shape[0],\"S:\",\n",
    "      df_e.loc[df_e[\"response\"]==\"S\",:].shape[0])\n",
    "df_e.index.name = \"sample_name\"\n",
    "df_e.sort_values(by=\"sample_name\",inplace=True)\n",
    "df_e.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE33072_response.\"+\"Erlotinib\"+\".tsv\",sep = \"\\t\")\n",
    "df_e.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GSE9782 - response and survival with bortezomib compared to dexamethasone in patients with multiple myeloma\n",
    " - Two arrays HG-U133A(GPL96) and HG-U133B(GPL97) for each patient\n",
    " - \"characteristics_ch1.1\" - Dex - Dexamethasone (76), PS341 - Bortezomib (188) \n",
    " - Response:\n",
    "     - \"characteristics_ch1.7\" - PGx_Response : complete response (CR), partial response (PR), minimal response (MR), no change (NC), or PD (progressive disease)\n",
    "\n",
    "     [from 10.1182/blood-2006-09-044974]\n",
    "     - \"characteristics_ch1.8\" - PGx_Responder : R - responder, NR - non-responder\n",
    "Samples with PGx_Response = IE and PGx_Responder = IE were excluded, because this group was not explined in the text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HG-U133A: Dexamethasone -  70 \tBortezomib -  169\n",
      "HG-U133B: Dexamethasone -  70 \tBortezomib -  169\n"
     ]
    }
   ],
   "source": [
    "df_a = pd.read_csv(\"GSE9782-GPL96_annotation.tsv\",sep=\"\\t\",index_col=0)\n",
    "df_a.index.name = \"sample_name\"\n",
    "df_a.sort_values(by=\"sample_name\",inplace=True)\n",
    "df_a = df_a.loc[df_a[\"characteristics_ch1.8\"] != \"PGx_Responder = IE\",]\n",
    "df_a.loc[:,\"response_detailed\"] = df_a[\"characteristics_ch1.7\"].apply(lambda x: x.replace(\"PGx_Response = \",\"\"))\n",
    "df_a.loc[df_a[\"characteristics_ch1.8\"]==\"PGx_Responder = NR\",\"response\"] = \"R\"\n",
    "df_a.loc[df_a[\"characteristics_ch1.8\"]==\"PGx_Responder = R\",\"response\"] = \"S\"\n",
    "df_a_dex = df_a[df_a[\"characteristics_ch1.1\"] == \"treatment = Dex\"]\n",
    "df_a_dex .loc[:,\"drug\"] = \"Dexamethasone\"\n",
    "df_a_bort = df_a[df_a[\"characteristics_ch1.1\"] == \"treatment = PS341\"]\n",
    "df_a_bort.loc[:,\"drug\"] = \"Bortezomib\"\n",
    "print(\"HG-U133A:\",\"Dexamethasone - \",df_a_dex.shape[0],\"\\tBortezomib - \",df_a_bort.shape[0])\n",
    "\n",
    "df_b = pd.read_csv(\"GSE9782-GPL97_annotation.tsv\",sep=\"\\t\",index_col=0)\n",
    "df_b.index.name = \"sample_name\"\n",
    "df_b.sort_values(by=\"sample_name\",inplace=True)\n",
    "df_b = df_b.loc[df_b[\"characteristics_ch1.8\"] != \"PGx_Responder = IE\",]\n",
    "df_b.loc[:,\"response_detailed\"] = df_b[\"characteristics_ch1.7\"].apply(lambda x: x.replace(\"PGx_Response = \",\"\"))\n",
    "df_b.loc[df_b[\"characteristics_ch1.8\"]==\"PGx_Responder = NR\",\"response\"] = \"R\"\n",
    "df_b.loc[df_b[\"characteristics_ch1.8\"]==\"PGx_Responder = R\",\"response\"] = \"S\"\n",
    "df_b_dex = df_b[df_b[\"characteristics_ch1.1\"] == \"treatment = Dex\"]\n",
    "df_b_dex .loc[:,\"drug\"] = \"Dexamethasone\"\n",
    "df_b_bort = df_b[df_b[\"characteristics_ch1.1\"] == \"treatment = PS341\"]\n",
    "df_b_bort.loc[:,\"drug\"] = \"Bortezomib\"\n",
    "print(\"HG-U133B:\",\"Dexamethasone - \",df_b_dex.shape[0],\"\\tBortezomib - \",df_b_bort.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bortezomib - GPL96 R: 84 S: 85\n",
      "Bortezomib - GPL97 R: 84 S: 85\n",
      "Dexamethasone - GPL96 R: 42 S: 28\n",
      "Dexamethasone - GPL97 R: 42 S: 28\n"
     ]
    }
   ],
   "source": [
    "cols_order = [\"drug\",\"response\",\"response_detailed\",'title',u'characteristics_ch1.1',u'characteristics_ch1.7',u'characteristics_ch1.8'] \n",
    "df_a_bort = df_a_bort[cols_order+list(df_a_bort.columns.values)].T.drop_duplicates().T\n",
    "df_a_bort.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE9782-GPL96_response.\"+\"Bortezomib\"+\".tsv\",sep = \"\\t\")\n",
    "print(\"Bortezomib - GPL96\",\"R:\",df_a_bort[df_a_bort[\"response\"]==\"R\"].shape[0],\n",
    "      \"S:\",df_a_bort[df_a_bort[\"response\"]==\"S\"].shape[0])\n",
    "\n",
    "df_b_bort = df_b_bort[cols_order+list(df_b_bort.columns.values)].T.drop_duplicates().T\n",
    "df_b_bort.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE9782-GPL97_response.\"+\"Bortezomib\"+\".tsv\",sep = \"\\t\")\n",
    "print(\"Bortezomib - GPL97\",\"R:\",df_b_bort[df_b_bort[\"response\"]==\"R\"].shape[0],\n",
    "      \"S:\",df_b_bort[df_b_bort[\"response\"]==\"S\"].shape[0])\n",
    "\n",
    "df_a_dex = df_a_dex[cols_order+list(df_a_dex.columns.values)].T.drop_duplicates().T\n",
    "df_a_dex.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE9782-GPL96_response.\"+\"Dexamethasone\"+\".tsv\",sep = \"\\t\")\n",
    "print(\"Dexamethasone - GPL96\",\"R:\",df_a_dex[df_a_dex[\"response\"]==\"R\"].shape[0],\n",
    "      \"S:\",df_a_dex[df_a_dex[\"response\"]==\"S\"].shape[0])\n",
    "\n",
    "df_b_dex = df_b_dex[cols_order+list(df_b_dex.columns.values)].T.drop_duplicates().T\n",
    "df_b_dex.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GSE9782-GPL97_response.\"+\"Dexamethasone\"+\".tsv\",sep = \"\\t\")\n",
    "print(\"Dexamethasone - GPL97\",\"R:\",df_b_dex[df_b_dex[\"response\"]==\"R\"].shape[0],\n",
    "      \"S:\",df_b_dex[df_b_dex[\"response\"]==\"S\"].shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PDX\n",
    "Supplementary file nm.3954-S2.xlsx from https://www.nature.com/articles/nm.3954, tab \"PCT curve metrics\"\n",
    "\n",
    "- all combinational treatemnts were excluded\n",
    "- records containing '-->' or '-->-->' signs in ResponseCategory were excluded; these records correspond non-stable response, e.g. PR --> PD means SD-->-->PD means\n",
    "- we focus on 5 drugs: 'Cetuximab', 'Paclitaxel', 'Gemcitabine', '5-Fluorouracil', 'Erlotinib';\n",
    " 'Tamoxifen' has no \"S\" xenografts\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4758, 11)\n",
      "Combo drugs responses dropped: 1279\n",
      "(3479, 11)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>Treatment</th>\n",
       "      <th>Treatment target</th>\n",
       "      <th>Treatment type</th>\n",
       "      <th>BestResponse</th>\n",
       "      <th>Day_BestResponse</th>\n",
       "      <th>BestAvgResponse</th>\n",
       "      <th>Day_BestAvgResponse</th>\n",
       "      <th>TimeToDouble</th>\n",
       "      <th>Day_Last</th>\n",
       "      <th>ResponseCategory</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>X-007</td>\n",
       "      <td>BGJ398</td>\n",
       "      <td>FGFR</td>\n",
       "      <td>single</td>\n",
       "      <td>396.5</td>\n",
       "      <td>11</td>\n",
       "      <td>220.475000</td>\n",
       "      <td>11</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>11</td>\n",
       "      <td>PD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>X-007</td>\n",
       "      <td>BKM120</td>\n",
       "      <td>PIK3CA,PIK3CB,PIK3CG,PIK3CD,panPI3K</td>\n",
       "      <td>single</td>\n",
       "      <td>189.1</td>\n",
       "      <td>14</td>\n",
       "      <td>77.050000</td>\n",
       "      <td>11</td>\n",
       "      <td>6.207547</td>\n",
       "      <td>14</td>\n",
       "      <td>PD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>X-007</td>\n",
       "      <td>BYL719</td>\n",
       "      <td>PIK3CA</td>\n",
       "      <td>single</td>\n",
       "      <td>303.7</td>\n",
       "      <td>11</td>\n",
       "      <td>196.175000</td>\n",
       "      <td>11</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>11</td>\n",
       "      <td>PD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>X-007</td>\n",
       "      <td>CLR457</td>\n",
       "      <td>PIK3CA,PIK3CB,PIK3CG,PIK3CD,panPI3K</td>\n",
       "      <td>single</td>\n",
       "      <td>25.0</td>\n",
       "      <td>16</td>\n",
       "      <td>26.533333</td>\n",
       "      <td>16</td>\n",
       "      <td>36.835000</td>\n",
       "      <td>37</td>\n",
       "      <td>SD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>X-007</td>\n",
       "      <td>HDM201</td>\n",
       "      <td>MDM2</td>\n",
       "      <td>single</td>\n",
       "      <td>330.8</td>\n",
       "      <td>11</td>\n",
       "      <td>182.750000</td>\n",
       "      <td>11</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>11</td>\n",
       "      <td>PD</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Model Treatment                     Treatment target Treatment type  \\\n",
       "0  X-007    BGJ398                                 FGFR         single   \n",
       "1  X-007    BKM120  PIK3CA,PIK3CB,PIK3CG,PIK3CD,panPI3K         single   \n",
       "2  X-007    BYL719                               PIK3CA         single   \n",
       "5  X-007    CLR457  PIK3CA,PIK3CB,PIK3CG,PIK3CD,panPI3K         single   \n",
       "6  X-007    HDM201                                 MDM2         single   \n",
       "\n",
       "   BestResponse  Day_BestResponse  BestAvgResponse  Day_BestAvgResponse  \\\n",
       "0         396.5                11       220.475000                   11   \n",
       "1         189.1                14        77.050000                   11   \n",
       "2         303.7                11       196.175000                   11   \n",
       "5          25.0                16        26.533333                   16   \n",
       "6         330.8                11       182.750000                   11   \n",
       "\n",
       "   TimeToDouble  Day_Last ResponseCategory  \n",
       "0      4.000000        11               PD  \n",
       "1      6.207547        14               PD  \n",
       "2      4.000000        11               PD  \n",
       "5     36.835000        37               SD  \n",
       "6      4.000000        11               PD  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add download of annotation file \n",
    "df = pd.read_excel(\"/home/olya/SFU/Hossein/PDX/nm.3954-S2.xlsx\",\"PCT curve metrics\")\n",
    "print(df.shape)\n",
    "df.drop_duplicates(inplace=True)\n",
    "print(\"Combo drugs responses dropped:\",df.loc[df[\"Treatment type\"]==\"combo\",:].shape[0])\n",
    "df = df.loc[df[\"Treatment type\"]==\"single\",:]\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df[[\"Model\",\"Treatment\"]].groupby(\"Treatment\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[[\"Model\",\"ResponseCategory\"]].groupby(\"ResponseCategory\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dict = {\"5FU\":\"5-Fluorouracil\",\"erlotinib\":\"Erlotinib\",\"cetuximab\":\"Cetuximab\",\n",
    "            \"gemcitabine-50mpk\":\"Gemcitabine\",\"paclitaxel\":\"Paclitaxel\"}\n",
    "response_dict = {\"CR\":\"S\",\"PR\":\"S\",\"SD\":\"R\",\"PD\":\"R\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.loc[df[\"Treatment\"].isin(drug_dict.keys()),:]\n",
    "print(\"Records for drugs\",drug_dict.values(),df.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[[\"Model\",\"Treatment\"]].groupby(\"Treatment\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.loc[~df[\"ResponseCategory\"].str.contains(\"-->\"),:]\n",
    "df.groupby(\"ResponseCategory\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df.loc[:,\"drug\"] = df[\"Treatment\"].apply(lambda x : drug_dict[x])\n",
    "df.loc[:,\"response\"] = df[\"ResponseCategory\"].apply(lambda x : response_dict[x])\n",
    "df = df[[\"Model\",\"drug\",\"response\",\"ResponseCategory\",\"Treatment\",\"Treatment target\",\n",
    "   \"Treatment type\",\"BestResponse\",\"Day_BestResponse\",\"BestAvgResponse\",\"Day_BestAvgResponse\",\"TimeToDouble\",\"Day_Last\"]]\n",
    "\n",
    "for drug in drug_dict.values():\n",
    "    d = df.loc[df[\"drug\"]==drug,:]\n",
    "    d.set_index(\"Model\",inplace = True,drop=True)\n",
    "    d.index.name = \"sample_name\"\n",
    "    d.sort_values(by=\"sample_name\",inplace=True)\n",
    "    d.to_csv(root_dir+\"/preprocessed/annotations/\"+\"PDX_response.\"+drug+\".tsv\",sep = \"\\t\")\n",
    "    print(drug,\"R:\",d[d[\"response\"]==\"R\"].shape[0],\n",
    "      \"S:\",d[d[\"response\"]==\"S\"].shape[0])\n",
    "d.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TCGA \n",
    "\n",
    "Ding et al. 2016, Supplementary tables , tab \"Table S2\"\n",
    "\n",
    "\"bcr_patient_barcode\" matches with first 12 symbols in sample barcore. One patient in TCGA may have more than one tumor sample and even one or several normal samples. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "response_dict = {'Clinical Progressive Disease':\"R\",'Complete Response':\"S\",\n",
    "                 'Partial Response':\"S\",'Stable Disease':\"R\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2569, 16)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>1</th>\n",
       "      <th>bcr_patient_barcode</th>\n",
       "      <th>cohort</th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>measure_of_response</th>\n",
       "      <th>days_to_drug_therapy_start</th>\n",
       "      <th>days_to_drug_therapy_end</th>\n",
       "      <th>DrugBank ID</th>\n",
       "      <th>days_to_initial_pathologic_diagnosis</th>\n",
       "      <th>method_of_sample_procurement</th>\n",
       "      <th>days_to_sample_procurement</th>\n",
       "      <th>days_to_new_tumor_event_after_initial_treatment</th>\n",
       "      <th>additional_pharmaceutical_therapy</th>\n",
       "      <th>new_tumor_event_additional_surgery_procedure</th>\n",
       "      <th>history_of_neoadjuvant_treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TCGA-OR-A5JM</td>\n",
       "      <td>ACC</td>\n",
       "      <td>Sunitinib</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>378</td>\n",
       "      <td>439</td>\n",
       "      <td>DB01268</td>\n",
       "      <td>0</td>\n",
       "      <td>Surgical Resection</td>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>YES</td>\n",
       "      <td>NO</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TCGA-OR-A5JM</td>\n",
       "      <td>ACC</td>\n",
       "      <td>Ketoconazole</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>378</td>\n",
       "      <td>439</td>\n",
       "      <td>DB01026</td>\n",
       "      <td>0</td>\n",
       "      <td>Surgical Resection</td>\n",
       "      <td>1</td>\n",
       "      <td>72</td>\n",
       "      <td>YES</td>\n",
       "      <td>NO</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TCGA-OU-A5PI</td>\n",
       "      <td>ACC</td>\n",
       "      <td>Etoposide</td>\n",
       "      <td>R</td>\n",
       "      <td>Stable Disease</td>\n",
       "      <td>69</td>\n",
       "      <td>239</td>\n",
       "      <td>DB00773</td>\n",
       "      <td>0</td>\n",
       "      <td>Surgical Resection</td>\n",
       "      <td>0</td>\n",
       "      <td>351</td>\n",
       "      <td>YES</td>\n",
       "      <td>YES</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>TCGA-OU-A5PI</td>\n",
       "      <td>ACC</td>\n",
       "      <td>Doxorubicin</td>\n",
       "      <td>R</td>\n",
       "      <td>Stable Disease</td>\n",
       "      <td>69</td>\n",
       "      <td>239</td>\n",
       "      <td>DB00997</td>\n",
       "      <td>0</td>\n",
       "      <td>Surgical Resection</td>\n",
       "      <td>0</td>\n",
       "      <td>351</td>\n",
       "      <td>YES</td>\n",
       "      <td>YES</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>TCGA-OU-A5PI</td>\n",
       "      <td>ACC</td>\n",
       "      <td>Cisplatin</td>\n",
       "      <td>R</td>\n",
       "      <td>Stable Disease</td>\n",
       "      <td>55</td>\n",
       "      <td>239</td>\n",
       "      <td>DB00515</td>\n",
       "      <td>0</td>\n",
       "      <td>Surgical Resection</td>\n",
       "      <td>0</td>\n",
       "      <td>351</td>\n",
       "      <td>YES</td>\n",
       "      <td>YES</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "1 bcr_patient_barcode cohort          drug response  \\\n",
       "3        TCGA-OR-A5JM    ACC     Sunitinib        R   \n",
       "4        TCGA-OR-A5JM    ACC  Ketoconazole        R   \n",
       "5        TCGA-OU-A5PI    ACC     Etoposide        R   \n",
       "6        TCGA-OU-A5PI    ACC   Doxorubicin        R   \n",
       "7        TCGA-OU-A5PI    ACC     Cisplatin        R   \n",
       "\n",
       "1           measure_of_response days_to_drug_therapy_start  \\\n",
       "3  Clinical Progressive Disease                        378   \n",
       "4  Clinical Progressive Disease                        378   \n",
       "5                Stable Disease                         69   \n",
       "6                Stable Disease                         69   \n",
       "7                Stable Disease                         55   \n",
       "\n",
       "1 days_to_drug_therapy_end DrugBank ID days_to_initial_pathologic_diagnosis  \\\n",
       "3                      439     DB01268                                    0   \n",
       "4                      439     DB01026                                    0   \n",
       "5                      239     DB00773                                    0   \n",
       "6                      239     DB00997                                    0   \n",
       "7                      239     DB00515                                    0   \n",
       "\n",
       "1 method_of_sample_procurement days_to_sample_procurement  \\\n",
       "3           Surgical Resection                          1   \n",
       "4           Surgical Resection                          1   \n",
       "5           Surgical Resection                          0   \n",
       "6           Surgical Resection                          0   \n",
       "7           Surgical Resection                          0   \n",
       "\n",
       "1 days_to_new_tumor_event_after_initial_treatment  \\\n",
       "3                                              72   \n",
       "4                                              72   \n",
       "5                                             351   \n",
       "6                                             351   \n",
       "7                                             351   \n",
       "\n",
       "1 additional_pharmaceutical_therapy  \\\n",
       "3                               YES   \n",
       "4                               YES   \n",
       "5                               YES   \n",
       "6                               YES   \n",
       "7                               YES   \n",
       "\n",
       "1 new_tumor_event_additional_surgery_procedure  \\\n",
       "3                                           NO   \n",
       "4                                           NO   \n",
       "5                                          YES   \n",
       "6                                          YES   \n",
       "7                                          YES   \n",
       "\n",
       "1 history_of_neoadjuvant_treatment  \n",
       "3                              Yes  \n",
       "4                              Yes  \n",
       "5                               No  \n",
       "6                               No  \n",
       "7                               No  "
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add download of annotation file \n",
    "df = pd.read_excel(\"/home/olya/SFU/Hossein/TCGA/annotation_Ding2016/bioinfo16_supplementary_tables.xlsx\",\n",
    "                   \"Table S2\")\n",
    "df = df.drop_duplicates()\n",
    "df.drop([0,2],inplace=True)\n",
    "cols = df.loc[1,:]\n",
    "df = df.drop(1)\n",
    "df.columns = cols\n",
    "df.loc[:,\"cohort\"] = df[\"Cancer\"].apply(lambda x: re.search(r'\\((.*?)\\)',x).group(1))\n",
    "df.loc[:,\"response\"] =  df[\"measure_of_response\"].apply(lambda x: response_dict[x])\n",
    "print(df.shape)\n",
    "dup_indices = df.loc[df[[\"bcr_patient_barcode\",\n",
    "           \"days_to_drug_therapy_start\",\"days_to_drug_therapy_end\"]].duplicated(keep=False),:].index.values\n",
    "df = df[[\"bcr_patient_barcode\",\"cohort\",\"drug_name\",\"response\",\"measure_of_response\",\n",
    "         \"days_to_drug_therapy_start\",\"days_to_drug_therapy_end\",\"DrugBank ID\",\n",
    "         \"days_to_initial_pathologic_diagnosis\",\"method_of_sample_procurement\",\n",
    "         \"days_to_sample_procurement\",\"days_to_new_tumor_event_after_initial_treatment\",\n",
    "         \"additional_pharmaceutical_therapy\",\"new_tumor_event_additional_surgery_procedure\",\n",
    "         \"history_of_neoadjuvant_treatment\"]]\n",
    "df.rename({\"drug_name\":\"drug\"},axis=\"columns\",inplace=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>1</th>\n",
       "      <th>bcr_patient_barcode</th>\n",
       "      <th>cohort</th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>measure_of_response</th>\n",
       "      <th>days_to_drug_therapy_start</th>\n",
       "      <th>days_to_drug_therapy_end</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>TCGA-OU-A5PI</td>\n",
       "      <td>ACC</td>\n",
       "      <td>Carboplatin</td>\n",
       "      <td>R</td>\n",
       "      <td>Stable Disease</td>\n",
       "      <td>725</td>\n",
       "      <td>817</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "1  bcr_patient_barcode cohort         drug response measure_of_response  \\\n",
       "12        TCGA-OU-A5PI    ACC  Carboplatin        R      Stable Disease   \n",
       "\n",
       "1  days_to_drug_therapy_start days_to_drug_therapy_end  \n",
       "12                        725                      817  "
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for group in df.iloc[2:10,0:7].groupby(\"bcr_patient_barcode\"):\n",
    "    pass\n",
    "def exclude_combos(df_group):\n",
    "    if df_group.shape[0] == 1:\n",
    "        return df_group\n",
    "    d = df_group.T.to_dict()\n",
    "    keys_to_remove = set()\n",
    "    for key in d.keys():\n",
    "        start = d[key][\"days_to_drug_therapy_start\"]\n",
    "        end = d[key][\"days_to_drug_therapy_end\"]\n",
    "        #print(key,start,end)\n",
    "        for key2 in d.keys():\n",
    "            if key2 != key:\n",
    "                start2 = d[key2][\"days_to_drug_therapy_start\"]\n",
    "                end2 = d[key2][\"days_to_drug_therapy_end\"]\n",
    "                if not (end < start2) and not (end2 < start):\n",
    "                    # if not non-overlapping time intervals\n",
    "                    keys_to_remove.add(key)\n",
    "                    keys_to_remove.add(key2)\n",
    "    #print(list(keys_to_remove))\n",
    "    return df_group.loc[~df_group.index.isin(keys_to_remove),:]\n",
    "exclude_combos(group[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(785, 15)\n",
      "Records with combo drugs excluded: 1784\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>1</th>\n",
       "      <th>bcr_patient_barcode</th>\n",
       "      <th>cohort</th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>measure_of_response</th>\n",
       "      <th>days_to_drug_therapy_start</th>\n",
       "      <th>days_to_drug_therapy_end</th>\n",
       "      <th>DrugBank ID</th>\n",
       "      <th>days_to_initial_pathologic_diagnosis</th>\n",
       "      <th>method_of_sample_procurement</th>\n",
       "      <th>days_to_sample_procurement</th>\n",
       "      <th>days_to_new_tumor_event_after_initial_treatment</th>\n",
       "      <th>additional_pharmaceutical_therapy</th>\n",
       "      <th>new_tumor_event_additional_surgery_procedure</th>\n",
       "      <th>history_of_neoadjuvant_treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1355</th>\n",
       "      <td>TCGA-05-4402</td>\n",
       "      <td>LUAD</td>\n",
       "      <td>Erlotinib</td>\n",
       "      <td>S</td>\n",
       "      <td>Complete Response</td>\n",
       "      <td>122</td>\n",
       "      <td>122</td>\n",
       "      <td>DB00530</td>\n",
       "      <td>0</td>\n",
       "      <td>Other Method (please specify)</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1360</th>\n",
       "      <td>TCGA-05-5425</td>\n",
       "      <td>LUAD</td>\n",
       "      <td>Gefitinib</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>608</td>\n",
       "      <td>669</td>\n",
       "      <td>DB00317</td>\n",
       "      <td>0</td>\n",
       "      <td>Other Method (please specify)</td>\n",
       "      <td>31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>883</th>\n",
       "      <td>TCGA-06-1806</td>\n",
       "      <td>GBM</td>\n",
       "      <td>veliparib</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>81</td>\n",
       "      <td>256</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>Subtotal Resection</td>\n",
       "      <td>0</td>\n",
       "      <td>256</td>\n",
       "      <td>YES</td>\n",
       "      <td>[Not Available]</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>884</th>\n",
       "      <td>TCGA-06-1806</td>\n",
       "      <td>GBM</td>\n",
       "      <td>Cabozantinib</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>293</td>\n",
       "      <td>455</td>\n",
       "      <td>DB08875</td>\n",
       "      <td>0</td>\n",
       "      <td>Subtotal Resection</td>\n",
       "      <td>0</td>\n",
       "      <td>256</td>\n",
       "      <td>YES</td>\n",
       "      <td>[Not Available]</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>885</th>\n",
       "      <td>TCGA-06-A5U0</td>\n",
       "      <td>GBM</td>\n",
       "      <td>Temozolomide</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>31</td>\n",
       "      <td>74</td>\n",
       "      <td>DB00853</td>\n",
       "      <td>0</td>\n",
       "      <td>Subtotal Resection</td>\n",
       "      <td>0</td>\n",
       "      <td>100</td>\n",
       "      <td>YES</td>\n",
       "      <td>[Not Available]</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "1    bcr_patient_barcode cohort          drug response  \\\n",
       "1355        TCGA-05-4402   LUAD     Erlotinib        S   \n",
       "1360        TCGA-05-5425   LUAD     Gefitinib        R   \n",
       "883         TCGA-06-1806    GBM     veliparib        R   \n",
       "884         TCGA-06-1806    GBM  Cabozantinib        R   \n",
       "885         TCGA-06-A5U0    GBM  Temozolomide        R   \n",
       "\n",
       "1              measure_of_response days_to_drug_therapy_start  \\\n",
       "1355             Complete Response                        122   \n",
       "1360  Clinical Progressive Disease                        608   \n",
       "883   Clinical Progressive Disease                         81   \n",
       "884   Clinical Progressive Disease                        293   \n",
       "885   Clinical Progressive Disease                         31   \n",
       "\n",
       "1    days_to_drug_therapy_end DrugBank ID  \\\n",
       "1355                      122     DB00530   \n",
       "1360                      669     DB00317   \n",
       "883                       256         NaN   \n",
       "884                       455     DB08875   \n",
       "885                        74     DB00853   \n",
       "\n",
       "1    days_to_initial_pathologic_diagnosis   method_of_sample_procurement  \\\n",
       "1355                                    0  Other Method (please specify)   \n",
       "1360                                    0  Other Method (please specify)   \n",
       "883                                     0             Subtotal Resection   \n",
       "884                                     0             Subtotal Resection   \n",
       "885                                     0             Subtotal Resection   \n",
       "\n",
       "1    days_to_sample_procurement  \\\n",
       "1355                          0   \n",
       "1360                         31   \n",
       "883                           0   \n",
       "884                           0   \n",
       "885                           0   \n",
       "\n",
       "1    days_to_new_tumor_event_after_initial_treatment  \\\n",
       "1355                                             NaN   \n",
       "1360                                             NaN   \n",
       "883                                              256   \n",
       "884                                              256   \n",
       "885                                              100   \n",
       "\n",
       "1    additional_pharmaceutical_therapy  \\\n",
       "1355                               NaN   \n",
       "1360                               NaN   \n",
       "883                                YES   \n",
       "884                                YES   \n",
       "885                                YES   \n",
       "\n",
       "1    new_tumor_event_additional_surgery_procedure  \\\n",
       "1355                                          NaN   \n",
       "1360                                          NaN   \n",
       "883                               [Not Available]   \n",
       "884                               [Not Available]   \n",
       "885                               [Not Available]   \n",
       "\n",
       "1    history_of_neoadjuvant_treatment  \n",
       "1355                               No  \n",
       "1360                               No  \n",
       "883                                No  \n",
       "884                                No  \n",
       "885                                No  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_single = []\n",
    "for group in df.groupby(\"bcr_patient_barcode\"):\n",
    "    df_single.append(exclude_combos(group[1]))\n",
    "df_single = pd.concat(df_single)\n",
    "print(df_single.shape)\n",
    "print(\"Records with combo drugs excluded:\",df.shape[0] - df_single.shape[0])\n",
    "df_single.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cisplatin 113 R: 19 S: 94\n",
      "Cetuximab 10 R: 4 S: 6\n",
      "Gemcitabine 68 R: 43 S: 25\n",
      "Bortezomib 0 R: 0 S: 0\n",
      "Tamoxifen 15 R: 3 S: 12\n",
      "ZD-6474 0 R: 0 S: 0\n",
      "Gefitinib 2 R: 2 S: 0\n",
      "5-Fluorouracil 0 R: 0 S: 0\n",
      "Afatinib 0 R: 0 S: 0\n",
      "Pelitinib 0 R: 0 S: 0\n",
      "Panitumumab 0 R: 0 S: 0\n",
      "Paclitaxel 49 R: 15 S: 34\n",
      "Docetaxel 21 R: 12 S: 9\n",
      "Lapatinib 0 R: 0 S: 0\n",
      "Erlotinib 6 R: 4 S: 2\n"
     ]
    }
   ],
   "source": [
    "drugs = list(set(['Docetaxel', 'Cisplatin', 'Erlotinib', 'Bortezomib','5-Fluorouracil',\n",
    "         'Tamoxifen', 'Cetuximab', 'Paclitaxel', 'Gemcitabine'] + EGFRi_drugs))\n",
    "\n",
    "for drug in drugs:\n",
    "    d = df_single[df_single[\"drug\"] == drug ]\n",
    "    print(drug, d.shape[0],\"R:\",d[d[\"response\"] ==\"R\"].shape[0],\"S:\",d[d[\"response\"] ==\"S\"].shape[0] )\n",
    "    if d.shape[0] > 0 :\n",
    "        d.set_index(\"bcr_patient_barcode\",drop=True,inplace=True)\n",
    "        d.to_csv(root_dir+\"/preprocessed/annotations/\"+\"TCGA_response.\"+drug+\".tsv\",sep = \"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>1</th>\n",
       "      <th>cohort</th>\n",
       "      <th>drug</th>\n",
       "      <th>response</th>\n",
       "      <th>measure_of_response</th>\n",
       "      <th>days_to_drug_therapy_start</th>\n",
       "      <th>days_to_drug_therapy_end</th>\n",
       "      <th>DrugBank ID</th>\n",
       "      <th>days_to_initial_pathologic_diagnosis</th>\n",
       "      <th>method_of_sample_procurement</th>\n",
       "      <th>days_to_sample_procurement</th>\n",
       "      <th>days_to_new_tumor_event_after_initial_treatment</th>\n",
       "      <th>additional_pharmaceutical_therapy</th>\n",
       "      <th>new_tumor_event_additional_surgery_procedure</th>\n",
       "      <th>history_of_neoadjuvant_treatment</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bcr_patient_barcode</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>TCGA-05-4402</th>\n",
       "      <td>LUAD</td>\n",
       "      <td>Erlotinib</td>\n",
       "      <td>S</td>\n",
       "      <td>Complete Response</td>\n",
       "      <td>122</td>\n",
       "      <td>122</td>\n",
       "      <td>DB00530</td>\n",
       "      <td>0</td>\n",
       "      <td>Other Method (please specify)</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TCGA-53-7624</th>\n",
       "      <td>LUAD</td>\n",
       "      <td>Erlotinib</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>880</td>\n",
       "      <td>922</td>\n",
       "      <td>DB00530</td>\n",
       "      <td>0</td>\n",
       "      <td>Other Method (please specify)</td>\n",
       "      <td>40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TCGA-64-5778</th>\n",
       "      <td>LUAD</td>\n",
       "      <td>Erlotinib</td>\n",
       "      <td>R</td>\n",
       "      <td>Clinical Progressive Disease</td>\n",
       "      <td>1174</td>\n",
       "      <td>[Not Available]</td>\n",
       "      <td>DB00530</td>\n",
       "      <td>0</td>\n",
       "      <td>Tumor Resection</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "1                   cohort       drug response           measure_of_response  \\\n",
       "bcr_patient_barcode                                                            \n",
       "TCGA-05-4402          LUAD  Erlotinib        S             Complete Response   \n",
       "TCGA-53-7624          LUAD  Erlotinib        R  Clinical Progressive Disease   \n",
       "TCGA-64-5778          LUAD  Erlotinib        R  Clinical Progressive Disease   \n",
       "\n",
       "1                   days_to_drug_therapy_start days_to_drug_therapy_end  \\\n",
       "bcr_patient_barcode                                                       \n",
       "TCGA-05-4402                               122                      122   \n",
       "TCGA-53-7624                               880                      922   \n",
       "TCGA-64-5778                              1174          [Not Available]   \n",
       "\n",
       "1                   DrugBank ID days_to_initial_pathologic_diagnosis  \\\n",
       "bcr_patient_barcode                                                    \n",
       "TCGA-05-4402            DB00530                                    0   \n",
       "TCGA-53-7624            DB00530                                    0   \n",
       "TCGA-64-5778            DB00530                                    0   \n",
       "\n",
       "1                     method_of_sample_procurement days_to_sample_procurement  \\\n",
       "bcr_patient_barcode                                                             \n",
       "TCGA-05-4402         Other Method (please specify)                          0   \n",
       "TCGA-53-7624         Other Method (please specify)                         40   \n",
       "TCGA-64-5778                       Tumor Resection                          0   \n",
       "\n",
       "1                   days_to_new_tumor_event_after_initial_treatment  \\\n",
       "bcr_patient_barcode                                                   \n",
       "TCGA-05-4402                                                    NaN   \n",
       "TCGA-53-7624                                                    NaN   \n",
       "TCGA-64-5778                                                    NaN   \n",
       "\n",
       "1                   additional_pharmaceutical_therapy  \\\n",
       "bcr_patient_barcode                                     \n",
       "TCGA-05-4402                                      NaN   \n",
       "TCGA-53-7624                                      NaN   \n",
       "TCGA-64-5778                                      NaN   \n",
       "\n",
       "1                   new_tumor_event_additional_surgery_procedure  \\\n",
       "bcr_patient_barcode                                                \n",
       "TCGA-05-4402                                                 NaN   \n",
       "TCGA-53-7624                                                 NaN   \n",
       "TCGA-64-5778                                                 NaN   \n",
       "\n",
       "1                   history_of_neoadjuvant_treatment  \n",
       "bcr_patient_barcode                                   \n",
       "TCGA-05-4402                                      No  \n",
       "TCGA-53-7624                                      No  \n",
       "TCGA-64-5778                                      No  "
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GDSC\n",
    "\n",
    "###  Continuous response - log(IC50) values \n",
    "\n",
    "* Supplementary files from  \"A landscape of pharmacogenomic interactions in cancer\" by Iorio F et al. Cell. 2016:\n",
    "TableS4A.xlsx from https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources//Data/suppData/TableS4A.xlsx , tab 'TableS4A-IC50s'\n",
    "\n",
    "* Also, log(IC50) for are available here ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/current_release/v17.3_fitted_dose_response.xlsx\n",
    "(ln(IC50), these values seem to be just slightly different)\n",
    "\n",
    "###  Binary response \n",
    "\n",
    "*  Supplementary files from  Iorio F et al. 2016\n",
    "https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources///Data/suppData/TableS5C.xlsx\n",
    "\n",
    "Cell line names were replaced with corresponding COSMIC ids from \n",
    "https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources//Data/suppData/TableS1E.xlsx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GDSC - binarized response "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "COSMIC_ids = pd.read_excel(tmp_dir+\"TableS1E.xlsx\")\n",
    "COSMIC_ids = COSMIC_ids.iloc[2:,[1,2]]\n",
    "COSMIC_ids = COSMIC_ids.iloc[:-1,]\n",
    "COSMIC_ids.columns = [\"name\",'COSMIC']\n",
    "# 1002 pair, all IDs are unique\n",
    "#print(COSMIC_ids.shape[0],len(set(COSMIC_ids[\"name\"])),len(set(COSMIC_ids[\"COSMIC\"])))\n",
    "COSMIC_ids.set_index(\"name\",inplace=True,drop=True)\n",
    "names2COSMIC = dict(COSMIC_ids[\"COSMIC\"])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cisplatin</th>\n",
       "      <th>Cetuximab</th>\n",
       "      <th>Gemcitabine</th>\n",
       "      <th>Bortezomib</th>\n",
       "      <th>Tamoxifen</th>\n",
       "      <th>Gefitinib</th>\n",
       "      <th>5-Fluorouracil</th>\n",
       "      <th>Afatinib</th>\n",
       "      <th>Paclitaxel</th>\n",
       "      <th>Docetaxel</th>\n",
       "      <th>Lapatinib</th>\n",
       "      <th>Erlotinib</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cell_line</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>683665</th>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>S</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>683667</th>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684052</th>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684055</th>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>NaN</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684057</th>\n",
       "      <td>S</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Cisplatin Cetuximab Gemcitabine Bortezomib Tamoxifen Gefitinib  \\\n",
       "cell_line                                                                  \n",
       "683665            R         R           R          R         R         R   \n",
       "683667            R       NaN           R        NaN         R         R   \n",
       "684052          NaN         R           R        NaN         R       NaN   \n",
       "684055          NaN         S           R          R         R       NaN   \n",
       "684057            S         R           R          R         R         R   \n",
       "\n",
       "          5-Fluorouracil Afatinib Paclitaxel Docetaxel Lapatinib Erlotinib  \n",
       "cell_line                                                                   \n",
       "683665                 S        R          R         R         R         R  \n",
       "683667                 R        R        NaN         R       NaN       NaN  \n",
       "684052                 R      NaN        NaN       NaN       NaN       NaN  \n",
       "684055                 R      NaN          R       NaN         R         R  \n",
       "684057                 R        R          R         R         R         R  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel(tmp_dir+\"TableS5C.xlsx\")\n",
    "df.drop([0,1,2],inplace=True)\n",
    "df = df.iloc[:,1:]\n",
    "df.set_index(\"TableS5C - Binarized Drug IC50s, refers to figure 5\",inplace=True,drop=True)\n",
    "df.columns = df.loc[\"Screened Compounds:\",:].values\n",
    "df = df.iloc[1:,:]\n",
    "df.index.name = \"cell_line\"\n",
    "\n",
    "IC50_thr = df.iloc[0,:]\n",
    "IC50_thr.name = \"logIC50_threshold\"\n",
    "df =  df.iloc[1:,:]\n",
    "\n",
    "df.rename(names2COSMIC,axis=\"index\",inplace=True)\n",
    "drugs = set(nine_drugs+EGFRi_drugs).intersection(set(df.columns.values))\n",
    "df = df.loc[:,drugs]\n",
    "df.sort_values(by=\"cell_line\",inplace=True)\n",
    "df.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GDSC_response.\"+\"all_drugs\"+\".tsv\",sep = \"\\t\")\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cisplatin</th>\n",
       "      <th>Cetuximab</th>\n",
       "      <th>Gemcitabine</th>\n",
       "      <th>Bortezomib</th>\n",
       "      <th>Tamoxifen</th>\n",
       "      <th>Gefitinib</th>\n",
       "      <th>5-Fluorouracil</th>\n",
       "      <th>Afatinib</th>\n",
       "      <th>Paclitaxel</th>\n",
       "      <th>Docetaxel</th>\n",
       "      <th>Lapatinib</th>\n",
       "      <th>Erlotinib</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cell_line</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>683665</th>\n",
       "      <td>2.80727</td>\n",
       "      <td>6.29445</td>\n",
       "      <td>-4.40897</td>\n",
       "      <td>-3.81791</td>\n",
       "      <td>2.96832</td>\n",
       "      <td>1.46485</td>\n",
       "      <td>0.145949</td>\n",
       "      <td>1.49002</td>\n",
       "      <td>-3.64729</td>\n",
       "      <td>-4.91873</td>\n",
       "      <td>2.68418</td>\n",
       "      <td>2.43659</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>683667</th>\n",
       "      <td>1.75756</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.399711</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.70926</td>\n",
       "      <td>1.17482</td>\n",
       "      <td>3.7722</td>\n",
       "      <td>1.86838</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-6.34303</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684052</th>\n",
       "      <td>NaN</td>\n",
       "      <td>6.38732</td>\n",
       "      <td>-3.70724</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.57455</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.708</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684055</th>\n",
       "      <td>NaN</td>\n",
       "      <td>4.95212</td>\n",
       "      <td>-2.99645</td>\n",
       "      <td>-3.84107</td>\n",
       "      <td>3.6898</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.74045</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.214086</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.22649</td>\n",
       "      <td>3.34283</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684057</th>\n",
       "      <td>1.13197</td>\n",
       "      <td>6.39356</td>\n",
       "      <td>-2.41002</td>\n",
       "      <td>-4.39987</td>\n",
       "      <td>3.80699</td>\n",
       "      <td>2.15203</td>\n",
       "      <td>1.93716</td>\n",
       "      <td>0.463011</td>\n",
       "      <td>0.0960912</td>\n",
       "      <td>-6.73713</td>\n",
       "      <td>3.57179</td>\n",
       "      <td>3.57179</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Cisplatin Cetuximab Gemcitabine Bortezomib Tamoxifen Gefitinib  \\\n",
       "cell_line                                                                  \n",
       "683665      2.80727   6.29445    -4.40897   -3.81791   2.96832   1.46485   \n",
       "683667      1.75756       NaN   -0.399711        NaN   3.70926   1.17482   \n",
       "684052          NaN   6.38732    -3.70724        NaN   3.57455       NaN   \n",
       "684055          NaN   4.95212    -2.99645   -3.84107    3.6898       NaN   \n",
       "684057      1.13197   6.39356    -2.41002   -4.39987   3.80699   2.15203   \n",
       "\n",
       "          5-Fluorouracil  Afatinib Paclitaxel Docetaxel Lapatinib Erlotinib  \n",
       "cell_line                                                                    \n",
       "683665          0.145949   1.49002   -3.64729  -4.91873   2.68418   2.43659  \n",
       "683667            3.7722   1.86838        NaN  -6.34303       NaN       NaN  \n",
       "684052             4.708       NaN        NaN       NaN       NaN       NaN  \n",
       "684055           3.74045       NaN  -0.214086       NaN   3.22649   3.34283  \n",
       "684057           1.93716  0.463011  0.0960912  -6.73713   3.57179   3.57179  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ic50 = pd.read_excel(tmp_dir+\"TableS4A.xlsx\",'TableS4A-IC50s')\n",
    "df_ic50 = df_ic50.iloc[3:,:]\n",
    "df_ic50.drop(['TableS4A - Whole set of log(IC50s) across all the screened compounds and cell lines, related to Figure 4'],axis=1,inplace=True)\n",
    "df_ic50.columns = df_ic50.iloc[0,:].values\n",
    "df_ic50 = df_ic50.iloc[1:,:]\n",
    "df_ic50.index = df_ic50.iloc[:,0].values\n",
    "df_ic50.index.name = \"cell_line\"\n",
    "df_ic50 = df_ic50.iloc[:,1:]\n",
    "df_ic50.sort_values(by=\"cell_line\",inplace=True)\n",
    "df_ic50.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GDSC_response.\"+\"logIC50.all_drugs\"+\".tsv\",sep = \"\\t\")\n",
    "df_ic50[list(drugs)].head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'5-Fluorouracil': 1.1236,\n",
       " u'Afatinib': -0.22156,\n",
       " u'Bortezomib': -7.6275,\n",
       " u'Cetuximab': 5.144,\n",
       " u'Cisplatin': 1.3801,\n",
       " u'Docetaxel': -6.897,\n",
       " u'Erlotinib': 1.5671,\n",
       " u'Gefitinib': -0.05346,\n",
       " u'Gemcitabine': -5.9903,\n",
       " u'Lapatinib': 1.6257,\n",
       " u'Paclitaxel': -5.6772,\n",
       " u'Tamoxifen': 2.7296}"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "IC50_thr = IC50_thr[list(drugs)].to_dict()\n",
    "IC50_thr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/olya/miniconda2/lib/python2.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  import sys\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cisplatin total: 850 R: 771 S: 79\n",
      "Cetuximab total: 873 R: 749 S: 124\n",
      "Gemcitabine total: 870 R: 815 S: 55\n",
      "Bortezomib total: 402 R: 371 S: 31\n",
      "Tamoxifen total: 928 R: 820 S: 108\n",
      "Gefitinib total: 846 R: 727 S: 119\n",
      "5-Fluorouracil total: 916 R: 822 S: 94\n",
      "Afatinib total: 849 R: 696 S: 153\n",
      "Paclitaxel total: 402 R: 376 S: 26\n",
      "Docetaxel total: 850 R: 784 S: 66\n",
      "Lapatinib total: 398 R: 337 S: 61\n",
      "Erlotinib total: 372 R: 308 S: 64\n"
     ]
    }
   ],
   "source": [
    "df_long = []\n",
    "for drug in drugs:\n",
    "    d1 = df_ic50.loc[:,[drug]]\n",
    "    d1.columns = [\"logIC50\"]\n",
    "    d2 = df.loc[:,[drug]]\n",
    "    d2.columns = [\"response\"]\n",
    "    d1.dropna(inplace=True)\n",
    "    d2.dropna(inplace=True)\n",
    "    d = pd.concat([d2,d1],axis=1)\n",
    "    d.loc[:,\"drug\"] = drug\n",
    "    d.index.name = \"sample_name\"\n",
    "    df_long.append(d)\n",
    "    if d.shape[0] >0 :\n",
    "        d.to_csv(root_dir+\"/preprocessed/annotations/\"+\"GDSC_response.\"+drug+\".tsv\",sep = \"\\t\")\n",
    "    print(drug,\"total:\",d.shape[0],\"R:\",d.loc[d[\"logIC50\"]>IC50_thr[drug],:].shape[0],\n",
    "          \"S:\",d.loc[d[\"logIC50\"]<=IC50_thr[drug],:].shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>response</th>\n",
       "      <th>logIC50</th>\n",
       "      <th>drug</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>683665</th>\n",
       "      <td>R</td>\n",
       "      <td>2.807269</td>\n",
       "      <td>Cisplatin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>683667</th>\n",
       "      <td>R</td>\n",
       "      <td>1.757559</td>\n",
       "      <td>Cisplatin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684057</th>\n",
       "      <td>S</td>\n",
       "      <td>1.131967</td>\n",
       "      <td>Cisplatin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684059</th>\n",
       "      <td>S</td>\n",
       "      <td>0.877124</td>\n",
       "      <td>Cisplatin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684062</th>\n",
       "      <td>S</td>\n",
       "      <td>1.342990</td>\n",
       "      <td>Cisplatin</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            response   logIC50       drug\n",
       "sample_name                              \n",
       "683665             R  2.807269  Cisplatin\n",
       "683667             R  1.757559  Cisplatin\n",
       "684057             S  1.131967  Cisplatin\n",
       "684059             S  0.877124  Cisplatin\n",
       "684062             S  1.342990  Cisplatin"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_long = pd.concat(df_long)\n",
    "df_long.loc[:,\"logIC50\"] = df_long[\"logIC50\"].apply(np.float)\n",
    "df_long.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### log(IC50) in R and S groups "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7fa08a2c4cd0>"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1440x504 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(20,7))\n",
    "sns.violinplot(x=\"drug\", y=\"logIC50\", hue=\"response\",data=df_long,split=True)\n",
    "# this curve fitting is imprecize\n",
    "#sns.swarmplot(x=\"drug\", y=\"logIC50\", hue=\"response\",data=df_long)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7fa08abc8b90>"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1440x504 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(20,7))\n",
    "sns.swarmplot(x=\"drug\", y=\"logIC50\", hue=\"response\",data=df_long)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CCLE \n",
    "\n",
    "#### Problems : \n",
    "\"GSM886946\":\"COLO-699\" and \"GSM887546\":\"RPMI 6666\" are absent in annotation\n",
    "\n",
    "drug response data:\n",
    "wget https://data.broadinstitute.org/ccle_legacy_data/pharmacological_profiling/CCLE_NP24.2009_Drug_data_2015.02.24.csv\n",
    "\n",
    "cell line annotation:\n",
    "wget https://data.broadinstitute.org/ccle_legacy_data/cell_line_annotations/CCLE_sample_info_file_2012-10-18.txt\n",
    "\n",
    "expression annotation\n",
    "wget  ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE36nnn/GSE36133/matrix/GSE36133_series_matrix.txt.gz\n",
    "\n",
    "\n",
    "#### Drugs \n",
    "\"Paclitaxel\",\"Erlotinib\"\n",
    "\n",
    "Targeted EGFRi : 'Erlotinib', 'Lapatinib', 'ZD-6474'\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>histology</th>\n",
       "      <th>histology subtype1</th>\n",
       "      <th>primary site</th>\n",
       "      <th>source</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GSM886835</th>\n",
       "      <td>glioma</td>\n",
       "      <td>astrocytoma</td>\n",
       "      <td>central_nervous_system</td>\n",
       "      <td>ECACC</td>\n",
       "      <td>1321N1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886836</th>\n",
       "      <td>osteosarcoma</td>\n",
       "      <td></td>\n",
       "      <td>bone</td>\n",
       "      <td>ATCC</td>\n",
       "      <td>143B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886837</th>\n",
       "      <td>carcinoma</td>\n",
       "      <td></td>\n",
       "      <td>prostate</td>\n",
       "      <td>ATCC</td>\n",
       "      <td>22Rv1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886838</th>\n",
       "      <td>carcinoma</td>\n",
       "      <td>adenocarcinoma</td>\n",
       "      <td>stomach</td>\n",
       "      <td>DSMZ</td>\n",
       "      <td>23132/87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886839</th>\n",
       "      <td>glioma</td>\n",
       "      <td>astrocytoma_Grade_IV</td>\n",
       "      <td>central_nervous_system</td>\n",
       "      <td>DSMZ</td>\n",
       "      <td>42-MG-BA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              histology    histology subtype1            primary site source  \\\n",
       "GSM                                                                            \n",
       "GSM886835        glioma           astrocytoma  central_nervous_system  ECACC   \n",
       "GSM886836  osteosarcoma                                          bone   ATCC   \n",
       "GSM886837     carcinoma                                      prostate   ATCC   \n",
       "GSM886838     carcinoma        adenocarcinoma                 stomach   DSMZ   \n",
       "GSM886839        glioma  astrocytoma_Grade_IV  central_nervous_system   DSMZ   \n",
       "\n",
       "              title  \n",
       "GSM                  \n",
       "GSM886835    1321N1  \n",
       "GSM886836      143B  \n",
       "GSM886837     22Rv1  \n",
       "GSM886838  23132/87  \n",
       "GSM886839  42-MG-BA  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fpath = download_GEO_matrix(\"GSE36133_series_matrix.txt.gz\",'/geo/series/GSE36nnn/GSE36133/matrix/'\n",
    "                    ,destination=tmp_dir)\n",
    "df = read_matrix(fpath)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "anno = pd.read_csv(root_dir+ \"/preprocessed/annotations/CCLE_sample_info_file_2012-10-18.txt\",sep = \"\\t\")\n",
    "anno = anno[[\"CCLE name\",\"Cell line primary name\"]]\n",
    "anno.set_index(\"Cell line primary name\",drop=True,inplace=True)\n",
    "CCLE_names_dict = anno.to_dict()['CCLE name']\n",
    "#CCLE_names_dict "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1046, 915, 917, 1046, {'COLO-699', 'RPMI 6666'})"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapped_CCLE_names = set(df[\"title\"].values).intersection(set(CCLE_names_dict.keys()))\n",
    "not_mapped_CCLE_names = set(df[\"title\"].values).difference(set(CCLE_names_dict.keys()))\n",
    "not_mapped_GSM = set(df[\"title\"].values).difference(set(CCLE_names_dict.keys()))\n",
    "len(CCLE_names_dict.keys()), len(mapped_CCLE_names), len(set(df[\"title\"].values)), len(set(CCLE_names_dict.keys())),not_mapped_CCLE_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>histology</th>\n",
       "      <th>histology subtype1</th>\n",
       "      <th>primary site</th>\n",
       "      <th>source</th>\n",
       "      <th>title</th>\n",
       "      <th>CCLE_name</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GSM886835</th>\n",
       "      <td>glioma</td>\n",
       "      <td>astrocytoma</td>\n",
       "      <td>central_nervous_system</td>\n",
       "      <td>ECACC</td>\n",
       "      <td>1321N1</td>\n",
       "      <td>1321N1_CENTRAL_NERVOUS_SYSTEM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886836</th>\n",
       "      <td>osteosarcoma</td>\n",
       "      <td></td>\n",
       "      <td>bone</td>\n",
       "      <td>ATCC</td>\n",
       "      <td>143B</td>\n",
       "      <td>143B_BONE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886837</th>\n",
       "      <td>carcinoma</td>\n",
       "      <td></td>\n",
       "      <td>prostate</td>\n",
       "      <td>ATCC</td>\n",
       "      <td>22Rv1</td>\n",
       "      <td>22RV1_PROSTATE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886838</th>\n",
       "      <td>carcinoma</td>\n",
       "      <td>adenocarcinoma</td>\n",
       "      <td>stomach</td>\n",
       "      <td>DSMZ</td>\n",
       "      <td>23132/87</td>\n",
       "      <td>2313287_STOMACH</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GSM886839</th>\n",
       "      <td>glioma</td>\n",
       "      <td>astrocytoma_Grade_IV</td>\n",
       "      <td>central_nervous_system</td>\n",
       "      <td>DSMZ</td>\n",
       "      <td>42-MG-BA</td>\n",
       "      <td>42MGBA_CENTRAL_NERVOUS_SYSTEM</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              histology    histology subtype1            primary site source  \\\n",
       "GSM                                                                            \n",
       "GSM886835        glioma           astrocytoma  central_nervous_system  ECACC   \n",
       "GSM886836  osteosarcoma                                          bone   ATCC   \n",
       "GSM886837     carcinoma                                      prostate   ATCC   \n",
       "GSM886838     carcinoma        adenocarcinoma                 stomach   DSMZ   \n",
       "GSM886839        glioma  astrocytoma_Grade_IV  central_nervous_system   DSMZ   \n",
       "\n",
       "              title                      CCLE_name  \n",
       "GSM                                                 \n",
       "GSM886835    1321N1  1321N1_CENTRAL_NERVOUS_SYSTEM  \n",
       "GSM886836      143B                      143B_BONE  \n",
       "GSM886837     22Rv1                 22RV1_PROSTATE  \n",
       "GSM886838  23132/87                2313287_STOMACH  \n",
       "GSM886839  42-MG-BA  42MGBA_CENTRAL_NERVOUS_SYSTEM  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.loc[df[\"title\"].isin(mapped_CCLE_names ),:]\n",
    "df[\"CCLE_name\"] = df[\"title\"].apply(lambda x: CCLE_names_dict[x]) #df[\"title\"] +\"_\" + df[\"primary site\"].apply(str.upper)\n",
    "df.to_csv(root_dir+ \"/preprocessed/annotations/CCLE_expessions.annotations.tsv\",sep = \"\\t\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1321N1_CENTRAL_NERVOUS_SYSTEM</th>\n",
       "      <th>143B_BONE</th>\n",
       "      <th>22RV1_PROSTATE</th>\n",
       "      <th>2313287_STOMACH</th>\n",
       "      <th>42MGBA_CENTRAL_NERVOUS_SYSTEM</th>\n",
       "      <th>5637_URINARY_TRACT</th>\n",
       "      <th>639V_URINARY_TRACT</th>\n",
       "      <th>647V_URINARY_TRACT</th>\n",
       "      <th>697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE</th>\n",
       "      <th>769P_KIDNEY</th>\n",
       "      <th>...</th>\n",
       "      <th>YAPC_PANCREAS</th>\n",
       "      <th>YD10B_UPPER_AERODIGESTIVE_TRACT</th>\n",
       "      <th>YD15_SALIVARY_GLAND</th>\n",
       "      <th>YD38_UPPER_AERODIGESTIVE_TRACT</th>\n",
       "      <th>YD8_UPPER_AERODIGESTIVE_TRACT</th>\n",
       "      <th>YH13_CENTRAL_NERVOUS_SYSTEM</th>\n",
       "      <th>YKG1_CENTRAL_NERVOUS_SYSTEM</th>\n",
       "      <th>YMB1_BREAST</th>\n",
       "      <th>ZR751_BREAST</th>\n",
       "      <th>ZR7530_BREAST</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.899963</td>\n",
       "      <td>5.495630</td>\n",
       "      <td>4.915154</td>\n",
       "      <td>4.386444</td>\n",
       "      <td>8.018313</td>\n",
       "      <td>4.440586</td>\n",
       "      <td>7.108326</td>\n",
       "      <td>4.974334</td>\n",
       "      <td>7.542390</td>\n",
       "      <td>4.769888</td>\n",
       "      <td>...</td>\n",
       "      <td>4.750698</td>\n",
       "      <td>4.899243</td>\n",
       "      <td>4.071518</td>\n",
       "      <td>4.630944</td>\n",
       "      <td>4.617563</td>\n",
       "      <td>7.214694</td>\n",
       "      <td>5.955572</td>\n",
       "      <td>8.455509</td>\n",
       "      <td>7.726960</td>\n",
       "      <td>6.890627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.151092</td>\n",
       "      <td>4.867485</td>\n",
       "      <td>4.683452</td>\n",
       "      <td>3.770537</td>\n",
       "      <td>3.992411</td>\n",
       "      <td>3.668579</td>\n",
       "      <td>3.643959</td>\n",
       "      <td>3.799457</td>\n",
       "      <td>3.985218</td>\n",
       "      <td>3.708740</td>\n",
       "      <td>...</td>\n",
       "      <td>3.532178</td>\n",
       "      <td>4.184472</td>\n",
       "      <td>3.778447</td>\n",
       "      <td>3.898891</td>\n",
       "      <td>4.298302</td>\n",
       "      <td>4.583785</td>\n",
       "      <td>3.854023</td>\n",
       "      <td>4.372195</td>\n",
       "      <td>3.680397</td>\n",
       "      <td>4.555957</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>7.937206</td>\n",
       "      <td>7.653808</td>\n",
       "      <td>10.551317</td>\n",
       "      <td>9.345134</td>\n",
       "      <td>8.158181</td>\n",
       "      <td>7.965484</td>\n",
       "      <td>7.525776</td>\n",
       "      <td>8.514274</td>\n",
       "      <td>8.230191</td>\n",
       "      <td>8.323116</td>\n",
       "      <td>...</td>\n",
       "      <td>7.935024</td>\n",
       "      <td>7.825549</td>\n",
       "      <td>8.277187</td>\n",
       "      <td>8.173469</td>\n",
       "      <td>7.360998</td>\n",
       "      <td>7.899128</td>\n",
       "      <td>7.668647</td>\n",
       "      <td>9.483785</td>\n",
       "      <td>11.846274</td>\n",
       "      <td>9.143848</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4.153359</td>\n",
       "      <td>4.368691</td>\n",
       "      <td>4.320320</td>\n",
       "      <td>4.784488</td>\n",
       "      <td>4.429411</td>\n",
       "      <td>4.048702</td>\n",
       "      <td>4.131885</td>\n",
       "      <td>4.547273</td>\n",
       "      <td>4.425889</td>\n",
       "      <td>4.545985</td>\n",
       "      <td>...</td>\n",
       "      <td>4.516905</td>\n",
       "      <td>4.596316</td>\n",
       "      <td>4.187752</td>\n",
       "      <td>4.478969</td>\n",
       "      <td>4.309859</td>\n",
       "      <td>4.176227</td>\n",
       "      <td>4.356414</td>\n",
       "      <td>5.568482</td>\n",
       "      <td>5.570161</td>\n",
       "      <td>4.547440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>5.064883</td>\n",
       "      <td>4.265396</td>\n",
       "      <td>4.796432</td>\n",
       "      <td>7.778202</td>\n",
       "      <td>4.424837</td>\n",
       "      <td>5.518264</td>\n",
       "      <td>4.346809</td>\n",
       "      <td>4.667569</td>\n",
       "      <td>4.214931</td>\n",
       "      <td>5.945018</td>\n",
       "      <td>...</td>\n",
       "      <td>6.669781</td>\n",
       "      <td>4.458947</td>\n",
       "      <td>4.778975</td>\n",
       "      <td>6.045139</td>\n",
       "      <td>5.961929</td>\n",
       "      <td>4.457785</td>\n",
       "      <td>4.843793</td>\n",
       "      <td>5.031419</td>\n",
       "      <td>7.130748</td>\n",
       "      <td>5.497614</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 915 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    1321N1_CENTRAL_NERVOUS_SYSTEM  143B_BONE  22RV1_PROSTATE  2313287_STOMACH  \\\n",
       "1                        5.899963   5.495630        4.915154         4.386444   \n",
       "2                        4.151092   4.867485        4.683452         3.770537   \n",
       "9                        7.937206   7.653808       10.551317         9.345134   \n",
       "10                       4.153359   4.368691        4.320320         4.784488   \n",
       "12                       5.064883   4.265396        4.796432         7.778202   \n",
       "\n",
       "    42MGBA_CENTRAL_NERVOUS_SYSTEM  5637_URINARY_TRACT  639V_URINARY_TRACT  \\\n",
       "1                        8.018313            4.440586            7.108326   \n",
       "2                        3.992411            3.668579            3.643959   \n",
       "9                        8.158181            7.965484            7.525776   \n",
       "10                       4.429411            4.048702            4.131885   \n",
       "12                       4.424837            5.518264            4.346809   \n",
       "\n",
       "    647V_URINARY_TRACT  697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE  769P_KIDNEY  \\\n",
       "1             4.974334                                7.542390     4.769888   \n",
       "2             3.799457                                3.985218     3.708740   \n",
       "9             8.514274                                8.230191     8.323116   \n",
       "10            4.547273                                4.425889     4.545985   \n",
       "12            4.667569                                4.214931     5.945018   \n",
       "\n",
       "        ...        YAPC_PANCREAS  YD10B_UPPER_AERODIGESTIVE_TRACT  \\\n",
       "1       ...             4.750698                         4.899243   \n",
       "2       ...             3.532178                         4.184472   \n",
       "9       ...             7.935024                         7.825549   \n",
       "10      ...             4.516905                         4.596316   \n",
       "12      ...             6.669781                         4.458947   \n",
       "\n",
       "    YD15_SALIVARY_GLAND  YD38_UPPER_AERODIGESTIVE_TRACT  \\\n",
       "1              4.071518                        4.630944   \n",
       "2              3.778447                        3.898891   \n",
       "9              8.277187                        8.173469   \n",
       "10             4.187752                        4.478969   \n",
       "12             4.778975                        6.045139   \n",
       "\n",
       "    YD8_UPPER_AERODIGESTIVE_TRACT  YH13_CENTRAL_NERVOUS_SYSTEM  \\\n",
       "1                        4.617563                     7.214694   \n",
       "2                        4.298302                     4.583785   \n",
       "9                        7.360998                     7.899128   \n",
       "10                       4.309859                     4.176227   \n",
       "12                       5.961929                     4.457785   \n",
       "\n",
       "    YKG1_CENTRAL_NERVOUS_SYSTEM  YMB1_BREAST  ZR751_BREAST  ZR7530_BREAST  \n",
       "1                      5.955572     8.455509      7.726960       6.890627  \n",
       "2                      3.854023     4.372195      3.680397       4.555957  \n",
       "9                      7.668647     9.483785     11.846274       9.143848  \n",
       "10                     4.356414     5.568482      5.570161       4.547440  \n",
       "12                     4.843793     5.031419      7.130748       5.497614  \n",
       "\n",
       "[5 rows x 915 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ccle_exprs = pd.read_csv(root_dir+\"/preprocessed/exprs/GSE36133.BrainArray.RMAlog2Average.ENTREZID.Expr.tsv\", sep =\"\\t\")\n",
    "samples_annotated = set(ccle_exprs.columns.values).intersection(set(df.index.values))\n",
    "#df.loc[samples_annotated ,\"CCLE_name\"].to_dict()\n",
    "ccle_exprs = ccle_exprs.loc[:,samples_annotated]\n",
    "ccle_exprs.rename(df.loc[samples_annotated ,\"CCLE_name\"].to_dict(),axis=\"columns\",inplace=True)\n",
    "ccle_exprs.sort_index(inplace = True)\n",
    "ccle_exprs.sort_index(inplace = True,axis=1)\n",
    "#ccle_exprs.to_csv(root_dir+\"/preprocessed/exprs/GSE36133.BrainArray.RMAlog2Average.ENTREZID.Expr_renamed.tsv\",sep =\"\\t\")\n",
    "ccle_exprs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = pd.read_csv(root_dir+\"/preprocessed/annotations/\"+\"CCLE_NP24.2009_Drug_data_2015.02.24.csv\",sep=\",\")\n",
    "set(response[\"Compound\"].values)\n",
    "response_9drugs = response.loc[response[\"Compound\"].isin([\"Paclitaxel\",\"Erlotinib\"]),:]\n",
    "response_EGFR = response.loc[response[\"Compound\"].isin(['Erlotinib', 'Lapatinib', 'ZD-6474']),:]\n",
    "response_9drugs.set_index(\"CCLE Cell Line Name\",inplace=True)\n",
    "response_9drugs.index.name = \"CCLE_name\"\n",
    "response_9drugs.to_csv(root_dir+\"/preprocessed/annotations/\"+\"CCLE.responses.Paclitaxel_Erlotinib.tsv\", sep = \"\\t\")\n",
    "response_EGFR.set_index(\"CCLE Cell Line Name\",inplace=True)\n",
    "response_EGFR.index.name = \"CCLE_name\"\n",
    "response_EGFR.to_csv(root_dir+\"/preprocessed/annotations/\"+\"CCLE.responses.EGFRi.tsv\", sep = \"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}