[d90d15]: / preprocessing_scr / CNA.ipynb

Download this file

7241 lines (7240 with data), 306.0 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "import pandas as pd\n",
    "import os,sys\n",
    "import pybedtools as pbt\n",
    "from StringIO import StringIO\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import time\n",
    "from mapper import expand, parse_mapping_table, apply_mappers\n",
    "%matplotlib inline\n",
    "\n",
    "\n",
    "chr_dict = dict(zip(range(1,22),map(str,range(1,22))))\n",
    "chr_dict.update({22: 'X', 23: \"Y\"})\n",
    "\n",
    "root_dir = \"/home/olya/SFU/Hossein/v2/\"\n",
    "gene_coords_file = root_dir + \"ref_GRCh37.p5_top_level.gff3.bed\" # must contain chromosome, start, end and Entrez Gene ID for hg19"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TCGA \n",
    "\n",
    "Assume that segmentation files from GDAC : http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/*/*snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt are dowmnoaded\n",
    "\n",
    "1) Filtering segments:\n",
    " - segments containing less than 5 probes removed\n",
    " - keep only segments with segment mean below -0.23 or above 0.2. This means that one copy gains and losses are detectable when their CCF (canncer cell fraction) is 0.3 or higher. \n",
    " \n",
    "TODO: remove segements overlapping with germline CNA forund in normals (add this as the first step)\n",
    "2). For each samples aggregte to gene-level:\n",
    " - rename chromosomes 22 and 23 to X and Y\n",
    " - overpal segemntation file with Entrez gene coordinates for hg19\n",
    " - if a gene overlaps by multiple segments, keep the one with most extreme values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_marker_thr = 5\n",
    "# to detect 1 copy gains or losses presenting at CCF >= 0.3\n",
    "pos_seg_mean_thr = 0.20\n",
    "neg_seg_mean_thr = -0.23 \n",
    "\n",
    "preprocessed_dir = root_dir+\"preprocessed/CNA/\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "### functions for TCGA and CCLE #################################\n",
    "def filter_lowconf_segments(df,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr ):\n",
    "    # filter low-confidence segments with too few probes\n",
    "    df = df[df[\"Num_Probes\"] >= num_marker_thr ]\n",
    "    # filter low-confidence segments with Segment_Mean too close to zero:\n",
    "    df = df[ (df[\"Segment_Mean\"] >= pos_seg_mean_thr) | (df[\"Segment_Mean\"] <= neg_seg_mean_thr)]\n",
    "    return df\n",
    "\n",
    "def sample_type(barcode):\n",
    "    if barcode[13:16] in [\"10A\",\"10B\",\"11A\",\"11B\",\"10C\",\"11C\"]:\n",
    "        return \"Normal\"\n",
    "    else:\n",
    "        return \"Tumor\"\n",
    "\n",
    "def find_matching_normal(tumor_barcode,barcodes_list):\n",
    "    patient_id = tumor_barcode[:12]\n",
    "    normal_barcodes = []\n",
    "    for barcode in barcodes_list:\n",
    "        if barcode.startswith(patient_id) and sample_type(barcode) == \"Normal\":\n",
    "            normal_barcodes.append(barcode)\n",
    "    return normal_barcodes\n",
    "\n",
    "def cnv2bed(seg):\n",
    "    #cnv_bed = seg[[\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\"]]\n",
    "    #cnv_bed.columns = [\"chrom\",\"start\",\"stop\",\"Segment_Mean\"]\n",
    "    cnv_bed = seg.rename({\"Chromosome\":\"chrom\",\"Start\":\"start\",\n",
    "                          \"End\":\"stop\"},axis=\"columns\")\n",
    "    cnv_bed = cnv_bed.loc[:,[\"chrom\",\"start\",\"stop\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]]\n",
    "    return  pbt.BedTool.from_dataframe(cnv_bed)\n",
    "\n",
    "def bed2cnv(cnv_bed):\n",
    "    cnv_bed = str(cnv_bed)\n",
    "    if len(cnv_bed) > 0:\n",
    "        seg = pd.read_csv(StringIO(cnv_bed),sep = \"\\t\",header=None)\n",
    "        seg.columns = [\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]\n",
    "        seg = seg.loc[:,[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n",
    "    else:\n",
    "        seg = pd.DataFrame(columns=[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"])\n",
    "    return seg\n",
    "def remove_ovelapping_segments(tumor, normal,sample_name):\n",
    "    tumor_bed = cnv2bed(tumor)\n",
    "    normal_bed = cnv2bed(normal)\n",
    "    tumor_wo_germline = tumor_bed.subtract(normal_bed,r=True,f=0.8,A=True)\n",
    "    tumor_wo_germline  = bed2cnv(tumor_wo_germline)\n",
    "    n_segs_removed = tumor.shape[0] - tumor_wo_germline.shape[0]\n",
    "    if n_segs_removed*1.0/tumor.shape[0] > 0.5 and n_segs_removed>5 :\n",
    "        print(n_segs_removed,\"of\",tumor.shape[0],\"segments removed in\",sample_name,\"due to overlap with normal\",file = sys.stderr)\n",
    "    return tumor_wo_germline\n",
    "\n",
    "def cnv2genelevel(cnv_bed,gene_intervals_bed,sample_name,verbose = True,sorted_index = \"\"):\n",
    "    \n",
    "    # intersect \n",
    "    cnv2gene = str(gene_intervals_bed.intersect(cnv_bed,wb = True,wa=True))\n",
    "    if len(cnv2gene)==0: # if no intersection, return all zeroes\n",
    "        print(sample_name,\"has no genes with altered CN\",file = sys.stderr)\n",
    "        return pd.DataFrame(columns=[sample])\n",
    "    cnv2gene = pd.read_csv(StringIO(cnv2gene),sep = \"\\t\",header=None)\n",
    "    cnv2gene = cnv2gene[[3,7]].copy()\n",
    "    cnv2gene.columns = [\"gene\",\"Segment_Mean\"] \n",
    "    \n",
    "    # find genes overlapping with more than one segment:\n",
    "    # take the most exterme segement_mean value\n",
    "    \n",
    "    dups = cnv2gene.loc[cnv2gene.duplicated(subset=[\"gene\"],keep=False),]\n",
    "    if dups.shape[0] > 0:\n",
    "        cnv2gene = cnv2gene.drop_duplicates(subset=[\"gene\"],keep=False)\n",
    "        dups[\"abs_seg_mean\"] = abs(dups[\"Segment_Mean\"])\n",
    "        if verbose:\n",
    "            print(sample_name,\"contain \",len(set(dups[\"gene\"].values)),\"genes overalpped with more than one segment\",file=sys.stderr)\n",
    "            #print(dups.head(10),file=sys.stderr)\n",
    "        dups = dups.groupby(['gene'], group_keys=False).apply(lambda row: row.loc[row['abs_seg_mean'].idxmax()])\n",
    "        cnv2gene = pd.concat([cnv2gene,dups],sort=False)\n",
    "\n",
    "    cnv2gene = cnv2gene[[\"gene\",\"Segment_Mean\"]]\n",
    "    cnv2gene.set_index(\"gene\",inplace=True,drop=True)\n",
    "    cnv2gene.rename(int,axis=0,inplace=True)\n",
    "    # add copy-neutral genes with 0s\n",
    "    \n",
    "    cnv2gene = cnv2gene.loc[sorted_index,:]\n",
    "    cnv2gene.columns = [sample_name]\n",
    "    return cnv2gene\n",
    "\n",
    "\n",
    "### functions for GDSC and PDX #################################\n",
    "\n",
    "def CN2log2R(col, median_ploidy=2 ):\n",
    "    # this is fr GDSC only\n",
    "    lRs = []\n",
    "    genes = col.index.values\n",
    "    for code in col.values:\n",
    "        if not code == \"-1,-1,-,-\":\n",
    "            [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n",
    "            if int(max_cn) == 0:\n",
    "                lRs.append(-4.32) # CN=0 with 95% purity\n",
    "            else:\n",
    "                max_lR = np.log2(float(max_cn)/median_ploidy)\n",
    "                if not disruption == \"D\":\n",
    "                    lRs.append(max_lR)\n",
    "                else:\n",
    "                    if int(min_cn) == 0:\n",
    "                        min_lR = -4.32\n",
    "                    else:\n",
    "                        min_lR = np.log2(float(min_cn)/median_ploidy)\n",
    "                    if abs(min_lR) > abs(max_lR):\n",
    "                        lRs.append(min_lR)\n",
    "                    else:\n",
    "                        lRs.append(max_lR)\n",
    "                \n",
    "        else:\n",
    "            lRs.append(np.NaN)\n",
    "    return pd.Series(dict(zip(genes, lRs)))\n",
    "\n",
    "def define_avg_ploidy(col):\n",
    "    n,pl = 0,0\n",
    "    CN_non_disrupted = []\n",
    "    for code in col.values:\n",
    "        if not code == \"-1,-1,-,-\":\n",
    "            [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n",
    "            n+=1\n",
    "            cn = (int(max_cn)+int(min_cn))*0.5\n",
    "            pl += cn\n",
    "            if not disruption == \"D\":\n",
    "                CN_non_disrupted.append((cn))\n",
    "    return pd.Series({\"avg_pl\":pl/n , \"median_pl\":np.median(CN_non_disrupted)})\n",
    "\n",
    "def clean_logR(logR_value, pos_seg_mean_thr, neg_seg_mean_thr):\n",
    "    if logR_value >= pos_seg_mean_thr:\n",
    "        return logR_value \n",
    "    elif logR_value <= neg_seg_mean_thr:\n",
    "            return logR_value \n",
    "    else:\n",
    "        return 0\n",
    "    \n",
    "def handle_dups(df,corr_thr = 0.75):\n",
    "    '''Detect dupliated row IDs. Merge 2 or more rows with the same ID, \n",
    "    if averaged correlation in all pairvise comparision is >= corr_thhr;\\n\n",
    "    otherwise drop all duplicates.  Keeps abs. max value (negative preferred).'''\n",
    "    dups = df.index\n",
    "    dups = list(set(dups[dups.duplicated()]))\n",
    "    if len(dups)==0:\n",
    "        print(\"No duplicated row IDs. Do nothing.\")\n",
    "        return df\n",
    "    print(len(dups), \"duplicated IDs in\",df.loc[dups,:].shape[0],\"rows found.\")\n",
    "    dups_merge = [] # if corr > corr_thr\n",
    "    dups_remove = [] # corr < \n",
    "    for dup in dups:\n",
    "        r = df.loc[dup,:].T.corr()\n",
    "        n_dups = df.loc[dup,:].shape[0]\n",
    "        r_avg = []\n",
    "        for i in range(0,n_dups):\n",
    "            for j in range(i+1,n_dups):\n",
    "                r_avg.append(r.iloc[i,j])\n",
    "        if np.average(r_avg) < corr_thr :\n",
    "            #print(dup,r_avg, n_dups)\n",
    "            dups_remove.append(dup)\n",
    "        else:\n",
    "            dups_merge.append(dup)\n",
    "    \n",
    "    # remove not similar duplicates\n",
    "    df_size = df.shape[0]\n",
    "    df = df.loc[~df.index.isin(dups_remove),:]\n",
    "    print(\"duplicate rows removed due to low correlation of duplicated profiles\",df_size -df.shape[0] )\n",
    "    df_size = df.shape[0]\n",
    "    \n",
    "    # merge simialr duplicates\n",
    "    d1 = df.loc[~df.index.isin(dups_merge),:]\n",
    "    d2 = df.loc[dups_merge,:]\n",
    "    d2 = d2.groupby(d2.index).agg(lambda x: -max(-x.max(),-x.min(),key= abs))\n",
    "    df = pd.concat([d1,d2])\n",
    "    df.sort_index(inplace=True)\n",
    "    print(\"Merged \",df_size-df.shape[0]+len(dups_merge),\"duplicated rows into\",len(dups_merge),\"rows\")\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### next few tabs demonstrate necessity of removing low-confidence and germline segments:  \n",
    "\n",
    "(e.g. fragment 11:126596926-127130276 presents in both tumor and normal\n",
    "therefore, it is germline; see chr11:126596926-12713027 in UCSC browser - it covers part of KIRELL3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "segemtns in tumor 204 segemtns in normal 121\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sample</th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>Num_Probes</th>\n",
       "      <th>Segment_Mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>57803</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>456120</td>\n",
       "      <td>8896255</td>\n",
       "      <td>4489.0</td>\n",
       "      <td>-0.0113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57804</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>8899400</td>\n",
       "      <td>8899668</td>\n",
       "      <td>3.0</td>\n",
       "      <td>-1.3344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57805</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>8900394</td>\n",
       "      <td>126596817</td>\n",
       "      <td>67487.0</td>\n",
       "      <td>0.0010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57806</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>126596926</td>\n",
       "      <td>127130276</td>\n",
       "      <td>453.0</td>\n",
       "      <td>-1.0306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57807</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>127132920</td>\n",
       "      <td>128342803</td>\n",
       "      <td>864.0</td>\n",
       "      <td>-0.0031</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57808</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>128342819</td>\n",
       "      <td>128350888</td>\n",
       "      <td>44.0</td>\n",
       "      <td>0.2824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57809</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>128353007</td>\n",
       "      <td>134142530</td>\n",
       "      <td>3708.0</td>\n",
       "      <td>0.0082</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             Sample  Chromosome      Start        End  \\\n",
       "57803  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11     456120    8896255   \n",
       "57804  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11    8899400    8899668   \n",
       "57805  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11    8900394  126596817   \n",
       "57806  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  126596926  127130276   \n",
       "57807  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  127132920  128342803   \n",
       "57808  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  128342819  128350888   \n",
       "57809  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  128353007  134142530   \n",
       "\n",
       "       Num_Probes  Segment_Mean  \n",
       "57803      4489.0       -0.0113  \n",
       "57804         3.0       -1.3344  \n",
       "57805     67487.0        0.0010  \n",
       "57806       453.0       -1.0306  \n",
       "57807       864.0       -0.0031  \n",
       "57808        44.0        0.2824  \n",
       "57809      3708.0        0.0082  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#file_path = \"../../TCGA/CNA/data/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015082100.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n",
    "file_path = \"../../TCGA/CNA/data__2016_01_28/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2016012800.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n",
    "df = pd.read_csv(file_path, sep = \"\\t\")\n",
    "tumor_barcode = \"TCGA-ZJ-AAXJ-01A-11D-A42N-01\"\n",
    "t = df.loc[df[\"Sample\"]==tumor_barcode,:]\n",
    "t_shape = t.shape[0]\n",
    "n = find_matching_normal(tumor_barcode,list(set(df[\"Sample\"].values)))\n",
    "n = df.loc[df[\"Sample\"]==n[0],:]\n",
    "print(\"segemtns in tumor\",t.shape[0],\"segemtns in normal\",n.shape[0])\n",
    "\n",
    "n.loc[n['Chromosome']==11,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sample</th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>Num_Probes</th>\n",
       "      <th>Segment_Mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>57960</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>456120</td>\n",
       "      <td>64200041</td>\n",
       "      <td>34710.0</td>\n",
       "      <td>0.0054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57961</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>64208988</td>\n",
       "      <td>64319750</td>\n",
       "      <td>61.0</td>\n",
       "      <td>-0.6748</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57962</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>64325209</td>\n",
       "      <td>126596817</td>\n",
       "      <td>37207.0</td>\n",
       "      <td>0.0571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57963</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>126596926</td>\n",
       "      <td>127130276</td>\n",
       "      <td>454.0</td>\n",
       "      <td>-1.0760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57964</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>127132920</td>\n",
       "      <td>132080656</td>\n",
       "      <td>3591.0</td>\n",
       "      <td>0.0449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57965</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>132080885</td>\n",
       "      <td>132099465</td>\n",
       "      <td>15.0</td>\n",
       "      <td>-0.6123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57966</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>132099856</td>\n",
       "      <td>134142530</td>\n",
       "      <td>1010.0</td>\n",
       "      <td>0.0483</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             Sample  Chromosome      Start        End  \\\n",
       "57960  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11     456120   64200041   \n",
       "57961  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64208988   64319750   \n",
       "57962  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64325209  126596817   \n",
       "57963  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  126596926  127130276   \n",
       "57964  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  127132920  132080656   \n",
       "57965  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132080885  132099465   \n",
       "57966  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132099856  134142530   \n",
       "\n",
       "       Num_Probes  Segment_Mean  \n",
       "57960     34710.0        0.0054  \n",
       "57961        61.0       -0.6748  \n",
       "57962     37207.0        0.0571  \n",
       "57963       454.0       -1.0760  \n",
       "57964      3591.0        0.0449  \n",
       "57965        15.0       -0.6123  \n",
       "57966      1010.0        0.0483  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.loc[t[\"Chromosome\"] ==11,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "segemtns in normal after dropping low.conf.: 38\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sample</th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>Num_Probes</th>\n",
       "      <th>Segment_Mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>57804</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>8899400</td>\n",
       "      <td>8899668</td>\n",
       "      <td>3.0</td>\n",
       "      <td>-1.3344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57806</th>\n",
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
       "      <td>11</td>\n",
       "      <td>126596926</td>\n",
       "      <td>127130276</td>\n",
       "      <td>453.0</td>\n",
       "      <td>-1.0306</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             Sample  Chromosome      Start        End  \\\n",
       "57804  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11    8899400    8899668   \n",
       "57806  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  126596926  127130276   \n",
       "\n",
       "       Num_Probes  Segment_Mean  \n",
       "57804         3.0       -1.3344  \n",
       "57806       453.0       -1.0306  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n = filter_lowconf_segments(n,0,0.46, -0.68 )\n",
    "print(\"segemtns in normal after dropping low.conf.:\",n.shape[0])\n",
    "n.loc[n['Chromosome']==11,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "segemtns in tumor after removing germlines: 194\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sample</th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>Num_Probes</th>\n",
       "      <th>Segment_Mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>456120</td>\n",
       "      <td>64200041</td>\n",
       "      <td>34710.0</td>\n",
       "      <td>0.0054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>64208988</td>\n",
       "      <td>64319750</td>\n",
       "      <td>61.0</td>\n",
       "      <td>-0.6748</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>64325209</td>\n",
       "      <td>126596817</td>\n",
       "      <td>37207.0</td>\n",
       "      <td>0.0571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>127132920</td>\n",
       "      <td>132080656</td>\n",
       "      <td>3591.0</td>\n",
       "      <td>0.0449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>132080885</td>\n",
       "      <td>132099465</td>\n",
       "      <td>15.0</td>\n",
       "      <td>-0.6123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>132099856</td>\n",
       "      <td>134142530</td>\n",
       "      <td>1010.0</td>\n",
       "      <td>0.0483</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           Sample  Chromosome      Start        End  \\\n",
       "96   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11     456120   64200041   \n",
       "97   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64208988   64319750   \n",
       "98   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64325209  126596817   \n",
       "99   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  127132920  132080656   \n",
       "100  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132080885  132099465   \n",
       "101  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132099856  134142530   \n",
       "\n",
       "     Num_Probes  Segment_Mean  \n",
       "96      34710.0        0.0054  \n",
       "97         61.0       -0.6748  \n",
       "98      37207.0        0.0571  \n",
       "99       3591.0        0.0449  \n",
       "100        15.0       -0.6123  \n",
       "101      1010.0        0.0483  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "t = remove_ovelapping_segments(t, n,tumor_barcode)\n",
    "print(\"segemtns in tumor after removing germlines:\",t.shape[0])\n",
    "t.loc[t[\"Chromosome\"] ==11,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "segemtns in tumor after dropping low.conf.: 101\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sample</th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>Num_Probes</th>\n",
       "      <th>Segment_Mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>64208988</td>\n",
       "      <td>64319750</td>\n",
       "      <td>61.0</td>\n",
       "      <td>-0.6748</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
       "      <td>11</td>\n",
       "      <td>132080885</td>\n",
       "      <td>132099465</td>\n",
       "      <td>15.0</td>\n",
       "      <td>-0.6123</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           Sample  Chromosome      Start        End  \\\n",
       "97   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64208988   64319750   \n",
       "100  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132080885  132099465   \n",
       "\n",
       "     Num_Probes  Segment_Mean  \n",
       "97         61.0       -0.6748  \n",
       "100        15.0       -0.6123  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = filter_lowconf_segments(t,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
    "print(\"segemtns in tumor after dropping low.conf.:\",t.shape[0])\n",
    "t.loc[t[\"Chromosome\"] ==11,:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TCGA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HNSC samples: 1089 CNA events per sample on avg.: 101.275482094\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1089 tumors: 530 normals: 559\n",
      "\ttumors without matched normal 28\n",
      "\ttumors with at least one sCNA 497\n",
      "\ttumors without any somatic CNA 5\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "HNSC samples: 525 Segments per sample on avg.: 60.6876190476\n",
      "ESCA samples: 373 CNA events per sample on avg.: 163.010723861\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 373 tumors: 185 normals: 188\n",
      "\ttumors without matched normal 3\n",
      "\ttumors with at least one sCNA 181\n",
      "\ttumors without any somatic CNA 1\n",
      "total samples: 248 tumors: 125 normals: 123\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "ESCA samples: 184 Segments per sample on avg.: 141.836956522\n",
      "THYM samples: 248 CNA events per sample on avg.: 62.7862903226\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 5\n",
      "\ttumors with at least one sCNA 95\n",
      "\ttumors without any somatic CNA 25\n",
      "total samples: 132 tumors: 66 normals: 66\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "THYM samples: 100 Segments per sample on avg.: 9.41\n",
      "KICH samples: 132 CNA events per sample on avg.: 77.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 0\n",
      "\ttumors with at least one sCNA 65\n",
      "\ttumors without any somatic CNA 1\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "KICH samples: 65 Segments per sample on avg.: 51.4923076923\n",
      "LUSC samples: 1032 CNA events per sample on avg.: 130.682170543\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1032 tumors: 501 normals: 531\n",
      "\ttumors without matched normal 23\n",
      "\ttumors with at least one sCNA 476\n",
      "\ttumors without any somatic CNA 2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "LUSC samples: 499 Segments per sample on avg.: 94.6533066132\n",
      "BLCA samples: 797 CNA events per sample on avg.: 130.927227102\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 797 tumors: 414 normals: 383\n",
      "\ttumors without matched normal 46\n",
      "\ttumors with at least one sCNA 366\n",
      "\ttumors without any somatic CNA 2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "BLCA samples: 412 Segments per sample on avg.: 94.8859223301\n",
      "GBM samples: 1104 CNA events per sample on avg.: 133.018115942\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1104 tumors: 590 normals: 514\n",
      "\ttumors without matched normal 78\n",
      "\ttumors with at least one sCNA 511\n",
      "\ttumors without any somatic CNA 1\n",
      "total samples: 85 tumors: 36 normals: 49\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "GBM samples: 589 Segments per sample on avg.: 70.2139219015\n",
      "CHOL samples: 85 CNA events per sample on avg.: 89.0588235294\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 0\n",
      "\ttumors with at least one sCNA 36\n",
      "\ttumors without any somatic CNA 0\n",
      "total samples: 111 tumors: 56 normals: 55\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "CHOL samples: 36 Segments per sample on avg.: 56.6944444444\n",
      "UCS samples: 111 CNA events per sample on avg.: 173.855855856\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 2\n",
      "\ttumors with at least one sCNA 54\n",
      "\ttumors without any somatic CNA 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "UCS samples: 56 Segments per sample on avg.: 179.125\n",
      "LGG samples: 1015 CNA events per sample on avg.: 78.6118226601\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1015 tumors: 530 normals: 485\n",
      "\ttumors without matched normal 33\n",
      "\ttumors with at least one sCNA 494\n",
      "\ttumors without any somatic CNA 3\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "LGG samples: 527 Segments per sample on avg.: 29.1157495256\n",
      "THCA samples: 1013 CNA events per sample on avg.: 54.4096742349\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1013 tumors: 506 normals: 507\n",
      "\ttumors without matched normal 15\n",
      "\ttumors with at least one sCNA 367\n",
      "\ttumors without any somatic CNA 124\n",
      "total samples: 365 tumors: 185 normals: 180\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "THCA samples: 382 Segments per sample on avg.: 3.8219895288\n",
      "PAAD samples: 365 CNA events per sample on avg.: 95.3643835616\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 10\n",
      "\ttumors with at least one sCNA 161\n",
      "\ttumors without any somatic CNA 14\n",
      "total samples: 1059 tumors: 529 normals: 530\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "PAAD samples: 171 Segments per sample on avg.: 32.4093567251\n",
      "KIRC samples: 1059 CNA events per sample on avg.: 80.298394712\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 22\n",
      "\ttumors with at least one sCNA 501\n",
      "\ttumors without any somatic CNA 6\n",
      "total samples: 160 tumors: 80 normals: 80\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "KIRC samples: 523 Segments per sample on avg.: 20.5009560229\n",
      "UVM samples: 160 CNA events per sample on avg.: 81.08125\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 0\n",
      "\ttumors with at least one sCNA 80\n",
      "\ttumors without any somatic CNA 0\n",
      "total samples: 586 tumors: 297 normals: 289\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "UVM samples: 80 Segments per sample on avg.: 38.425\n",
      "CESC samples: 586 CNA events per sample on avg.: 101.450511945\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 16\n",
      "\ttumors with at least one sCNA 280\n",
      "\ttumors without any somatic CNA 1\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "CESC samples: 296 Segments per sample on avg.: 58.1351351351\n",
      "LUAD samples: 1095 CNA events per sample on avg.: 105.78630137\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1095 tumors: 518 normals: 577\n",
      "\ttumors without matched normal 19\n",
      "\ttumors with at least one sCNA 494\n",
      "\ttumors without any somatic CNA 5\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "LUAD samples: 513 Segments per sample on avg.: 70.469785575\n",
      "STAD samples: 904 CNA events per sample on avg.: 130.961283186\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 904 tumors: 442 normals: 462\n",
      "\ttumors without matched normal 26\n",
      "\ttumors with at least one sCNA 410\n",
      "\ttumors without any somatic CNA 6\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "STAD samples: 436 Segments per sample on avg.: 96.4220183486\n",
      "UCEC samples: 1089 CNA events per sample on avg.: 116.707070707\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1089 tumors: 540 normals: 549\n",
      "\ttumors without matched normal 23\n",
      "\ttumors with at least one sCNA 504\n",
      "\ttumors without any somatic CNA 13\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "UCEC samples: 527 Segments per sample on avg.: 78.89943074\n",
      "SKCM samples: 937 CNA events per sample on avg.: 115.351120598\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 937 tumors: 472 normals: 465\n",
      "\ttumors without matched normal 7\n",
      "\ttumors with at least one sCNA 463\n",
      "\ttumors without any somatic CNA 2\n",
      "total samples: 172 tumors: 87 normals: 85\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "SKCM samples: 470 Segments per sample on avg.: 82.9957446809\n",
      "MESO samples: 172 CNA events per sample on avg.: 106.598837209\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 2\n",
      "\ttumors with at least one sCNA 82\n",
      "\ttumors without any somatic CNA 3\n",
      "total samples: 346 tumors: 168 normals: 178\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "MESO samples: 84 Segments per sample on avg.: 60.8333333333\n",
      "PCPG samples: 346 CNA events per sample on avg.: 90.3352601156\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 6\n",
      "\ttumors with at least one sCNA 159\n",
      "\ttumors without any somatic CNA 3\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "PCPG samples: 165 Segments per sample on avg.: 43.5878787879\n",
      "STES samples: 1277 CNA events per sample on avg.: 140.322631167\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1277 tumors: 627 normals: 650\n",
      "\ttumors without matched normal 29\n",
      "\ttumors with at least one sCNA 591\n",
      "\ttumors without any somatic CNA 7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "STES samples: 620 Segments per sample on avg.: 109.9\n",
      "SARC samples: 513 CNA events per sample on avg.: 208.068226121\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 513 tumors: 263 normals: 250\n",
      "\ttumors without matched normal 17\n",
      "\ttumors with at least one sCNA 245\n",
      "\ttumors without any somatic CNA 1\n",
      "total samples: 380 tumors: 191 normals: 189\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "SARC samples: 262 Segments per sample on avg.: 187.057251908\n",
      "LAML samples: 380 CNA events per sample on avg.: 74.5368421053\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 3\n",
      "\ttumors with at least one sCNA 167\n",
      "\ttumors without any somatic CNA 21\n",
      "total samples: 590 tumors: 288 normals: 302\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "LAML samples: 170 Segments per sample on avg.: 7.18823529412\n",
      "KIRP samples: 590 CNA events per sample on avg.: 79.5152542373\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 15\n",
      "\ttumors with at least one sCNA 271\n",
      "\ttumors without any somatic CNA 2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "KIRP samples: 286 Segments per sample on avg.: 21.8846153846\n",
      "LIHC samples: 760 CNA events per sample on avg.: 122.8\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 760 tumors: 373 normals: 387\n",
      "\ttumors without matched normal 21\n",
      "\ttumors with at least one sCNA 348\n",
      "\ttumors without any somatic CNA 4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "LIHC samples: 369 Segments per sample on avg.: 81.1327913279\n",
      "OV samples: 1168 CNA events per sample on avg.: 224.04109589\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1168 tumors: 597 normals: 571\n",
      "\ttumors without matched normal 26\n",
      "\ttumors with at least one sCNA 571\n",
      "\ttumors without any somatic CNA 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "OV samples: 597 Segments per sample on avg.: 207.924623116\n",
      "TGCT samples: 304 CNA events per sample on avg.: 83.8125\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 304 tumors: 156 normals: 148\n",
      "\ttumors without matched normal 2\n",
      "\ttumors with at least one sCNA 154\n",
      "\ttumors without any somatic CNA 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "TGCT samples: 156 Segments per sample on avg.: 37.7820512821\n",
      "COAD samples: 918 CNA events per sample on avg.: 98.6209150327\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 918 tumors: 453 normals: 465\n",
      "\ttumors without matched normal 44\n",
      "\ttumors with at least one sCNA 406\n",
      "\ttumors without any somatic CNA 3\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "COAD samples: 450 Segments per sample on avg.: 48.4755555556\n",
      "BRCA samples: 2199 CNA events per sample on avg.: 129.35788995\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 2199 tumors: 1088 normals: 1111\n",
      "\ttumors without matched normal 35\n",
      "\ttumors with at least one sCNA 1046\n",
      "\ttumors without any somatic CNA 7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "BRCA samples: 1081 Segments per sample on avg.: 102.808510638\n",
      "PRAD samples: 1023 CNA events per sample on avg.: 114.706744868\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 1023 tumors: 493 normals: 530\n",
      "\ttumors without matched normal 17\n",
      "\ttumors with at least one sCNA 458\n",
      "\ttumors without any somatic CNA 18\n",
      "total samples: 96 tumors: 52 normals: 44\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "PRAD samples: 475 Segments per sample on avg.: 60.3831578947\n",
      "DLBC samples: 96 CNA events per sample on avg.: 97.3229166667\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 10\n",
      "\ttumors with at least one sCNA 40\n",
      "\ttumors without any somatic CNA 2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "DLBC samples: 50 Segments per sample on avg.: 44.44\n",
      "READ samples: 316 CNA events per sample on avg.: 113.180379747\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 316 tumors: 166 normals: 150\n",
      "\ttumors without matched normal 23\n",
      "\ttumors with at least one sCNA 141\n",
      "\ttumors without any somatic CNA 2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "READ samples: 164 Segments per sample on avg.: 70.012195122\n",
      "ACC samples: 180 CNA events per sample on avg.: 116.955555556\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "total samples: 180 tumors: 90 normals: 90\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "after filtering\n",
      "ACC samples: 89 Segments per sample on avg.: 107.449438202\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\ttumors without matched normal 2\n",
      "\ttumors with at least one sCNA 87\n",
      "\ttumors without any somatic CNA 1\n"
     ]
    }
   ],
   "source": [
    "\n",
    "data_dir = \"../../TCGA/CNA/data__2016_01_28//\"\n",
    "\n",
    "dfs = {}\n",
    "dfs_normals = {}\n",
    "tumors_without_CNA = {}\n",
    "for f in os.listdir(data_dir):\n",
    "    if f.endswith(\"tar.gz\"):\n",
    "        fp = f.replace(\".tar.gz\",\"\")\n",
    "        cohort = fp.split(\".\")[2].replace(\"org_\",\"\")\n",
    "        file_path = fp+\"/\"+cohort+\".\"+fp.split(\".\")[3].replace(\"Merge_\",\"\")+\".seg.txt\"\n",
    "        df = pd.read_csv(data_dir+file_path, sep = \"\\t\")\n",
    "        \n",
    "        df[\"Chromosome\"] = df[\"Chromosome\"].map(chr_dict)\n",
    "        print(cohort,\"samples:\",len(set(df[\"Sample\"].values)),\n",
    "              \"CNA events per sample on avg.:\",float(df.shape[0])/len(set(df[\"Sample\"].values)))\n",
    "        \n",
    "        #### remove segments overlapping with segemnts in normals by 80% or more reciprocally ####\n",
    "        df[\"type\"] = df[\"Sample\"].apply(sample_type)\n",
    "        df_normals = df.loc[df[\"type\"]== \"Normal\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n",
    "        df_tumors = df.loc[df[\"type\"]== \"Tumor\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n",
    "        normal_samples = list(set(df_normals[\"Sample\"].values))\n",
    "        tumor_samples = list(set(df_tumors[\"Sample\"].values))\n",
    "        print(\"total samples:\", len(set(df[\"Sample\"].values)),\n",
    "              \"tumors:\",len(tumor_samples),\"normals:\",len(normal_samples),file= sys.stderr)\n",
    "        \n",
    "        tumors_without_somatic_CNA = []\n",
    "        tumors_germline_removed = []\n",
    "        tumors_without_matching_normal = []\n",
    "        filtered_normals = []\n",
    "        for tumor_sample in tumor_samples:\n",
    "            #print(sample, find_matching_normal(sample,list(set(d[\"Sample\"]))))\n",
    "            tumor = df_tumors.loc[df_tumors [\"Sample\"]== tumor_sample,:]\n",
    "            matching_normals = find_matching_normal(tumor_sample,normal_samples)\n",
    "            if len(matching_normals) >0:\n",
    "                n_segs = tumor.shape[0]\n",
    "                for normal_sample in matching_normals:\n",
    "                    normal  =  df_normals.loc[df_normals[\"Sample\"]== normal_sample,:]\n",
    "                    # thresholds for +1 and -1 copy in 75% of normal cell;\n",
    "                    # this is to retain segments appeared due to slight tumor contamination\n",
    "                    normal =  filter_lowconf_segments(normal,0,0.46, -0.68 )\n",
    "                    filtered_normals.append(normal)\n",
    "                    tumor = remove_ovelapping_segments(tumor, normal,tumor_sample)\n",
    "                #if n_segs > tumor.shape[0]:\n",
    "                #    print(n_segs - tumor.shape[0],\"segments removed in sample\",tumor_sample,\n",
    "                #    tumor.shape[0],\"remained\",file= sys.stderr)\n",
    "                tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
    "                if tumor.shape[0] == 0:\n",
    "                       tumors_without_somatic_CNA.append(tumor_sample)\n",
    "                else:\n",
    "                    tumors_germline_removed.append(tumor)\n",
    "            else:\n",
    "                tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
    "                if tumor.shape[0] == 0:\n",
    "                       tumors_without_somatic_CNA.append(tumor_sample)\n",
    "                else:\n",
    "                    tumors_without_matching_normal.append(tumor)\n",
    "\n",
    "        print(\"\\ttumors without matched normal\",len(tumors_without_matching_normal),file= sys.stderr)\n",
    "        print(\"\\ttumors with at least one sCNA\",len(tumors_germline_removed),file= sys.stderr)\n",
    "        print(\"\\ttumors without any somatic CNA\",len(tumors_without_somatic_CNA),file= sys.stderr)\n",
    "        #dfs[cohort] = df\n",
    "        filtered_tumors = pd.concat(tumors_germline_removed+tumors_without_matching_normal)\n",
    "        dfs[cohort] = filtered_tumors\n",
    "        filtered_normals = pd.concat(filtered_normals)\n",
    "        dfs_normals[cohort] = filtered_normals\n",
    "        tumors_without_CNA[cohort] = tumors_without_somatic_CNA\n",
    "        print(\"after filtering\")\n",
    "        print(cohort,\"samples:\",len(set(filtered_tumors[\"Sample\"].values)),\n",
    "              \"Segments per sample on avg.:\",float(filtered_tumors.shape[0])/len(set(filtered_tumors[\"Sample\"].values)))\n",
    "        \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Aggregating to gene-level\n",
    "\n",
    "Gene annotation must be:\n",
    " - with Entrez gene IDs \n",
    " - in hg19 coordinates\n",
    " - with columns \"chrom\",\"start\",\"stop\",\"gene\" (this is foru-column bed format)\n",
    " \n",
    "wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz\n",
    "\n",
    "echo -e \"chrom\\tstart\\tstop\\tgene\\tname\"  > ef_GRCh37.p5_top_level.gff3.bed;\n",
    "zcat ref_GRCh37.p5_top_level.gff3.gz  | awk '$3==\"gene\"' | cut -f 1,4,5,9| sed -e 's/;/\\t/g'| cut -f 1-3,5,6 | grep  GeneID |  sed -re 's/(Dbxref=GeneID:[0-9]*),.*/\\1/' | sed -e 's/Name=//' -e 's/Dbxref=GeneID://' | awk '{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$5\"\\t\"$4}'  >> \n",
    "ref_GRCh37.p5_top_level.gff3.bed\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "rename_chroms = {\"NC_000001.10\":1,\"NC_000002.11\":2,\"NC_000003.11\":3,\"NC_000004.11\":4,\n",
    "                 \"NC_000005.9\":5,\"NC_000006.11\":6,\"NC_000007.13\":7,\"NC_000008.10\":8,\n",
    "                 \"NC_000009.11\":9,\"NC_000010.10\":10,\"NC_000011.9\":11,\"NC_000012.11\":12,\"NC_000013.10\":13,\n",
    "                 \"NC_000014.8\":14,\"NC_000015.9\":15,\"NC_000016.9\":16,\"NC_000017.10\":17,\n",
    "                 \"NC_000018.9\":18,\"NC_000019.9\":19,\"NC_000020.10\":20,\"NC_000021.8\":21,\n",
    "                 \"NC_000022.10\":22,\"NC_000023.10\":23,\"NC_000024.9\":24}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(36019, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>chrom</th>\n",
       "      <th>start</th>\n",
       "      <th>stop</th>\n",
       "      <th>gene</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>10954</td>\n",
       "      <td>11507</td>\n",
       "      <td>100506145</td>\n",
       "      <td>LOC100506145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>12190</td>\n",
       "      <td>13639</td>\n",
       "      <td>100652771</td>\n",
       "      <td>LOC100652771</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>14362</td>\n",
       "      <td>29370</td>\n",
       "      <td>653635</td>\n",
       "      <td>WASH7P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>30366</td>\n",
       "      <td>30503</td>\n",
       "      <td>100302278</td>\n",
       "      <td>MIR1302-2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   chrom  start   stop       gene          name\n",
       "0      1  10954  11507  100506145  LOC100506145\n",
       "1      1  12190  13639  100652771  LOC100652771\n",
       "2      1  14362  29370     653635        WASH7P\n",
       "3      1  30366  30503  100302278     MIR1302-2"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene_intervals = pd.read_csv(gene_coords_file, sep = \"\\t\")\n",
    "gene_intervals = gene_intervals.loc[gene_intervals[\"chrom\"].isin(rename_chroms.keys()),:]\n",
    "gene_intervals[\"chrom\"] = gene_intervals[\"chrom\"].apply(lambda x : rename_chroms[x])\n",
    "#print(\"chromosomes:\",list(set(gene_intervals[\"chrom\"].values)))\n",
    "gene_intervals = gene_intervals.sort_values(by=[\"chrom\",\"start\",\"stop\"],ascending=True)\n",
    "gene_intervals.to_csv(\"/home/olya/SFU/Hossein/v1/ref_GRCh37.p5_top_level.gff3.chroms_renamed.bed\",sep = \"\\t\",index=False)\n",
    "print(gene_intervals.shape)\n",
    "gene_intervals.head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_intervals_bed = pbt.BedTool.from_dataframe(gene_intervals[[\"chrom\",\"start\",\"stop\",\"gene\"]])\n",
    "# prepare copy-neutral table\n",
    "cnv_baseline = gene_intervals.copy()\n",
    "cnv_baseline[\"Segment_Mean\"] = [0]*cnv_baseline.shape[0]\n",
    "cnv_baseline = cnv_baseline[[\"gene\",\"Segment_Mean\"]]\n",
    "cnv_baseline.set_index(\"gene\",inplace=True,drop=True)\n",
    "cnv_baseline.sort_index(inplace=True)\n",
    "sorted_index = list(cnv_baseline.index.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ESCA\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "ESCA (36019, 185)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DLBC\n",
      "TCGA-G8-6914-14A-01D-2209-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DLBC (36019, 52)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "READ\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "READ (36019, 166)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "GBM\n",
      "TCGA-06-0165-01A-01D-0236-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-06-0119-01A-08D-0214-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n",
      "... 400 processed.\n",
      "... 500 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-06-5410-01A-01D-1694-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GBM (36019, 590)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "STES\n",
      "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n",
      "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n",
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n",
      "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n",
      "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 500 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 600 processed.\n",
      "STES (36019, 627)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "BLCA\n",
      "TCGA-YC-A8S6-01A-31D-A38F-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-DK-A3WY-01A-11D-A22Y-01 has no genes with altered CN\n",
      "TCGA-XF-A9SL-01A-11D-A390-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-E7-A7XN-01A-11D-A34T-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n",
      "... 400 processed.\n",
      "BLCA (36019, 414)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "UCEC\n",
      "TCGA-D1-A16Y-01A-31D-A12G-01 has no genes with altered CN\n",
      "TCGA-BK-A6W4-01A-12D-A34P-01 has no genes with altered CN\n",
      "TCGA-BS-A0V7-01A-21D-A120-01 has no genes with altered CN\n",
      "TCGA-B5-A11Y-01A-21D-A10L-01 has no genes with altered CN\n",
      "TCGA-D1-A17F-01A-11D-A12G-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-AX-A062-01A-11D-A00X-01 has no genes with altered CN\n",
      "TCGA-D1-A16D-01A-11D-A12G-01 has no genes with altered CN\n",
      "TCGA-BG-A0VZ-01A-11D-A107-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-AJ-A2QL-01A-11D-A18N-01 has no genes with altered CN\n",
      "TCGA-BS-A0UA-01A-11D-A120-01 has no genes with altered CN\n",
      "TCGA-B5-A11U-01A-11D-A120-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-EO-A3AU-01A-21D-A19X-01 has no genes with altered CN\n",
      "TCGA-QF-A5YS-01A-11D-A31T-01 has no genes with altered CN\n",
      "TCGA-D1-A0ZV-01A-11D-A10L-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-QS-A5YR-01A-31D-A31T-01 has no genes with altered CN\n",
      "TCGA-DI-A1BU-01A-11D-A134-01 has no genes with altered CN\n",
      "TCGA-AP-A0LG-01A-11D-A042-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 500 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-D1-A0ZS-01A-11D-A120-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "UCEC (36019, 540)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PCPG\n",
      "TCGA-RW-A7CZ-01A-11D-A35C-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-WB-A817-01A-11D-A35H-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PCPG (36019, 168)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "STAD\n",
      "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n",
      "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n",
      "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n",
      "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n",
      "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n",
      "STAD (36019, 442)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "CESC\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "... 200 processed.\n",
      "CESC (36019, 297)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "UCS\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "UCS (36019, 56)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TGCT\n",
      "TCGA-YU-A90S-01A-11D-A434-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "TGCT (36019, 156)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "THCA\n",
      "TCGA-EL-A4JZ-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-DJ-A13X-01A-11D-A10T-01 has no genes with altered CN\n",
      "TCGA-EL-A3ZT-01A-12D-A23L-01 has no genes with altered CN\n",
      "TCGA-DE-A0XZ-01A-11D-A17S-01 has no genes with altered CN\n",
      "TCGA-DJ-A2PP-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-KS-A4I5-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-DJ-A2PS-01A-11D-A18E-01 has no genes with altered CN\n",
      "TCGA-EL-A3GW-01A-11D-A201-01 has no genes with altered CN\n",
      "TCGA-BJ-A0ZG-01A-11D-A10T-01 has no genes with altered CN\n",
      "TCGA-J8-A3O2-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-FY-A3RA-01A-11D-A21Y-01 has no genes with altered CN\n",
      "TCGA-CE-A483-01A-11D-A23T-01 has no genes with altered CN\n",
      "TCGA-EM-A1CW-01A-21D-A13V-01 has no genes with altered CN\n",
      "TCGA-DJ-A4V4-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-E3-A3E1-01A-11D-A20A-01 has no genes with altered CN\n",
      "TCGA-ET-A2MZ-01A-12D-A19I-01 has no genes with altered CN\n",
      "TCGA-E8-A414-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-EL-A3T6-01A-11D-A21Y-01 has no genes with altered CN\n",
      "TCGA-DJ-A4V5-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-DJ-A3UY-01A-21D-A22C-01 has no genes with altered CN\n",
      "TCGA-EL-A3D4-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-FY-A76V-01A-11D-A396-01 has no genes with altered CN\n",
      "TCGA-FY-A4B3-01A-11D-A23T-01 has no genes with altered CN\n",
      "TCGA-DJ-A3UO-01A-11D-A22C-01 has no genes with altered CN\n",
      "TCGA-EL-A4K7-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-DJ-A1QI-01A-11D-A14V-01 has no genes with altered CN\n",
      "TCGA-EL-A3N2-01A-11D-A20A-01 has no genes with altered CN\n",
      "TCGA-E3-A3E5-01A-11D-A20A-01 has no genes with altered CN\n",
      "TCGA-EM-A1YD-01A-11D-A14V-01 has no genes with altered CN\n",
      "TCGA-GE-A2C6-01A-11D-A16M-01 has no genes with altered CN\n",
      "TCGA-DJ-A2Q5-01A-11D-A18E-01 has no genes with altered CN\n",
      "TCGA-ET-A3DP-01A-11D-A219-01 has no genes with altered CN\n",
      "TCGA-DJ-A4UT-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-DJ-A2PT-01A-11D-A18E-01 has no genes with altered CN\n",
      "TCGA-DJ-A4V2-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-L6-A4ET-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-BJ-A0ZJ-01A-11D-A10T-01 has no genes with altered CN\n",
      "TCGA-DE-A4M9-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-EL-A4KD-01A-11D-A256-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-QD-A8IV-01A-11D-A396-01 has no genes with altered CN\n",
      "TCGA-ET-A3DV-01A-12D-A201-01 has no genes with altered CN\n",
      "TCGA-EM-A22K-01A-11D-A17S-01 has no genes with altered CN\n",
      "TCGA-DJ-A3VE-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-EL-A3D1-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-BJ-A2P4-01A-11D-A18E-01 has no genes with altered CN\n",
      "TCGA-CE-A3ME-01A-11D-A20A-01 has no genes with altered CN\n",
      "TCGA-E8-A417-01A-21D-A23L-01 has no genes with altered CN\n",
      "TCGA-KS-A41I-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-FK-A3SB-01A-11D-A22C-01 has no genes with altered CN\n",
      "TCGA-BJ-A28S-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-MK-A4N9-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-E8-A437-01A-12D-A23T-01 has no genes with altered CN\n",
      "TCGA-EM-A3AP-01A-12D-A20A-01 has no genes with altered CN\n",
      "TCGA-EL-A3TA-01A-12D-A22C-01 has no genes with altered CN\n",
      "TCGA-IM-A41Z-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-EM-A2CQ-01A-11D-A17S-01 has no genes with altered CN\n",
      "TCGA-EM-A3O7-01A-11D-A21Y-01 has no genes with altered CN\n",
      "TCGA-FE-A3PC-01A-11D-A21Y-01 has no genes with altered CN\n",
      "TCGA-DJ-A2PY-01A-11D-A18E-01 has no genes with altered CN\n",
      "TCGA-EM-A4FQ-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-EM-A3FO-01A-11D-A219-01 has no genes with altered CN\n",
      "TCGA-BJ-A0Z9-01A-11D-A10T-01 has no genes with altered CN\n",
      "TCGA-EM-A3FK-01A-11D-A219-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-ET-A3BU-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-BJ-A0Z5-01A-11D-A10T-01 has no genes with altered CN\n",
      "TCGA-EL-A3MY-01A-11D-A219-01 has no genes with altered CN\n",
      "TCGA-ET-A39L-01A-12D-A19I-01 has no genes with altered CN\n",
      "TCGA-E8-A415-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-ET-A40Q-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-KS-A4I7-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-MK-A4N7-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-L6-A4EQ-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-FY-A3TY-01A-11D-A22Y-01 has no genes with altered CN\n",
      "TCGA-ET-A2N1-01A-11D-A18E-01 has no genes with altered CN\n",
      "TCGA-DJ-A2PO-01A-21D-A19I-01 has no genes with altered CN\n",
      "TCGA-J8-A3O2-06A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-CE-A485-01A-11D-A23T-01 has no genes with altered CN\n",
      "TCGA-ET-A3BX-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-DJ-A3VK-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-DE-A4M8-01A-21D-A256-01 has no genes with altered CN\n",
      "TCGA-ET-A40T-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-BJ-A18Z-01A-21D-A13V-01 has no genes with altered CN\n",
      "TCGA-DJ-A3UT-01A-11D-A22C-01 has no genes with altered CN\n",
      "TCGA-DJ-A2Q2-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-BJ-A18Y-01A-11D-A13V-01 has no genes with altered CN\n",
      "TCGA-ET-A39T-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-EL-A3CL-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-DJ-A4V0-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-EL-A3H8-01A-11D-A20A-01 has no genes with altered CN\n",
      "TCGA-ET-A39J-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-FY-A3I4-01A-11D-A219-01 has no genes with altered CN\n",
      "TCGA-EM-A2CU-01A-12D-A17S-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-EM-A3FM-01A-11D-A219-01 has no genes with altered CN\n",
      "TCGA-EM-A4FF-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-EL-A3GX-01A-11D-A201-01 has no genes with altered CN\n",
      "TCGA-DJ-A3UN-01A-11D-A22C-01 has no genes with altered CN\n",
      "TCGA-EM-A4FO-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-EL-A3TB-01A-11D-A22C-01 has no genes with altered CN\n",
      "TCGA-ET-A25N-01A-11D-A16M-01 has no genes with altered CN\n",
      "TCGA-ET-A39M-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-DE-A4MA-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-ET-A39O-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-DE-A0Y2-01A-11D-A10T-01 has no genes with altered CN\n",
      "TCGA-FY-A3R8-01A-11D-A21Y-01 has no genes with altered CN\n",
      "TCGA-EM-A3AL-01A-11D-A201-01 has no genes with altered CN\n",
      "TCGA-EM-A2CN-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-FY-A3BL-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-EM-A1CS-01A-11D-A13V-01 has no genes with altered CN\n",
      "TCGA-EL-A3D0-01A-12D-A201-01 has no genes with altered CN\n",
      "TCGA-E3-A3DZ-01A-11D-A20A-01 has no genes with altered CN\n",
      "TCGA-DJ-A1QF-01A-12D-A14V-01 has no genes with altered CN\n",
      "TCGA-J8-A3YH-01A-11D-A22Y-01 has no genes with altered CN\n",
      "TCGA-EL-A4K1-01A-11D-A256-01 has no genes with altered CN\n",
      "TCGA-EM-A3O8-01A-11D-A21Y-01 has no genes with altered CN\n",
      "TCGA-DJ-A3VJ-01A-11D-A23L-01 has no genes with altered CN\n",
      "TCGA-BJ-A45D-01A-11D-A23T-01 has no genes with altered CN\n",
      "TCGA-FY-A4B4-01A-11D-A23T-01 has no genes with altered CN\n",
      "TCGA-EM-A1CU-01A-11D-A13V-01 has no genes with altered CN\n",
      "TCGA-EL-A3CX-01A-11D-A19I-01 has no genes with altered CN\n",
      "TCGA-ET-A25O-01A-11D-A16M-01 has no genes with altered CN\n",
      "TCGA-E8-A433-01A-11D-A23L-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "THCA (36019, 506)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "CHOL\n",
      "TCGA-W5-AA2H-01A-31D-A416-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CHOL (36019, 36)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "HNSC\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "... 200 processed.\n",
      "... 300 processed.\n",
      "... 400 processed.\n",
      "... 500 processed.\n",
      "HNSC (36019, 530)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "UVM\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "UVM (36019, 80)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SKCM\n",
      "TCGA-ER-A19A-06A-21D-A191-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "... 200 processed.\n",
      "... 300 processed.\n",
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-EB-A4OZ-01A-12D-A25P-01 has no genes with altered CN\n",
      "TCGA-EE-A2GK-06A-11D-A194-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SKCM (36019, 472)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "COAD\n",
      "TCGA-G4-6302-01A-11D-1717-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-AA-A03F-01A-11D-A080-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n",
      "... 300 processed.\n",
      "... 400 processed.\n",
      "COAD (36019, 453)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ACC\n",
      "TCGA-OR-A5KQ-01A-11D-A309-01 has no genes with altered CN\n",
      "TCGA-OR-A5KV-01A-11D-A29H-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ACC (36019, 90)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PAAD\n",
      "TCGA-IB-AAUR-01A-21D-A38F-01 has no genes with altered CN\n",
      "TCGA-HZ-8002-01A-11D-2200-01 has no genes with altered CN\n",
      "TCGA-XD-AAUG-01A-61D-A40V-01 has no genes with altered CN\n",
      "TCGA-Z5-AAPL-01A-12D-A40V-01 has no genes with altered CN\n",
      "TCGA-IB-A5SQ-01A-11D-A32M-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-IB-AAUS-01A-12D-A38F-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PAAD (36019, 185)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "THYM\n",
      "TCGA-4V-A9QW-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-ZB-A96B-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-X7-A8DB-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-X7-A8M4-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-X7-A8D8-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-3S-AAYX-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-YT-A95E-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-X7-A8M8-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-ZT-A8OM-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-ZB-A96E-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-3Q-A9WF-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-X7-A8M1-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-ZB-A96A-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-ZB-A96R-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-ZB-A963-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-ZC-AAAA-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-XM-A8RB-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-ZB-A96G-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-X7-A8M7-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-XU-AAXZ-01A-11D-A427-01 has no genes with altered CN\n",
      "TCGA-XH-A853-01A-11D-A422-01 has no genes with altered CN\n",
      "TCGA-XM-AAZ3-01A-11D-A422-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "THYM (36019, 125)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "LUSC\n",
      "TCGA-56-8623-01A-11D-2391-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "... 200 processed.\n",
      "... 300 processed.\n",
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-98-A53H-01A-12D-A25M-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LUSC (36019, 501)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "MESO\n",
      "TCGA-TS-A8AS-01A-21D-A39Q-01 has no genes with altered CN\n",
      "TCGA-TS-A7P8-01A-11D-A34B-01 has no genes with altered CN\n",
      "TCGA-TS-A8AV-01A-12D-A39Q-01 has no genes with altered CN\n",
      "TCGA-3H-AB3O-01A-11D-A39Q-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MESO (36019, 87)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OV\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "... 200 processed.\n",
      "... 300 processed.\n",
      "... 400 processed.\n",
      "... 500 processed.\n",
      "OV (36019, 597)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SARC\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-WK-A8Y0-10D-01D-A419-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-WK-A8XS-10E-01D-A37E-01 has no genes with altered CN\n",
      "TCGA-QQ-A5V2-01A-11D-A32H-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SARC (36019, 263)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "KIRP\n",
      "TCGA-Y8-A8S1-01A-11D-A36W-01 has no genes with altered CN\n",
      "TCGA-GL-A4EM-01A-11D-A253-01 has no genes with altered CN\n",
      "TCGA-4A-A93Y-01A-11D-A36W-01 has no genes with altered CN\n",
      "TCGA-AL-3467-01A-02D-1348-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-A4-7828-01A-11D-2135-01 has no genes with altered CN\n",
      "TCGA-DW-7838-01A-11D-2135-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n",
      "KIRP (36019, 288)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "LGG\n",
      "TCGA-HT-8106-01A-11D-2391-01 has no genes with altered CN\n",
      "TCGA-S9-A6WI-01A-21D-A33S-01 has no genes with altered CN\n",
      "TCGA-HT-7602-01A-21D-2085-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-DU-7011-01A-11D-2023-01 has no genes with altered CN\n",
      "TCGA-TM-A84B-12A-01D-A366-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-FG-8181-01A-11D-2252-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-FG-8189-01B-11D-A288-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-DU-5872-02A-21D-A36N-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 500 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-HT-7680-01A-11D-2252-01 has no genes with altered CN\n",
      "TCGA-P5-A5EY-01A-11D-A27J-01 has no genes with altered CN\n",
      "TCGA-CS-6669-01A-11D-1892-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LGG (36019, 530)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "LAML\n",
      "TCGA-AB-2884-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2932-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2842-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2969-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2826-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2836-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2871-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2845-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2840-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2837-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2844-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2854-03A-01D-0756-21 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-AB-3006-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2931-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2851-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2978-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2880-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2922-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2947-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2998-03A-01D-0756-21 has no genes with altered CN\n",
      "TCGA-AB-2824-03A-01D-0756-21 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LAML (36019, 191)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "LIHC\n",
      "TCGA-2V-A95S-10D-01D-A36Z-01 has no genes with altered CN\n",
      "TCGA-UB-AA0V-01A-11D-A381-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-G3-A25V-01A-11D-A16U-01 has no genes with altered CN\n",
      "TCGA-DD-A3A6-01A-11D-A22E-01 has no genes with altered CN\n",
      "TCGA-DD-A4NL-01A-11D-A28W-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-ED-A5KG-01A-11D-A27H-01 has no genes with altered CN\n",
      "TCGA-CC-A9FV-01A-11D-A36W-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-MR-A520-01A-11D-A25U-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LIHC (36019, 373)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PRAD\n",
      "TCGA-J9-A52C-01A-11D-A26L-01 has no genes with altered CN\n",
      "TCGA-V1-A8MJ-01A-11D-A363-01 has no genes with altered CN\n",
      "TCGA-XJ-A9DQ-01A-11D-A376-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-J4-A6G1-01A-11D-A30W-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-J4-A67R-01A-21D-A30D-01 has no genes with altered CN\n",
      "TCGA-EJ-A7NJ-01A-22D-A34T-01 has no genes with altered CN\n",
      "TCGA-EJ-7791-01A-11D-2112-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-EJ-A8FU-01A-11D-A363-01 has no genes with altered CN\n",
      "TCGA-EJ-A6RC-01A-11D-A32A-01 has no genes with altered CN\n",
      "TCGA-HC-7740-01A-11D-2112-01 has no genes with altered CN\n",
      "TCGA-EJ-A65B-01A-12D-A30D-01 has no genes with altered CN\n",
      "TCGA-HC-8260-01A-11D-2259-01 has no genes with altered CN\n",
      "TCGA-FC-A8O0-01A-41D-A376-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-VN-A88I-01A-11D-A34T-01 has no genes with altered CN\n",
      "TCGA-EJ-A7NK-01A-12D-A34T-01 has no genes with altered CN\n",
      "TCGA-CH-5743-01A-21D-1574-01 has no genes with altered CN\n",
      "TCGA-G9-6367-01A-11D-1785-01 has no genes with altered CN\n",
      "TCGA-KC-A4BO-01A-61D-A256-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PRAD (36019, 493)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "LUAD\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n",
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-L4-A4E6-01A-11D-A24C-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-44-3398-01A-01D-1877-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-55-8619-01A-11D-2389-01 has no genes with altered CN\n",
      "TCGA-86-A4P8-01A-11D-A24O-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 500 processed.\n",
      "LUAD (36019, 518)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "BRCA\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-AO-A0JC-01A-11D-A059-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-BH-A0H5-01A-21D-A111-01 has no genes with altered CN\n",
      "TCGA-A2-A0CR-01A-11D-A227-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-BH-A1FE-06A-11D-A20R-01 has no genes with altered CN\n",
      "TCGA-AN-A0FN-01A-11D-A036-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n",
      "... 500 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-PL-A8LY-01A-11D-A41E-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 600 processed.\n",
      "... 700 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-GM-A3XG-01A-31D-A242-01 has no genes with altered CN\n",
      "TCGA-LD-A74U-01A-13D-A33D-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 800 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-GM-A2DO-10D-01D-A18N-01 has no genes with altered CN\n",
      "TCGA-A2-A0EP-01A-52D-A22W-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 900 processed.\n",
      "... 1000 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-AO-A1KO-01A-31D-A13J-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BRCA (36019, 1088)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "KIRC\n",
      "TCGA-B4-5378-01A-01D-1499-01 has no genes with altered CN\n",
      "TCGA-B0-5400-01A-01D-1499-01 has no genes with altered CN\n",
      "TCGA-CJ-4890-01A-01D-1302-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 100 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-A3-A8OX-01A-11D-A36W-01 has no genes with altered CN\n",
      "TCGA-B0-4817-01A-01D-1274-01 has no genes with altered CN\n",
      "TCGA-B0-5080-01A-01D-1499-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 200 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-DV-A4VZ-01A-11D-A25U-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 300 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-CJ-4891-01A-01D-1302-01 has no genes with altered CN\n",
      "TCGA-CJ-4889-01A-01D-1302-01 has no genes with altered CN\n",
      "TCGA-BP-4769-01A-01D-1283-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 400 processed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "TCGA-BP-4760-01A-02D-1417-01 has no genes with altered CN\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "... 500 processed.\n",
      "KIRC (36019, 529)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "KICH\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KICH (36019, 66)\n"
     ]
    }
   ],
   "source": [
    "for cohort in dfs.keys():\n",
    "    print(cohort, file=sys.stderr)\n",
    "    df = dfs[cohort]\n",
    "    cna_table = []\n",
    "    n_samples = 0\n",
    "    for sample in list(set(df.Sample.values)):\n",
    "        n_samples +=1\n",
    "        cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n",
    "                                 verbose = False,sorted_index = sorted_index)\n",
    "        cna_table.append(cnv2gene)\n",
    "        if n_samples % 100 == 0:\n",
    "            print(\"...\",n_samples, \"processed.\")\n",
    "    cna_table = pd.concat(cna_table,axis =1)\n",
    "    \n",
    "\n",
    "    for sample in tumors_without_CNA[cohort]:\n",
    "        cna_table[sample] = 0\n",
    "    \n",
    "    cna_table.fillna(0, inplace = True)\n",
    "    cna_table.to_csv(preprocessed_dir+\"/TCGA-\"+cohort+\".Segment_Mean.CNA.tsv\",\n",
    "                     sep = \"\\t\",header=True,index=True)\n",
    "    print(cohort,cna_table.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'t = time.time()\\ncnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\\n                         verbose = False,sorted_index = sorted_index)\\nprint( time.time() - t)\\ncnv2gene'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"t = time.time()\n",
    "cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n",
    "                         verbose = False,sorted_index = sorted_index)\n",
    "print( time.time() - t)\n",
    "cnv2gene\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CCLE \n",
    "\n",
    "the same pipeline as for TCGA except filtering out germline CNA (because no )\n",
    "\n",
    "wget https://data.broadinstitute.org/ccle_legacy_data/dna_copy_number/CCLE_copynumber_2013-12-03.seg.txt\n",
    "\n",
    "? should we use a stronger segment_mean threshold because this data are for cell lines and purity must be 100%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "47 duplicated IDs in 94 rows found.\n",
      "duplicate rows removed due to low correlation of duplicated profiles 0\n",
      "Merged  94 duplicated rows into 47 rows\n",
      "CCLE: genes: 35972 samples 1043\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(\"../../CCLE/CCLE_copynumber_2013-12-03.seg.txt\",sep = \"\\t\")\n",
    "df.rename({\"CCLE_name\":\"Sample\"},inplace=True, axis=\"columns\")\n",
    "df[\"End\"] = df[\"End\"].apply(int)\n",
    "ccle = []\n",
    "for sample_name in list(set(df[\"Sample\"].values)):\n",
    "    cl = df.loc[df[\"Sample\"]==sample_name, :]\n",
    "    # keep high-conf segments \n",
    "    cl_filtered = filter_lowconf_segments(cl,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
    "    #print(sample_name, cl.shape[0], \"after filtration\",cl_filtered.shape[0])\n",
    "    # map to genes \n",
    "    cnv2gene = cnv2genelevel(cnv2bed(cl_filtered),gene_intervals_bed,sample_name,\n",
    "                                 verbose = False,sorted_index = sorted_index)\n",
    "    ccle.append(cnv2gene)\n",
    "    \n",
    "ccle = pd.concat(ccle,axis =1)\n",
    "ccle.fillna(0, inplace = True)\n",
    "ccle = handle_dups(ccle)\n",
    "ccle.to_csv(preprocessed_dir+\"/\"+\"CCLE\"+\".Segment_Mean.CNA.tsv\",\n",
    "                 sep = \"\\t\",header=True,index=True)\n",
    "print(\"CCLE:\",\"genes:\",ccle.shape[0],\"samples\",ccle.shape[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GDSC\n",
    "Assume that supplementary file with gene-level CN is downloaded :\n",
    "\n",
    "wget \n",
    "\n",
    "GDSC provides gene-level integer estimated CN, max. and min. CN over all segments covering a gene. In order to make it comparable with TCGA and CCLE, we  divide estimated CN by CN of copy-neutral state and log2-transform it. \n",
    "\n",
    "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n",
    "\n",
    "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n",
    "\n",
    "3) Replace estimates below thresholds with zeroes. \n",
    "\n",
    "\n",
    "DGSC uses 4 comma-separated values for gene-level CN (max_cn,min_cn,zygosity,disruption): e.g. (from \"legend\" tab)\n",
    "\n",
    "2,2,H,-\tGene resides on a single genomic segment in a diploid region of the genome.\n",
    "2,0,L,D\tGene spans multiple segments, higest copy number is 2 but part of the coding sequence is homozygously deleted, the gene is disrupted.\n",
    "13,13,H,-\tGene resides on a single genomic segment of copy number 13 in a heterozygous part of the genome (amplification).\n",
    "14,12,L,D\tGene spans multiple genomic segments all of which are amplified to 12 or more copies, some or all segments have LOH, the gene is disrupted.\n",
    "0,0,0,-\tComplete gene sequence falls within a homozygous deletion.\n",
    "-1,-1,-,- gene level CN not assigned\n",
    "\n",
    "* min and max CN are integers \n",
    "* zygosity -  can be L (LOH in any overlapping segment) or H (heterozygous) or 0 (homozygous deleteion of the whole gene) or - (undefined)\n",
    "* disruption - D (if disrupted) or \"-\" (not disrupted) \n",
    "\n",
    "Average ploidies of cell lines were downloaded from COSMIC:\n",
    "\n",
    "wget https://cog.sanger.ac.uk/cosmic/GRCh37/cell_lines/v86/PICNIC_average_ploidies.tsv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1540792525&Signature=mcSB6oFv%2BXCF4%2Fezm4a3Ds1JXo4%3D\n",
    "\n",
    "wget ftp:// ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-7.0/Gene_level_CN.xlsx\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene</th>\n",
       "      <th>chr</th>\n",
       "      <th>start</th>\n",
       "      <th>stop</th>\n",
       "      <th>201T</th>\n",
       "      <th>22RV1</th>\n",
       "      <th>23132-87</th>\n",
       "      <th>42-MG-BA</th>\n",
       "      <th>451Lu</th>\n",
       "      <th>5637</th>\n",
       "      <th>...</th>\n",
       "      <th>WSU-NHL</th>\n",
       "      <th>YAPC</th>\n",
       "      <th>YH-13</th>\n",
       "      <th>YKG-1</th>\n",
       "      <th>YMB-1-E</th>\n",
       "      <th>YT</th>\n",
       "      <th>ZR-75-30</th>\n",
       "      <th>huH-1</th>\n",
       "      <th>no-10</th>\n",
       "      <th>no-11</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1287381</td>\n",
       "      <td>924100</td>\n",
       "      <td>910924</td>\n",
       "      <td>687561</td>\n",
       "      <td>1287706</td>\n",
       "      <td>687452</td>\n",
       "      <td>...</td>\n",
       "      <td>909785</td>\n",
       "      <td>909904</td>\n",
       "      <td>909905</td>\n",
       "      <td>687592</td>\n",
       "      <td>1303911</td>\n",
       "      <td>946358</td>\n",
       "      <td>909907</td>\n",
       "      <td>1298146</td>\n",
       "      <td>908452</td>\n",
       "      <td>908450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DDX11L1</td>\n",
       "      <td>1</td>\n",
       "      <td>11869.0</td>\n",
       "      <td>14412.0</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>...</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>WASH7P</td>\n",
       "      <td>1</td>\n",
       "      <td>14363.0</td>\n",
       "      <td>29806.0</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>...</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "      <td>-1,-1,-,-</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 1000 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      gene  chr    start     stop       201T      22RV1   23132-87   42-MG-BA  \\\n",
       "0      NaN  NaN      NaN      NaN    1287381     924100     910924     687561   \n",
       "1  DDX11L1    1  11869.0  14412.0  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
       "2   WASH7P    1  14363.0  29806.0  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
       "\n",
       "       451Lu       5637    ...        WSU-NHL       YAPC      YH-13  \\\n",
       "0    1287706     687452    ...         909785     909904     909905   \n",
       "1  -1,-1,-,-  -1,-1,-,-    ...      -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
       "2  -1,-1,-,-  -1,-1,-,-    ...      -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
       "\n",
       "       YKG-1    YMB-1-E         YT   ZR-75-30      huH-1      no-10      no-11  \n",
       "0     687592    1303911     946358     909907    1298146     908452     908450  \n",
       "1  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  \n",
       "2  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  \n",
       "\n",
       "[3 rows x 1000 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GDSC_CNA = \"/home/olya/SFU/Hossein/GDSC/Gene_level_CN.xlsx\"\n",
    "\n",
    "gdsc = pd.read_excel(GDSC_CNA,\"Gene_level_CN\")\n",
    "gdsc.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25 gene IDs excluded due to string to datetime conversion in Excel.\n",
      "Strings containing duplicated gene IDs: 0\n"
     ]
    }
   ],
   "source": [
    "gdsc.set_index(\"gene\",inplace = True)\n",
    "gdsc.drop([\"chr\",\"start\",\"stop\"],inplace=True,axis=1)\n",
    "gdsc.columns = gdsc.iloc[0,:]\n",
    "gdsc = gdsc.iloc[1:,:]\n",
    "gdsc.columns.name = None\n",
    "# replace 2001-12-01 with DEC1 and get remove gene names converted to datetimes\n",
    "gdsc.index.values[37778] = \"DEC1\"\n",
    "df_size = gdsc.shape[0]\n",
    "ndxs=pd.Series(gdsc.index).apply(lambda x : type(x) == unicode or type(x) == str)\n",
    "gdsc = gdsc.loc[gdsc.index.values[ndxs[ndxs].index],:]\n",
    "print(df_size - gdsc.shape[0],\"gene IDs excluded due to string to datetime conversion in Excel.\")\n",
    "\n",
    "gdsc.index.name = \"gene_id\"\n",
    "ids = gdsc.index\n",
    "ids = list(set(ids[ids.duplicated()]))\n",
    "print(\"Strings containing duplicated gene IDs:\",gdsc.loc[ids,:].shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### distribution of averaged ploidies in GDSC\n",
    "\n",
    "we compared average ploidies reported in PICNIC_average_ploidies.tsv provided by COSMIC with  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1020\n",
      "1016\n"
     ]
    }
   ],
   "source": [
    "GDSC_Ploidies = \"/home/olya/SFU/Hossein/GDSC/PICNIC_average_ploidies.tsv\"\n",
    "GDSC_Ploidies = pd.read_csv(GDSC_Ploidies,sep = \"\\t\")\n",
    "GDSC_Ploidies.drop(\"#sample_name\",axis = 1, inplace= True)\n",
    "GDSC_Ploidies.set_index(\"sample_id\",inplace=True)\n",
    "print(GDSC_Ploidies.shape[0])\n",
    "GDSC_Ploidies.dropna(inplace=True)\n",
    "print(GDSC_Ploidies.shape[0])\n",
    "\n",
    "est_ploidies = gdsc.apply(define_avg_ploidy).T\n",
    "df_ploidies = pd.DataFrame.from_dict({\"est. avg. ploidy from CN profile\":est_ploidies[\"avg_pl\"],\"PICNIC avg. pl.\":GDSC_Ploidies[\"average_ploidy\"],\n",
    "                                     \"est. median. ploidy\":est_ploidies[\"median_pl\"]})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1440x360 with 3 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(20,5))\n",
    "plt.subplot(131)\n",
    "tmp = plt.hist(est_ploidies[\"avg_pl\"],bins=30)\n",
    "plt.title(\"est. avg. ploidy from CN profile\")\n",
    "plt.subplot(132)\n",
    "tmp = plt.hist(sorted(list(GDSC_Ploidies[\"average_ploidy\"].values)),bins=30)\n",
    "plt.title(\"PICNIC avg. pl.\")\n",
    "plt.subplot(133)\n",
    "tmp = plt.hist(est_ploidies[\"median_pl\"],bins=30)\n",
    "plt.title(\"est. median ploidy\")\n",
    "\n",
    "tmp = df_ploidies.plot.scatter(x = \"est. avg. ploidy from CN profile\",y=\"PICNIC avg. pl.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# PICNIC average ploidy vs estimated copy-neutral \n",
    "tmp = df_ploidies.boxplot(column=\"PICNIC avg. pl.\", by = \"est. median. ploidy\" )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert gene-level integer CN into log2R-like format in order to make it compatible with TCGA and CCLE\n",
    "\n",
    "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n",
    "\n",
    "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n",
    "\n",
    "3) Replace estimates below thresholds with zeroes. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "estimated_CN = est_ploidies[\"median_pl\"].to_dict()\n",
    "estimated_CN[1287381]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1287381</th>\n",
       "      <th>924100</th>\n",
       "      <th>910924</th>\n",
       "      <th>687561</th>\n",
       "      <th>1287706</th>\n",
       "      <th>687452</th>\n",
       "      <th>906798</th>\n",
       "      <th>906797</th>\n",
       "      <th>906800</th>\n",
       "      <th>910922</th>\n",
       "      <th>...</th>\n",
       "      <th>909785</th>\n",
       "      <th>909904</th>\n",
       "      <th>909905</th>\n",
       "      <th>687592</th>\n",
       "      <th>1303911</th>\n",
       "      <th>946358</th>\n",
       "      <th>909907</th>\n",
       "      <th>1298146</th>\n",
       "      <th>908452</th>\n",
       "      <th>908450</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5S_rRNA</th>\n",
       "      <td>-4.320000</td>\n",
       "      <td>1.807355</td>\n",
       "      <td>2.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>-4.320000</td>\n",
       "      <td>-1.00</td>\n",
       "      <td>2.807355</td>\n",
       "      <td>2.169925</td>\n",
       "      <td>-1.00</td>\n",
       "      <td>...</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>2.222392</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>-4.320000</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>1.807355</td>\n",
       "      <td>-4.320000</td>\n",
       "      <td>1.736966</td>\n",
       "      <td>2.169925</td>\n",
       "      <td>-1.584963</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5_8S_rRNA</th>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.00</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.415037</td>\n",
       "      <td>0.00</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.584963</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7SK</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-4.320000</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>...</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>-4.320000</td>\n",
       "      <td>-4.32</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>-4.320000</td>\n",
       "      <td>-4.320000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-4.320000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 996 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            1287381   924100   910924    687561   1287706   687452   906798   \\\n",
       "5S_rRNA   -4.320000  1.807355      2.0 -0.415037    -4.32 -4.320000    -1.00   \n",
       "5_8S_rRNA -0.584963  0.000000      0.0 -0.415037     0.00 -0.584963     0.00   \n",
       "7SK        1.000000  0.000000      0.0 -4.320000    -4.32 -0.584963    -4.32   \n",
       "\n",
       "            906797    906800   910922     ...     909785    909904   909905   \\\n",
       "5S_rRNA    2.807355  2.169925    -1.00    ...       -4.32  2.222392    -4.32   \n",
       "5_8S_rRNA  0.000000  0.000000     0.00    ...        0.00  0.415037     0.00   \n",
       "7SK        0.584963  0.000000    -4.32    ...       -4.32  0.000000    -4.32   \n",
       "\n",
       "            687592   1303911   946358    909907    1298146   908452    908450   \n",
       "5S_rRNA   -4.320000    -4.32  1.807355 -4.320000  1.736966  2.169925 -1.584963  \n",
       "5_8S_rRNA -0.584963    -4.32  0.000000 -0.415037  0.000000  0.000000 -0.584963  \n",
       "7SK       -4.320000    -4.32 -0.415037 -4.320000 -4.320000  0.000000 -4.320000  \n",
       "\n",
       "[3 rows x 996 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gdsc = gdsc.apply(lambda x : CN2log2R(x,estimated_CN[x.name] ))\n",
    "# drop genes without any determined value\n",
    "gdsc = gdsc.dropna(axis=0,how=\"all\")\n",
    "# fill with zeroes the remaining ones\n",
    "gdsc.fillna(0,inplace=True)\n",
    "gdsc.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "gdsc = gdsc.applymap(lambda x :  clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Ok: no empty rows detected\n",
      "Ok: no duplicated pairs detected\n",
      "Ok: All Symbol rows are not empty.\n",
      "Ok: All Symbol are mapped to GeneID\n",
      "16 Symbol mapped to multiple GeneID\n",
      "Ok: All GeneID are unique\n",
      "59266 Symbol can be mapped directly to GeneID\n"
     ]
    }
   ],
   "source": [
    "NCBI = pd.read_csv(root_dir+\"Homo_sapiens.gene_info\",sep = \"\\t\")\n",
    "NCBI = NCBI[[\"#tax_id\",\"GeneID\",\"Symbol\",\"Synonyms\",\"type_of_gene\"]]\n",
    "NCBI = NCBI.loc[NCBI[\"#tax_id\"] == 9606]\n",
    "NCBI = NCBI.loc[NCBI[\"type_of_gene\"] != \"unknown\"]\n",
    "ncbi_symbols = parse_mapping_table(NCBI, \"Symbol\",\"GeneID\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Ok: no empty rows detected\n",
      "Ok: no duplicated pairs detected\n",
      "Ok: All Synonyms rows are not empty.\n",
      "Ok: All Synonyms are mapped to GeneID\n",
      "3145 Synonyms mapped to multiple GeneID\n",
      "49179 different Synonyms mapped to the same GeneID\n",
      "10839 Synonyms can be mapped directly to GeneID\n"
     ]
    }
   ],
   "source": [
    "ncbi_synonyms = expand(NCBI[[\"Synonyms\",\"GeneID\"]],column=\"Synonyms\",sep=\"|\") \n",
    "ncbi_synonyms = parse_mapping_table(ncbi_synonyms, \"Synonyms\",\"GeneID\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mapped: 24545 \n",
      "\tdirectly via main_mapper 22363 \n",
      "\tvia alternative mapper 766 \n",
      "\tvia one of multiple synonyms in alternative mapper 1416 \n",
      "\tLOC 0 \n",
      "Unmapped: 21587 \n",
      "\trecognized symbols without Entrez ID 0 \n",
      "\tmultiple query_ids map to the same target_id 0 \n",
      "\tquery_ids map to multiple target_ids in the main mapper 0 \n",
      "\tquery_ids map to multiple target_ids in the alternative mapper 76 \n",
      "\tLOC not found in Entrez 0 \n",
      "\tNot found at all: 21511\n",
      "Warning: query IDs mapping to duplicated target IDs in mapping table: 156\n",
      "Warning: query IDs not mapped to any target IDs excluded: 21587\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/olya/miniconda2/lib/python2.7/site-packages/pandas/core/frame.py:3781: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  return super(DataFrame, self).rename(**kwargs)\n",
      "IDs mapped to multiple target IDs are kept:\n",
      " [143872, 286464, 140290, 414212, 414213, 51463, 642826, 84631, 574445, 399761, 100132115, 647060, 284565, 6551, 161176, 341019, 4253, 9502, 442416, 51236, 643749, 54438, 728113, 100302179, 414761, 29099, 729438, 256815, 10160, 645425, 653234, 644019, 26165, 3255, 644509, 2749, 653505, 653067, 643479, 100462820, 100418977, 26824, 79817, 6218, 728695, 100034743, 221262, 647507, 677844, 728917, 26583, 100289124, 84316, 200030, 768096, 642658, 23523, 401508, 23334, 119016, 106478953, 84458, 1517, 246126, 26095, 100033392, 92017, 374, 26871, 100132948, 125050, 387707, 653308, 79741, 728798]\n",
      "mapper.py:204: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  df.sort_index(inplace=True)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1287381</th>\n",
       "      <th>924100</th>\n",
       "      <th>910924</th>\n",
       "      <th>687561</th>\n",
       "      <th>1287706</th>\n",
       "      <th>687452</th>\n",
       "      <th>906798</th>\n",
       "      <th>906797</th>\n",
       "      <th>906800</th>\n",
       "      <th>910922</th>\n",
       "      <th>...</th>\n",
       "      <th>909785</th>\n",
       "      <th>909904</th>\n",
       "      <th>909905</th>\n",
       "      <th>687592</th>\n",
       "      <th>1303911</th>\n",
       "      <th>946358</th>\n",
       "      <th>909907</th>\n",
       "      <th>1298146</th>\n",
       "      <th>908452</th>\n",
       "      <th>908450</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.415037</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.415037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.736966</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.736966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.415037</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 996 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    1287381   924100   910924    687561    1287706   687452    906798   \\\n",
       "1  0.000000  0.000000      0.0  0.000000  0.000000  0.000000  0.000000   \n",
       "2  0.000000  0.584963      0.0 -0.415037  0.321928  0.000000  0.584963   \n",
       "9 -0.584963  0.584963      0.0 -0.415037 -0.415037 -0.584963  0.000000   \n",
       "\n",
       "    906797   906800   910922     ...     909785    909904    909905   \\\n",
       "1  0.584963      0.0      0.0    ...         0.0 -0.584963  0.584963   \n",
       "2  0.584963      0.0      0.0    ...         0.0 -0.584963  0.000000   \n",
       "9  0.000000      0.0      0.0    ...         0.0 -0.584963  0.000000   \n",
       "\n",
       "    687592    1303911   946358   909907    1298146   908452    908450   \n",
       "1  0.415037  0.000000  0.000000      0.0  0.000000  0.584963  0.415037  \n",
       "2  0.000000  0.736966  0.321928      0.0 -0.584963  0.000000  0.736966  \n",
       "9  0.415037  0.000000  0.000000     -1.0 -0.584963  0.000000  0.000000  \n",
       "\n",
       "[3 rows x 996 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gdsc,query2target,not_mapped  = apply_mappers(gdsc, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n",
    "gdsc.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "75 duplicated IDs in 156 rows found.\n",
      "duplicate rows removed due to low correlation of duplicated profiles 25\n",
      "Merged  131 duplicated rows into 63 rows\n"
     ]
    }
   ],
   "source": [
    "gdsc = handle_dups(gdsc,corr_thr = 0.75)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>683665</th>\n",
       "      <th>683667</th>\n",
       "      <th>684052</th>\n",
       "      <th>684055</th>\n",
       "      <th>684057</th>\n",
       "      <th>684059</th>\n",
       "      <th>684062</th>\n",
       "      <th>684072</th>\n",
       "      <th>684681</th>\n",
       "      <th>687448</th>\n",
       "      <th>...</th>\n",
       "      <th>1659818</th>\n",
       "      <th>1659819</th>\n",
       "      <th>1659823</th>\n",
       "      <th>1659928</th>\n",
       "      <th>1659929</th>\n",
       "      <th>1660034</th>\n",
       "      <th>1660035</th>\n",
       "      <th>1660036</th>\n",
       "      <th>1674021</th>\n",
       "      <th>1789883</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.415037</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.321928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>-0.415037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.584963</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.415037</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.415037</td>\n",
       "      <td>0.584963</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 996 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         683665    683667   684052    684055    684057   684059    684062   \\\n",
       "gene_id                                                                      \n",
       "1            0.0  0.000000      0.0  0.000000 -0.415037      0.0 -0.415037   \n",
       "2            0.0  0.000000      0.0  0.584963  0.000000      0.0  0.000000   \n",
       "9            0.0  0.321928      0.0  0.584963  0.584963      0.0  0.321928   \n",
       "10           0.0  0.321928      0.0  0.584963  0.584963      0.0  0.321928   \n",
       "12           0.0 -1.000000      0.0  0.000000 -1.000000      0.0 -0.415037   \n",
       "\n",
       "          684072    684681   687448     ...     1659818  1659819   1659823  \\\n",
       "gene_id                                 ...                                  \n",
       "1        0.000000  0.415037      0.0    ...         0.0      0.0  0.000000   \n",
       "2        0.584963  0.000000      0.0    ...         0.0      0.0  0.000000   \n",
       "9        0.000000 -0.584963      0.0    ...         0.0      0.0 -0.415037   \n",
       "10       0.000000 -0.584963      0.0    ...         0.0      0.0 -0.415037   \n",
       "12       0.000000  0.000000      0.0    ...        -1.0      0.0  0.321928   \n",
       "\n",
       "         1659928   1659929  1660034   1660035   1660036   1674021   1789883  \n",
       "gene_id                                                                      \n",
       "1            0.0 -0.415037      0.0 -0.584963  0.000000  0.000000  0.321928  \n",
       "2            0.0  0.000000      0.0  0.000000  0.000000  0.584963 -0.415037  \n",
       "9            0.0  0.000000     -1.0 -1.584963  0.000000 -1.000000 -1.000000  \n",
       "10           0.0  0.000000     -1.0 -1.584963  0.000000 -1.000000 -1.000000  \n",
       "12           0.0 -0.415037      0.0  0.415037  0.584963  0.000000  0.000000  \n",
       "\n",
       "[5 rows x 996 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gdsc.index.name = \"gene_id\"\n",
    "gdsc = gdsc.T.sort_index().T\n",
    "gdsc.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "gdsc.to_csv(preprocessed_dir+\"/\"+\"GDSC\"+\".Segment_Mean.CNA.tsv\",\n",
    "                 sep = \"\\t\",header=True,index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PDX \n",
    "\n",
    "For PDX dataset only gene-level estimated copy-number (non-integer) reported. \n",
    "From ploidy distributions, calculated as average over all genes we concluded that CN estimates were called under assumption that copy-neutral state of each xenograft corresponds CN = 2.\n",
    "\n",
    "\n",
    "For gene ID conversion we used the same approach as for RNA-seq."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(23852, 375)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X-1004</th>\n",
       "      <th>X-1008</th>\n",
       "      <th>X-1027</th>\n",
       "      <th>X-1095</th>\n",
       "      <th>X-1119</th>\n",
       "      <th>X-1156</th>\n",
       "      <th>X-1167</th>\n",
       "      <th>X-1169</th>\n",
       "      <th>X-1172</th>\n",
       "      <th>X-1173</th>\n",
       "      <th>...</th>\n",
       "      <th>X-5694</th>\n",
       "      <th>X-5696</th>\n",
       "      <th>X-5713</th>\n",
       "      <th>X-5717</th>\n",
       "      <th>X-5727</th>\n",
       "      <th>X-5739</th>\n",
       "      <th>X-5808</th>\n",
       "      <th>X-5959</th>\n",
       "      <th>X-5975</th>\n",
       "      <th>X-6047</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sample</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1BG</th>\n",
       "      <td>2.58</td>\n",
       "      <td>1.60</td>\n",
       "      <td>2.17</td>\n",
       "      <td>2.08</td>\n",
       "      <td>2.00</td>\n",
       "      <td>3.94</td>\n",
       "      <td>2.04</td>\n",
       "      <td>11.39</td>\n",
       "      <td>2.17</td>\n",
       "      <td>2.01</td>\n",
       "      <td>...</td>\n",
       "      <td>2.08</td>\n",
       "      <td>2.10</td>\n",
       "      <td>2.14</td>\n",
       "      <td>2.95</td>\n",
       "      <td>2.06</td>\n",
       "      <td>2.07</td>\n",
       "      <td>1.99</td>\n",
       "      <td>2.07</td>\n",
       "      <td>1.43</td>\n",
       "      <td>2.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1BG-AS1</th>\n",
       "      <td>2.58</td>\n",
       "      <td>1.60</td>\n",
       "      <td>2.17</td>\n",
       "      <td>2.08</td>\n",
       "      <td>2.00</td>\n",
       "      <td>3.94</td>\n",
       "      <td>2.04</td>\n",
       "      <td>11.39</td>\n",
       "      <td>2.17</td>\n",
       "      <td>2.01</td>\n",
       "      <td>...</td>\n",
       "      <td>2.08</td>\n",
       "      <td>2.10</td>\n",
       "      <td>2.14</td>\n",
       "      <td>2.95</td>\n",
       "      <td>2.06</td>\n",
       "      <td>2.07</td>\n",
       "      <td>1.99</td>\n",
       "      <td>2.07</td>\n",
       "      <td>1.43</td>\n",
       "      <td>2.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1CF</th>\n",
       "      <td>2.87</td>\n",
       "      <td>2.97</td>\n",
       "      <td>2.01</td>\n",
       "      <td>2.06</td>\n",
       "      <td>2.10</td>\n",
       "      <td>1.58</td>\n",
       "      <td>2.01</td>\n",
       "      <td>1.64</td>\n",
       "      <td>1.89</td>\n",
       "      <td>1.99</td>\n",
       "      <td>...</td>\n",
       "      <td>2.04</td>\n",
       "      <td>0.97</td>\n",
       "      <td>1.58</td>\n",
       "      <td>2.08</td>\n",
       "      <td>1.95</td>\n",
       "      <td>1.92</td>\n",
       "      <td>1.54</td>\n",
       "      <td>1.28</td>\n",
       "      <td>1.33</td>\n",
       "      <td>2.10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2LD1</th>\n",
       "      <td>5.74</td>\n",
       "      <td>1.64</td>\n",
       "      <td>2.06</td>\n",
       "      <td>2.01</td>\n",
       "      <td>2.07</td>\n",
       "      <td>1.74</td>\n",
       "      <td>2.06</td>\n",
       "      <td>1.59</td>\n",
       "      <td>1.40</td>\n",
       "      <td>2.53</td>\n",
       "      <td>...</td>\n",
       "      <td>2.03</td>\n",
       "      <td>2.07</td>\n",
       "      <td>2.25</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1.01</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1.08</td>\n",
       "      <td>1.85</td>\n",
       "      <td>1.93</td>\n",
       "      <td>1.45</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4 rows × 375 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          X-1004  X-1008  X-1027  X-1095  X-1119  X-1156  X-1167  X-1169  \\\n",
       "Sample                                                                     \n",
       "A1BG        2.58    1.60    2.17    2.08    2.00    3.94    2.04   11.39   \n",
       "A1BG-AS1    2.58    1.60    2.17    2.08    2.00    3.94    2.04   11.39   \n",
       "A1CF        2.87    2.97    2.01    2.06    2.10    1.58    2.01    1.64   \n",
       "A2LD1       5.74    1.64    2.06    2.01    2.07    1.74    2.06    1.59   \n",
       "\n",
       "          X-1172  X-1173   ...    X-5694  X-5696  X-5713  X-5717  X-5727  \\\n",
       "Sample                     ...                                             \n",
       "A1BG        2.17    2.01   ...      2.08    2.10    2.14    2.95    2.06   \n",
       "A1BG-AS1    2.17    2.01   ...      2.08    2.10    2.14    2.95    2.06   \n",
       "A1CF        1.89    1.99   ...      2.04    0.97    1.58    2.08    1.95   \n",
       "A2LD1       1.40    2.53   ...      2.03    2.07    2.25    2.00    1.01   \n",
       "\n",
       "          X-5739  X-5808  X-5959  X-5975  X-6047  \n",
       "Sample                                            \n",
       "A1BG        2.07    1.99    2.07    1.43    2.03  \n",
       "A1BG-AS1    2.07    1.99    2.07    1.43    2.03  \n",
       "A1CF        1.92    1.54    1.28    1.33    2.10  \n",
       "A2LD1       2.00    1.08    1.85    1.93    1.45  \n",
       "\n",
       "[4 rows x 375 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PDX_xls = \"/home/olya/SFU/Hossein/PDX/nm.3954-S2.xlsx\"\n",
    "pdx = pd.read_excel(PDX_xls,\"copy number\")\n",
    "pdx.set_index(\"Sample\",drop=True,inplace=True)\n",
    "focal  = pdx.T[\"FocalCNScore\"]\n",
    "pdx.drop([\"ArmLevelCNScore\",\"FocalCNScore\"],inplace = True)\n",
    "print(pdx.shape)\n",
    "pdx.head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Strings containing duplicated gene IDs: 544\n",
      "268 duplicated IDs in 544 rows found.\n",
      "duplicate rows removed due to low correlation of duplicated profiles 134\n",
      "Merged  410 duplicated rows into 205 rows\n"
     ]
    }
   ],
   "source": [
    "pdx.index.name = \"gene_id\"\n",
    "ids = pdx.index\n",
    "ids = list(set(ids[ids.duplicated()]))\n",
    "print(\"Strings containing duplicated gene IDs:\",pdx.loc[ids,:].shape[0])\n",
    "pdx = handle_dups(pdx,corr_thr = 0.75)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5,0,'CN Averaged over all')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "average_ploidies = pdx.apply(np.mean)\n",
    "p = plt.hist(average_ploidies,bins=30)\n",
    "plt.title(\"Ploidy in PDX samples\")\n",
    "plt.ylabel(\"n samples\")\n",
    "plt.xlabel(\"CN Averaged over all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X-1004</th>\n",
       "      <th>X-1008</th>\n",
       "      <th>X-1027</th>\n",
       "      <th>X-1095</th>\n",
       "      <th>X-1119</th>\n",
       "      <th>X-1156</th>\n",
       "      <th>X-1167</th>\n",
       "      <th>X-1169</th>\n",
       "      <th>X-1172</th>\n",
       "      <th>X-1173</th>\n",
       "      <th>...</th>\n",
       "      <th>X-5694</th>\n",
       "      <th>X-5696</th>\n",
       "      <th>X-5713</th>\n",
       "      <th>X-5717</th>\n",
       "      <th>X-5727</th>\n",
       "      <th>X-5739</th>\n",
       "      <th>X-5808</th>\n",
       "      <th>X-5959</th>\n",
       "      <th>X-5975</th>\n",
       "      <th>X-6047</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1BG</th>\n",
       "      <td>0.367371</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.117695</td>\n",
       "      <td>0.056584</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.978196</td>\n",
       "      <td>0.028569</td>\n",
       "      <td>2.509696</td>\n",
       "      <td>0.117695</td>\n",
       "      <td>0.007196</td>\n",
       "      <td>...</td>\n",
       "      <td>0.056584</td>\n",
       "      <td>0.070389</td>\n",
       "      <td>0.097611</td>\n",
       "      <td>0.560715</td>\n",
       "      <td>0.042644</td>\n",
       "      <td>0.049631</td>\n",
       "      <td>-0.007232</td>\n",
       "      <td>0.049631</td>\n",
       "      <td>-0.483985</td>\n",
       "      <td>0.021480</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1BG-AS1</th>\n",
       "      <td>0.367371</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.117695</td>\n",
       "      <td>0.056584</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.978196</td>\n",
       "      <td>0.028569</td>\n",
       "      <td>2.509696</td>\n",
       "      <td>0.117695</td>\n",
       "      <td>0.007196</td>\n",
       "      <td>...</td>\n",
       "      <td>0.056584</td>\n",
       "      <td>0.070389</td>\n",
       "      <td>0.097611</td>\n",
       "      <td>0.560715</td>\n",
       "      <td>0.042644</td>\n",
       "      <td>0.049631</td>\n",
       "      <td>-0.007232</td>\n",
       "      <td>0.049631</td>\n",
       "      <td>-0.483985</td>\n",
       "      <td>0.021480</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1CF</th>\n",
       "      <td>0.521051</td>\n",
       "      <td>0.570463</td>\n",
       "      <td>0.007196</td>\n",
       "      <td>0.042644</td>\n",
       "      <td>0.070389</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.007196</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>-0.081614</td>\n",
       "      <td>-0.007232</td>\n",
       "      <td>...</td>\n",
       "      <td>0.028569</td>\n",
       "      <td>-1.043943</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.056584</td>\n",
       "      <td>-0.036526</td>\n",
       "      <td>-0.058894</td>\n",
       "      <td>-0.377070</td>\n",
       "      <td>-0.643856</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.070389</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 375 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            X-1004    X-1008    X-1027    X-1095    X-1119    X-1156  \\\n",
       "gene_id                                                                \n",
       "A1BG      0.367371 -0.321928  0.117695  0.056584  0.000000  0.978196   \n",
       "A1BG-AS1  0.367371 -0.321928  0.117695  0.056584  0.000000  0.978196   \n",
       "A1CF      0.521051  0.570463  0.007196  0.042644  0.070389 -0.340075   \n",
       "\n",
       "            X-1167    X-1169    X-1172    X-1173    ...       X-5694  \\\n",
       "gene_id                                             ...                \n",
       "A1BG      0.028569  2.509696  0.117695  0.007196    ...     0.056584   \n",
       "A1BG-AS1  0.028569  2.509696  0.117695  0.007196    ...     0.056584   \n",
       "A1CF      0.007196 -0.286304 -0.081614 -0.007232    ...     0.028569   \n",
       "\n",
       "            X-5696    X-5713    X-5717    X-5727    X-5739    X-5808  \\\n",
       "gene_id                                                                \n",
       "A1BG      0.070389  0.097611  0.560715  0.042644  0.049631 -0.007232   \n",
       "A1BG-AS1  0.070389  0.097611  0.560715  0.042644  0.049631 -0.007232   \n",
       "A1CF     -1.043943 -0.340075  0.056584 -0.036526 -0.058894 -0.377070   \n",
       "\n",
       "            X-5959    X-5975    X-6047  \n",
       "gene_id                                 \n",
       "A1BG      0.049631 -0.483985  0.021480  \n",
       "A1BG-AS1  0.049631 -0.483985  0.021480  \n",
       "A1CF     -0.643856 -0.588574  0.070389  \n",
       "\n",
       "[3 rows x 375 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pdx = pdx.applymap(lambda x: np.log2(x/2))\n",
    "pdx.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X-1004</th>\n",
       "      <th>X-1008</th>\n",
       "      <th>X-1027</th>\n",
       "      <th>X-1095</th>\n",
       "      <th>X-1119</th>\n",
       "      <th>X-1156</th>\n",
       "      <th>X-1167</th>\n",
       "      <th>X-1169</th>\n",
       "      <th>X-1172</th>\n",
       "      <th>X-1173</th>\n",
       "      <th>...</th>\n",
       "      <th>X-5694</th>\n",
       "      <th>X-5696</th>\n",
       "      <th>X-5713</th>\n",
       "      <th>X-5717</th>\n",
       "      <th>X-5727</th>\n",
       "      <th>X-5739</th>\n",
       "      <th>X-5808</th>\n",
       "      <th>X-5959</th>\n",
       "      <th>X-5975</th>\n",
       "      <th>X-6047</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1BG</th>\n",
       "      <td>0.367371</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.978196</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.509696</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.560715</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.483985</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1BG-AS1</th>\n",
       "      <td>0.367371</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.978196</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.509696</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.560715</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.483985</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1CF</th>\n",
       "      <td>0.521051</td>\n",
       "      <td>0.570463</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.043943</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.37707</td>\n",
       "      <td>-0.643856</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 375 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            X-1004    X-1008  X-1027  X-1095  X-1119    X-1156  X-1167  \\\n",
       "gene_id                                                                  \n",
       "A1BG      0.367371 -0.321928     0.0     0.0     0.0  0.978196     0.0   \n",
       "A1BG-AS1  0.367371 -0.321928     0.0     0.0     0.0  0.978196     0.0   \n",
       "A1CF      0.521051  0.570463     0.0     0.0     0.0 -0.340075     0.0   \n",
       "\n",
       "            X-1169  X-1172  X-1173   ...    X-5694    X-5696    X-5713  \\\n",
       "gene_id                              ...                                 \n",
       "A1BG      2.509696     0.0     0.0   ...       0.0  0.000000  0.000000   \n",
       "A1BG-AS1  2.509696     0.0     0.0   ...       0.0  0.000000  0.000000   \n",
       "A1CF     -0.286304     0.0     0.0   ...       0.0 -1.043943 -0.340075   \n",
       "\n",
       "            X-5717  X-5727  X-5739   X-5808    X-5959    X-5975  X-6047  \n",
       "gene_id                                                                  \n",
       "A1BG      0.560715     0.0     0.0  0.00000  0.000000 -0.483985     0.0  \n",
       "A1BG-AS1  0.560715     0.0     0.0  0.00000  0.000000 -0.483985     0.0  \n",
       "A1CF      0.000000     0.0     0.0 -0.37707 -0.643856 -0.588574     0.0  \n",
       "\n",
       "[3 rows x 375 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pdx = pdx.applymap(lambda x :  clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))\n",
    "pdx.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mapped: 23313 \n",
      "\tdirectly via main_mapper 21188 \n",
      "\tvia alternative mapper 466 \n",
      "\tvia one of multiple synonyms in alternative mapper 926 \n",
      "\tLOC 733 \n",
      "Unmapped: 200 \n",
      "\trecognized symbols without Entrez ID 0 \n",
      "\tmultiple query_ids map to the same target_id 0 \n",
      "\tquery_ids map to multiple target_ids in the main mapper 0 \n",
      "\tquery_ids map to multiple target_ids in the alternative mapper 52 \n",
      "\tLOC not found in Entrez 29 \n",
      "\tNot found at all: 119\n",
      "Warning: query IDs mapping to duplicated target IDs in mapping table: 77\n",
      "Warning: query IDs not mapped to any target IDs excluded: 200\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "IDs mapped to multiple target IDs are kept:\n",
      " [143872, 286464, 51463, 642826, 653067, 399761, 647060, 284565, 84631, 161176, 341019, 83869, 9502, 83871, 728113, 729438, 4253, 645425, 26165, 6218, 728695, 100132948, 100134869, 84316, 200030, 642658, 100302179, 401508, 119016, 84458, 574445, 26095, 84968, 80759, 3192, 387707, 79741]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X-1004</th>\n",
       "      <th>X-1008</th>\n",
       "      <th>X-1027</th>\n",
       "      <th>X-1095</th>\n",
       "      <th>X-1119</th>\n",
       "      <th>X-1156</th>\n",
       "      <th>X-1167</th>\n",
       "      <th>X-1169</th>\n",
       "      <th>X-1172</th>\n",
       "      <th>X-1173</th>\n",
       "      <th>...</th>\n",
       "      <th>X-5694</th>\n",
       "      <th>X-5696</th>\n",
       "      <th>X-5713</th>\n",
       "      <th>X-5717</th>\n",
       "      <th>X-5727</th>\n",
       "      <th>X-5739</th>\n",
       "      <th>X-5808</th>\n",
       "      <th>X-5959</th>\n",
       "      <th>X-5975</th>\n",
       "      <th>X-6047</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.367371</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.978196</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.509696</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.560715</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.483985</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.761285</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.500802</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.700440</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.739848</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.739848</td>\n",
       "      <td>0.327687</td>\n",
       "      <td>-0.494109</td>\n",
       "      <td>-0.535332</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.761285</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.500802</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.700440</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.739848</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.739848</td>\n",
       "      <td>0.327687</td>\n",
       "      <td>-0.494109</td>\n",
       "      <td>-0.535332</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 375 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           X-1004    X-1008  X-1027    X-1095  X-1119    X-1156  X-1167  \\\n",
       "gene_id                                                                   \n",
       "1        0.367371 -0.321928     0.0  0.000000     0.0  0.978196     0.0   \n",
       "2        0.761285  0.000000     0.0  0.500802     0.0  0.700440     0.0   \n",
       "3        0.761285  0.000000     0.0  0.500802     0.0  0.700440     0.0   \n",
       "\n",
       "           X-1169    X-1172  X-1173   ...    X-5694  X-5696  X-5713    X-5717  \\\n",
       "gene_id                               ...                                       \n",
       "1        2.509696  0.000000     0.0   ...       0.0     0.0     0.0  0.560715   \n",
       "2        0.000000  0.201634     0.0   ...       0.0     0.0     0.0  0.739848   \n",
       "3        0.000000  0.201634     0.0   ...       0.0     0.0     0.0  0.739848   \n",
       "\n",
       "         X-5727    X-5739    X-5808    X-5959    X-5975  X-6047  \n",
       "gene_id                                                          \n",
       "1           0.0  0.000000  0.000000  0.000000 -0.483985     0.0  \n",
       "2           0.0  0.739848  0.327687 -0.494109 -0.535332     0.0  \n",
       "3           0.0  0.739848  0.327687 -0.494109 -0.535332     0.0  \n",
       "\n",
       "[3 rows x 375 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pdx,query2target,not_mapped = apply_mappers(pdx, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n",
    "pdx.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X-1004</th>\n",
       "      <th>X-1008</th>\n",
       "      <th>X-1027</th>\n",
       "      <th>X-1095</th>\n",
       "      <th>X-1119</th>\n",
       "      <th>X-1156</th>\n",
       "      <th>X-1167</th>\n",
       "      <th>X-1169</th>\n",
       "      <th>X-1172</th>\n",
       "      <th>X-1173</th>\n",
       "      <th>...</th>\n",
       "      <th>X-5694</th>\n",
       "      <th>X-5696</th>\n",
       "      <th>X-5713</th>\n",
       "      <th>X-5717</th>\n",
       "      <th>X-5727</th>\n",
       "      <th>X-5739</th>\n",
       "      <th>X-5808</th>\n",
       "      <th>X-5959</th>\n",
       "      <th>X-5975</th>\n",
       "      <th>X-6047</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>143872</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.560715</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.330973</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.350497</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.349235</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.514573</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143872</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.560715</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.330973</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.350497</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.349235</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.514573</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>286464</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.524915</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>-1.321928</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.718088</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>286464</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.524915</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>-1.321928</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.718088</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>286464</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.524915</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>-1.321928</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.718088</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51463</th>\n",
       "      <td>1.238787</td>\n",
       "      <td>1.090853</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.839960</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>-0.349235</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51463</th>\n",
       "      <td>1.238787</td>\n",
       "      <td>1.090853</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.839960</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.438293</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>642826</th>\n",
       "      <td>0.608809</td>\n",
       "      <td>0.859970</td>\n",
       "      <td>0.531069</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.871844</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-0.312939</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>642826</th>\n",
       "      <td>0.608809</td>\n",
       "      <td>0.859970</td>\n",
       "      <td>0.531069</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.871844</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-0.312939</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>653067</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.434403</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>1.220330</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.580145</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>653067</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.434403</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>1.220330</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.580145</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>653067</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.434403</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>1.220330</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.580145</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>653067</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.434403</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>1.220330</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.580145</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>399761</th>\n",
       "      <td>0.531069</td>\n",
       "      <td>0.718088</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.790772</td>\n",
       "      <td>...</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>-1.535332</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.599462</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>399761</th>\n",
       "      <td>0.531069</td>\n",
       "      <td>0.718088</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.790772</td>\n",
       "      <td>...</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>-1.535332</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.599462</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>647060</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.312939</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.621488</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.377070</td>\n",
       "      <td>-0.749038</td>\n",
       "      <td>-0.405451</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.395929</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>647060</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.312939</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.621488</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.377070</td>\n",
       "      <td>-0.749038</td>\n",
       "      <td>-0.405451</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.395929</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>284565</th>\n",
       "      <td>1.238787</td>\n",
       "      <td>1.090853</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.570463</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>0.411426</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>...</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>-0.504305</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>284565</th>\n",
       "      <td>1.238787</td>\n",
       "      <td>1.090853</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.310340</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>0.411426</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>...</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>-0.524915</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84631</th>\n",
       "      <td>-0.823677</td>\n",
       "      <td>-0.524915</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.632268</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.074001</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>-0.678072</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.500802</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.535332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84631</th>\n",
       "      <td>-0.823677</td>\n",
       "      <td>-0.524915</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.632268</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.074001</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>-0.678072</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.500802</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.535332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>161176</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.358454</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.339137</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.483985</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.411426</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>161176</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.358454</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.339137</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.483985</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.411426</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341019</th>\n",
       "      <td>0.959770</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.304006</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.349235</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.666576</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.304006</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.444184</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341019</th>\n",
       "      <td>0.959770</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.304006</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.349235</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.312939</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.304006</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.444184</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83869</th>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-3.184425</td>\n",
       "      <td>-3.321928</td>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-2.599462</td>\n",
       "      <td>-3.556393</td>\n",
       "      <td>-2.785875</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>-2.736966</td>\n",
       "      <td>-2.514573</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.556393</td>\n",
       "      <td>-3.943416</td>\n",
       "      <td>-2.785875</td>\n",
       "      <td>-2.152003</td>\n",
       "      <td>-3.556393</td>\n",
       "      <td>-3.251539</td>\n",
       "      <td>-2.736966</td>\n",
       "      <td>-3.473931</td>\n",
       "      <td>-3.120294</td>\n",
       "      <td>-3.643856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83869</th>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-3.184425</td>\n",
       "      <td>-3.321928</td>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-2.599462</td>\n",
       "      <td>-3.556393</td>\n",
       "      <td>-2.785875</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>-2.736966</td>\n",
       "      <td>-2.514573</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.556393</td>\n",
       "      <td>-3.943416</td>\n",
       "      <td>-2.785875</td>\n",
       "      <td>-2.152003</td>\n",
       "      <td>-3.556393</td>\n",
       "      <td>-3.251539</td>\n",
       "      <td>-2.736966</td>\n",
       "      <td>-3.473931</td>\n",
       "      <td>-3.120294</td>\n",
       "      <td>-3.643856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9502</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.434403</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>1.220330</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.580145</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9502</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.434403</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.902389</td>\n",
       "      <td>1.220330</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.580145</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>-1.494109</td>\n",
       "      <td>-1.494109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83871</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500802</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.304006</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.349235</td>\n",
       "      <td>0.327687</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.463947</td>\n",
       "      <td>-0.358454</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>-0.535332</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100134869</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.232661</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.389567</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.463947</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.790772</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.535332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100134869</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.232661</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.389567</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.463947</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.790772</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.535332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84316</th>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.349235</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.985645</td>\n",
       "      <td>-0.358454</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>-0.971431</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>0.310340</td>\n",
       "      <td>0.438293</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84316</th>\n",
       "      <td>-0.810966</td>\n",
       "      <td>-0.689660</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.718088</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.731183</td>\n",
       "      <td>0.367371</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.434403</td>\n",
       "      <td>1.060047</td>\n",
       "      <td>0.490570</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.632629</td>\n",
       "      <td>-0.655172</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200030</th>\n",
       "      <td>1.238787</td>\n",
       "      <td>1.090853</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.839960</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.438293</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200030</th>\n",
       "      <td>1.238787</td>\n",
       "      <td>1.090853</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.839960</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.438293</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.618239</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.469886</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>642658</th>\n",
       "      <td>1.049631</td>\n",
       "      <td>1.358959</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.599318</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.700440</td>\n",
       "      <td>0.298658</td>\n",
       "      <td>...</td>\n",
       "      <td>0.632268</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.769772</td>\n",
       "      <td>0.232661</td>\n",
       "      <td>1.121015</td>\n",
       "      <td>0.831877</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.121015</td>\n",
       "      <td>0.459432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>642658</th>\n",
       "      <td>1.049631</td>\n",
       "      <td>1.358959</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.599318</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.321928</td>\n",
       "      <td>0.700440</td>\n",
       "      <td>0.298658</td>\n",
       "      <td>...</td>\n",
       "      <td>0.632268</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.769772</td>\n",
       "      <td>0.232661</td>\n",
       "      <td>1.121015</td>\n",
       "      <td>0.831877</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.121015</td>\n",
       "      <td>0.459432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100302179</th>\n",
       "      <td>0.778209</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.463947</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100302179</th>\n",
       "      <td>0.778209</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.201634</td>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.463947</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>401508</th>\n",
       "      <td>0.232661</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.666576</td>\n",
       "      <td>1.629939</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-2.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.330973</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>401508</th>\n",
       "      <td>0.232661</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.666576</td>\n",
       "      <td>1.629939</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-2.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.260152</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.330973</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119016</th>\n",
       "      <td>0.378512</td>\n",
       "      <td>0.570463</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.043943</td>\n",
       "      <td>-0.524915</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.377070</td>\n",
       "      <td>-0.643856</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119016</th>\n",
       "      <td>0.448901</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.242977</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84458</th>\n",
       "      <td>0.389567</td>\n",
       "      <td>-0.666576</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84458</th>\n",
       "      <td>0.389567</td>\n",
       "      <td>-0.666576</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>574445</th>\n",
       "      <td>0.490570</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.074001</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.238787</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.823677</td>\n",
       "      <td>-0.957356</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.985645</td>\n",
       "      <td>-0.454032</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.599462</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>574445</th>\n",
       "      <td>0.490570</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.074001</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.238787</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.823677</td>\n",
       "      <td>-0.957356</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.985645</td>\n",
       "      <td>-0.454032</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.599462</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26095</th>\n",
       "      <td>0.531069</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.790772</td>\n",
       "      <td>...</td>\n",
       "      <td>0.269033</td>\n",
       "      <td>-1.043943</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26095</th>\n",
       "      <td>0.531069</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.251539</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.286304</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.790772</td>\n",
       "      <td>...</td>\n",
       "      <td>0.269033</td>\n",
       "      <td>-1.043943</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84968</th>\n",
       "      <td>-0.736966</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.570463</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>-0.678072</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.535332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84968</th>\n",
       "      <td>-0.736966</td>\n",
       "      <td>-0.321928</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.570463</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.875672</td>\n",
       "      <td>-0.678072</td>\n",
       "      <td>0.378512</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.545824</td>\n",
       "      <td>-0.535332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80759</th>\n",
       "      <td>-0.556393</td>\n",
       "      <td>0.599318</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.567041</td>\n",
       "      <td>-0.556393</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.621488</td>\n",
       "      <td>-0.454032</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.621488</td>\n",
       "      <td>0.761285</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80759</th>\n",
       "      <td>-0.556393</td>\n",
       "      <td>0.599318</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.567041</td>\n",
       "      <td>-0.556393</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.621488</td>\n",
       "      <td>-0.454032</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.621488</td>\n",
       "      <td>0.761285</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3192</th>\n",
       "      <td>1.350497</td>\n",
       "      <td>0.632268</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.220330</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>-0.268817</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.480265</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.330973</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.021480</td>\n",
       "      <td>0.599318</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3192</th>\n",
       "      <td>1.350497</td>\n",
       "      <td>0.632268</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.220330</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.448901</td>\n",
       "      <td>-0.268817</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.480265</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.330973</td>\n",
       "      <td>0.250962</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.021480</td>\n",
       "      <td>0.599318</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>387707</th>\n",
       "      <td>0.389567</td>\n",
       "      <td>-0.666576</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>387707</th>\n",
       "      <td>0.389567</td>\n",
       "      <td>-0.666576</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.340075</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.014500</td>\n",
       "      <td>-0.473931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.386468</td>\n",
       "      <td>-0.610433</td>\n",
       "      <td>-0.577767</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79741</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.589763</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.242977</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.798366</td>\n",
       "      <td>-0.985645</td>\n",
       "      <td>-0.358454</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.700440</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.459432</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79741</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.589763</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.242977</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.207893</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.220330</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.798366</td>\n",
       "      <td>-0.985645</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.700440</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.367732</td>\n",
       "      <td>-0.588574</td>\n",
       "      <td>0.459432</td>\n",
       "      <td>0.269033</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>77 rows × 375 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             X-1004    X-1008    X-1027    X-1095    X-1119    X-1156  \\\n",
       "gene_id                                                                 \n",
       "143872     0.000000  0.560715  0.000000  0.000000  0.000000 -0.330973   \n",
       "143872     0.000000  0.560715  0.000000  0.000000  0.000000 -0.330973   \n",
       "286464     0.000000  0.378512  0.000000  0.550901  0.000000 -0.524915   \n",
       "286464     0.000000  0.378512  0.000000  0.550901  0.000000 -0.524915   \n",
       "286464     0.000000  0.378512  0.000000  0.550901  0.000000 -0.524915   \n",
       "51463      1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
       "51463      1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
       "642826     0.608809  0.859970  0.531069  0.000000  0.000000  0.871844   \n",
       "642826     0.608809  0.859970  0.531069  0.000000  0.000000  0.871844   \n",
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
       "399761     0.531069  0.718088  0.000000  0.000000  0.000000 -0.251539   \n",
       "399761     0.531069  0.718088  0.000000  0.000000  0.000000 -0.251539   \n",
       "647060     0.000000  0.000000  0.000000  0.000000  0.000000 -0.312939   \n",
       "647060     0.000000  0.000000  0.000000  0.000000  0.000000 -0.312939   \n",
       "284565     1.238787  1.090853  0.000000  0.000000  0.000000  1.570463   \n",
       "284565     1.238787  1.090853  0.000000  0.000000  0.000000  1.310340   \n",
       "84631     -0.823677 -0.524915  0.000000  0.632268  0.000000 -1.074001   \n",
       "84631     -0.823677 -0.524915  0.000000  0.632268  0.000000 -1.074001   \n",
       "161176     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
       "161176     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
       "341019     0.959770  0.000000  0.000000  0.000000  0.000000 -0.304006   \n",
       "341019     0.959770  0.000000  0.000000  0.000000  0.000000 -0.304006   \n",
       "83869     -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393   \n",
       "83869     -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393   \n",
       "9502       0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
       "9502       0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
       "83871      0.000000  0.000000  0.000000  0.500802  0.000000 -0.304006   \n",
       "...             ...       ...       ...       ...       ...       ...   \n",
       "100134869  0.000000  0.207893  0.000000  0.000000  0.000000  0.000000   \n",
       "100134869  0.000000  0.207893  0.000000  0.000000  0.000000  0.000000   \n",
       "84316     -0.251539  0.000000  0.000000  0.000000  0.000000 -0.340075   \n",
       "84316     -0.810966 -0.689660  0.000000  0.000000  0.000000  0.718088   \n",
       "200030     1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
       "200030     1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
       "642658     1.049631  1.358959  0.000000  0.599318  0.000000  0.000000   \n",
       "642658     1.049631  1.358959  0.000000  0.599318  0.000000  0.000000   \n",
       "100302179  0.778209  0.000000  0.000000  0.000000  0.000000 -0.340075   \n",
       "100302179  0.778209  0.000000  0.000000  0.000000  0.000000 -0.340075   \n",
       "401508     0.232661  0.000000  0.000000  0.000000  0.250962  0.000000   \n",
       "401508     0.232661  0.000000  0.000000  0.000000  0.250962  0.000000   \n",
       "119016     0.378512  0.570463  0.000000  0.000000  0.000000 -0.251539   \n",
       "119016     0.448901 -0.286304  0.000000  0.000000  0.000000 -0.242977   \n",
       "84458      0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
       "84458      0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
       "574445     0.490570  0.000000 -0.286304  0.000000  0.000000 -1.074001   \n",
       "574445     0.490570  0.000000 -0.286304  0.000000  0.000000 -1.074001   \n",
       "26095      0.531069  0.000000  0.000000  0.000000  0.000000 -0.251539   \n",
       "26095      0.531069  0.000000  0.000000  0.000000  0.000000 -0.251539   \n",
       "84968     -0.736966 -0.321928  0.000000  0.570463  0.000000  0.000000   \n",
       "84968     -0.736966 -0.321928  0.000000  0.570463  0.000000  0.000000   \n",
       "80759     -0.556393  0.599318  0.000000  0.000000  0.000000  0.250962   \n",
       "80759     -0.556393  0.599318  0.000000  0.000000  0.000000  0.250962   \n",
       "3192       1.350497  0.632268  0.000000  0.000000  0.000000  0.220330   \n",
       "3192       1.350497  0.632268  0.000000  0.000000  0.000000  0.220330   \n",
       "387707     0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
       "387707     0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
       "79741      0.000000  0.589763  0.000000  0.000000  0.000000 -0.242977   \n",
       "79741      0.000000  0.589763  0.000000  0.000000  0.000000 -0.242977   \n",
       "\n",
       "             X-1167    X-1169    X-1172    X-1173    ...       X-5694  \\\n",
       "gene_id                                              ...                \n",
       "143872     0.000000 -0.367732  0.350497  0.000000    ...     0.000000   \n",
       "143872     0.000000 -0.367732  0.350497  0.000000    ...     0.000000   \n",
       "286464     0.000000 -0.902389 -1.321928  0.367371    ...     0.000000   \n",
       "286464     0.000000 -0.902389 -1.321928  0.367371    ...     0.000000   \n",
       "286464     0.000000 -0.902389 -1.321928  0.367371    ...     0.000000   \n",
       "51463      0.000000  0.448901  0.000000  0.000000    ...     0.469886   \n",
       "51463      0.000000  0.448901  0.000000  0.000000    ...     0.438293   \n",
       "642826     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
       "642826     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
       "399761     0.000000 -0.286304  0.000000  0.790772    ...     0.367371   \n",
       "399761     0.000000 -0.286304  0.000000  0.790772    ...     0.367371   \n",
       "647060     0.000000 -0.321928  0.000000  0.618239    ...     0.000000   \n",
       "647060     0.000000 -0.321928  0.000000  0.618239    ...     0.000000   \n",
       "284565     0.000000  0.448901  0.411426  0.448901    ...     0.469886   \n",
       "284565     0.000000  0.448901  0.411426  0.448901    ...     0.469886   \n",
       "84631      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
       "84631      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
       "161176     0.000000 -0.358454  0.250962  0.339137    ...    -0.875672   \n",
       "161176     0.000000 -0.358454  0.250962  0.339137    ...    -0.875672   \n",
       "341019     0.000000 -0.349235  0.000000  0.000000    ...     0.000000   \n",
       "341019     0.000000 -0.349235  0.000000  0.000000    ...     0.000000   \n",
       "83869     -2.785875 -0.875672 -2.736966 -2.514573    ...    -2.556393   \n",
       "83869     -2.785875 -0.875672 -2.736966 -2.514573    ...    -2.556393   \n",
       "9502       0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
       "9502       0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
       "83871      0.000000 -0.349235  0.327687  0.000000    ...     0.000000   \n",
       "...             ...       ...       ...       ...    ...          ...   \n",
       "100134869  0.000000  0.000000  0.232661  0.000000    ...     0.000000   \n",
       "100134869  0.000000  0.000000  0.232661  0.000000    ...     0.000000   \n",
       "84316      0.000000 -0.349235 -0.588574  0.000000    ...     0.000000   \n",
       "84316      0.000000  0.000000  0.731183  0.367371    ...     0.000000   \n",
       "200030     0.000000  0.448901  0.000000  0.000000    ...     0.438293   \n",
       "200030     0.000000  0.448901  0.000000  0.000000    ...     0.438293   \n",
       "642658     0.000000  0.321928  0.700440  0.298658    ...     0.632268   \n",
       "642658     0.000000  0.321928  0.700440  0.298658    ...     0.632268   \n",
       "100302179  0.000000  0.000000  0.000000  0.000000    ...     0.000000   \n",
       "100302179  0.000000  0.000000  0.000000  0.000000    ...     0.000000   \n",
       "401508     0.207893  0.000000 -0.666576  1.629939    ...     0.000000   \n",
       "401508     0.207893  0.000000 -0.666576  1.629939    ...     0.000000   \n",
       "119016     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
       "119016     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
       "84458      0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
       "84458      0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
       "574445     0.000000  0.238787  0.000000  0.000000    ...    -0.823677   \n",
       "574445     0.000000  0.238787  0.000000  0.000000    ...    -0.823677   \n",
       "26095      0.000000 -0.286304  0.000000  0.790772    ...     0.269033   \n",
       "26095      0.000000 -0.286304  0.000000  0.790772    ...     0.269033   \n",
       "84968      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
       "84968      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
       "80759      0.000000 -0.386468 -0.567041 -0.556393    ...     0.000000   \n",
       "80759      0.000000 -0.386468 -0.567041 -0.556393    ...     0.000000   \n",
       "3192       0.000000  0.448901 -0.268817  0.000000    ...     0.480265   \n",
       "3192       0.000000  0.448901 -0.268817  0.000000    ...     0.480265   \n",
       "387707     0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
       "387707     0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
       "79741      0.000000  0.207893  0.000000  0.000000    ...    -0.798366   \n",
       "79741      0.000000  0.207893  0.000000  0.220330    ...    -0.798366   \n",
       "\n",
       "             X-5696    X-5713    X-5717    X-5727    X-5739    X-5808  \\\n",
       "gene_id                                                                 \n",
       "143872    -0.349235  0.000000 -0.321928 -1.014500 -0.588574  0.000000   \n",
       "143872    -0.349235  0.000000 -0.321928 -1.014500 -0.588574  0.000000   \n",
       "286464    -1.014500  0.718088  0.000000 -1.014500  0.000000  0.000000   \n",
       "286464    -1.014500  0.718088  0.000000 -1.014500  0.000000  0.000000   \n",
       "286464    -1.014500  0.718088  0.000000 -1.014500  0.000000  0.000000   \n",
       "51463      0.000000  0.618239  0.201634  0.000000 -0.260152 -0.349235   \n",
       "51463      0.000000  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
       "642826    -1.014500 -0.473931  0.000000  0.618239  0.000000 -0.386468   \n",
       "642826    -1.014500 -0.473931  0.000000  0.618239  0.000000 -0.386468   \n",
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
       "399761    -1.535332  0.000000  0.000000  0.000000  0.000000 -0.599462   \n",
       "399761    -1.535332  0.000000  0.000000  0.000000  0.000000 -0.599462   \n",
       "647060    -0.621488  0.000000 -0.377070 -0.749038 -0.405451  0.000000   \n",
       "647060    -0.621488  0.000000 -0.377070 -0.749038 -0.405451  0.000000   \n",
       "284565    -0.504305  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
       "284565    -0.524915  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
       "84631     -1.014500  0.500802  0.000000  0.000000  0.000000  0.000000   \n",
       "84631     -1.014500  0.500802  0.000000  0.000000  0.000000  0.000000   \n",
       "161176     0.000000 -0.483985  0.000000  0.000000 -0.610433  0.000000   \n",
       "161176     0.000000 -0.483985  0.000000  0.000000 -0.610433  0.000000   \n",
       "341019    -0.666576  0.000000 -0.304006 -1.000000 -0.610433  0.000000   \n",
       "341019    -0.312939  0.000000 -0.304006 -1.000000 -0.610433  0.000000   \n",
       "83869     -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966   \n",
       "83869     -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966   \n",
       "9502      -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
       "9502      -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
       "83871      0.000000 -0.463947 -0.358454  0.000000  0.000000  0.000000   \n",
       "...             ...       ...       ...       ...       ...       ...   \n",
       "100134869  0.389567  0.000000  0.000000 -0.463947  0.000000 -0.367732   \n",
       "100134869  0.389567  0.000000  0.000000 -0.463947  0.000000 -0.367732   \n",
       "84316     -0.985645 -0.358454 -0.340075 -0.971431 -0.545824  0.310340   \n",
       "84316      0.000000  0.000000 -0.434403  1.060047  0.490570  0.000000   \n",
       "200030     0.000000  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
       "200030     0.000000  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
       "642658     0.000000  0.769772  0.232661  1.121015  0.831877 -0.367732   \n",
       "642658     0.000000  0.769772  0.232661  1.121015  0.831877 -0.367732   \n",
       "100302179  0.000000  0.201634 -0.251539  0.000000 -0.463947  0.000000   \n",
       "100302179  0.000000  0.201634 -0.251539  0.000000 -0.463947  0.000000   \n",
       "401508    -2.321928  0.000000  0.000000 -0.260152  0.000000 -0.545824   \n",
       "401508    -2.321928  0.000000  0.000000 -0.260152  0.000000 -0.545824   \n",
       "119016    -1.043943 -0.524915  0.000000  0.000000  0.000000 -0.377070   \n",
       "119016    -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.367732   \n",
       "84458     -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
       "84458     -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
       "574445    -0.957356 -0.321928  0.000000 -0.985645 -0.454032  0.000000   \n",
       "574445    -0.957356 -0.321928  0.000000 -0.985645 -0.454032  0.000000   \n",
       "26095     -1.043943  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
       "26095     -1.043943  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
       "84968     -1.000000  0.250962  0.000000  0.000000  0.000000  0.000000   \n",
       "84968     -1.000000  0.250962  0.000000  0.000000  0.000000  0.000000   \n",
       "80759      0.000000 -0.621488 -0.454032  0.000000  0.000000 -0.386468   \n",
       "80759      0.000000 -0.621488 -0.454032  0.000000  0.000000 -0.386468   \n",
       "3192       0.000000  0.207893  0.000000  0.000000 -0.330973  0.250962   \n",
       "3192       0.000000  0.207893  0.000000  0.000000 -0.330973  0.250962   \n",
       "387707    -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
       "387707    -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
       "79741     -0.985645 -0.358454  0.000000  0.700440  0.000000 -0.367732   \n",
       "79741     -0.985645  0.000000  0.000000  0.700440  0.000000 -0.367732   \n",
       "\n",
       "             X-5959    X-5975    X-6047  \n",
       "gene_id                                  \n",
       "143872     0.000000  0.000000 -0.514573  \n",
       "143872     0.000000  0.000000 -0.514573  \n",
       "286464     0.000000 -1.494109 -1.494109  \n",
       "286464     0.000000 -1.494109 -1.494109  \n",
       "286464     0.000000 -1.494109 -1.494109  \n",
       "51463      0.000000  0.469886  0.000000  \n",
       "51463      0.000000  0.469886  0.000000  \n",
       "642826    -1.494109 -0.312939  0.000000  \n",
       "642826    -1.494109 -0.312939  0.000000  \n",
       "653067    -0.577767 -1.494109 -1.494109  \n",
       "653067    -0.577767 -1.494109 -1.494109  \n",
       "653067    -0.577767 -1.494109 -1.494109  \n",
       "653067    -0.577767 -1.494109 -1.494109  \n",
       "399761    -0.588574  0.000000  0.000000  \n",
       "399761    -0.588574  0.000000  0.000000  \n",
       "647060    -0.395929  0.000000  0.000000  \n",
       "647060    -0.395929  0.000000  0.000000  \n",
       "284565     0.000000  0.469886  0.000000  \n",
       "284565     0.000000  0.469886  0.000000  \n",
       "84631     -0.545824 -0.545824 -0.535332  \n",
       "84631     -0.545824 -0.545824 -0.535332  \n",
       "161176     0.411426  0.000000  0.000000  \n",
       "161176     0.411426  0.000000  0.000000  \n",
       "341019     0.000000 -0.444184  0.000000  \n",
       "341019     0.000000 -0.444184  0.000000  \n",
       "83869     -3.473931 -3.120294 -3.643856  \n",
       "83869     -3.473931 -3.120294 -3.643856  \n",
       "9502      -0.577767 -1.494109 -1.494109  \n",
       "9502      -0.577767 -1.494109 -1.494109  \n",
       "83871      0.448901 -0.535332  0.000000  \n",
       "...             ...       ...       ...  \n",
       "100134869  0.790772  0.000000 -0.535332  \n",
       "100134869  0.790772  0.000000 -0.535332  \n",
       "84316      0.438293 -0.473931  0.000000  \n",
       "84316     -0.632629 -0.655172  0.000000  \n",
       "200030     0.000000  0.469886  0.000000  \n",
       "200030     0.000000  0.469886  0.000000  \n",
       "642658     0.000000  1.121015  0.459432  \n",
       "642658     0.000000  1.121015  0.459432  \n",
       "100302179  0.000000  0.000000  0.000000  \n",
       "100302179  0.000000  0.000000  0.000000  \n",
       "401508    -0.330973  0.000000  0.000000  \n",
       "401508    -0.330973  0.000000  0.000000  \n",
       "119016    -0.643856  0.000000  0.000000  \n",
       "119016    -0.577767  0.000000  0.000000  \n",
       "84458     -0.610433 -0.577767  0.000000  \n",
       "84458     -0.610433 -0.577767  0.000000  \n",
       "574445    -0.599462 -0.577767  0.000000  \n",
       "574445    -0.599462 -0.577767  0.000000  \n",
       "26095     -0.588574  0.000000  0.000000  \n",
       "26095     -0.588574  0.000000  0.000000  \n",
       "84968     -0.545824 -0.545824 -0.535332  \n",
       "84968     -0.545824 -0.545824 -0.535332  \n",
       "80759     -0.621488  0.761285  0.000000  \n",
       "80759     -0.621488  0.761285  0.000000  \n",
       "3192       0.000000  1.021480  0.599318  \n",
       "3192       0.000000  1.021480  0.599318  \n",
       "387707    -0.610433 -0.577767  0.000000  \n",
       "387707    -0.610433 -0.577767  0.000000  \n",
       "79741     -0.588574  0.459432  0.000000  \n",
       "79741     -0.588574  0.459432  0.269033  \n",
       "\n",
       "[77 rows x 375 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dups = list(set(pdx[pdx.index.duplicated(keep=False)].index.values))\n",
    "pdx.loc[dups,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "37 duplicated IDs in 77 rows found.\n",
      "duplicate rows removed due to low correlation of duplicated profiles 4\n",
      "Merged  73 duplicated rows into 35 rows\n"
     ]
    }
   ],
   "source": [
    "# most of these dupliates correspond to genes merged in the current assembly, e.g. gene - gene-AS\n",
    "pdx = handle_dups(pdx,corr_thr = 0.75)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "pdx = pdx.T.sort_index().T\n",
    "pdx.to_csv(preprocessed_dir+\"/\"+\"PDX\"+\".Segment_Mean.CNA.tsv\",\n",
    "                 sep = \"\\t\",header=True,index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluation of the results\n",
    "1). How many common genes between four datasets?\n",
    "\n",
    "2). Do CNA profiles of the same cell line from GDSC and CCLE correlate?\n",
    "\n",
    "3). Do CNA profiles of the same cancer type from TCGA and PDX look similar?\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "ename": "IOError",
     "evalue": "File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIOError\u001b[0m                                   Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-39-1f476096b0ec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m### 1). How many common genes between four datasets?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;31m# we take BRCA from TCGA because\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtcga\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"BRCA\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;31m#print(tcga.head(3))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mgdsc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"GDSC\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m    676\u001b[0m                     skip_blank_lines=skip_blank_lines)\n\u001b[1;32m    677\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 678\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    679\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    680\u001b[0m     \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    438\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    439\u001b[0m     \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m     \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    442\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    785\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    786\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 787\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    788\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    789\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m   1012\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1013\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1014\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1015\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1016\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m   1706\u001b[0m         \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'usecols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1707\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1708\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1709\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1710\u001b[0m         \u001b[0mpassed_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnames\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mIOError\u001b[0m: File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist"
     ]
    }
   ],
   "source": [
    "### 1). How many common genes between four datasets?\n",
    "# we take BRCA from TCGA because  \n",
    "tcga = pd.read_csv(preprocessed_dir+\"BRCA\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n",
    "#print(tcga.head(3))\n",
    "gdsc = pd.read_csv(preprocessed_dir+\"GDSC\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n",
    "#print(tcga.head(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### distribution of logR values in GDSC and CCLE \n",
    "cn_values_gdsc  = []\n",
    "for row in df.iterrows():\n",
    "    cn_values_gdsc += list(row[1].values)\n",
    "cn_values_ccle = []\n",
    "for row in cna_table.iterrows():\n",
    "    cn_values_ccle+=  list(row[1].values)\n",
    "\n",
    "cn_values_gdsc = sorted (cn_values_gdsc)\n",
    "cn_values_ccle = sorted (cn_values_ccle)\n",
    "plt.figure(figsize=(20,5))\n",
    "plt.subplot(121)\n",
    "tmp = plt.hist(cn_values_gdsc,bins=100,density = True,range=(-5,4))\n",
    "plt.title(\"GDSC\")\n",
    "plt.subplot(122)\n",
    "tmp = plt.hist(cn_values_ccle,bins=100,density = True, range=(-5,4))\n",
    "plt.title(\"CCLE\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}