a b/preprocessing_scr/CNA.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "from __future__ import print_function\n",
10
    "import pandas as pd\n",
11
    "import os,sys\n",
12
    "import pybedtools as pbt\n",
13
    "from StringIO import StringIO\n",
14
    "import numpy as np\n",
15
    "import matplotlib.pyplot as plt\n",
16
    "import numpy as np\n",
17
    "import time\n",
18
    "from mapper import expand, parse_mapping_table, apply_mappers\n",
19
    "%matplotlib inline\n",
20
    "\n",
21
    "\n",
22
    "chr_dict = dict(zip(range(1,22),map(str,range(1,22))))\n",
23
    "chr_dict.update({22: 'X', 23: \"Y\"})\n",
24
    "\n",
25
    "root_dir = \"/home/olya/SFU/Hossein/v2/\"\n",
26
    "gene_coords_file = root_dir + \"ref_GRCh37.p5_top_level.gff3.bed\" # must contain chromosome, start, end and Entrez Gene ID for hg19"
27
   ]
28
  },
29
  {
30
   "cell_type": "markdown",
31
   "metadata": {},
32
   "source": [
33
    "# TCGA \n",
34
    "\n",
35
    "Assume that segmentation files from GDAC : http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/*/*snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt are dowmnoaded\n",
36
    "\n",
37
    "1) Filtering segments:\n",
38
    " - segments containing less than 5 probes removed\n",
39
    " - keep only segments with segment mean below -0.23 or above 0.2. This means that one copy gains and losses are detectable when their CCF (canncer cell fraction) is 0.3 or higher. \n",
40
    " \n",
41
    "TODO: remove segements overlapping with germline CNA forund in normals (add this as the first step)\n",
42
    "2). For each samples aggregte to gene-level:\n",
43
    " - rename chromosomes 22 and 23 to X and Y\n",
44
    " - overpal segemntation file with Entrez gene coordinates for hg19\n",
45
    " - if a gene overlaps by multiple segments, keep the one with most extreme values"
46
   ]
47
  },
48
  {
49
   "cell_type": "code",
50
   "execution_count": 2,
51
   "metadata": {},
52
   "outputs": [],
53
   "source": [
54
    "num_marker_thr = 5\n",
55
    "# to detect 1 copy gains or losses presenting at CCF >= 0.3\n",
56
    "pos_seg_mean_thr = 0.20\n",
57
    "neg_seg_mean_thr = -0.23 \n",
58
    "\n",
59
    "preprocessed_dir = root_dir+\"preprocessed/CNA/\"\n"
60
   ]
61
  },
62
  {
63
   "cell_type": "code",
64
   "execution_count": 19,
65
   "metadata": {},
66
   "outputs": [],
67
   "source": [
68
    "### functions for TCGA and CCLE #################################\n",
69
    "def filter_lowconf_segments(df,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr ):\n",
70
    "    # filter low-confidence segments with too few probes\n",
71
    "    df = df[df[\"Num_Probes\"] >= num_marker_thr ]\n",
72
    "    # filter low-confidence segments with Segment_Mean too close to zero:\n",
73
    "    df = df[ (df[\"Segment_Mean\"] >= pos_seg_mean_thr) | (df[\"Segment_Mean\"] <= neg_seg_mean_thr)]\n",
74
    "    return df\n",
75
    "\n",
76
    "def sample_type(barcode):\n",
77
    "    if barcode[13:16] in [\"10A\",\"10B\",\"11A\",\"11B\",\"10C\",\"11C\"]:\n",
78
    "        return \"Normal\"\n",
79
    "    else:\n",
80
    "        return \"Tumor\"\n",
81
    "\n",
82
    "def find_matching_normal(tumor_barcode,barcodes_list):\n",
83
    "    patient_id = tumor_barcode[:12]\n",
84
    "    normal_barcodes = []\n",
85
    "    for barcode in barcodes_list:\n",
86
    "        if barcode.startswith(patient_id) and sample_type(barcode) == \"Normal\":\n",
87
    "            normal_barcodes.append(barcode)\n",
88
    "    return normal_barcodes\n",
89
    "\n",
90
    "def cnv2bed(seg):\n",
91
    "    #cnv_bed = seg[[\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\"]]\n",
92
    "    #cnv_bed.columns = [\"chrom\",\"start\",\"stop\",\"Segment_Mean\"]\n",
93
    "    cnv_bed = seg.rename({\"Chromosome\":\"chrom\",\"Start\":\"start\",\n",
94
    "                          \"End\":\"stop\"},axis=\"columns\")\n",
95
    "    cnv_bed = cnv_bed.loc[:,[\"chrom\",\"start\",\"stop\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]]\n",
96
    "    return  pbt.BedTool.from_dataframe(cnv_bed)\n",
97
    "\n",
98
    "def bed2cnv(cnv_bed):\n",
99
    "    cnv_bed = str(cnv_bed)\n",
100
    "    if len(cnv_bed) > 0:\n",
101
    "        seg = pd.read_csv(StringIO(cnv_bed),sep = \"\\t\",header=None)\n",
102
    "        seg.columns = [\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]\n",
103
    "        seg = seg.loc[:,[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n",
104
    "    else:\n",
105
    "        seg = pd.DataFrame(columns=[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"])\n",
106
    "    return seg\n",
107
    "def remove_ovelapping_segments(tumor, normal,sample_name):\n",
108
    "    tumor_bed = cnv2bed(tumor)\n",
109
    "    normal_bed = cnv2bed(normal)\n",
110
    "    tumor_wo_germline = tumor_bed.subtract(normal_bed,r=True,f=0.8,A=True)\n",
111
    "    tumor_wo_germline  = bed2cnv(tumor_wo_germline)\n",
112
    "    n_segs_removed = tumor.shape[0] - tumor_wo_germline.shape[0]\n",
113
    "    if n_segs_removed*1.0/tumor.shape[0] > 0.5 and n_segs_removed>5 :\n",
114
    "        print(n_segs_removed,\"of\",tumor.shape[0],\"segments removed in\",sample_name,\"due to overlap with normal\",file = sys.stderr)\n",
115
    "    return tumor_wo_germline\n",
116
    "\n",
117
    "def cnv2genelevel(cnv_bed,gene_intervals_bed,sample_name,verbose = True,sorted_index = \"\"):\n",
118
    "    \n",
119
    "    # intersect \n",
120
    "    cnv2gene = str(gene_intervals_bed.intersect(cnv_bed,wb = True,wa=True))\n",
121
    "    if len(cnv2gene)==0: # if no intersection, return all zeroes\n",
122
    "        print(sample_name,\"has no genes with altered CN\",file = sys.stderr)\n",
123
    "        return pd.DataFrame(columns=[sample])\n",
124
    "    cnv2gene = pd.read_csv(StringIO(cnv2gene),sep = \"\\t\",header=None)\n",
125
    "    cnv2gene = cnv2gene[[3,7]].copy()\n",
126
    "    cnv2gene.columns = [\"gene\",\"Segment_Mean\"] \n",
127
    "    \n",
128
    "    # find genes overlapping with more than one segment:\n",
129
    "    # take the most exterme segement_mean value\n",
130
    "    \n",
131
    "    dups = cnv2gene.loc[cnv2gene.duplicated(subset=[\"gene\"],keep=False),]\n",
132
    "    if dups.shape[0] > 0:\n",
133
    "        cnv2gene = cnv2gene.drop_duplicates(subset=[\"gene\"],keep=False)\n",
134
    "        dups[\"abs_seg_mean\"] = abs(dups[\"Segment_Mean\"])\n",
135
    "        if verbose:\n",
136
    "            print(sample_name,\"contain \",len(set(dups[\"gene\"].values)),\"genes overalpped with more than one segment\",file=sys.stderr)\n",
137
    "            #print(dups.head(10),file=sys.stderr)\n",
138
    "        dups = dups.groupby(['gene'], group_keys=False).apply(lambda row: row.loc[row['abs_seg_mean'].idxmax()])\n",
139
    "        cnv2gene = pd.concat([cnv2gene,dups],sort=False)\n",
140
    "\n",
141
    "    cnv2gene = cnv2gene[[\"gene\",\"Segment_Mean\"]]\n",
142
    "    cnv2gene.set_index(\"gene\",inplace=True,drop=True)\n",
143
    "    cnv2gene.rename(int,axis=0,inplace=True)\n",
144
    "    # add copy-neutral genes with 0s\n",
145
    "    \n",
146
    "    cnv2gene = cnv2gene.loc[sorted_index,:]\n",
147
    "    cnv2gene.columns = [sample_name]\n",
148
    "    return cnv2gene\n",
149
    "\n",
150
    "\n",
151
    "### functions for GDSC and PDX #################################\n",
152
    "\n",
153
    "def CN2log2R(col, median_ploidy=2 ):\n",
154
    "    # this is fr GDSC only\n",
155
    "    lRs = []\n",
156
    "    genes = col.index.values\n",
157
    "    for code in col.values:\n",
158
    "        if not code == \"-1,-1,-,-\":\n",
159
    "            [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n",
160
    "            if int(max_cn) == 0:\n",
161
    "                lRs.append(-4.32) # CN=0 with 95% purity\n",
162
    "            else:\n",
163
    "                max_lR = np.log2(float(max_cn)/median_ploidy)\n",
164
    "                if not disruption == \"D\":\n",
165
    "                    lRs.append(max_lR)\n",
166
    "                else:\n",
167
    "                    if int(min_cn) == 0:\n",
168
    "                        min_lR = -4.32\n",
169
    "                    else:\n",
170
    "                        min_lR = np.log2(float(min_cn)/median_ploidy)\n",
171
    "                    if abs(min_lR) > abs(max_lR):\n",
172
    "                        lRs.append(min_lR)\n",
173
    "                    else:\n",
174
    "                        lRs.append(max_lR)\n",
175
    "                \n",
176
    "        else:\n",
177
    "            lRs.append(np.NaN)\n",
178
    "    return pd.Series(dict(zip(genes, lRs)))\n",
179
    "\n",
180
    "def define_avg_ploidy(col):\n",
181
    "    n,pl = 0,0\n",
182
    "    CN_non_disrupted = []\n",
183
    "    for code in col.values:\n",
184
    "        if not code == \"-1,-1,-,-\":\n",
185
    "            [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n",
186
    "            n+=1\n",
187
    "            cn = (int(max_cn)+int(min_cn))*0.5\n",
188
    "            pl += cn\n",
189
    "            if not disruption == \"D\":\n",
190
    "                CN_non_disrupted.append((cn))\n",
191
    "    return pd.Series({\"avg_pl\":pl/n , \"median_pl\":np.median(CN_non_disrupted)})\n",
192
    "\n",
193
    "def clean_logR(logR_value, pos_seg_mean_thr, neg_seg_mean_thr):\n",
194
    "    if logR_value >= pos_seg_mean_thr:\n",
195
    "        return logR_value \n",
196
    "    elif logR_value <= neg_seg_mean_thr:\n",
197
    "            return logR_value \n",
198
    "    else:\n",
199
    "        return 0\n",
200
    "    \n",
201
    "def handle_dups(df,corr_thr = 0.75):\n",
202
    "    '''Detect dupliated row IDs. Merge 2 or more rows with the same ID, \n",
203
    "    if averaged correlation in all pairvise comparision is >= corr_thhr;\\n\n",
204
    "    otherwise drop all duplicates.  Keeps abs. max value (negative preferred).'''\n",
205
    "    dups = df.index\n",
206
    "    dups = list(set(dups[dups.duplicated()]))\n",
207
    "    if len(dups)==0:\n",
208
    "        print(\"No duplicated row IDs. Do nothing.\")\n",
209
    "        return df\n",
210
    "    print(len(dups), \"duplicated IDs in\",df.loc[dups,:].shape[0],\"rows found.\")\n",
211
    "    dups_merge = [] # if corr > corr_thr\n",
212
    "    dups_remove = [] # corr < \n",
213
    "    for dup in dups:\n",
214
    "        r = df.loc[dup,:].T.corr()\n",
215
    "        n_dups = df.loc[dup,:].shape[0]\n",
216
    "        r_avg = []\n",
217
    "        for i in range(0,n_dups):\n",
218
    "            for j in range(i+1,n_dups):\n",
219
    "                r_avg.append(r.iloc[i,j])\n",
220
    "        if np.average(r_avg) < corr_thr :\n",
221
    "            #print(dup,r_avg, n_dups)\n",
222
    "            dups_remove.append(dup)\n",
223
    "        else:\n",
224
    "            dups_merge.append(dup)\n",
225
    "    \n",
226
    "    # remove not similar duplicates\n",
227
    "    df_size = df.shape[0]\n",
228
    "    df = df.loc[~df.index.isin(dups_remove),:]\n",
229
    "    print(\"duplicate rows removed due to low correlation of duplicated profiles\",df_size -df.shape[0] )\n",
230
    "    df_size = df.shape[0]\n",
231
    "    \n",
232
    "    # merge simialr duplicates\n",
233
    "    d1 = df.loc[~df.index.isin(dups_merge),:]\n",
234
    "    d2 = df.loc[dups_merge,:]\n",
235
    "    d2 = d2.groupby(d2.index).agg(lambda x: -max(-x.max(),-x.min(),key= abs))\n",
236
    "    df = pd.concat([d1,d2])\n",
237
    "    df.sort_index(inplace=True)\n",
238
    "    print(\"Merged \",df_size-df.shape[0]+len(dups_merge),\"duplicated rows into\",len(dups_merge),\"rows\")\n",
239
    "    return df"
240
   ]
241
  },
242
  {
243
   "cell_type": "markdown",
244
   "metadata": {},
245
   "source": [
246
    "### next few tabs demonstrate necessity of removing low-confidence and germline segments:  \n",
247
    "\n",
248
    "(e.g. fragment 11:126596926-127130276 presents in both tumor and normal\n",
249
    "therefore, it is germline; see chr11:126596926-12713027 in UCSC browser - it covers part of KIRELL3)\n"
250
   ]
251
  },
252
  {
253
   "cell_type": "code",
254
   "execution_count": 8,
255
   "metadata": {
256
    "scrolled": false
257
   },
258
   "outputs": [
259
    {
260
     "name": "stdout",
261
     "output_type": "stream",
262
     "text": [
263
      "segemtns in tumor 204 segemtns in normal 121\n"
264
     ]
265
    },
266
    {
267
     "data": {
268
      "text/html": [
269
       "<div>\n",
270
       "<style scoped>\n",
271
       "    .dataframe tbody tr th:only-of-type {\n",
272
       "        vertical-align: middle;\n",
273
       "    }\n",
274
       "\n",
275
       "    .dataframe tbody tr th {\n",
276
       "        vertical-align: top;\n",
277
       "    }\n",
278
       "\n",
279
       "    .dataframe thead th {\n",
280
       "        text-align: right;\n",
281
       "    }\n",
282
       "</style>\n",
283
       "<table border=\"1\" class=\"dataframe\">\n",
284
       "  <thead>\n",
285
       "    <tr style=\"text-align: right;\">\n",
286
       "      <th></th>\n",
287
       "      <th>Sample</th>\n",
288
       "      <th>Chromosome</th>\n",
289
       "      <th>Start</th>\n",
290
       "      <th>End</th>\n",
291
       "      <th>Num_Probes</th>\n",
292
       "      <th>Segment_Mean</th>\n",
293
       "    </tr>\n",
294
       "  </thead>\n",
295
       "  <tbody>\n",
296
       "    <tr>\n",
297
       "      <th>57803</th>\n",
298
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
299
       "      <td>11</td>\n",
300
       "      <td>456120</td>\n",
301
       "      <td>8896255</td>\n",
302
       "      <td>4489.0</td>\n",
303
       "      <td>-0.0113</td>\n",
304
       "    </tr>\n",
305
       "    <tr>\n",
306
       "      <th>57804</th>\n",
307
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
308
       "      <td>11</td>\n",
309
       "      <td>8899400</td>\n",
310
       "      <td>8899668</td>\n",
311
       "      <td>3.0</td>\n",
312
       "      <td>-1.3344</td>\n",
313
       "    </tr>\n",
314
       "    <tr>\n",
315
       "      <th>57805</th>\n",
316
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
317
       "      <td>11</td>\n",
318
       "      <td>8900394</td>\n",
319
       "      <td>126596817</td>\n",
320
       "      <td>67487.0</td>\n",
321
       "      <td>0.0010</td>\n",
322
       "    </tr>\n",
323
       "    <tr>\n",
324
       "      <th>57806</th>\n",
325
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
326
       "      <td>11</td>\n",
327
       "      <td>126596926</td>\n",
328
       "      <td>127130276</td>\n",
329
       "      <td>453.0</td>\n",
330
       "      <td>-1.0306</td>\n",
331
       "    </tr>\n",
332
       "    <tr>\n",
333
       "      <th>57807</th>\n",
334
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
335
       "      <td>11</td>\n",
336
       "      <td>127132920</td>\n",
337
       "      <td>128342803</td>\n",
338
       "      <td>864.0</td>\n",
339
       "      <td>-0.0031</td>\n",
340
       "    </tr>\n",
341
       "    <tr>\n",
342
       "      <th>57808</th>\n",
343
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
344
       "      <td>11</td>\n",
345
       "      <td>128342819</td>\n",
346
       "      <td>128350888</td>\n",
347
       "      <td>44.0</td>\n",
348
       "      <td>0.2824</td>\n",
349
       "    </tr>\n",
350
       "    <tr>\n",
351
       "      <th>57809</th>\n",
352
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
353
       "      <td>11</td>\n",
354
       "      <td>128353007</td>\n",
355
       "      <td>134142530</td>\n",
356
       "      <td>3708.0</td>\n",
357
       "      <td>0.0082</td>\n",
358
       "    </tr>\n",
359
       "  </tbody>\n",
360
       "</table>\n",
361
       "</div>"
362
      ],
363
      "text/plain": [
364
       "                             Sample  Chromosome      Start        End  \\\n",
365
       "57803  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11     456120    8896255   \n",
366
       "57804  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11    8899400    8899668   \n",
367
       "57805  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11    8900394  126596817   \n",
368
       "57806  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  126596926  127130276   \n",
369
       "57807  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  127132920  128342803   \n",
370
       "57808  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  128342819  128350888   \n",
371
       "57809  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  128353007  134142530   \n",
372
       "\n",
373
       "       Num_Probes  Segment_Mean  \n",
374
       "57803      4489.0       -0.0113  \n",
375
       "57804         3.0       -1.3344  \n",
376
       "57805     67487.0        0.0010  \n",
377
       "57806       453.0       -1.0306  \n",
378
       "57807       864.0       -0.0031  \n",
379
       "57808        44.0        0.2824  \n",
380
       "57809      3708.0        0.0082  "
381
      ]
382
     },
383
     "execution_count": 8,
384
     "metadata": {},
385
     "output_type": "execute_result"
386
    }
387
   ],
388
   "source": [
389
    "#file_path = \"../../TCGA/CNA/data/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015082100.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n",
390
    "file_path = \"../../TCGA/CNA/data__2016_01_28/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2016012800.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n",
391
    "df = pd.read_csv(file_path, sep = \"\\t\")\n",
392
    "tumor_barcode = \"TCGA-ZJ-AAXJ-01A-11D-A42N-01\"\n",
393
    "t = df.loc[df[\"Sample\"]==tumor_barcode,:]\n",
394
    "t_shape = t.shape[0]\n",
395
    "n = find_matching_normal(tumor_barcode,list(set(df[\"Sample\"].values)))\n",
396
    "n = df.loc[df[\"Sample\"]==n[0],:]\n",
397
    "print(\"segemtns in tumor\",t.shape[0],\"segemtns in normal\",n.shape[0])\n",
398
    "\n",
399
    "n.loc[n['Chromosome']==11,:]"
400
   ]
401
  },
402
  {
403
   "cell_type": "code",
404
   "execution_count": 9,
405
   "metadata": {},
406
   "outputs": [
407
    {
408
     "data": {
409
      "text/html": [
410
       "<div>\n",
411
       "<style scoped>\n",
412
       "    .dataframe tbody tr th:only-of-type {\n",
413
       "        vertical-align: middle;\n",
414
       "    }\n",
415
       "\n",
416
       "    .dataframe tbody tr th {\n",
417
       "        vertical-align: top;\n",
418
       "    }\n",
419
       "\n",
420
       "    .dataframe thead th {\n",
421
       "        text-align: right;\n",
422
       "    }\n",
423
       "</style>\n",
424
       "<table border=\"1\" class=\"dataframe\">\n",
425
       "  <thead>\n",
426
       "    <tr style=\"text-align: right;\">\n",
427
       "      <th></th>\n",
428
       "      <th>Sample</th>\n",
429
       "      <th>Chromosome</th>\n",
430
       "      <th>Start</th>\n",
431
       "      <th>End</th>\n",
432
       "      <th>Num_Probes</th>\n",
433
       "      <th>Segment_Mean</th>\n",
434
       "    </tr>\n",
435
       "  </thead>\n",
436
       "  <tbody>\n",
437
       "    <tr>\n",
438
       "      <th>57960</th>\n",
439
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
440
       "      <td>11</td>\n",
441
       "      <td>456120</td>\n",
442
       "      <td>64200041</td>\n",
443
       "      <td>34710.0</td>\n",
444
       "      <td>0.0054</td>\n",
445
       "    </tr>\n",
446
       "    <tr>\n",
447
       "      <th>57961</th>\n",
448
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
449
       "      <td>11</td>\n",
450
       "      <td>64208988</td>\n",
451
       "      <td>64319750</td>\n",
452
       "      <td>61.0</td>\n",
453
       "      <td>-0.6748</td>\n",
454
       "    </tr>\n",
455
       "    <tr>\n",
456
       "      <th>57962</th>\n",
457
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
458
       "      <td>11</td>\n",
459
       "      <td>64325209</td>\n",
460
       "      <td>126596817</td>\n",
461
       "      <td>37207.0</td>\n",
462
       "      <td>0.0571</td>\n",
463
       "    </tr>\n",
464
       "    <tr>\n",
465
       "      <th>57963</th>\n",
466
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
467
       "      <td>11</td>\n",
468
       "      <td>126596926</td>\n",
469
       "      <td>127130276</td>\n",
470
       "      <td>454.0</td>\n",
471
       "      <td>-1.0760</td>\n",
472
       "    </tr>\n",
473
       "    <tr>\n",
474
       "      <th>57964</th>\n",
475
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
476
       "      <td>11</td>\n",
477
       "      <td>127132920</td>\n",
478
       "      <td>132080656</td>\n",
479
       "      <td>3591.0</td>\n",
480
       "      <td>0.0449</td>\n",
481
       "    </tr>\n",
482
       "    <tr>\n",
483
       "      <th>57965</th>\n",
484
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
485
       "      <td>11</td>\n",
486
       "      <td>132080885</td>\n",
487
       "      <td>132099465</td>\n",
488
       "      <td>15.0</td>\n",
489
       "      <td>-0.6123</td>\n",
490
       "    </tr>\n",
491
       "    <tr>\n",
492
       "      <th>57966</th>\n",
493
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
494
       "      <td>11</td>\n",
495
       "      <td>132099856</td>\n",
496
       "      <td>134142530</td>\n",
497
       "      <td>1010.0</td>\n",
498
       "      <td>0.0483</td>\n",
499
       "    </tr>\n",
500
       "  </tbody>\n",
501
       "</table>\n",
502
       "</div>"
503
      ],
504
      "text/plain": [
505
       "                             Sample  Chromosome      Start        End  \\\n",
506
       "57960  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11     456120   64200041   \n",
507
       "57961  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64208988   64319750   \n",
508
       "57962  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64325209  126596817   \n",
509
       "57963  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  126596926  127130276   \n",
510
       "57964  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  127132920  132080656   \n",
511
       "57965  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132080885  132099465   \n",
512
       "57966  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132099856  134142530   \n",
513
       "\n",
514
       "       Num_Probes  Segment_Mean  \n",
515
       "57960     34710.0        0.0054  \n",
516
       "57961        61.0       -0.6748  \n",
517
       "57962     37207.0        0.0571  \n",
518
       "57963       454.0       -1.0760  \n",
519
       "57964      3591.0        0.0449  \n",
520
       "57965        15.0       -0.6123  \n",
521
       "57966      1010.0        0.0483  "
522
      ]
523
     },
524
     "execution_count": 9,
525
     "metadata": {},
526
     "output_type": "execute_result"
527
    }
528
   ],
529
   "source": [
530
    "t.loc[t[\"Chromosome\"] ==11,:]"
531
   ]
532
  },
533
  {
534
   "cell_type": "code",
535
   "execution_count": 10,
536
   "metadata": {},
537
   "outputs": [
538
    {
539
     "name": "stdout",
540
     "output_type": "stream",
541
     "text": [
542
      "segemtns in normal after dropping low.conf.: 38\n"
543
     ]
544
    },
545
    {
546
     "data": {
547
      "text/html": [
548
       "<div>\n",
549
       "<style scoped>\n",
550
       "    .dataframe tbody tr th:only-of-type {\n",
551
       "        vertical-align: middle;\n",
552
       "    }\n",
553
       "\n",
554
       "    .dataframe tbody tr th {\n",
555
       "        vertical-align: top;\n",
556
       "    }\n",
557
       "\n",
558
       "    .dataframe thead th {\n",
559
       "        text-align: right;\n",
560
       "    }\n",
561
       "</style>\n",
562
       "<table border=\"1\" class=\"dataframe\">\n",
563
       "  <thead>\n",
564
       "    <tr style=\"text-align: right;\">\n",
565
       "      <th></th>\n",
566
       "      <th>Sample</th>\n",
567
       "      <th>Chromosome</th>\n",
568
       "      <th>Start</th>\n",
569
       "      <th>End</th>\n",
570
       "      <th>Num_Probes</th>\n",
571
       "      <th>Segment_Mean</th>\n",
572
       "    </tr>\n",
573
       "  </thead>\n",
574
       "  <tbody>\n",
575
       "    <tr>\n",
576
       "      <th>57804</th>\n",
577
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
578
       "      <td>11</td>\n",
579
       "      <td>8899400</td>\n",
580
       "      <td>8899668</td>\n",
581
       "      <td>3.0</td>\n",
582
       "      <td>-1.3344</td>\n",
583
       "    </tr>\n",
584
       "    <tr>\n",
585
       "      <th>57806</th>\n",
586
       "      <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n",
587
       "      <td>11</td>\n",
588
       "      <td>126596926</td>\n",
589
       "      <td>127130276</td>\n",
590
       "      <td>453.0</td>\n",
591
       "      <td>-1.0306</td>\n",
592
       "    </tr>\n",
593
       "  </tbody>\n",
594
       "</table>\n",
595
       "</div>"
596
      ],
597
      "text/plain": [
598
       "                             Sample  Chromosome      Start        End  \\\n",
599
       "57804  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11    8899400    8899668   \n",
600
       "57806  TCGA-ZJ-AAXJ-10A-01D-A42Q-01          11  126596926  127130276   \n",
601
       "\n",
602
       "       Num_Probes  Segment_Mean  \n",
603
       "57804         3.0       -1.3344  \n",
604
       "57806       453.0       -1.0306  "
605
      ]
606
     },
607
     "execution_count": 10,
608
     "metadata": {},
609
     "output_type": "execute_result"
610
    }
611
   ],
612
   "source": [
613
    "n = filter_lowconf_segments(n,0,0.46, -0.68 )\n",
614
    "print(\"segemtns in normal after dropping low.conf.:\",n.shape[0])\n",
615
    "n.loc[n['Chromosome']==11,:]"
616
   ]
617
  },
618
  {
619
   "cell_type": "code",
620
   "execution_count": 11,
621
   "metadata": {},
622
   "outputs": [
623
    {
624
     "name": "stdout",
625
     "output_type": "stream",
626
     "text": [
627
      "segemtns in tumor after removing germlines: 194\n"
628
     ]
629
    },
630
    {
631
     "data": {
632
      "text/html": [
633
       "<div>\n",
634
       "<style scoped>\n",
635
       "    .dataframe tbody tr th:only-of-type {\n",
636
       "        vertical-align: middle;\n",
637
       "    }\n",
638
       "\n",
639
       "    .dataframe tbody tr th {\n",
640
       "        vertical-align: top;\n",
641
       "    }\n",
642
       "\n",
643
       "    .dataframe thead th {\n",
644
       "        text-align: right;\n",
645
       "    }\n",
646
       "</style>\n",
647
       "<table border=\"1\" class=\"dataframe\">\n",
648
       "  <thead>\n",
649
       "    <tr style=\"text-align: right;\">\n",
650
       "      <th></th>\n",
651
       "      <th>Sample</th>\n",
652
       "      <th>Chromosome</th>\n",
653
       "      <th>Start</th>\n",
654
       "      <th>End</th>\n",
655
       "      <th>Num_Probes</th>\n",
656
       "      <th>Segment_Mean</th>\n",
657
       "    </tr>\n",
658
       "  </thead>\n",
659
       "  <tbody>\n",
660
       "    <tr>\n",
661
       "      <th>96</th>\n",
662
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
663
       "      <td>11</td>\n",
664
       "      <td>456120</td>\n",
665
       "      <td>64200041</td>\n",
666
       "      <td>34710.0</td>\n",
667
       "      <td>0.0054</td>\n",
668
       "    </tr>\n",
669
       "    <tr>\n",
670
       "      <th>97</th>\n",
671
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
672
       "      <td>11</td>\n",
673
       "      <td>64208988</td>\n",
674
       "      <td>64319750</td>\n",
675
       "      <td>61.0</td>\n",
676
       "      <td>-0.6748</td>\n",
677
       "    </tr>\n",
678
       "    <tr>\n",
679
       "      <th>98</th>\n",
680
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
681
       "      <td>11</td>\n",
682
       "      <td>64325209</td>\n",
683
       "      <td>126596817</td>\n",
684
       "      <td>37207.0</td>\n",
685
       "      <td>0.0571</td>\n",
686
       "    </tr>\n",
687
       "    <tr>\n",
688
       "      <th>99</th>\n",
689
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
690
       "      <td>11</td>\n",
691
       "      <td>127132920</td>\n",
692
       "      <td>132080656</td>\n",
693
       "      <td>3591.0</td>\n",
694
       "      <td>0.0449</td>\n",
695
       "    </tr>\n",
696
       "    <tr>\n",
697
       "      <th>100</th>\n",
698
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
699
       "      <td>11</td>\n",
700
       "      <td>132080885</td>\n",
701
       "      <td>132099465</td>\n",
702
       "      <td>15.0</td>\n",
703
       "      <td>-0.6123</td>\n",
704
       "    </tr>\n",
705
       "    <tr>\n",
706
       "      <th>101</th>\n",
707
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
708
       "      <td>11</td>\n",
709
       "      <td>132099856</td>\n",
710
       "      <td>134142530</td>\n",
711
       "      <td>1010.0</td>\n",
712
       "      <td>0.0483</td>\n",
713
       "    </tr>\n",
714
       "  </tbody>\n",
715
       "</table>\n",
716
       "</div>"
717
      ],
718
      "text/plain": [
719
       "                           Sample  Chromosome      Start        End  \\\n",
720
       "96   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11     456120   64200041   \n",
721
       "97   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64208988   64319750   \n",
722
       "98   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64325209  126596817   \n",
723
       "99   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  127132920  132080656   \n",
724
       "100  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132080885  132099465   \n",
725
       "101  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132099856  134142530   \n",
726
       "\n",
727
       "     Num_Probes  Segment_Mean  \n",
728
       "96      34710.0        0.0054  \n",
729
       "97         61.0       -0.6748  \n",
730
       "98      37207.0        0.0571  \n",
731
       "99       3591.0        0.0449  \n",
732
       "100        15.0       -0.6123  \n",
733
       "101      1010.0        0.0483  "
734
      ]
735
     },
736
     "execution_count": 11,
737
     "metadata": {},
738
     "output_type": "execute_result"
739
    }
740
   ],
741
   "source": [
742
    "\n",
743
    "t = remove_ovelapping_segments(t, n,tumor_barcode)\n",
744
    "print(\"segemtns in tumor after removing germlines:\",t.shape[0])\n",
745
    "t.loc[t[\"Chromosome\"] ==11,:]"
746
   ]
747
  },
748
  {
749
   "cell_type": "code",
750
   "execution_count": 12,
751
   "metadata": {},
752
   "outputs": [
753
    {
754
     "name": "stdout",
755
     "output_type": "stream",
756
     "text": [
757
      "segemtns in tumor after dropping low.conf.: 101\n"
758
     ]
759
    },
760
    {
761
     "data": {
762
      "text/html": [
763
       "<div>\n",
764
       "<style scoped>\n",
765
       "    .dataframe tbody tr th:only-of-type {\n",
766
       "        vertical-align: middle;\n",
767
       "    }\n",
768
       "\n",
769
       "    .dataframe tbody tr th {\n",
770
       "        vertical-align: top;\n",
771
       "    }\n",
772
       "\n",
773
       "    .dataframe thead th {\n",
774
       "        text-align: right;\n",
775
       "    }\n",
776
       "</style>\n",
777
       "<table border=\"1\" class=\"dataframe\">\n",
778
       "  <thead>\n",
779
       "    <tr style=\"text-align: right;\">\n",
780
       "      <th></th>\n",
781
       "      <th>Sample</th>\n",
782
       "      <th>Chromosome</th>\n",
783
       "      <th>Start</th>\n",
784
       "      <th>End</th>\n",
785
       "      <th>Num_Probes</th>\n",
786
       "      <th>Segment_Mean</th>\n",
787
       "    </tr>\n",
788
       "  </thead>\n",
789
       "  <tbody>\n",
790
       "    <tr>\n",
791
       "      <th>97</th>\n",
792
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
793
       "      <td>11</td>\n",
794
       "      <td>64208988</td>\n",
795
       "      <td>64319750</td>\n",
796
       "      <td>61.0</td>\n",
797
       "      <td>-0.6748</td>\n",
798
       "    </tr>\n",
799
       "    <tr>\n",
800
       "      <th>100</th>\n",
801
       "      <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n",
802
       "      <td>11</td>\n",
803
       "      <td>132080885</td>\n",
804
       "      <td>132099465</td>\n",
805
       "      <td>15.0</td>\n",
806
       "      <td>-0.6123</td>\n",
807
       "    </tr>\n",
808
       "  </tbody>\n",
809
       "</table>\n",
810
       "</div>"
811
      ],
812
      "text/plain": [
813
       "                           Sample  Chromosome      Start        End  \\\n",
814
       "97   TCGA-ZJ-AAXJ-01A-11D-A42N-01          11   64208988   64319750   \n",
815
       "100  TCGA-ZJ-AAXJ-01A-11D-A42N-01          11  132080885  132099465   \n",
816
       "\n",
817
       "     Num_Probes  Segment_Mean  \n",
818
       "97         61.0       -0.6748  \n",
819
       "100        15.0       -0.6123  "
820
      ]
821
     },
822
     "execution_count": 12,
823
     "metadata": {},
824
     "output_type": "execute_result"
825
    }
826
   ],
827
   "source": [
828
    "t = filter_lowconf_segments(t,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
829
    "print(\"segemtns in tumor after dropping low.conf.:\",t.shape[0])\n",
830
    "t.loc[t[\"Chromosome\"] ==11,:]"
831
   ]
832
  },
833
  {
834
   "cell_type": "markdown",
835
   "metadata": {},
836
   "source": [
837
    "# TCGA "
838
   ]
839
  },
840
  {
841
   "cell_type": "code",
842
   "execution_count": 14,
843
   "metadata": {
844
    "scrolled": true
845
   },
846
   "outputs": [
847
    {
848
     "name": "stdout",
849
     "output_type": "stream",
850
     "text": [
851
      "HNSC samples: 1089 CNA events per sample on avg.: 101.275482094\n"
852
     ]
853
    },
854
    {
855
     "name": "stderr",
856
     "output_type": "stream",
857
     "text": [
858
      "total samples: 1089 tumors: 530 normals: 559\n",
859
      "\ttumors without matched normal 28\n",
860
      "\ttumors with at least one sCNA 497\n",
861
      "\ttumors without any somatic CNA 5\n"
862
     ]
863
    },
864
    {
865
     "name": "stdout",
866
     "output_type": "stream",
867
     "text": [
868
      "after filtering\n",
869
      "HNSC samples: 525 Segments per sample on avg.: 60.6876190476\n",
870
      "ESCA samples: 373 CNA events per sample on avg.: 163.010723861\n"
871
     ]
872
    },
873
    {
874
     "name": "stderr",
875
     "output_type": "stream",
876
     "text": [
877
      "total samples: 373 tumors: 185 normals: 188\n",
878
      "\ttumors without matched normal 3\n",
879
      "\ttumors with at least one sCNA 181\n",
880
      "\ttumors without any somatic CNA 1\n",
881
      "total samples: 248 tumors: 125 normals: 123\n"
882
     ]
883
    },
884
    {
885
     "name": "stdout",
886
     "output_type": "stream",
887
     "text": [
888
      "after filtering\n",
889
      "ESCA samples: 184 Segments per sample on avg.: 141.836956522\n",
890
      "THYM samples: 248 CNA events per sample on avg.: 62.7862903226\n"
891
     ]
892
    },
893
    {
894
     "name": "stderr",
895
     "output_type": "stream",
896
     "text": [
897
      "\ttumors without matched normal 5\n",
898
      "\ttumors with at least one sCNA 95\n",
899
      "\ttumors without any somatic CNA 25\n",
900
      "total samples: 132 tumors: 66 normals: 66\n"
901
     ]
902
    },
903
    {
904
     "name": "stdout",
905
     "output_type": "stream",
906
     "text": [
907
      "after filtering\n",
908
      "THYM samples: 100 Segments per sample on avg.: 9.41\n",
909
      "KICH samples: 132 CNA events per sample on avg.: 77.0\n"
910
     ]
911
    },
912
    {
913
     "name": "stderr",
914
     "output_type": "stream",
915
     "text": [
916
      "\ttumors without matched normal 0\n",
917
      "\ttumors with at least one sCNA 65\n",
918
      "\ttumors without any somatic CNA 1\n"
919
     ]
920
    },
921
    {
922
     "name": "stdout",
923
     "output_type": "stream",
924
     "text": [
925
      "after filtering\n",
926
      "KICH samples: 65 Segments per sample on avg.: 51.4923076923\n",
927
      "LUSC samples: 1032 CNA events per sample on avg.: 130.682170543\n"
928
     ]
929
    },
930
    {
931
     "name": "stderr",
932
     "output_type": "stream",
933
     "text": [
934
      "total samples: 1032 tumors: 501 normals: 531\n",
935
      "\ttumors without matched normal 23\n",
936
      "\ttumors with at least one sCNA 476\n",
937
      "\ttumors without any somatic CNA 2\n"
938
     ]
939
    },
940
    {
941
     "name": "stdout",
942
     "output_type": "stream",
943
     "text": [
944
      "after filtering\n",
945
      "LUSC samples: 499 Segments per sample on avg.: 94.6533066132\n",
946
      "BLCA samples: 797 CNA events per sample on avg.: 130.927227102\n"
947
     ]
948
    },
949
    {
950
     "name": "stderr",
951
     "output_type": "stream",
952
     "text": [
953
      "total samples: 797 tumors: 414 normals: 383\n",
954
      "\ttumors without matched normal 46\n",
955
      "\ttumors with at least one sCNA 366\n",
956
      "\ttumors without any somatic CNA 2\n"
957
     ]
958
    },
959
    {
960
     "name": "stdout",
961
     "output_type": "stream",
962
     "text": [
963
      "after filtering\n",
964
      "BLCA samples: 412 Segments per sample on avg.: 94.8859223301\n",
965
      "GBM samples: 1104 CNA events per sample on avg.: 133.018115942\n"
966
     ]
967
    },
968
    {
969
     "name": "stderr",
970
     "output_type": "stream",
971
     "text": [
972
      "total samples: 1104 tumors: 590 normals: 514\n",
973
      "\ttumors without matched normal 78\n",
974
      "\ttumors with at least one sCNA 511\n",
975
      "\ttumors without any somatic CNA 1\n",
976
      "total samples: 85 tumors: 36 normals: 49\n"
977
     ]
978
    },
979
    {
980
     "name": "stdout",
981
     "output_type": "stream",
982
     "text": [
983
      "after filtering\n",
984
      "GBM samples: 589 Segments per sample on avg.: 70.2139219015\n",
985
      "CHOL samples: 85 CNA events per sample on avg.: 89.0588235294\n"
986
     ]
987
    },
988
    {
989
     "name": "stderr",
990
     "output_type": "stream",
991
     "text": [
992
      "\ttumors without matched normal 0\n",
993
      "\ttumors with at least one sCNA 36\n",
994
      "\ttumors without any somatic CNA 0\n",
995
      "total samples: 111 tumors: 56 normals: 55\n"
996
     ]
997
    },
998
    {
999
     "name": "stdout",
1000
     "output_type": "stream",
1001
     "text": [
1002
      "after filtering\n",
1003
      "CHOL samples: 36 Segments per sample on avg.: 56.6944444444\n",
1004
      "UCS samples: 111 CNA events per sample on avg.: 173.855855856\n"
1005
     ]
1006
    },
1007
    {
1008
     "name": "stderr",
1009
     "output_type": "stream",
1010
     "text": [
1011
      "\ttumors without matched normal 2\n",
1012
      "\ttumors with at least one sCNA 54\n",
1013
      "\ttumors without any somatic CNA 0\n"
1014
     ]
1015
    },
1016
    {
1017
     "name": "stdout",
1018
     "output_type": "stream",
1019
     "text": [
1020
      "after filtering\n",
1021
      "UCS samples: 56 Segments per sample on avg.: 179.125\n",
1022
      "LGG samples: 1015 CNA events per sample on avg.: 78.6118226601\n"
1023
     ]
1024
    },
1025
    {
1026
     "name": "stderr",
1027
     "output_type": "stream",
1028
     "text": [
1029
      "total samples: 1015 tumors: 530 normals: 485\n",
1030
      "\ttumors without matched normal 33\n",
1031
      "\ttumors with at least one sCNA 494\n",
1032
      "\ttumors without any somatic CNA 3\n"
1033
     ]
1034
    },
1035
    {
1036
     "name": "stdout",
1037
     "output_type": "stream",
1038
     "text": [
1039
      "after filtering\n",
1040
      "LGG samples: 527 Segments per sample on avg.: 29.1157495256\n",
1041
      "THCA samples: 1013 CNA events per sample on avg.: 54.4096742349\n"
1042
     ]
1043
    },
1044
    {
1045
     "name": "stderr",
1046
     "output_type": "stream",
1047
     "text": [
1048
      "total samples: 1013 tumors: 506 normals: 507\n",
1049
      "\ttumors without matched normal 15\n",
1050
      "\ttumors with at least one sCNA 367\n",
1051
      "\ttumors without any somatic CNA 124\n",
1052
      "total samples: 365 tumors: 185 normals: 180\n"
1053
     ]
1054
    },
1055
    {
1056
     "name": "stdout",
1057
     "output_type": "stream",
1058
     "text": [
1059
      "after filtering\n",
1060
      "THCA samples: 382 Segments per sample on avg.: 3.8219895288\n",
1061
      "PAAD samples: 365 CNA events per sample on avg.: 95.3643835616\n"
1062
     ]
1063
    },
1064
    {
1065
     "name": "stderr",
1066
     "output_type": "stream",
1067
     "text": [
1068
      "\ttumors without matched normal 10\n",
1069
      "\ttumors with at least one sCNA 161\n",
1070
      "\ttumors without any somatic CNA 14\n",
1071
      "total samples: 1059 tumors: 529 normals: 530\n"
1072
     ]
1073
    },
1074
    {
1075
     "name": "stdout",
1076
     "output_type": "stream",
1077
     "text": [
1078
      "after filtering\n",
1079
      "PAAD samples: 171 Segments per sample on avg.: 32.4093567251\n",
1080
      "KIRC samples: 1059 CNA events per sample on avg.: 80.298394712\n"
1081
     ]
1082
    },
1083
    {
1084
     "name": "stderr",
1085
     "output_type": "stream",
1086
     "text": [
1087
      "\ttumors without matched normal 22\n",
1088
      "\ttumors with at least one sCNA 501\n",
1089
      "\ttumors without any somatic CNA 6\n",
1090
      "total samples: 160 tumors: 80 normals: 80\n"
1091
     ]
1092
    },
1093
    {
1094
     "name": "stdout",
1095
     "output_type": "stream",
1096
     "text": [
1097
      "after filtering\n",
1098
      "KIRC samples: 523 Segments per sample on avg.: 20.5009560229\n",
1099
      "UVM samples: 160 CNA events per sample on avg.: 81.08125\n"
1100
     ]
1101
    },
1102
    {
1103
     "name": "stderr",
1104
     "output_type": "stream",
1105
     "text": [
1106
      "\ttumors without matched normal 0\n",
1107
      "\ttumors with at least one sCNA 80\n",
1108
      "\ttumors without any somatic CNA 0\n",
1109
      "total samples: 586 tumors: 297 normals: 289\n"
1110
     ]
1111
    },
1112
    {
1113
     "name": "stdout",
1114
     "output_type": "stream",
1115
     "text": [
1116
      "after filtering\n",
1117
      "UVM samples: 80 Segments per sample on avg.: 38.425\n",
1118
      "CESC samples: 586 CNA events per sample on avg.: 101.450511945\n"
1119
     ]
1120
    },
1121
    {
1122
     "name": "stderr",
1123
     "output_type": "stream",
1124
     "text": [
1125
      "\ttumors without matched normal 16\n",
1126
      "\ttumors with at least one sCNA 280\n",
1127
      "\ttumors without any somatic CNA 1\n"
1128
     ]
1129
    },
1130
    {
1131
     "name": "stdout",
1132
     "output_type": "stream",
1133
     "text": [
1134
      "after filtering\n",
1135
      "CESC samples: 296 Segments per sample on avg.: 58.1351351351\n",
1136
      "LUAD samples: 1095 CNA events per sample on avg.: 105.78630137\n"
1137
     ]
1138
    },
1139
    {
1140
     "name": "stderr",
1141
     "output_type": "stream",
1142
     "text": [
1143
      "total samples: 1095 tumors: 518 normals: 577\n",
1144
      "\ttumors without matched normal 19\n",
1145
      "\ttumors with at least one sCNA 494\n",
1146
      "\ttumors without any somatic CNA 5\n"
1147
     ]
1148
    },
1149
    {
1150
     "name": "stdout",
1151
     "output_type": "stream",
1152
     "text": [
1153
      "after filtering\n",
1154
      "LUAD samples: 513 Segments per sample on avg.: 70.469785575\n",
1155
      "STAD samples: 904 CNA events per sample on avg.: 130.961283186\n"
1156
     ]
1157
    },
1158
    {
1159
     "name": "stderr",
1160
     "output_type": "stream",
1161
     "text": [
1162
      "total samples: 904 tumors: 442 normals: 462\n",
1163
      "\ttumors without matched normal 26\n",
1164
      "\ttumors with at least one sCNA 410\n",
1165
      "\ttumors without any somatic CNA 6\n"
1166
     ]
1167
    },
1168
    {
1169
     "name": "stdout",
1170
     "output_type": "stream",
1171
     "text": [
1172
      "after filtering\n",
1173
      "STAD samples: 436 Segments per sample on avg.: 96.4220183486\n",
1174
      "UCEC samples: 1089 CNA events per sample on avg.: 116.707070707\n"
1175
     ]
1176
    },
1177
    {
1178
     "name": "stderr",
1179
     "output_type": "stream",
1180
     "text": [
1181
      "total samples: 1089 tumors: 540 normals: 549\n",
1182
      "\ttumors without matched normal 23\n",
1183
      "\ttumors with at least one sCNA 504\n",
1184
      "\ttumors without any somatic CNA 13\n"
1185
     ]
1186
    },
1187
    {
1188
     "name": "stdout",
1189
     "output_type": "stream",
1190
     "text": [
1191
      "after filtering\n",
1192
      "UCEC samples: 527 Segments per sample on avg.: 78.89943074\n",
1193
      "SKCM samples: 937 CNA events per sample on avg.: 115.351120598\n"
1194
     ]
1195
    },
1196
    {
1197
     "name": "stderr",
1198
     "output_type": "stream",
1199
     "text": [
1200
      "total samples: 937 tumors: 472 normals: 465\n",
1201
      "\ttumors without matched normal 7\n",
1202
      "\ttumors with at least one sCNA 463\n",
1203
      "\ttumors without any somatic CNA 2\n",
1204
      "total samples: 172 tumors: 87 normals: 85\n"
1205
     ]
1206
    },
1207
    {
1208
     "name": "stdout",
1209
     "output_type": "stream",
1210
     "text": [
1211
      "after filtering\n",
1212
      "SKCM samples: 470 Segments per sample on avg.: 82.9957446809\n",
1213
      "MESO samples: 172 CNA events per sample on avg.: 106.598837209\n"
1214
     ]
1215
    },
1216
    {
1217
     "name": "stderr",
1218
     "output_type": "stream",
1219
     "text": [
1220
      "\ttumors without matched normal 2\n",
1221
      "\ttumors with at least one sCNA 82\n",
1222
      "\ttumors without any somatic CNA 3\n",
1223
      "total samples: 346 tumors: 168 normals: 178\n"
1224
     ]
1225
    },
1226
    {
1227
     "name": "stdout",
1228
     "output_type": "stream",
1229
     "text": [
1230
      "after filtering\n",
1231
      "MESO samples: 84 Segments per sample on avg.: 60.8333333333\n",
1232
      "PCPG samples: 346 CNA events per sample on avg.: 90.3352601156\n"
1233
     ]
1234
    },
1235
    {
1236
     "name": "stderr",
1237
     "output_type": "stream",
1238
     "text": [
1239
      "\ttumors without matched normal 6\n",
1240
      "\ttumors with at least one sCNA 159\n",
1241
      "\ttumors without any somatic CNA 3\n"
1242
     ]
1243
    },
1244
    {
1245
     "name": "stdout",
1246
     "output_type": "stream",
1247
     "text": [
1248
      "after filtering\n",
1249
      "PCPG samples: 165 Segments per sample on avg.: 43.5878787879\n",
1250
      "STES samples: 1277 CNA events per sample on avg.: 140.322631167\n"
1251
     ]
1252
    },
1253
    {
1254
     "name": "stderr",
1255
     "output_type": "stream",
1256
     "text": [
1257
      "total samples: 1277 tumors: 627 normals: 650\n",
1258
      "\ttumors without matched normal 29\n",
1259
      "\ttumors with at least one sCNA 591\n",
1260
      "\ttumors without any somatic CNA 7\n"
1261
     ]
1262
    },
1263
    {
1264
     "name": "stdout",
1265
     "output_type": "stream",
1266
     "text": [
1267
      "after filtering\n",
1268
      "STES samples: 620 Segments per sample on avg.: 109.9\n",
1269
      "SARC samples: 513 CNA events per sample on avg.: 208.068226121\n"
1270
     ]
1271
    },
1272
    {
1273
     "name": "stderr",
1274
     "output_type": "stream",
1275
     "text": [
1276
      "total samples: 513 tumors: 263 normals: 250\n",
1277
      "\ttumors without matched normal 17\n",
1278
      "\ttumors with at least one sCNA 245\n",
1279
      "\ttumors without any somatic CNA 1\n",
1280
      "total samples: 380 tumors: 191 normals: 189\n"
1281
     ]
1282
    },
1283
    {
1284
     "name": "stdout",
1285
     "output_type": "stream",
1286
     "text": [
1287
      "after filtering\n",
1288
      "SARC samples: 262 Segments per sample on avg.: 187.057251908\n",
1289
      "LAML samples: 380 CNA events per sample on avg.: 74.5368421053\n"
1290
     ]
1291
    },
1292
    {
1293
     "name": "stderr",
1294
     "output_type": "stream",
1295
     "text": [
1296
      "\ttumors without matched normal 3\n",
1297
      "\ttumors with at least one sCNA 167\n",
1298
      "\ttumors without any somatic CNA 21\n",
1299
      "total samples: 590 tumors: 288 normals: 302\n"
1300
     ]
1301
    },
1302
    {
1303
     "name": "stdout",
1304
     "output_type": "stream",
1305
     "text": [
1306
      "after filtering\n",
1307
      "LAML samples: 170 Segments per sample on avg.: 7.18823529412\n",
1308
      "KIRP samples: 590 CNA events per sample on avg.: 79.5152542373\n"
1309
     ]
1310
    },
1311
    {
1312
     "name": "stderr",
1313
     "output_type": "stream",
1314
     "text": [
1315
      "\ttumors without matched normal 15\n",
1316
      "\ttumors with at least one sCNA 271\n",
1317
      "\ttumors without any somatic CNA 2\n"
1318
     ]
1319
    },
1320
    {
1321
     "name": "stdout",
1322
     "output_type": "stream",
1323
     "text": [
1324
      "after filtering\n",
1325
      "KIRP samples: 286 Segments per sample on avg.: 21.8846153846\n",
1326
      "LIHC samples: 760 CNA events per sample on avg.: 122.8\n"
1327
     ]
1328
    },
1329
    {
1330
     "name": "stderr",
1331
     "output_type": "stream",
1332
     "text": [
1333
      "total samples: 760 tumors: 373 normals: 387\n",
1334
      "\ttumors without matched normal 21\n",
1335
      "\ttumors with at least one sCNA 348\n",
1336
      "\ttumors without any somatic CNA 4\n"
1337
     ]
1338
    },
1339
    {
1340
     "name": "stdout",
1341
     "output_type": "stream",
1342
     "text": [
1343
      "after filtering\n",
1344
      "LIHC samples: 369 Segments per sample on avg.: 81.1327913279\n",
1345
      "OV samples: 1168 CNA events per sample on avg.: 224.04109589\n"
1346
     ]
1347
    },
1348
    {
1349
     "name": "stderr",
1350
     "output_type": "stream",
1351
     "text": [
1352
      "total samples: 1168 tumors: 597 normals: 571\n",
1353
      "\ttumors without matched normal 26\n",
1354
      "\ttumors with at least one sCNA 571\n",
1355
      "\ttumors without any somatic CNA 0\n"
1356
     ]
1357
    },
1358
    {
1359
     "name": "stdout",
1360
     "output_type": "stream",
1361
     "text": [
1362
      "after filtering\n",
1363
      "OV samples: 597 Segments per sample on avg.: 207.924623116\n",
1364
      "TGCT samples: 304 CNA events per sample on avg.: 83.8125\n"
1365
     ]
1366
    },
1367
    {
1368
     "name": "stderr",
1369
     "output_type": "stream",
1370
     "text": [
1371
      "total samples: 304 tumors: 156 normals: 148\n",
1372
      "\ttumors without matched normal 2\n",
1373
      "\ttumors with at least one sCNA 154\n",
1374
      "\ttumors without any somatic CNA 0\n"
1375
     ]
1376
    },
1377
    {
1378
     "name": "stdout",
1379
     "output_type": "stream",
1380
     "text": [
1381
      "after filtering\n",
1382
      "TGCT samples: 156 Segments per sample on avg.: 37.7820512821\n",
1383
      "COAD samples: 918 CNA events per sample on avg.: 98.6209150327\n"
1384
     ]
1385
    },
1386
    {
1387
     "name": "stderr",
1388
     "output_type": "stream",
1389
     "text": [
1390
      "total samples: 918 tumors: 453 normals: 465\n",
1391
      "\ttumors without matched normal 44\n",
1392
      "\ttumors with at least one sCNA 406\n",
1393
      "\ttumors without any somatic CNA 3\n"
1394
     ]
1395
    },
1396
    {
1397
     "name": "stdout",
1398
     "output_type": "stream",
1399
     "text": [
1400
      "after filtering\n",
1401
      "COAD samples: 450 Segments per sample on avg.: 48.4755555556\n",
1402
      "BRCA samples: 2199 CNA events per sample on avg.: 129.35788995\n"
1403
     ]
1404
    },
1405
    {
1406
     "name": "stderr",
1407
     "output_type": "stream",
1408
     "text": [
1409
      "total samples: 2199 tumors: 1088 normals: 1111\n",
1410
      "\ttumors without matched normal 35\n",
1411
      "\ttumors with at least one sCNA 1046\n",
1412
      "\ttumors without any somatic CNA 7\n"
1413
     ]
1414
    },
1415
    {
1416
     "name": "stdout",
1417
     "output_type": "stream",
1418
     "text": [
1419
      "after filtering\n",
1420
      "BRCA samples: 1081 Segments per sample on avg.: 102.808510638\n",
1421
      "PRAD samples: 1023 CNA events per sample on avg.: 114.706744868\n"
1422
     ]
1423
    },
1424
    {
1425
     "name": "stderr",
1426
     "output_type": "stream",
1427
     "text": [
1428
      "total samples: 1023 tumors: 493 normals: 530\n",
1429
      "\ttumors without matched normal 17\n",
1430
      "\ttumors with at least one sCNA 458\n",
1431
      "\ttumors without any somatic CNA 18\n",
1432
      "total samples: 96 tumors: 52 normals: 44\n"
1433
     ]
1434
    },
1435
    {
1436
     "name": "stdout",
1437
     "output_type": "stream",
1438
     "text": [
1439
      "after filtering\n",
1440
      "PRAD samples: 475 Segments per sample on avg.: 60.3831578947\n",
1441
      "DLBC samples: 96 CNA events per sample on avg.: 97.3229166667\n"
1442
     ]
1443
    },
1444
    {
1445
     "name": "stderr",
1446
     "output_type": "stream",
1447
     "text": [
1448
      "\ttumors without matched normal 10\n",
1449
      "\ttumors with at least one sCNA 40\n",
1450
      "\ttumors without any somatic CNA 2\n"
1451
     ]
1452
    },
1453
    {
1454
     "name": "stdout",
1455
     "output_type": "stream",
1456
     "text": [
1457
      "after filtering\n",
1458
      "DLBC samples: 50 Segments per sample on avg.: 44.44\n",
1459
      "READ samples: 316 CNA events per sample on avg.: 113.180379747\n"
1460
     ]
1461
    },
1462
    {
1463
     "name": "stderr",
1464
     "output_type": "stream",
1465
     "text": [
1466
      "total samples: 316 tumors: 166 normals: 150\n",
1467
      "\ttumors without matched normal 23\n",
1468
      "\ttumors with at least one sCNA 141\n",
1469
      "\ttumors without any somatic CNA 2\n"
1470
     ]
1471
    },
1472
    {
1473
     "name": "stdout",
1474
     "output_type": "stream",
1475
     "text": [
1476
      "after filtering\n",
1477
      "READ samples: 164 Segments per sample on avg.: 70.012195122\n",
1478
      "ACC samples: 180 CNA events per sample on avg.: 116.955555556\n"
1479
     ]
1480
    },
1481
    {
1482
     "name": "stderr",
1483
     "output_type": "stream",
1484
     "text": [
1485
      "total samples: 180 tumors: 90 normals: 90\n"
1486
     ]
1487
    },
1488
    {
1489
     "name": "stdout",
1490
     "output_type": "stream",
1491
     "text": [
1492
      "after filtering\n",
1493
      "ACC samples: 89 Segments per sample on avg.: 107.449438202\n"
1494
     ]
1495
    },
1496
    {
1497
     "name": "stderr",
1498
     "output_type": "stream",
1499
     "text": [
1500
      "\ttumors without matched normal 2\n",
1501
      "\ttumors with at least one sCNA 87\n",
1502
      "\ttumors without any somatic CNA 1\n"
1503
     ]
1504
    }
1505
   ],
1506
   "source": [
1507
    "\n",
1508
    "data_dir = \"../../TCGA/CNA/data__2016_01_28//\"\n",
1509
    "\n",
1510
    "dfs = {}\n",
1511
    "dfs_normals = {}\n",
1512
    "tumors_without_CNA = {}\n",
1513
    "for f in os.listdir(data_dir):\n",
1514
    "    if f.endswith(\"tar.gz\"):\n",
1515
    "        fp = f.replace(\".tar.gz\",\"\")\n",
1516
    "        cohort = fp.split(\".\")[2].replace(\"org_\",\"\")\n",
1517
    "        file_path = fp+\"/\"+cohort+\".\"+fp.split(\".\")[3].replace(\"Merge_\",\"\")+\".seg.txt\"\n",
1518
    "        df = pd.read_csv(data_dir+file_path, sep = \"\\t\")\n",
1519
    "        \n",
1520
    "        df[\"Chromosome\"] = df[\"Chromosome\"].map(chr_dict)\n",
1521
    "        print(cohort,\"samples:\",len(set(df[\"Sample\"].values)),\n",
1522
    "              \"CNA events per sample on avg.:\",float(df.shape[0])/len(set(df[\"Sample\"].values)))\n",
1523
    "        \n",
1524
    "        #### remove segments overlapping with segemnts in normals by 80% or more reciprocally ####\n",
1525
    "        df[\"type\"] = df[\"Sample\"].apply(sample_type)\n",
1526
    "        df_normals = df.loc[df[\"type\"]== \"Normal\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n",
1527
    "        df_tumors = df.loc[df[\"type\"]== \"Tumor\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n",
1528
    "        normal_samples = list(set(df_normals[\"Sample\"].values))\n",
1529
    "        tumor_samples = list(set(df_tumors[\"Sample\"].values))\n",
1530
    "        print(\"total samples:\", len(set(df[\"Sample\"].values)),\n",
1531
    "              \"tumors:\",len(tumor_samples),\"normals:\",len(normal_samples),file= sys.stderr)\n",
1532
    "        \n",
1533
    "        tumors_without_somatic_CNA = []\n",
1534
    "        tumors_germline_removed = []\n",
1535
    "        tumors_without_matching_normal = []\n",
1536
    "        filtered_normals = []\n",
1537
    "        for tumor_sample in tumor_samples:\n",
1538
    "            #print(sample, find_matching_normal(sample,list(set(d[\"Sample\"]))))\n",
1539
    "            tumor = df_tumors.loc[df_tumors [\"Sample\"]== tumor_sample,:]\n",
1540
    "            matching_normals = find_matching_normal(tumor_sample,normal_samples)\n",
1541
    "            if len(matching_normals) >0:\n",
1542
    "                n_segs = tumor.shape[0]\n",
1543
    "                for normal_sample in matching_normals:\n",
1544
    "                    normal  =  df_normals.loc[df_normals[\"Sample\"]== normal_sample,:]\n",
1545
    "                    # thresholds for +1 and -1 copy in 75% of normal cell;\n",
1546
    "                    # this is to retain segments appeared due to slight tumor contamination\n",
1547
    "                    normal =  filter_lowconf_segments(normal,0,0.46, -0.68 )\n",
1548
    "                    filtered_normals.append(normal)\n",
1549
    "                    tumor = remove_ovelapping_segments(tumor, normal,tumor_sample)\n",
1550
    "                #if n_segs > tumor.shape[0]:\n",
1551
    "                #    print(n_segs - tumor.shape[0],\"segments removed in sample\",tumor_sample,\n",
1552
    "                #    tumor.shape[0],\"remained\",file= sys.stderr)\n",
1553
    "                tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
1554
    "                if tumor.shape[0] == 0:\n",
1555
    "                       tumors_without_somatic_CNA.append(tumor_sample)\n",
1556
    "                else:\n",
1557
    "                    tumors_germline_removed.append(tumor)\n",
1558
    "            else:\n",
1559
    "                tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
1560
    "                if tumor.shape[0] == 0:\n",
1561
    "                       tumors_without_somatic_CNA.append(tumor_sample)\n",
1562
    "                else:\n",
1563
    "                    tumors_without_matching_normal.append(tumor)\n",
1564
    "\n",
1565
    "        print(\"\\ttumors without matched normal\",len(tumors_without_matching_normal),file= sys.stderr)\n",
1566
    "        print(\"\\ttumors with at least one sCNA\",len(tumors_germline_removed),file= sys.stderr)\n",
1567
    "        print(\"\\ttumors without any somatic CNA\",len(tumors_without_somatic_CNA),file= sys.stderr)\n",
1568
    "        #dfs[cohort] = df\n",
1569
    "        filtered_tumors = pd.concat(tumors_germline_removed+tumors_without_matching_normal)\n",
1570
    "        dfs[cohort] = filtered_tumors\n",
1571
    "        filtered_normals = pd.concat(filtered_normals)\n",
1572
    "        dfs_normals[cohort] = filtered_normals\n",
1573
    "        tumors_without_CNA[cohort] = tumors_without_somatic_CNA\n",
1574
    "        print(\"after filtering\")\n",
1575
    "        print(cohort,\"samples:\",len(set(filtered_tumors[\"Sample\"].values)),\n",
1576
    "              \"Segments per sample on avg.:\",float(filtered_tumors.shape[0])/len(set(filtered_tumors[\"Sample\"].values)))\n",
1577
    "        \n",
1578
    "        "
1579
   ]
1580
  },
1581
  {
1582
   "cell_type": "code",
1583
   "execution_count": null,
1584
   "metadata": {},
1585
   "outputs": [],
1586
   "source": []
1587
  },
1588
  {
1589
   "cell_type": "markdown",
1590
   "metadata": {},
1591
   "source": [
1592
    "### Aggregating to gene-level\n",
1593
    "\n",
1594
    "Gene annotation must be:\n",
1595
    " - with Entrez gene IDs \n",
1596
    " - in hg19 coordinates\n",
1597
    " - with columns \"chrom\",\"start\",\"stop\",\"gene\" (this is foru-column bed format)\n",
1598
    " \n",
1599
    "wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz\n",
1600
    "\n",
1601
    "echo -e \"chrom\\tstart\\tstop\\tgene\\tname\"  > ef_GRCh37.p5_top_level.gff3.bed;\n",
1602
    "zcat ref_GRCh37.p5_top_level.gff3.gz  | awk '$3==\"gene\"' | cut -f 1,4,5,9| sed -e 's/;/\\t/g'| cut -f 1-3,5,6 | grep  GeneID |  sed -re 's/(Dbxref=GeneID:[0-9]*),.*/\\1/' | sed -e 's/Name=//' -e 's/Dbxref=GeneID://' | awk '{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$5\"\\t\"$4}'  >> \n",
1603
    "ref_GRCh37.p5_top_level.gff3.bed\n"
1604
   ]
1605
  },
1606
  {
1607
   "cell_type": "code",
1608
   "execution_count": 15,
1609
   "metadata": {},
1610
   "outputs": [],
1611
   "source": [
1612
    "rename_chroms = {\"NC_000001.10\":1,\"NC_000002.11\":2,\"NC_000003.11\":3,\"NC_000004.11\":4,\n",
1613
    "                 \"NC_000005.9\":5,\"NC_000006.11\":6,\"NC_000007.13\":7,\"NC_000008.10\":8,\n",
1614
    "                 \"NC_000009.11\":9,\"NC_000010.10\":10,\"NC_000011.9\":11,\"NC_000012.11\":12,\"NC_000013.10\":13,\n",
1615
    "                 \"NC_000014.8\":14,\"NC_000015.9\":15,\"NC_000016.9\":16,\"NC_000017.10\":17,\n",
1616
    "                 \"NC_000018.9\":18,\"NC_000019.9\":19,\"NC_000020.10\":20,\"NC_000021.8\":21,\n",
1617
    "                 \"NC_000022.10\":22,\"NC_000023.10\":23,\"NC_000024.9\":24}"
1618
   ]
1619
  },
1620
  {
1621
   "cell_type": "code",
1622
   "execution_count": 16,
1623
   "metadata": {},
1624
   "outputs": [
1625
    {
1626
     "name": "stdout",
1627
     "output_type": "stream",
1628
     "text": [
1629
      "(36019, 5)\n"
1630
     ]
1631
    },
1632
    {
1633
     "data": {
1634
      "text/html": [
1635
       "<div>\n",
1636
       "<style scoped>\n",
1637
       "    .dataframe tbody tr th:only-of-type {\n",
1638
       "        vertical-align: middle;\n",
1639
       "    }\n",
1640
       "\n",
1641
       "    .dataframe tbody tr th {\n",
1642
       "        vertical-align: top;\n",
1643
       "    }\n",
1644
       "\n",
1645
       "    .dataframe thead th {\n",
1646
       "        text-align: right;\n",
1647
       "    }\n",
1648
       "</style>\n",
1649
       "<table border=\"1\" class=\"dataframe\">\n",
1650
       "  <thead>\n",
1651
       "    <tr style=\"text-align: right;\">\n",
1652
       "      <th></th>\n",
1653
       "      <th>chrom</th>\n",
1654
       "      <th>start</th>\n",
1655
       "      <th>stop</th>\n",
1656
       "      <th>gene</th>\n",
1657
       "      <th>name</th>\n",
1658
       "    </tr>\n",
1659
       "  </thead>\n",
1660
       "  <tbody>\n",
1661
       "    <tr>\n",
1662
       "      <th>0</th>\n",
1663
       "      <td>1</td>\n",
1664
       "      <td>10954</td>\n",
1665
       "      <td>11507</td>\n",
1666
       "      <td>100506145</td>\n",
1667
       "      <td>LOC100506145</td>\n",
1668
       "    </tr>\n",
1669
       "    <tr>\n",
1670
       "      <th>1</th>\n",
1671
       "      <td>1</td>\n",
1672
       "      <td>12190</td>\n",
1673
       "      <td>13639</td>\n",
1674
       "      <td>100652771</td>\n",
1675
       "      <td>LOC100652771</td>\n",
1676
       "    </tr>\n",
1677
       "    <tr>\n",
1678
       "      <th>2</th>\n",
1679
       "      <td>1</td>\n",
1680
       "      <td>14362</td>\n",
1681
       "      <td>29370</td>\n",
1682
       "      <td>653635</td>\n",
1683
       "      <td>WASH7P</td>\n",
1684
       "    </tr>\n",
1685
       "    <tr>\n",
1686
       "      <th>3</th>\n",
1687
       "      <td>1</td>\n",
1688
       "      <td>30366</td>\n",
1689
       "      <td>30503</td>\n",
1690
       "      <td>100302278</td>\n",
1691
       "      <td>MIR1302-2</td>\n",
1692
       "    </tr>\n",
1693
       "  </tbody>\n",
1694
       "</table>\n",
1695
       "</div>"
1696
      ],
1697
      "text/plain": [
1698
       "   chrom  start   stop       gene          name\n",
1699
       "0      1  10954  11507  100506145  LOC100506145\n",
1700
       "1      1  12190  13639  100652771  LOC100652771\n",
1701
       "2      1  14362  29370     653635        WASH7P\n",
1702
       "3      1  30366  30503  100302278     MIR1302-2"
1703
      ]
1704
     },
1705
     "execution_count": 16,
1706
     "metadata": {},
1707
     "output_type": "execute_result"
1708
    }
1709
   ],
1710
   "source": [
1711
    "gene_intervals = pd.read_csv(gene_coords_file, sep = \"\\t\")\n",
1712
    "gene_intervals = gene_intervals.loc[gene_intervals[\"chrom\"].isin(rename_chroms.keys()),:]\n",
1713
    "gene_intervals[\"chrom\"] = gene_intervals[\"chrom\"].apply(lambda x : rename_chroms[x])\n",
1714
    "#print(\"chromosomes:\",list(set(gene_intervals[\"chrom\"].values)))\n",
1715
    "gene_intervals = gene_intervals.sort_values(by=[\"chrom\",\"start\",\"stop\"],ascending=True)\n",
1716
    "gene_intervals.to_csv(\"/home/olya/SFU/Hossein/v1/ref_GRCh37.p5_top_level.gff3.chroms_renamed.bed\",sep = \"\\t\",index=False)\n",
1717
    "print(gene_intervals.shape)\n",
1718
    "gene_intervals.head(4)"
1719
   ]
1720
  },
1721
  {
1722
   "cell_type": "code",
1723
   "execution_count": 17,
1724
   "metadata": {},
1725
   "outputs": [],
1726
   "source": [
1727
    "gene_intervals_bed = pbt.BedTool.from_dataframe(gene_intervals[[\"chrom\",\"start\",\"stop\",\"gene\"]])\n",
1728
    "# prepare copy-neutral table\n",
1729
    "cnv_baseline = gene_intervals.copy()\n",
1730
    "cnv_baseline[\"Segment_Mean\"] = [0]*cnv_baseline.shape[0]\n",
1731
    "cnv_baseline = cnv_baseline[[\"gene\",\"Segment_Mean\"]]\n",
1732
    "cnv_baseline.set_index(\"gene\",inplace=True,drop=True)\n",
1733
    "cnv_baseline.sort_index(inplace=True)\n",
1734
    "sorted_index = list(cnv_baseline.index.values)"
1735
   ]
1736
  },
1737
  {
1738
   "cell_type": "code",
1739
   "execution_count": 20,
1740
   "metadata": {
1741
    "scrolled": false
1742
   },
1743
   "outputs": [
1744
    {
1745
     "name": "stderr",
1746
     "output_type": "stream",
1747
     "text": [
1748
      "ESCA\n"
1749
     ]
1750
    },
1751
    {
1752
     "name": "stdout",
1753
     "output_type": "stream",
1754
     "text": [
1755
      "... 100 processed.\n",
1756
      "ESCA (36019, 185)\n"
1757
     ]
1758
    },
1759
    {
1760
     "name": "stderr",
1761
     "output_type": "stream",
1762
     "text": [
1763
      "DLBC\n",
1764
      "TCGA-G8-6914-14A-01D-2209-01 has no genes with altered CN\n"
1765
     ]
1766
    },
1767
    {
1768
     "name": "stdout",
1769
     "output_type": "stream",
1770
     "text": [
1771
      "DLBC (36019, 52)\n"
1772
     ]
1773
    },
1774
    {
1775
     "name": "stderr",
1776
     "output_type": "stream",
1777
     "text": [
1778
      "READ\n"
1779
     ]
1780
    },
1781
    {
1782
     "name": "stdout",
1783
     "output_type": "stream",
1784
     "text": [
1785
      "... 100 processed.\n",
1786
      "READ (36019, 166)\n"
1787
     ]
1788
    },
1789
    {
1790
     "name": "stderr",
1791
     "output_type": "stream",
1792
     "text": [
1793
      "GBM\n",
1794
      "TCGA-06-0165-01A-01D-0236-01 has no genes with altered CN\n"
1795
     ]
1796
    },
1797
    {
1798
     "name": "stdout",
1799
     "output_type": "stream",
1800
     "text": [
1801
      "... 100 processed.\n",
1802
      "... 200 processed.\n"
1803
     ]
1804
    },
1805
    {
1806
     "name": "stderr",
1807
     "output_type": "stream",
1808
     "text": [
1809
      "TCGA-06-0119-01A-08D-0214-01 has no genes with altered CN\n"
1810
     ]
1811
    },
1812
    {
1813
     "name": "stdout",
1814
     "output_type": "stream",
1815
     "text": [
1816
      "... 300 processed.\n",
1817
      "... 400 processed.\n",
1818
      "... 500 processed.\n"
1819
     ]
1820
    },
1821
    {
1822
     "name": "stderr",
1823
     "output_type": "stream",
1824
     "text": [
1825
      "TCGA-06-5410-01A-01D-1694-01 has no genes with altered CN\n"
1826
     ]
1827
    },
1828
    {
1829
     "name": "stdout",
1830
     "output_type": "stream",
1831
     "text": [
1832
      "GBM (36019, 590)\n"
1833
     ]
1834
    },
1835
    {
1836
     "name": "stderr",
1837
     "output_type": "stream",
1838
     "text": [
1839
      "STES\n",
1840
      "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n",
1841
      "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n"
1842
     ]
1843
    },
1844
    {
1845
     "name": "stdout",
1846
     "output_type": "stream",
1847
     "text": [
1848
      "... 100 processed.\n"
1849
     ]
1850
    },
1851
    {
1852
     "name": "stderr",
1853
     "output_type": "stream",
1854
     "text": [
1855
      "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n"
1856
     ]
1857
    },
1858
    {
1859
     "name": "stdout",
1860
     "output_type": "stream",
1861
     "text": [
1862
      "... 200 processed.\n",
1863
      "... 300 processed.\n"
1864
     ]
1865
    },
1866
    {
1867
     "name": "stderr",
1868
     "output_type": "stream",
1869
     "text": [
1870
      "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n",
1871
      "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n",
1872
      "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n"
1873
     ]
1874
    },
1875
    {
1876
     "name": "stdout",
1877
     "output_type": "stream",
1878
     "text": [
1879
      "... 400 processed.\n"
1880
     ]
1881
    },
1882
    {
1883
     "name": "stderr",
1884
     "output_type": "stream",
1885
     "text": [
1886
      "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n"
1887
     ]
1888
    },
1889
    {
1890
     "name": "stdout",
1891
     "output_type": "stream",
1892
     "text": [
1893
      "... 500 processed.\n"
1894
     ]
1895
    },
1896
    {
1897
     "name": "stderr",
1898
     "output_type": "stream",
1899
     "text": [
1900
      "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n"
1901
     ]
1902
    },
1903
    {
1904
     "name": "stdout",
1905
     "output_type": "stream",
1906
     "text": [
1907
      "... 600 processed.\n",
1908
      "STES (36019, 627)\n"
1909
     ]
1910
    },
1911
    {
1912
     "name": "stderr",
1913
     "output_type": "stream",
1914
     "text": [
1915
      "BLCA\n",
1916
      "TCGA-YC-A8S6-01A-31D-A38F-01 has no genes with altered CN\n"
1917
     ]
1918
    },
1919
    {
1920
     "name": "stdout",
1921
     "output_type": "stream",
1922
     "text": [
1923
      "... 100 processed.\n"
1924
     ]
1925
    },
1926
    {
1927
     "name": "stderr",
1928
     "output_type": "stream",
1929
     "text": [
1930
      "TCGA-DK-A3WY-01A-11D-A22Y-01 has no genes with altered CN\n",
1931
      "TCGA-XF-A9SL-01A-11D-A390-01 has no genes with altered CN\n"
1932
     ]
1933
    },
1934
    {
1935
     "name": "stdout",
1936
     "output_type": "stream",
1937
     "text": [
1938
      "... 200 processed.\n"
1939
     ]
1940
    },
1941
    {
1942
     "name": "stderr",
1943
     "output_type": "stream",
1944
     "text": [
1945
      "TCGA-E7-A7XN-01A-11D-A34T-01 has no genes with altered CN\n"
1946
     ]
1947
    },
1948
    {
1949
     "name": "stdout",
1950
     "output_type": "stream",
1951
     "text": [
1952
      "... 300 processed.\n",
1953
      "... 400 processed.\n",
1954
      "BLCA (36019, 414)\n"
1955
     ]
1956
    },
1957
    {
1958
     "name": "stderr",
1959
     "output_type": "stream",
1960
     "text": [
1961
      "UCEC\n",
1962
      "TCGA-D1-A16Y-01A-31D-A12G-01 has no genes with altered CN\n",
1963
      "TCGA-BK-A6W4-01A-12D-A34P-01 has no genes with altered CN\n",
1964
      "TCGA-BS-A0V7-01A-21D-A120-01 has no genes with altered CN\n",
1965
      "TCGA-B5-A11Y-01A-21D-A10L-01 has no genes with altered CN\n",
1966
      "TCGA-D1-A17F-01A-11D-A12G-01 has no genes with altered CN\n"
1967
     ]
1968
    },
1969
    {
1970
     "name": "stdout",
1971
     "output_type": "stream",
1972
     "text": [
1973
      "... 100 processed.\n"
1974
     ]
1975
    },
1976
    {
1977
     "name": "stderr",
1978
     "output_type": "stream",
1979
     "text": [
1980
      "TCGA-AX-A062-01A-11D-A00X-01 has no genes with altered CN\n",
1981
      "TCGA-D1-A16D-01A-11D-A12G-01 has no genes with altered CN\n",
1982
      "TCGA-BG-A0VZ-01A-11D-A107-01 has no genes with altered CN\n"
1983
     ]
1984
    },
1985
    {
1986
     "name": "stdout",
1987
     "output_type": "stream",
1988
     "text": [
1989
      "... 200 processed.\n"
1990
     ]
1991
    },
1992
    {
1993
     "name": "stderr",
1994
     "output_type": "stream",
1995
     "text": [
1996
      "TCGA-AJ-A2QL-01A-11D-A18N-01 has no genes with altered CN\n",
1997
      "TCGA-BS-A0UA-01A-11D-A120-01 has no genes with altered CN\n",
1998
      "TCGA-B5-A11U-01A-11D-A120-01 has no genes with altered CN\n"
1999
     ]
2000
    },
2001
    {
2002
     "name": "stdout",
2003
     "output_type": "stream",
2004
     "text": [
2005
      "... 300 processed.\n"
2006
     ]
2007
    },
2008
    {
2009
     "name": "stderr",
2010
     "output_type": "stream",
2011
     "text": [
2012
      "TCGA-EO-A3AU-01A-21D-A19X-01 has no genes with altered CN\n",
2013
      "TCGA-QF-A5YS-01A-11D-A31T-01 has no genes with altered CN\n",
2014
      "TCGA-D1-A0ZV-01A-11D-A10L-01 has no genes with altered CN\n"
2015
     ]
2016
    },
2017
    {
2018
     "name": "stdout",
2019
     "output_type": "stream",
2020
     "text": [
2021
      "... 400 processed.\n"
2022
     ]
2023
    },
2024
    {
2025
     "name": "stderr",
2026
     "output_type": "stream",
2027
     "text": [
2028
      "TCGA-QS-A5YR-01A-31D-A31T-01 has no genes with altered CN\n",
2029
      "TCGA-DI-A1BU-01A-11D-A134-01 has no genes with altered CN\n",
2030
      "TCGA-AP-A0LG-01A-11D-A042-01 has no genes with altered CN\n"
2031
     ]
2032
    },
2033
    {
2034
     "name": "stdout",
2035
     "output_type": "stream",
2036
     "text": [
2037
      "... 500 processed.\n"
2038
     ]
2039
    },
2040
    {
2041
     "name": "stderr",
2042
     "output_type": "stream",
2043
     "text": [
2044
      "TCGA-D1-A0ZS-01A-11D-A120-01 has no genes with altered CN\n"
2045
     ]
2046
    },
2047
    {
2048
     "name": "stdout",
2049
     "output_type": "stream",
2050
     "text": [
2051
      "UCEC (36019, 540)\n"
2052
     ]
2053
    },
2054
    {
2055
     "name": "stderr",
2056
     "output_type": "stream",
2057
     "text": [
2058
      "PCPG\n",
2059
      "TCGA-RW-A7CZ-01A-11D-A35C-01 has no genes with altered CN\n"
2060
     ]
2061
    },
2062
    {
2063
     "name": "stdout",
2064
     "output_type": "stream",
2065
     "text": [
2066
      "... 100 processed.\n"
2067
     ]
2068
    },
2069
    {
2070
     "name": "stderr",
2071
     "output_type": "stream",
2072
     "text": [
2073
      "TCGA-WB-A817-01A-11D-A35H-01 has no genes with altered CN\n"
2074
     ]
2075
    },
2076
    {
2077
     "name": "stdout",
2078
     "output_type": "stream",
2079
     "text": [
2080
      "PCPG (36019, 168)\n"
2081
     ]
2082
    },
2083
    {
2084
     "name": "stderr",
2085
     "output_type": "stream",
2086
     "text": [
2087
      "STAD\n",
2088
      "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n",
2089
      "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n"
2090
     ]
2091
    },
2092
    {
2093
     "name": "stdout",
2094
     "output_type": "stream",
2095
     "text": [
2096
      "... 100 processed.\n"
2097
     ]
2098
    },
2099
    {
2100
     "name": "stderr",
2101
     "output_type": "stream",
2102
     "text": [
2103
      "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n"
2104
     ]
2105
    },
2106
    {
2107
     "name": "stdout",
2108
     "output_type": "stream",
2109
     "text": [
2110
      "... 200 processed.\n"
2111
     ]
2112
    },
2113
    {
2114
     "name": "stderr",
2115
     "output_type": "stream",
2116
     "text": [
2117
      "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n",
2118
      "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n",
2119
      "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n"
2120
     ]
2121
    },
2122
    {
2123
     "name": "stdout",
2124
     "output_type": "stream",
2125
     "text": [
2126
      "... 300 processed.\n"
2127
     ]
2128
    },
2129
    {
2130
     "name": "stderr",
2131
     "output_type": "stream",
2132
     "text": [
2133
      "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n",
2134
      "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n"
2135
     ]
2136
    },
2137
    {
2138
     "name": "stdout",
2139
     "output_type": "stream",
2140
     "text": [
2141
      "... 400 processed.\n",
2142
      "STAD (36019, 442)\n"
2143
     ]
2144
    },
2145
    {
2146
     "name": "stderr",
2147
     "output_type": "stream",
2148
     "text": [
2149
      "CESC\n"
2150
     ]
2151
    },
2152
    {
2153
     "name": "stdout",
2154
     "output_type": "stream",
2155
     "text": [
2156
      "... 100 processed.\n",
2157
      "... 200 processed.\n",
2158
      "CESC (36019, 297)\n"
2159
     ]
2160
    },
2161
    {
2162
     "name": "stderr",
2163
     "output_type": "stream",
2164
     "text": [
2165
      "UCS\n"
2166
     ]
2167
    },
2168
    {
2169
     "name": "stdout",
2170
     "output_type": "stream",
2171
     "text": [
2172
      "UCS (36019, 56)\n"
2173
     ]
2174
    },
2175
    {
2176
     "name": "stderr",
2177
     "output_type": "stream",
2178
     "text": [
2179
      "TGCT\n",
2180
      "TCGA-YU-A90S-01A-11D-A434-01 has no genes with altered CN\n"
2181
     ]
2182
    },
2183
    {
2184
     "name": "stdout",
2185
     "output_type": "stream",
2186
     "text": [
2187
      "... 100 processed.\n",
2188
      "TGCT (36019, 156)\n"
2189
     ]
2190
    },
2191
    {
2192
     "name": "stderr",
2193
     "output_type": "stream",
2194
     "text": [
2195
      "THCA\n",
2196
      "TCGA-EL-A4JZ-01A-11D-A256-01 has no genes with altered CN\n",
2197
      "TCGA-DJ-A13X-01A-11D-A10T-01 has no genes with altered CN\n",
2198
      "TCGA-EL-A3ZT-01A-12D-A23L-01 has no genes with altered CN\n",
2199
      "TCGA-DE-A0XZ-01A-11D-A17S-01 has no genes with altered CN\n",
2200
      "TCGA-DJ-A2PP-01A-11D-A19I-01 has no genes with altered CN\n",
2201
      "TCGA-KS-A4I5-01A-11D-A256-01 has no genes with altered CN\n",
2202
      "TCGA-DJ-A2PS-01A-11D-A18E-01 has no genes with altered CN\n",
2203
      "TCGA-EL-A3GW-01A-11D-A201-01 has no genes with altered CN\n",
2204
      "TCGA-BJ-A0ZG-01A-11D-A10T-01 has no genes with altered CN\n",
2205
      "TCGA-J8-A3O2-01A-11D-A23L-01 has no genes with altered CN\n",
2206
      "TCGA-FY-A3RA-01A-11D-A21Y-01 has no genes with altered CN\n",
2207
      "TCGA-CE-A483-01A-11D-A23T-01 has no genes with altered CN\n",
2208
      "TCGA-EM-A1CW-01A-21D-A13V-01 has no genes with altered CN\n",
2209
      "TCGA-DJ-A4V4-01A-11D-A256-01 has no genes with altered CN\n",
2210
      "TCGA-E3-A3E1-01A-11D-A20A-01 has no genes with altered CN\n",
2211
      "TCGA-ET-A2MZ-01A-12D-A19I-01 has no genes with altered CN\n",
2212
      "TCGA-E8-A414-01A-11D-A23L-01 has no genes with altered CN\n",
2213
      "TCGA-EL-A3T6-01A-11D-A21Y-01 has no genes with altered CN\n",
2214
      "TCGA-DJ-A4V5-01A-11D-A256-01 has no genes with altered CN\n",
2215
      "TCGA-DJ-A3UY-01A-21D-A22C-01 has no genes with altered CN\n",
2216
      "TCGA-EL-A3D4-01A-11D-A19I-01 has no genes with altered CN\n",
2217
      "TCGA-FY-A76V-01A-11D-A396-01 has no genes with altered CN\n",
2218
      "TCGA-FY-A4B3-01A-11D-A23T-01 has no genes with altered CN\n",
2219
      "TCGA-DJ-A3UO-01A-11D-A22C-01 has no genes with altered CN\n",
2220
      "TCGA-EL-A4K7-01A-11D-A256-01 has no genes with altered CN\n",
2221
      "TCGA-DJ-A1QI-01A-11D-A14V-01 has no genes with altered CN\n",
2222
      "TCGA-EL-A3N2-01A-11D-A20A-01 has no genes with altered CN\n",
2223
      "TCGA-E3-A3E5-01A-11D-A20A-01 has no genes with altered CN\n",
2224
      "TCGA-EM-A1YD-01A-11D-A14V-01 has no genes with altered CN\n",
2225
      "TCGA-GE-A2C6-01A-11D-A16M-01 has no genes with altered CN\n",
2226
      "TCGA-DJ-A2Q5-01A-11D-A18E-01 has no genes with altered CN\n",
2227
      "TCGA-ET-A3DP-01A-11D-A219-01 has no genes with altered CN\n",
2228
      "TCGA-DJ-A4UT-01A-11D-A256-01 has no genes with altered CN\n",
2229
      "TCGA-DJ-A2PT-01A-11D-A18E-01 has no genes with altered CN\n",
2230
      "TCGA-DJ-A4V2-01A-11D-A256-01 has no genes with altered CN\n",
2231
      "TCGA-L6-A4ET-01A-11D-A256-01 has no genes with altered CN\n",
2232
      "TCGA-BJ-A0ZJ-01A-11D-A10T-01 has no genes with altered CN\n",
2233
      "TCGA-DE-A4M9-01A-11D-A256-01 has no genes with altered CN\n",
2234
      "TCGA-EL-A4KD-01A-11D-A256-01 has no genes with altered CN\n"
2235
     ]
2236
    },
2237
    {
2238
     "name": "stdout",
2239
     "output_type": "stream",
2240
     "text": [
2241
      "... 100 processed.\n"
2242
     ]
2243
    },
2244
    {
2245
     "name": "stderr",
2246
     "output_type": "stream",
2247
     "text": [
2248
      "TCGA-QD-A8IV-01A-11D-A396-01 has no genes with altered CN\n",
2249
      "TCGA-ET-A3DV-01A-12D-A201-01 has no genes with altered CN\n",
2250
      "TCGA-EM-A22K-01A-11D-A17S-01 has no genes with altered CN\n",
2251
      "TCGA-DJ-A3VE-01A-11D-A23L-01 has no genes with altered CN\n",
2252
      "TCGA-EL-A3D1-01A-11D-A19I-01 has no genes with altered CN\n",
2253
      "TCGA-BJ-A2P4-01A-11D-A18E-01 has no genes with altered CN\n",
2254
      "TCGA-CE-A3ME-01A-11D-A20A-01 has no genes with altered CN\n",
2255
      "TCGA-E8-A417-01A-21D-A23L-01 has no genes with altered CN\n",
2256
      "TCGA-KS-A41I-01A-11D-A23L-01 has no genes with altered CN\n",
2257
      "TCGA-FK-A3SB-01A-11D-A22C-01 has no genes with altered CN\n",
2258
      "TCGA-BJ-A28S-01A-11D-A19I-01 has no genes with altered CN\n",
2259
      "TCGA-MK-A4N9-01A-11D-A256-01 has no genes with altered CN\n",
2260
      "TCGA-E8-A437-01A-12D-A23T-01 has no genes with altered CN\n",
2261
      "TCGA-EM-A3AP-01A-12D-A20A-01 has no genes with altered CN\n",
2262
      "TCGA-EL-A3TA-01A-12D-A22C-01 has no genes with altered CN\n",
2263
      "TCGA-IM-A41Z-01A-11D-A23L-01 has no genes with altered CN\n",
2264
      "TCGA-EM-A2CQ-01A-11D-A17S-01 has no genes with altered CN\n",
2265
      "TCGA-EM-A3O7-01A-11D-A21Y-01 has no genes with altered CN\n",
2266
      "TCGA-FE-A3PC-01A-11D-A21Y-01 has no genes with altered CN\n",
2267
      "TCGA-DJ-A2PY-01A-11D-A18E-01 has no genes with altered CN\n",
2268
      "TCGA-EM-A4FQ-01A-11D-A256-01 has no genes with altered CN\n",
2269
      "TCGA-EM-A3FO-01A-11D-A219-01 has no genes with altered CN\n",
2270
      "TCGA-BJ-A0Z9-01A-11D-A10T-01 has no genes with altered CN\n",
2271
      "TCGA-EM-A3FK-01A-11D-A219-01 has no genes with altered CN\n"
2272
     ]
2273
    },
2274
    {
2275
     "name": "stdout",
2276
     "output_type": "stream",
2277
     "text": [
2278
      "... 200 processed.\n"
2279
     ]
2280
    },
2281
    {
2282
     "name": "stderr",
2283
     "output_type": "stream",
2284
     "text": [
2285
      "TCGA-ET-A3BU-01A-11D-A19I-01 has no genes with altered CN\n",
2286
      "TCGA-BJ-A0Z5-01A-11D-A10T-01 has no genes with altered CN\n",
2287
      "TCGA-EL-A3MY-01A-11D-A219-01 has no genes with altered CN\n",
2288
      "TCGA-ET-A39L-01A-12D-A19I-01 has no genes with altered CN\n",
2289
      "TCGA-E8-A415-01A-11D-A23L-01 has no genes with altered CN\n",
2290
      "TCGA-ET-A40Q-01A-11D-A23L-01 has no genes with altered CN\n",
2291
      "TCGA-KS-A4I7-01A-11D-A256-01 has no genes with altered CN\n",
2292
      "TCGA-MK-A4N7-01A-11D-A256-01 has no genes with altered CN\n",
2293
      "TCGA-L6-A4EQ-01A-11D-A256-01 has no genes with altered CN\n",
2294
      "TCGA-FY-A3TY-01A-11D-A22Y-01 has no genes with altered CN\n",
2295
      "TCGA-ET-A2N1-01A-11D-A18E-01 has no genes with altered CN\n",
2296
      "TCGA-DJ-A2PO-01A-21D-A19I-01 has no genes with altered CN\n",
2297
      "TCGA-J8-A3O2-06A-11D-A23L-01 has no genes with altered CN\n",
2298
      "TCGA-CE-A485-01A-11D-A23T-01 has no genes with altered CN\n",
2299
      "TCGA-ET-A3BX-01A-11D-A19I-01 has no genes with altered CN\n",
2300
      "TCGA-DJ-A3VK-01A-11D-A23L-01 has no genes with altered CN\n",
2301
      "TCGA-DE-A4M8-01A-21D-A256-01 has no genes with altered CN\n",
2302
      "TCGA-ET-A40T-01A-11D-A23L-01 has no genes with altered CN\n",
2303
      "TCGA-BJ-A18Z-01A-21D-A13V-01 has no genes with altered CN\n",
2304
      "TCGA-DJ-A3UT-01A-11D-A22C-01 has no genes with altered CN\n",
2305
      "TCGA-DJ-A2Q2-01A-11D-A19I-01 has no genes with altered CN\n",
2306
      "TCGA-BJ-A18Y-01A-11D-A13V-01 has no genes with altered CN\n",
2307
      "TCGA-ET-A39T-01A-11D-A19I-01 has no genes with altered CN\n",
2308
      "TCGA-EL-A3CL-01A-11D-A19I-01 has no genes with altered CN\n",
2309
      "TCGA-DJ-A4V0-01A-11D-A256-01 has no genes with altered CN\n",
2310
      "TCGA-EL-A3H8-01A-11D-A20A-01 has no genes with altered CN\n",
2311
      "TCGA-ET-A39J-01A-11D-A19I-01 has no genes with altered CN\n",
2312
      "TCGA-FY-A3I4-01A-11D-A219-01 has no genes with altered CN\n",
2313
      "TCGA-EM-A2CU-01A-12D-A17S-01 has no genes with altered CN\n"
2314
     ]
2315
    },
2316
    {
2317
     "name": "stdout",
2318
     "output_type": "stream",
2319
     "text": [
2320
      "... 300 processed.\n"
2321
     ]
2322
    },
2323
    {
2324
     "name": "stderr",
2325
     "output_type": "stream",
2326
     "text": [
2327
      "TCGA-EM-A3FM-01A-11D-A219-01 has no genes with altered CN\n",
2328
      "TCGA-EM-A4FF-01A-11D-A256-01 has no genes with altered CN\n",
2329
      "TCGA-EL-A3GX-01A-11D-A201-01 has no genes with altered CN\n",
2330
      "TCGA-DJ-A3UN-01A-11D-A22C-01 has no genes with altered CN\n",
2331
      "TCGA-EM-A4FO-01A-11D-A256-01 has no genes with altered CN\n",
2332
      "TCGA-EL-A3TB-01A-11D-A22C-01 has no genes with altered CN\n",
2333
      "TCGA-ET-A25N-01A-11D-A16M-01 has no genes with altered CN\n",
2334
      "TCGA-ET-A39M-01A-11D-A19I-01 has no genes with altered CN\n",
2335
      "TCGA-DE-A4MA-01A-11D-A256-01 has no genes with altered CN\n",
2336
      "TCGA-ET-A39O-01A-11D-A19I-01 has no genes with altered CN\n",
2337
      "TCGA-DE-A0Y2-01A-11D-A10T-01 has no genes with altered CN\n",
2338
      "TCGA-FY-A3R8-01A-11D-A21Y-01 has no genes with altered CN\n",
2339
      "TCGA-EM-A3AL-01A-11D-A201-01 has no genes with altered CN\n",
2340
      "TCGA-EM-A2CN-01A-11D-A19I-01 has no genes with altered CN\n",
2341
      "TCGA-FY-A3BL-01A-11D-A19I-01 has no genes with altered CN\n",
2342
      "TCGA-EM-A1CS-01A-11D-A13V-01 has no genes with altered CN\n",
2343
      "TCGA-EL-A3D0-01A-12D-A201-01 has no genes with altered CN\n",
2344
      "TCGA-E3-A3DZ-01A-11D-A20A-01 has no genes with altered CN\n",
2345
      "TCGA-DJ-A1QF-01A-12D-A14V-01 has no genes with altered CN\n",
2346
      "TCGA-J8-A3YH-01A-11D-A22Y-01 has no genes with altered CN\n",
2347
      "TCGA-EL-A4K1-01A-11D-A256-01 has no genes with altered CN\n",
2348
      "TCGA-EM-A3O8-01A-11D-A21Y-01 has no genes with altered CN\n",
2349
      "TCGA-DJ-A3VJ-01A-11D-A23L-01 has no genes with altered CN\n",
2350
      "TCGA-BJ-A45D-01A-11D-A23T-01 has no genes with altered CN\n",
2351
      "TCGA-FY-A4B4-01A-11D-A23T-01 has no genes with altered CN\n",
2352
      "TCGA-EM-A1CU-01A-11D-A13V-01 has no genes with altered CN\n",
2353
      "TCGA-EL-A3CX-01A-11D-A19I-01 has no genes with altered CN\n",
2354
      "TCGA-ET-A25O-01A-11D-A16M-01 has no genes with altered CN\n",
2355
      "TCGA-E8-A433-01A-11D-A23L-01 has no genes with altered CN\n"
2356
     ]
2357
    },
2358
    {
2359
     "name": "stdout",
2360
     "output_type": "stream",
2361
     "text": [
2362
      "THCA (36019, 506)\n"
2363
     ]
2364
    },
2365
    {
2366
     "name": "stderr",
2367
     "output_type": "stream",
2368
     "text": [
2369
      "CHOL\n",
2370
      "TCGA-W5-AA2H-01A-31D-A416-01 has no genes with altered CN\n"
2371
     ]
2372
    },
2373
    {
2374
     "name": "stdout",
2375
     "output_type": "stream",
2376
     "text": [
2377
      "CHOL (36019, 36)\n"
2378
     ]
2379
    },
2380
    {
2381
     "name": "stderr",
2382
     "output_type": "stream",
2383
     "text": [
2384
      "HNSC\n"
2385
     ]
2386
    },
2387
    {
2388
     "name": "stdout",
2389
     "output_type": "stream",
2390
     "text": [
2391
      "... 100 processed.\n",
2392
      "... 200 processed.\n",
2393
      "... 300 processed.\n",
2394
      "... 400 processed.\n",
2395
      "... 500 processed.\n",
2396
      "HNSC (36019, 530)\n"
2397
     ]
2398
    },
2399
    {
2400
     "name": "stderr",
2401
     "output_type": "stream",
2402
     "text": [
2403
      "UVM\n"
2404
     ]
2405
    },
2406
    {
2407
     "name": "stdout",
2408
     "output_type": "stream",
2409
     "text": [
2410
      "UVM (36019, 80)\n"
2411
     ]
2412
    },
2413
    {
2414
     "name": "stderr",
2415
     "output_type": "stream",
2416
     "text": [
2417
      "SKCM\n",
2418
      "TCGA-ER-A19A-06A-21D-A191-01 has no genes with altered CN\n"
2419
     ]
2420
    },
2421
    {
2422
     "name": "stdout",
2423
     "output_type": "stream",
2424
     "text": [
2425
      "... 100 processed.\n",
2426
      "... 200 processed.\n",
2427
      "... 300 processed.\n",
2428
      "... 400 processed.\n"
2429
     ]
2430
    },
2431
    {
2432
     "name": "stderr",
2433
     "output_type": "stream",
2434
     "text": [
2435
      "TCGA-EB-A4OZ-01A-12D-A25P-01 has no genes with altered CN\n",
2436
      "TCGA-EE-A2GK-06A-11D-A194-01 has no genes with altered CN\n"
2437
     ]
2438
    },
2439
    {
2440
     "name": "stdout",
2441
     "output_type": "stream",
2442
     "text": [
2443
      "SKCM (36019, 472)\n"
2444
     ]
2445
    },
2446
    {
2447
     "name": "stderr",
2448
     "output_type": "stream",
2449
     "text": [
2450
      "COAD\n",
2451
      "TCGA-G4-6302-01A-11D-1717-01 has no genes with altered CN\n"
2452
     ]
2453
    },
2454
    {
2455
     "name": "stdout",
2456
     "output_type": "stream",
2457
     "text": [
2458
      "... 100 processed.\n"
2459
     ]
2460
    },
2461
    {
2462
     "name": "stderr",
2463
     "output_type": "stream",
2464
     "text": [
2465
      "TCGA-AA-A03F-01A-11D-A080-01 has no genes with altered CN\n"
2466
     ]
2467
    },
2468
    {
2469
     "name": "stdout",
2470
     "output_type": "stream",
2471
     "text": [
2472
      "... 200 processed.\n",
2473
      "... 300 processed.\n",
2474
      "... 400 processed.\n",
2475
      "COAD (36019, 453)\n"
2476
     ]
2477
    },
2478
    {
2479
     "name": "stderr",
2480
     "output_type": "stream",
2481
     "text": [
2482
      "ACC\n",
2483
      "TCGA-OR-A5KQ-01A-11D-A309-01 has no genes with altered CN\n",
2484
      "TCGA-OR-A5KV-01A-11D-A29H-01 has no genes with altered CN\n"
2485
     ]
2486
    },
2487
    {
2488
     "name": "stdout",
2489
     "output_type": "stream",
2490
     "text": [
2491
      "ACC (36019, 90)\n"
2492
     ]
2493
    },
2494
    {
2495
     "name": "stderr",
2496
     "output_type": "stream",
2497
     "text": [
2498
      "PAAD\n",
2499
      "TCGA-IB-AAUR-01A-21D-A38F-01 has no genes with altered CN\n",
2500
      "TCGA-HZ-8002-01A-11D-2200-01 has no genes with altered CN\n",
2501
      "TCGA-XD-AAUG-01A-61D-A40V-01 has no genes with altered CN\n",
2502
      "TCGA-Z5-AAPL-01A-12D-A40V-01 has no genes with altered CN\n",
2503
      "TCGA-IB-A5SQ-01A-11D-A32M-01 has no genes with altered CN\n"
2504
     ]
2505
    },
2506
    {
2507
     "name": "stdout",
2508
     "output_type": "stream",
2509
     "text": [
2510
      "... 100 processed.\n"
2511
     ]
2512
    },
2513
    {
2514
     "name": "stderr",
2515
     "output_type": "stream",
2516
     "text": [
2517
      "TCGA-IB-AAUS-01A-12D-A38F-01 has no genes with altered CN\n"
2518
     ]
2519
    },
2520
    {
2521
     "name": "stdout",
2522
     "output_type": "stream",
2523
     "text": [
2524
      "PAAD (36019, 185)\n"
2525
     ]
2526
    },
2527
    {
2528
     "name": "stderr",
2529
     "output_type": "stream",
2530
     "text": [
2531
      "THYM\n",
2532
      "TCGA-4V-A9QW-01A-11D-A422-01 has no genes with altered CN\n",
2533
      "TCGA-ZB-A96B-01A-11D-A427-01 has no genes with altered CN\n",
2534
      "TCGA-X7-A8DB-01A-11D-A422-01 has no genes with altered CN\n",
2535
      "TCGA-X7-A8M4-01A-11D-A422-01 has no genes with altered CN\n",
2536
      "TCGA-X7-A8D8-01A-11D-A422-01 has no genes with altered CN\n",
2537
      "TCGA-3S-AAYX-01A-11D-A422-01 has no genes with altered CN\n",
2538
      "TCGA-YT-A95E-01A-11D-A427-01 has no genes with altered CN\n",
2539
      "TCGA-X7-A8M8-01A-11D-A422-01 has no genes with altered CN\n",
2540
      "TCGA-ZT-A8OM-01A-11D-A427-01 has no genes with altered CN\n",
2541
      "TCGA-ZB-A96E-01A-11D-A427-01 has no genes with altered CN\n",
2542
      "TCGA-3Q-A9WF-01A-11D-A422-01 has no genes with altered CN\n",
2543
      "TCGA-X7-A8M1-01A-11D-A422-01 has no genes with altered CN\n",
2544
      "TCGA-ZB-A96A-01A-11D-A427-01 has no genes with altered CN\n",
2545
      "TCGA-ZB-A96R-01A-11D-A427-01 has no genes with altered CN\n",
2546
      "TCGA-ZB-A963-01A-11D-A427-01 has no genes with altered CN\n",
2547
      "TCGA-ZC-AAAA-01A-11D-A427-01 has no genes with altered CN\n",
2548
      "TCGA-XM-A8RB-01A-11D-A422-01 has no genes with altered CN\n",
2549
      "TCGA-ZB-A96G-01A-11D-A427-01 has no genes with altered CN\n",
2550
      "TCGA-X7-A8M7-01A-11D-A422-01 has no genes with altered CN\n",
2551
      "TCGA-XU-AAXZ-01A-11D-A427-01 has no genes with altered CN\n",
2552
      "TCGA-XH-A853-01A-11D-A422-01 has no genes with altered CN\n",
2553
      "TCGA-XM-AAZ3-01A-11D-A422-01 has no genes with altered CN\n"
2554
     ]
2555
    },
2556
    {
2557
     "name": "stdout",
2558
     "output_type": "stream",
2559
     "text": [
2560
      "... 100 processed.\n",
2561
      "THYM (36019, 125)\n"
2562
     ]
2563
    },
2564
    {
2565
     "name": "stderr",
2566
     "output_type": "stream",
2567
     "text": [
2568
      "LUSC\n",
2569
      "TCGA-56-8623-01A-11D-2391-01 has no genes with altered CN\n"
2570
     ]
2571
    },
2572
    {
2573
     "name": "stdout",
2574
     "output_type": "stream",
2575
     "text": [
2576
      "... 100 processed.\n",
2577
      "... 200 processed.\n",
2578
      "... 300 processed.\n",
2579
      "... 400 processed.\n"
2580
     ]
2581
    },
2582
    {
2583
     "name": "stderr",
2584
     "output_type": "stream",
2585
     "text": [
2586
      "TCGA-98-A53H-01A-12D-A25M-01 has no genes with altered CN\n"
2587
     ]
2588
    },
2589
    {
2590
     "name": "stdout",
2591
     "output_type": "stream",
2592
     "text": [
2593
      "LUSC (36019, 501)\n"
2594
     ]
2595
    },
2596
    {
2597
     "name": "stderr",
2598
     "output_type": "stream",
2599
     "text": [
2600
      "MESO\n",
2601
      "TCGA-TS-A8AS-01A-21D-A39Q-01 has no genes with altered CN\n",
2602
      "TCGA-TS-A7P8-01A-11D-A34B-01 has no genes with altered CN\n",
2603
      "TCGA-TS-A8AV-01A-12D-A39Q-01 has no genes with altered CN\n",
2604
      "TCGA-3H-AB3O-01A-11D-A39Q-01 has no genes with altered CN\n"
2605
     ]
2606
    },
2607
    {
2608
     "name": "stdout",
2609
     "output_type": "stream",
2610
     "text": [
2611
      "MESO (36019, 87)\n"
2612
     ]
2613
    },
2614
    {
2615
     "name": "stderr",
2616
     "output_type": "stream",
2617
     "text": [
2618
      "OV\n"
2619
     ]
2620
    },
2621
    {
2622
     "name": "stdout",
2623
     "output_type": "stream",
2624
     "text": [
2625
      "... 100 processed.\n",
2626
      "... 200 processed.\n",
2627
      "... 300 processed.\n",
2628
      "... 400 processed.\n",
2629
      "... 500 processed.\n",
2630
      "OV (36019, 597)\n"
2631
     ]
2632
    },
2633
    {
2634
     "name": "stderr",
2635
     "output_type": "stream",
2636
     "text": [
2637
      "SARC\n"
2638
     ]
2639
    },
2640
    {
2641
     "name": "stdout",
2642
     "output_type": "stream",
2643
     "text": [
2644
      "... 100 processed.\n"
2645
     ]
2646
    },
2647
    {
2648
     "name": "stderr",
2649
     "output_type": "stream",
2650
     "text": [
2651
      "TCGA-WK-A8Y0-10D-01D-A419-01 has no genes with altered CN\n"
2652
     ]
2653
    },
2654
    {
2655
     "name": "stdout",
2656
     "output_type": "stream",
2657
     "text": [
2658
      "... 200 processed.\n"
2659
     ]
2660
    },
2661
    {
2662
     "name": "stderr",
2663
     "output_type": "stream",
2664
     "text": [
2665
      "TCGA-WK-A8XS-10E-01D-A37E-01 has no genes with altered CN\n",
2666
      "TCGA-QQ-A5V2-01A-11D-A32H-01 has no genes with altered CN\n"
2667
     ]
2668
    },
2669
    {
2670
     "name": "stdout",
2671
     "output_type": "stream",
2672
     "text": [
2673
      "SARC (36019, 263)\n"
2674
     ]
2675
    },
2676
    {
2677
     "name": "stderr",
2678
     "output_type": "stream",
2679
     "text": [
2680
      "KIRP\n",
2681
      "TCGA-Y8-A8S1-01A-11D-A36W-01 has no genes with altered CN\n",
2682
      "TCGA-GL-A4EM-01A-11D-A253-01 has no genes with altered CN\n",
2683
      "TCGA-4A-A93Y-01A-11D-A36W-01 has no genes with altered CN\n",
2684
      "TCGA-AL-3467-01A-02D-1348-01 has no genes with altered CN\n"
2685
     ]
2686
    },
2687
    {
2688
     "name": "stdout",
2689
     "output_type": "stream",
2690
     "text": [
2691
      "... 100 processed.\n"
2692
     ]
2693
    },
2694
    {
2695
     "name": "stderr",
2696
     "output_type": "stream",
2697
     "text": [
2698
      "TCGA-A4-7828-01A-11D-2135-01 has no genes with altered CN\n",
2699
      "TCGA-DW-7838-01A-11D-2135-01 has no genes with altered CN\n"
2700
     ]
2701
    },
2702
    {
2703
     "name": "stdout",
2704
     "output_type": "stream",
2705
     "text": [
2706
      "... 200 processed.\n",
2707
      "KIRP (36019, 288)\n"
2708
     ]
2709
    },
2710
    {
2711
     "name": "stderr",
2712
     "output_type": "stream",
2713
     "text": [
2714
      "LGG\n",
2715
      "TCGA-HT-8106-01A-11D-2391-01 has no genes with altered CN\n",
2716
      "TCGA-S9-A6WI-01A-21D-A33S-01 has no genes with altered CN\n",
2717
      "TCGA-HT-7602-01A-21D-2085-01 has no genes with altered CN\n"
2718
     ]
2719
    },
2720
    {
2721
     "name": "stdout",
2722
     "output_type": "stream",
2723
     "text": [
2724
      "... 100 processed.\n"
2725
     ]
2726
    },
2727
    {
2728
     "name": "stderr",
2729
     "output_type": "stream",
2730
     "text": [
2731
      "TCGA-DU-7011-01A-11D-2023-01 has no genes with altered CN\n",
2732
      "TCGA-TM-A84B-12A-01D-A366-01 has no genes with altered CN\n"
2733
     ]
2734
    },
2735
    {
2736
     "name": "stdout",
2737
     "output_type": "stream",
2738
     "text": [
2739
      "... 200 processed.\n"
2740
     ]
2741
    },
2742
    {
2743
     "name": "stderr",
2744
     "output_type": "stream",
2745
     "text": [
2746
      "TCGA-FG-8181-01A-11D-2252-01 has no genes with altered CN\n"
2747
     ]
2748
    },
2749
    {
2750
     "name": "stdout",
2751
     "output_type": "stream",
2752
     "text": [
2753
      "... 300 processed.\n"
2754
     ]
2755
    },
2756
    {
2757
     "name": "stderr",
2758
     "output_type": "stream",
2759
     "text": [
2760
      "TCGA-FG-8189-01B-11D-A288-01 has no genes with altered CN\n"
2761
     ]
2762
    },
2763
    {
2764
     "name": "stdout",
2765
     "output_type": "stream",
2766
     "text": [
2767
      "... 400 processed.\n"
2768
     ]
2769
    },
2770
    {
2771
     "name": "stderr",
2772
     "output_type": "stream",
2773
     "text": [
2774
      "TCGA-DU-5872-02A-21D-A36N-01 has no genes with altered CN\n"
2775
     ]
2776
    },
2777
    {
2778
     "name": "stdout",
2779
     "output_type": "stream",
2780
     "text": [
2781
      "... 500 processed.\n"
2782
     ]
2783
    },
2784
    {
2785
     "name": "stderr",
2786
     "output_type": "stream",
2787
     "text": [
2788
      "TCGA-HT-7680-01A-11D-2252-01 has no genes with altered CN\n",
2789
      "TCGA-P5-A5EY-01A-11D-A27J-01 has no genes with altered CN\n",
2790
      "TCGA-CS-6669-01A-11D-1892-01 has no genes with altered CN\n"
2791
     ]
2792
    },
2793
    {
2794
     "name": "stdout",
2795
     "output_type": "stream",
2796
     "text": [
2797
      "LGG (36019, 530)\n"
2798
     ]
2799
    },
2800
    {
2801
     "name": "stderr",
2802
     "output_type": "stream",
2803
     "text": [
2804
      "LAML\n",
2805
      "TCGA-AB-2884-03A-01D-0756-21 has no genes with altered CN\n",
2806
      "TCGA-AB-2932-03A-01D-0756-21 has no genes with altered CN\n",
2807
      "TCGA-AB-2842-03A-01D-0756-21 has no genes with altered CN\n",
2808
      "TCGA-AB-2969-03A-01D-0756-21 has no genes with altered CN\n",
2809
      "TCGA-AB-2826-03A-01D-0756-21 has no genes with altered CN\n",
2810
      "TCGA-AB-2836-03A-01D-0756-21 has no genes with altered CN\n",
2811
      "TCGA-AB-2871-03A-01D-0756-21 has no genes with altered CN\n",
2812
      "TCGA-AB-2845-03A-01D-0756-21 has no genes with altered CN\n",
2813
      "TCGA-AB-2840-03A-01D-0756-21 has no genes with altered CN\n",
2814
      "TCGA-AB-2837-03A-01D-0756-21 has no genes with altered CN\n",
2815
      "TCGA-AB-2844-03A-01D-0756-21 has no genes with altered CN\n",
2816
      "TCGA-AB-2854-03A-01D-0756-21 has no genes with altered CN\n"
2817
     ]
2818
    },
2819
    {
2820
     "name": "stdout",
2821
     "output_type": "stream",
2822
     "text": [
2823
      "... 100 processed.\n"
2824
     ]
2825
    },
2826
    {
2827
     "name": "stderr",
2828
     "output_type": "stream",
2829
     "text": [
2830
      "TCGA-AB-3006-03A-01D-0756-21 has no genes with altered CN\n",
2831
      "TCGA-AB-2931-03A-01D-0756-21 has no genes with altered CN\n",
2832
      "TCGA-AB-2851-03A-01D-0756-21 has no genes with altered CN\n",
2833
      "TCGA-AB-2978-03A-01D-0756-21 has no genes with altered CN\n",
2834
      "TCGA-AB-2880-03A-01D-0756-21 has no genes with altered CN\n",
2835
      "TCGA-AB-2922-03A-01D-0756-21 has no genes with altered CN\n",
2836
      "TCGA-AB-2947-03A-01D-0756-21 has no genes with altered CN\n",
2837
      "TCGA-AB-2998-03A-01D-0756-21 has no genes with altered CN\n",
2838
      "TCGA-AB-2824-03A-01D-0756-21 has no genes with altered CN\n"
2839
     ]
2840
    },
2841
    {
2842
     "name": "stdout",
2843
     "output_type": "stream",
2844
     "text": [
2845
      "LAML (36019, 191)\n"
2846
     ]
2847
    },
2848
    {
2849
     "name": "stderr",
2850
     "output_type": "stream",
2851
     "text": [
2852
      "LIHC\n",
2853
      "TCGA-2V-A95S-10D-01D-A36Z-01 has no genes with altered CN\n",
2854
      "TCGA-UB-AA0V-01A-11D-A381-01 has no genes with altered CN\n"
2855
     ]
2856
    },
2857
    {
2858
     "name": "stdout",
2859
     "output_type": "stream",
2860
     "text": [
2861
      "... 100 processed.\n"
2862
     ]
2863
    },
2864
    {
2865
     "name": "stderr",
2866
     "output_type": "stream",
2867
     "text": [
2868
      "TCGA-G3-A25V-01A-11D-A16U-01 has no genes with altered CN\n",
2869
      "TCGA-DD-A3A6-01A-11D-A22E-01 has no genes with altered CN\n",
2870
      "TCGA-DD-A4NL-01A-11D-A28W-01 has no genes with altered CN\n"
2871
     ]
2872
    },
2873
    {
2874
     "name": "stdout",
2875
     "output_type": "stream",
2876
     "text": [
2877
      "... 200 processed.\n"
2878
     ]
2879
    },
2880
    {
2881
     "name": "stderr",
2882
     "output_type": "stream",
2883
     "text": [
2884
      "TCGA-ED-A5KG-01A-11D-A27H-01 has no genes with altered CN\n",
2885
      "TCGA-CC-A9FV-01A-11D-A36W-01 has no genes with altered CN\n"
2886
     ]
2887
    },
2888
    {
2889
     "name": "stdout",
2890
     "output_type": "stream",
2891
     "text": [
2892
      "... 300 processed.\n"
2893
     ]
2894
    },
2895
    {
2896
     "name": "stderr",
2897
     "output_type": "stream",
2898
     "text": [
2899
      "TCGA-MR-A520-01A-11D-A25U-01 has no genes with altered CN\n"
2900
     ]
2901
    },
2902
    {
2903
     "name": "stdout",
2904
     "output_type": "stream",
2905
     "text": [
2906
      "LIHC (36019, 373)\n"
2907
     ]
2908
    },
2909
    {
2910
     "name": "stderr",
2911
     "output_type": "stream",
2912
     "text": [
2913
      "PRAD\n",
2914
      "TCGA-J9-A52C-01A-11D-A26L-01 has no genes with altered CN\n",
2915
      "TCGA-V1-A8MJ-01A-11D-A363-01 has no genes with altered CN\n",
2916
      "TCGA-XJ-A9DQ-01A-11D-A376-01 has no genes with altered CN\n"
2917
     ]
2918
    },
2919
    {
2920
     "name": "stdout",
2921
     "output_type": "stream",
2922
     "text": [
2923
      "... 100 processed.\n"
2924
     ]
2925
    },
2926
    {
2927
     "name": "stderr",
2928
     "output_type": "stream",
2929
     "text": [
2930
      "TCGA-J4-A6G1-01A-11D-A30W-01 has no genes with altered CN\n"
2931
     ]
2932
    },
2933
    {
2934
     "name": "stdout",
2935
     "output_type": "stream",
2936
     "text": [
2937
      "... 200 processed.\n"
2938
     ]
2939
    },
2940
    {
2941
     "name": "stderr",
2942
     "output_type": "stream",
2943
     "text": [
2944
      "TCGA-J4-A67R-01A-21D-A30D-01 has no genes with altered CN\n",
2945
      "TCGA-EJ-A7NJ-01A-22D-A34T-01 has no genes with altered CN\n",
2946
      "TCGA-EJ-7791-01A-11D-2112-01 has no genes with altered CN\n"
2947
     ]
2948
    },
2949
    {
2950
     "name": "stdout",
2951
     "output_type": "stream",
2952
     "text": [
2953
      "... 300 processed.\n"
2954
     ]
2955
    },
2956
    {
2957
     "name": "stderr",
2958
     "output_type": "stream",
2959
     "text": [
2960
      "TCGA-EJ-A8FU-01A-11D-A363-01 has no genes with altered CN\n",
2961
      "TCGA-EJ-A6RC-01A-11D-A32A-01 has no genes with altered CN\n",
2962
      "TCGA-HC-7740-01A-11D-2112-01 has no genes with altered CN\n",
2963
      "TCGA-EJ-A65B-01A-12D-A30D-01 has no genes with altered CN\n",
2964
      "TCGA-HC-8260-01A-11D-2259-01 has no genes with altered CN\n",
2965
      "TCGA-FC-A8O0-01A-41D-A376-01 has no genes with altered CN\n"
2966
     ]
2967
    },
2968
    {
2969
     "name": "stdout",
2970
     "output_type": "stream",
2971
     "text": [
2972
      "... 400 processed.\n"
2973
     ]
2974
    },
2975
    {
2976
     "name": "stderr",
2977
     "output_type": "stream",
2978
     "text": [
2979
      "TCGA-VN-A88I-01A-11D-A34T-01 has no genes with altered CN\n",
2980
      "TCGA-EJ-A7NK-01A-12D-A34T-01 has no genes with altered CN\n",
2981
      "TCGA-CH-5743-01A-21D-1574-01 has no genes with altered CN\n",
2982
      "TCGA-G9-6367-01A-11D-1785-01 has no genes with altered CN\n",
2983
      "TCGA-KC-A4BO-01A-61D-A256-01 has no genes with altered CN\n"
2984
     ]
2985
    },
2986
    {
2987
     "name": "stdout",
2988
     "output_type": "stream",
2989
     "text": [
2990
      "PRAD (36019, 493)\n"
2991
     ]
2992
    },
2993
    {
2994
     "name": "stderr",
2995
     "output_type": "stream",
2996
     "text": [
2997
      "LUAD\n"
2998
     ]
2999
    },
3000
    {
3001
     "name": "stdout",
3002
     "output_type": "stream",
3003
     "text": [
3004
      "... 100 processed.\n",
3005
      "... 200 processed.\n"
3006
     ]
3007
    },
3008
    {
3009
     "name": "stderr",
3010
     "output_type": "stream",
3011
     "text": [
3012
      "TCGA-L4-A4E6-01A-11D-A24C-01 has no genes with altered CN\n"
3013
     ]
3014
    },
3015
    {
3016
     "name": "stdout",
3017
     "output_type": "stream",
3018
     "text": [
3019
      "... 300 processed.\n"
3020
     ]
3021
    },
3022
    {
3023
     "name": "stderr",
3024
     "output_type": "stream",
3025
     "text": [
3026
      "TCGA-44-3398-01A-01D-1877-01 has no genes with altered CN\n"
3027
     ]
3028
    },
3029
    {
3030
     "name": "stdout",
3031
     "output_type": "stream",
3032
     "text": [
3033
      "... 400 processed.\n"
3034
     ]
3035
    },
3036
    {
3037
     "name": "stderr",
3038
     "output_type": "stream",
3039
     "text": [
3040
      "TCGA-55-8619-01A-11D-2389-01 has no genes with altered CN\n",
3041
      "TCGA-86-A4P8-01A-11D-A24O-01 has no genes with altered CN\n"
3042
     ]
3043
    },
3044
    {
3045
     "name": "stdout",
3046
     "output_type": "stream",
3047
     "text": [
3048
      "... 500 processed.\n",
3049
      "LUAD (36019, 518)\n"
3050
     ]
3051
    },
3052
    {
3053
     "name": "stderr",
3054
     "output_type": "stream",
3055
     "text": [
3056
      "BRCA\n"
3057
     ]
3058
    },
3059
    {
3060
     "name": "stdout",
3061
     "output_type": "stream",
3062
     "text": [
3063
      "... 100 processed.\n"
3064
     ]
3065
    },
3066
    {
3067
     "name": "stderr",
3068
     "output_type": "stream",
3069
     "text": [
3070
      "TCGA-AO-A0JC-01A-11D-A059-01 has no genes with altered CN\n"
3071
     ]
3072
    },
3073
    {
3074
     "name": "stdout",
3075
     "output_type": "stream",
3076
     "text": [
3077
      "... 200 processed.\n"
3078
     ]
3079
    },
3080
    {
3081
     "name": "stderr",
3082
     "output_type": "stream",
3083
     "text": [
3084
      "TCGA-BH-A0H5-01A-21D-A111-01 has no genes with altered CN\n",
3085
      "TCGA-A2-A0CR-01A-11D-A227-01 has no genes with altered CN\n"
3086
     ]
3087
    },
3088
    {
3089
     "name": "stdout",
3090
     "output_type": "stream",
3091
     "text": [
3092
      "... 300 processed.\n"
3093
     ]
3094
    },
3095
    {
3096
     "name": "stderr",
3097
     "output_type": "stream",
3098
     "text": [
3099
      "TCGA-BH-A1FE-06A-11D-A20R-01 has no genes with altered CN\n",
3100
      "TCGA-AN-A0FN-01A-11D-A036-01 has no genes with altered CN\n"
3101
     ]
3102
    },
3103
    {
3104
     "name": "stdout",
3105
     "output_type": "stream",
3106
     "text": [
3107
      "... 400 processed.\n",
3108
      "... 500 processed.\n"
3109
     ]
3110
    },
3111
    {
3112
     "name": "stderr",
3113
     "output_type": "stream",
3114
     "text": [
3115
      "TCGA-PL-A8LY-01A-11D-A41E-01 has no genes with altered CN\n"
3116
     ]
3117
    },
3118
    {
3119
     "name": "stdout",
3120
     "output_type": "stream",
3121
     "text": [
3122
      "... 600 processed.\n",
3123
      "... 700 processed.\n"
3124
     ]
3125
    },
3126
    {
3127
     "name": "stderr",
3128
     "output_type": "stream",
3129
     "text": [
3130
      "TCGA-GM-A3XG-01A-31D-A242-01 has no genes with altered CN\n",
3131
      "TCGA-LD-A74U-01A-13D-A33D-01 has no genes with altered CN\n"
3132
     ]
3133
    },
3134
    {
3135
     "name": "stdout",
3136
     "output_type": "stream",
3137
     "text": [
3138
      "... 800 processed.\n"
3139
     ]
3140
    },
3141
    {
3142
     "name": "stderr",
3143
     "output_type": "stream",
3144
     "text": [
3145
      "TCGA-GM-A2DO-10D-01D-A18N-01 has no genes with altered CN\n",
3146
      "TCGA-A2-A0EP-01A-52D-A22W-01 has no genes with altered CN\n"
3147
     ]
3148
    },
3149
    {
3150
     "name": "stdout",
3151
     "output_type": "stream",
3152
     "text": [
3153
      "... 900 processed.\n",
3154
      "... 1000 processed.\n"
3155
     ]
3156
    },
3157
    {
3158
     "name": "stderr",
3159
     "output_type": "stream",
3160
     "text": [
3161
      "TCGA-AO-A1KO-01A-31D-A13J-01 has no genes with altered CN\n"
3162
     ]
3163
    },
3164
    {
3165
     "name": "stdout",
3166
     "output_type": "stream",
3167
     "text": [
3168
      "BRCA (36019, 1088)\n"
3169
     ]
3170
    },
3171
    {
3172
     "name": "stderr",
3173
     "output_type": "stream",
3174
     "text": [
3175
      "KIRC\n",
3176
      "TCGA-B4-5378-01A-01D-1499-01 has no genes with altered CN\n",
3177
      "TCGA-B0-5400-01A-01D-1499-01 has no genes with altered CN\n",
3178
      "TCGA-CJ-4890-01A-01D-1302-01 has no genes with altered CN\n"
3179
     ]
3180
    },
3181
    {
3182
     "name": "stdout",
3183
     "output_type": "stream",
3184
     "text": [
3185
      "... 100 processed.\n"
3186
     ]
3187
    },
3188
    {
3189
     "name": "stderr",
3190
     "output_type": "stream",
3191
     "text": [
3192
      "TCGA-A3-A8OX-01A-11D-A36W-01 has no genes with altered CN\n",
3193
      "TCGA-B0-4817-01A-01D-1274-01 has no genes with altered CN\n",
3194
      "TCGA-B0-5080-01A-01D-1499-01 has no genes with altered CN\n"
3195
     ]
3196
    },
3197
    {
3198
     "name": "stdout",
3199
     "output_type": "stream",
3200
     "text": [
3201
      "... 200 processed.\n"
3202
     ]
3203
    },
3204
    {
3205
     "name": "stderr",
3206
     "output_type": "stream",
3207
     "text": [
3208
      "TCGA-DV-A4VZ-01A-11D-A25U-01 has no genes with altered CN\n"
3209
     ]
3210
    },
3211
    {
3212
     "name": "stdout",
3213
     "output_type": "stream",
3214
     "text": [
3215
      "... 300 processed.\n"
3216
     ]
3217
    },
3218
    {
3219
     "name": "stderr",
3220
     "output_type": "stream",
3221
     "text": [
3222
      "TCGA-CJ-4891-01A-01D-1302-01 has no genes with altered CN\n",
3223
      "TCGA-CJ-4889-01A-01D-1302-01 has no genes with altered CN\n",
3224
      "TCGA-BP-4769-01A-01D-1283-01 has no genes with altered CN\n"
3225
     ]
3226
    },
3227
    {
3228
     "name": "stdout",
3229
     "output_type": "stream",
3230
     "text": [
3231
      "... 400 processed.\n"
3232
     ]
3233
    },
3234
    {
3235
     "name": "stderr",
3236
     "output_type": "stream",
3237
     "text": [
3238
      "TCGA-BP-4760-01A-02D-1417-01 has no genes with altered CN\n"
3239
     ]
3240
    },
3241
    {
3242
     "name": "stdout",
3243
     "output_type": "stream",
3244
     "text": [
3245
      "... 500 processed.\n",
3246
      "KIRC (36019, 529)\n"
3247
     ]
3248
    },
3249
    {
3250
     "name": "stderr",
3251
     "output_type": "stream",
3252
     "text": [
3253
      "KICH\n"
3254
     ]
3255
    },
3256
    {
3257
     "name": "stdout",
3258
     "output_type": "stream",
3259
     "text": [
3260
      "KICH (36019, 66)\n"
3261
     ]
3262
    }
3263
   ],
3264
   "source": [
3265
    "for cohort in dfs.keys():\n",
3266
    "    print(cohort, file=sys.stderr)\n",
3267
    "    df = dfs[cohort]\n",
3268
    "    cna_table = []\n",
3269
    "    n_samples = 0\n",
3270
    "    for sample in list(set(df.Sample.values)):\n",
3271
    "        n_samples +=1\n",
3272
    "        cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n",
3273
    "                                 verbose = False,sorted_index = sorted_index)\n",
3274
    "        cna_table.append(cnv2gene)\n",
3275
    "        if n_samples % 100 == 0:\n",
3276
    "            print(\"...\",n_samples, \"processed.\")\n",
3277
    "    cna_table = pd.concat(cna_table,axis =1)\n",
3278
    "    \n",
3279
    "\n",
3280
    "    for sample in tumors_without_CNA[cohort]:\n",
3281
    "        cna_table[sample] = 0\n",
3282
    "    \n",
3283
    "    cna_table.fillna(0, inplace = True)\n",
3284
    "    cna_table.to_csv(preprocessed_dir+\"/TCGA-\"+cohort+\".Segment_Mean.CNA.tsv\",\n",
3285
    "                     sep = \"\\t\",header=True,index=True)\n",
3286
    "    print(cohort,cna_table.shape)\n"
3287
   ]
3288
  },
3289
  {
3290
   "cell_type": "code",
3291
   "execution_count": 14,
3292
   "metadata": {},
3293
   "outputs": [
3294
    {
3295
     "data": {
3296
      "text/plain": [
3297
       "'t = time.time()\\ncnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\\n                         verbose = False,sorted_index = sorted_index)\\nprint( time.time() - t)\\ncnv2gene'"
3298
      ]
3299
     },
3300
     "execution_count": 14,
3301
     "metadata": {},
3302
     "output_type": "execute_result"
3303
    }
3304
   ],
3305
   "source": [
3306
    "\"\"\"t = time.time()\n",
3307
    "cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n",
3308
    "                         verbose = False,sorted_index = sorted_index)\n",
3309
    "print( time.time() - t)\n",
3310
    "cnv2gene\"\"\""
3311
   ]
3312
  },
3313
  {
3314
   "cell_type": "markdown",
3315
   "metadata": {},
3316
   "source": [
3317
    "# CCLE \n",
3318
    "\n",
3319
    "the same pipeline as for TCGA except filtering out germline CNA (because no )\n",
3320
    "\n",
3321
    "wget https://data.broadinstitute.org/ccle_legacy_data/dna_copy_number/CCLE_copynumber_2013-12-03.seg.txt\n",
3322
    "\n",
3323
    "? should we use a stronger segment_mean threshold because this data are for cell lines and purity must be 100%"
3324
   ]
3325
  },
3326
  {
3327
   "cell_type": "code",
3328
   "execution_count": 15,
3329
   "metadata": {
3330
    "scrolled": true
3331
   },
3332
   "outputs": [
3333
    {
3334
     "name": "stdout",
3335
     "output_type": "stream",
3336
     "text": [
3337
      "47 duplicated IDs in 94 rows found.\n",
3338
      "duplicate rows removed due to low correlation of duplicated profiles 0\n",
3339
      "Merged  94 duplicated rows into 47 rows\n",
3340
      "CCLE: genes: 35972 samples 1043\n"
3341
     ]
3342
    }
3343
   ],
3344
   "source": [
3345
    "df = pd.read_csv(\"../../CCLE/CCLE_copynumber_2013-12-03.seg.txt\",sep = \"\\t\")\n",
3346
    "df.rename({\"CCLE_name\":\"Sample\"},inplace=True, axis=\"columns\")\n",
3347
    "df[\"End\"] = df[\"End\"].apply(int)\n",
3348
    "ccle = []\n",
3349
    "for sample_name in list(set(df[\"Sample\"].values)):\n",
3350
    "    cl = df.loc[df[\"Sample\"]==sample_name, :]\n",
3351
    "    # keep high-conf segments \n",
3352
    "    cl_filtered = filter_lowconf_segments(cl,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n",
3353
    "    #print(sample_name, cl.shape[0], \"after filtration\",cl_filtered.shape[0])\n",
3354
    "    # map to genes \n",
3355
    "    cnv2gene = cnv2genelevel(cnv2bed(cl_filtered),gene_intervals_bed,sample_name,\n",
3356
    "                                 verbose = False,sorted_index = sorted_index)\n",
3357
    "    ccle.append(cnv2gene)\n",
3358
    "    \n",
3359
    "ccle = pd.concat(ccle,axis =1)\n",
3360
    "ccle.fillna(0, inplace = True)\n",
3361
    "ccle = handle_dups(ccle)\n",
3362
    "ccle.to_csv(preprocessed_dir+\"/\"+\"CCLE\"+\".Segment_Mean.CNA.tsv\",\n",
3363
    "                 sep = \"\\t\",header=True,index=True)\n",
3364
    "print(\"CCLE:\",\"genes:\",ccle.shape[0],\"samples\",ccle.shape[1])"
3365
   ]
3366
  },
3367
  {
3368
   "cell_type": "markdown",
3369
   "metadata": {},
3370
   "source": [
3371
    "# GDSC\n",
3372
    "Assume that supplementary file with gene-level CN is downloaded :\n",
3373
    "\n",
3374
    "wget \n",
3375
    "\n",
3376
    "GDSC provides gene-level integer estimated CN, max. and min. CN over all segments covering a gene. In order to make it comparable with TCGA and CCLE, we  divide estimated CN by CN of copy-neutral state and log2-transform it. \n",
3377
    "\n",
3378
    "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n",
3379
    "\n",
3380
    "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n",
3381
    "\n",
3382
    "3) Replace estimates below thresholds with zeroes. \n",
3383
    "\n",
3384
    "\n",
3385
    "DGSC uses 4 comma-separated values for gene-level CN (max_cn,min_cn,zygosity,disruption): e.g. (from \"legend\" tab)\n",
3386
    "\n",
3387
    "2,2,H,-\tGene resides on a single genomic segment in a diploid region of the genome.\n",
3388
    "2,0,L,D\tGene spans multiple segments, higest copy number is 2 but part of the coding sequence is homozygously deleted, the gene is disrupted.\n",
3389
    "13,13,H,-\tGene resides on a single genomic segment of copy number 13 in a heterozygous part of the genome (amplification).\n",
3390
    "14,12,L,D\tGene spans multiple genomic segments all of which are amplified to 12 or more copies, some or all segments have LOH, the gene is disrupted.\n",
3391
    "0,0,0,-\tComplete gene sequence falls within a homozygous deletion.\n",
3392
    "-1,-1,-,- gene level CN not assigned\n",
3393
    "\n",
3394
    "* min and max CN are integers \n",
3395
    "* zygosity -  can be L (LOH in any overlapping segment) or H (heterozygous) or 0 (homozygous deleteion of the whole gene) or - (undefined)\n",
3396
    "* disruption - D (if disrupted) or \"-\" (not disrupted) \n",
3397
    "\n",
3398
    "Average ploidies of cell lines were downloaded from COSMIC:\n",
3399
    "\n",
3400
    "wget https://cog.sanger.ac.uk/cosmic/GRCh37/cell_lines/v86/PICNIC_average_ploidies.tsv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1540792525&Signature=mcSB6oFv%2BXCF4%2Fezm4a3Ds1JXo4%3D\n",
3401
    "\n",
3402
    "wget ftp:// ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-7.0/Gene_level_CN.xlsx\n"
3403
   ]
3404
  },
3405
  {
3406
   "cell_type": "code",
3407
   "execution_count": 16,
3408
   "metadata": {},
3409
   "outputs": [
3410
    {
3411
     "data": {
3412
      "text/html": [
3413
       "<div>\n",
3414
       "<style scoped>\n",
3415
       "    .dataframe tbody tr th:only-of-type {\n",
3416
       "        vertical-align: middle;\n",
3417
       "    }\n",
3418
       "\n",
3419
       "    .dataframe tbody tr th {\n",
3420
       "        vertical-align: top;\n",
3421
       "    }\n",
3422
       "\n",
3423
       "    .dataframe thead th {\n",
3424
       "        text-align: right;\n",
3425
       "    }\n",
3426
       "</style>\n",
3427
       "<table border=\"1\" class=\"dataframe\">\n",
3428
       "  <thead>\n",
3429
       "    <tr style=\"text-align: right;\">\n",
3430
       "      <th></th>\n",
3431
       "      <th>gene</th>\n",
3432
       "      <th>chr</th>\n",
3433
       "      <th>start</th>\n",
3434
       "      <th>stop</th>\n",
3435
       "      <th>201T</th>\n",
3436
       "      <th>22RV1</th>\n",
3437
       "      <th>23132-87</th>\n",
3438
       "      <th>42-MG-BA</th>\n",
3439
       "      <th>451Lu</th>\n",
3440
       "      <th>5637</th>\n",
3441
       "      <th>...</th>\n",
3442
       "      <th>WSU-NHL</th>\n",
3443
       "      <th>YAPC</th>\n",
3444
       "      <th>YH-13</th>\n",
3445
       "      <th>YKG-1</th>\n",
3446
       "      <th>YMB-1-E</th>\n",
3447
       "      <th>YT</th>\n",
3448
       "      <th>ZR-75-30</th>\n",
3449
       "      <th>huH-1</th>\n",
3450
       "      <th>no-10</th>\n",
3451
       "      <th>no-11</th>\n",
3452
       "    </tr>\n",
3453
       "  </thead>\n",
3454
       "  <tbody>\n",
3455
       "    <tr>\n",
3456
       "      <th>0</th>\n",
3457
       "      <td>NaN</td>\n",
3458
       "      <td>NaN</td>\n",
3459
       "      <td>NaN</td>\n",
3460
       "      <td>NaN</td>\n",
3461
       "      <td>1287381</td>\n",
3462
       "      <td>924100</td>\n",
3463
       "      <td>910924</td>\n",
3464
       "      <td>687561</td>\n",
3465
       "      <td>1287706</td>\n",
3466
       "      <td>687452</td>\n",
3467
       "      <td>...</td>\n",
3468
       "      <td>909785</td>\n",
3469
       "      <td>909904</td>\n",
3470
       "      <td>909905</td>\n",
3471
       "      <td>687592</td>\n",
3472
       "      <td>1303911</td>\n",
3473
       "      <td>946358</td>\n",
3474
       "      <td>909907</td>\n",
3475
       "      <td>1298146</td>\n",
3476
       "      <td>908452</td>\n",
3477
       "      <td>908450</td>\n",
3478
       "    </tr>\n",
3479
       "    <tr>\n",
3480
       "      <th>1</th>\n",
3481
       "      <td>DDX11L1</td>\n",
3482
       "      <td>1</td>\n",
3483
       "      <td>11869.0</td>\n",
3484
       "      <td>14412.0</td>\n",
3485
       "      <td>-1,-1,-,-</td>\n",
3486
       "      <td>-1,-1,-,-</td>\n",
3487
       "      <td>-1,-1,-,-</td>\n",
3488
       "      <td>-1,-1,-,-</td>\n",
3489
       "      <td>-1,-1,-,-</td>\n",
3490
       "      <td>-1,-1,-,-</td>\n",
3491
       "      <td>...</td>\n",
3492
       "      <td>-1,-1,-,-</td>\n",
3493
       "      <td>-1,-1,-,-</td>\n",
3494
       "      <td>-1,-1,-,-</td>\n",
3495
       "      <td>-1,-1,-,-</td>\n",
3496
       "      <td>-1,-1,-,-</td>\n",
3497
       "      <td>-1,-1,-,-</td>\n",
3498
       "      <td>-1,-1,-,-</td>\n",
3499
       "      <td>-1,-1,-,-</td>\n",
3500
       "      <td>-1,-1,-,-</td>\n",
3501
       "      <td>-1,-1,-,-</td>\n",
3502
       "    </tr>\n",
3503
       "    <tr>\n",
3504
       "      <th>2</th>\n",
3505
       "      <td>WASH7P</td>\n",
3506
       "      <td>1</td>\n",
3507
       "      <td>14363.0</td>\n",
3508
       "      <td>29806.0</td>\n",
3509
       "      <td>-1,-1,-,-</td>\n",
3510
       "      <td>-1,-1,-,-</td>\n",
3511
       "      <td>-1,-1,-,-</td>\n",
3512
       "      <td>-1,-1,-,-</td>\n",
3513
       "      <td>-1,-1,-,-</td>\n",
3514
       "      <td>-1,-1,-,-</td>\n",
3515
       "      <td>...</td>\n",
3516
       "      <td>-1,-1,-,-</td>\n",
3517
       "      <td>-1,-1,-,-</td>\n",
3518
       "      <td>-1,-1,-,-</td>\n",
3519
       "      <td>-1,-1,-,-</td>\n",
3520
       "      <td>-1,-1,-,-</td>\n",
3521
       "      <td>-1,-1,-,-</td>\n",
3522
       "      <td>-1,-1,-,-</td>\n",
3523
       "      <td>-1,-1,-,-</td>\n",
3524
       "      <td>-1,-1,-,-</td>\n",
3525
       "      <td>-1,-1,-,-</td>\n",
3526
       "    </tr>\n",
3527
       "  </tbody>\n",
3528
       "</table>\n",
3529
       "<p>3 rows × 1000 columns</p>\n",
3530
       "</div>"
3531
      ],
3532
      "text/plain": [
3533
       "      gene  chr    start     stop       201T      22RV1   23132-87   42-MG-BA  \\\n",
3534
       "0      NaN  NaN      NaN      NaN    1287381     924100     910924     687561   \n",
3535
       "1  DDX11L1    1  11869.0  14412.0  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
3536
       "2   WASH7P    1  14363.0  29806.0  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
3537
       "\n",
3538
       "       451Lu       5637    ...        WSU-NHL       YAPC      YH-13  \\\n",
3539
       "0    1287706     687452    ...         909785     909904     909905   \n",
3540
       "1  -1,-1,-,-  -1,-1,-,-    ...      -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
3541
       "2  -1,-1,-,-  -1,-1,-,-    ...      -1,-1,-,-  -1,-1,-,-  -1,-1,-,-   \n",
3542
       "\n",
3543
       "       YKG-1    YMB-1-E         YT   ZR-75-30      huH-1      no-10      no-11  \n",
3544
       "0     687592    1303911     946358     909907    1298146     908452     908450  \n",
3545
       "1  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  \n",
3546
       "2  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  -1,-1,-,-  \n",
3547
       "\n",
3548
       "[3 rows x 1000 columns]"
3549
      ]
3550
     },
3551
     "execution_count": 16,
3552
     "metadata": {},
3553
     "output_type": "execute_result"
3554
    }
3555
   ],
3556
   "source": [
3557
    "GDSC_CNA = \"/home/olya/SFU/Hossein/GDSC/Gene_level_CN.xlsx\"\n",
3558
    "\n",
3559
    "gdsc = pd.read_excel(GDSC_CNA,\"Gene_level_CN\")\n",
3560
    "gdsc.head(3)"
3561
   ]
3562
  },
3563
  {
3564
   "cell_type": "code",
3565
   "execution_count": 17,
3566
   "metadata": {},
3567
   "outputs": [
3568
    {
3569
     "name": "stdout",
3570
     "output_type": "stream",
3571
     "text": [
3572
      "25 gene IDs excluded due to string to datetime conversion in Excel.\n",
3573
      "Strings containing duplicated gene IDs: 0\n"
3574
     ]
3575
    }
3576
   ],
3577
   "source": [
3578
    "gdsc.set_index(\"gene\",inplace = True)\n",
3579
    "gdsc.drop([\"chr\",\"start\",\"stop\"],inplace=True,axis=1)\n",
3580
    "gdsc.columns = gdsc.iloc[0,:]\n",
3581
    "gdsc = gdsc.iloc[1:,:]\n",
3582
    "gdsc.columns.name = None\n",
3583
    "# replace 2001-12-01 with DEC1 and get remove gene names converted to datetimes\n",
3584
    "gdsc.index.values[37778] = \"DEC1\"\n",
3585
    "df_size = gdsc.shape[0]\n",
3586
    "ndxs=pd.Series(gdsc.index).apply(lambda x : type(x) == unicode or type(x) == str)\n",
3587
    "gdsc = gdsc.loc[gdsc.index.values[ndxs[ndxs].index],:]\n",
3588
    "print(df_size - gdsc.shape[0],\"gene IDs excluded due to string to datetime conversion in Excel.\")\n",
3589
    "\n",
3590
    "gdsc.index.name = \"gene_id\"\n",
3591
    "ids = gdsc.index\n",
3592
    "ids = list(set(ids[ids.duplicated()]))\n",
3593
    "print(\"Strings containing duplicated gene IDs:\",gdsc.loc[ids,:].shape[0])"
3594
   ]
3595
  },
3596
  {
3597
   "cell_type": "markdown",
3598
   "metadata": {},
3599
   "source": [
3600
    "### distribution of averaged ploidies in GDSC\n",
3601
    "\n",
3602
    "we compared average ploidies reported in PICNIC_average_ploidies.tsv provided by COSMIC with  "
3603
   ]
3604
  },
3605
  {
3606
   "cell_type": "code",
3607
   "execution_count": 18,
3608
   "metadata": {},
3609
   "outputs": [
3610
    {
3611
     "name": "stdout",
3612
     "output_type": "stream",
3613
     "text": [
3614
      "1020\n",
3615
      "1016\n"
3616
     ]
3617
    }
3618
   ],
3619
   "source": [
3620
    "GDSC_Ploidies = \"/home/olya/SFU/Hossein/GDSC/PICNIC_average_ploidies.tsv\"\n",
3621
    "GDSC_Ploidies = pd.read_csv(GDSC_Ploidies,sep = \"\\t\")\n",
3622
    "GDSC_Ploidies.drop(\"#sample_name\",axis = 1, inplace= True)\n",
3623
    "GDSC_Ploidies.set_index(\"sample_id\",inplace=True)\n",
3624
    "print(GDSC_Ploidies.shape[0])\n",
3625
    "GDSC_Ploidies.dropna(inplace=True)\n",
3626
    "print(GDSC_Ploidies.shape[0])\n",
3627
    "\n",
3628
    "est_ploidies = gdsc.apply(define_avg_ploidy).T\n",
3629
    "df_ploidies = pd.DataFrame.from_dict({\"est. avg. ploidy from CN profile\":est_ploidies[\"avg_pl\"],\"PICNIC avg. pl.\":GDSC_Ploidies[\"average_ploidy\"],\n",
3630
    "                                     \"est. median. ploidy\":est_ploidies[\"median_pl\"]})\n"
3631
   ]
3632
  },
3633
  {
3634
   "cell_type": "code",
3635
   "execution_count": 19,
3636
   "metadata": {},
3637
   "outputs": [
3638
    {
3639
     "data": {
3640
      "image/png": "iVBORw0KGgoAAAANSUhEUgAABIgAAAE/CAYAAAAt2/ipAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xu8pWVd///XO/CUoihMiBwcD2ip1agTah4i0UJA0VKEDNGo0cKy7Psr1L6imd8fVkaWqV8UBEoQBElU6it5zBJtUEIO+hVw/DE4MiPIQVEL+Pz+uK8Na/Zh9prZex1m7tfz8diPfa/rvtZan33PrOu61+e+rutOVSFJkiRJkqT++rFJByBJkiRJkqTJMkEkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIUy3JAUnWD1n33Un+5xb2V5JHbkMM90nykSQ3J/ng1j6/D9J5X5LvJvlikqcn+drA/nVJnjXJGCVpR5fk00l+c9JxSFJfJHlZks8NPP5ekodPII6hvuckeUmSj29hv/1Iz5kg2kElWdkaip0nHcu4VNUrq+rNI3jpFwJ7ALtV1YtG8PqLSvKoJB9M8p2WqLo0yWuS7DTwb33BrOf8Q5I3jinEpwHPBvauqv2r6l+r6tFjem9JO4CWSP5BO7m+PsmpSe7X9m12wprk/kn+Osn/1+pf3R7vPvBaG5Pcd+A5v5nk0wOPNzuZ3lI7O5YDIEla1LR/x6mq+1XVNZOOYyFV9f6q+qVJx6HpZYJIWtxDgf9bVbfPt3PUHVSSRwBfAK4FfrqqHgC8CFgN7DJQ9UlJfn4E758ki7UVDwXWVdX3l/v9JfXKc6vqfsAT6Nq4P5ldIck9gU8AjwUOAu4PPAW4Adh/oOpOwKuHedOtaGclSZJ2WCaItgNJHpLk3CSbknwjye8N7Ns/ydokt7Qrrn/Vdn22/b6pXV19yhDv8/IkVya5Nck1SV4xsO/KJIcOPN65xfOE9vilSb6Z5IYk/3NrphS1uq9NckWbovS+JPdeoO5PtSvJNyW5PMnzBvadmuTPBh7/P0k2JPlWkt8YKP+5dqx2Gij7lST/Oc/7vQl4A/DidhyPaUNJ/y3JiUluAN6Y5MeS/Ek7BhuTnJ7kAe01Zq50vDzJte1vfGWL49L2t7xjC4foTcC/V9VrqmoDQFV9rap+rapuGqj358BbFjncM3/XzN/wjnal/KtJDhzY/+kkb0nyb8BtwMPb/8Pzk9yY5Kokv9XqHgO8F3hKO0ZvyhamBrZjdVy6K/43JDk7yYOGiVtSP1TVdcA/AY+bZ/dLgX2BF1TVFVV1Z1VtrKo3V9XgSMq/AP5Hkl2HeMth21kAkjwwyUdbP/jdtr132/fiJGtn1f+DJOe37d3STVu+Jcl/JPmzDExP2JLF2m5J2p6M8TvOp1tb++/tOR9pbfH7B9rilQP1fzLJhe2c92tJDh/Yt1s7H74lyReBR8x6r7tGpyY5JMmXW91rMzCyf+D7wdHpRsN+J8nrt/A3nJpuOY0L031X+0yShy5Q9wHtu8im9t3kT9Iu9mbulLhnt77k5vZ9JK38nu3v/+mBuj+R5LYkKxY75tp+mSCacu3D/BHgP4G9gAOB30/yy63K24G3V9X96Rqos1v5M9rvXdtQx88P8XYbgUPprsa+HDgxLQEEnAkcOVD3l4HvVNWXkjwGeCfwEmBP4AEt1q3xkvaajwAexfxXje9Bdyw+DvwE8LvA+5PMmcqU5CDgf9BNe9oPuCtZVVX/QXeleXB45VHA6bNfp6qOB/4XcFY7jie3XU8CrqGbevYW4GXt5xeBhwP3A2YnfZ7UYnkx8NfA61tcjwUOT/IL8xwXWp1zFtg36J3AozL8Wj9PAq4GdgeOBz40K1FzFLCG7ur5N4EPAOuBh9BNu/tfSZ7Zjskrgc+3Y3T8Iu/7u8DzgV9or/Vd4O+GjFlSDyTZBzgY+PI8u58F/HNVfW+Rl1kLfJquL1jMsO3sjB8D3kc3enJf4Afc3eZ/BHh0kv0G6v8acEbb/jvg+8CDgaPbz9ZYrO2WpKk35u84AEfQndvu1V7v83Tt+IOAK+naU9JNTb6Qrs3+ifa8d7bvO9C14T+k+87zG+1nId+nu6ixK3AI8NtJnj+rztOAR7e//w1JfmoLr/cS4M107f8lwPsXqPe3dN/HHk53vv1Suu92m0k3LftDdN+7dqfrW54KUFX/RXfu/+sDTzkS+ERVbdpCjNrOmSCafj8HrKiqP62q/2pzWt9D11gB/DfwyCS7V9X3quqibX2jqvpYVV1dnc/QJWKe3nafATwvyY+3x79GlzSCLlnwkar6XGtM3gDUVr79O6rq2qq6kS7hcuQ8dZ5Ml3g5oR2LTwIfXaDu4cD7quqyNu3pjbP2n0Zr8NqJ9S9z98n7ML5VVX9bVbdX1Q/oGuy/qqpr2peW1wJHZPPpZ2+uqh9W1cfpOowz21Xv64B/BR6/wHvtBmwYIqYf0B27P1usYrMR+Ouq+u+qOgv4Gl3nNePUqrq8Ta17MF2H8cftb7iEbtTQS4d8r0GvBF5fVeur6kd0/zYvzJTOJZc0Vv+Y5Cbgc8Bn6BL0sw3bJkLXH/3uEFc7t+Y1qaobqurcqrqtqm6la3t/oe27DfgwrW9qiaKfBM5PN3L1V4Hj23OvoOuPtsZibbckbQ/G9h2neV/7nnMz3QjVq6vqX9p57ge5+zz8ULplE97XzvO/DJwLvGigDX9DVX2/qi5jC214VX26qr7SRrpeSvfdafYF4TdV1Q+q6j/pkmU/u4W/4WNV9dl2/vx6utH7+wxWaDEeAby2qm6tqnXA2+iSY7MdDFxeVedU1X/TXcD+9sD+04Ajk6Q9Pgr4+y3Epx2ACaLp91DgIemmId3UTpxfRzdyBeAYuhE3X23DIw9d6IUWk+Q5SS5qwwlvoms0dgeoqqvosuvPbUmi53F3QuUhdOs20OreRjdCZ2tcO7D9zfaasz0EuLaq7pxVd77RSpvF1OoN+ge6v+W+dMmkf52ZVrAN8c683+B7fBPYmbv/nQCuH9j+wTyP77fAe91Ad5ViGO8F9kjy3CHqXldVg4m82cd98G98CHBj+yI0WH9rR4pB93/6vIH/z1cCd7D5sZLUT8+vql2r6qFV9TstAT/b0G1iO3n/KHDcIlW3pp0lyY8n+d9t6P4tdFMeds3dU5fP4O6LF78G/GPrG1fQ9Q2D7evs/mQxi7XdkrQ9GNt3nGbY8/CH0q3rORjXS+guls7Xhs/+jnGXJE9K8qk21etmuouku8+qNpiQuY2Fvw/A5t+3vgfcyNz2f3fgHsz9XrLo96XWtww+/kKL6YAkPwk8Ejh/C/FpB2CCaPpdC3yjnTDP/OxSVQcDVNXXq+pIuiGQbwXOaUmPrRrBk+RedNnxvwT2qKpdgQto81CbmWlmhwFXtKQRdFdd9x54rfvQXY3dGoPZ732Bb81T51vAPtl8weR9gevmqbthnte8Sxu183ngV9i2bPjs4/stug5l8P1uZ/POZ1v9C93VisWD6kZwvYlu+GkWqb7XwBUBmHvcB//GbwEPSrLLrPrzHfvFXAs8Z9b/6Xu3fxNJWsy/AL+cgTuULeJ44LfYckJ76Ha2+UO6KQFPatMfZqY8zLSpFwIrkqyi6zdnLqhsousb9h54rc2u/g5hsbZbkrYHY/mOs41xfWZWXPerqt/m7jZ8we8Ys5xBl1DZp7qbH7ybxc/Pt+Su9013l88HMbf9/w7d6KvZ30sW/b7U+pbZfdLMrIujgHOq6ofbGry2DyaIpt8XgVuT/HGS+6S7rfnjkvwcQJJfT7KijaqZWUjzTroG7E66uafDuCdwr/a825M8h83X6IFuHuovAb/N5tOxzqEbjfPz6e4u80a2vvE7NsnebbrX64Gz5qkzk8X+oyT3SHIA8NwW12xnAy9L8pg24mm+dXFOB/4I+Gm6+bdLcSbwB0ke1hrsmXWL5r3z2VY6Hvj5JH+R5MEASR6Z7jb28y2++vfAvenu7rMlPwH8XjuWLwJ+ii4pOEdVXQv8O/D/Jrl3kp+hu7LzD9vw97wbeEvawnpJViQ5bBteR1I//T3dCfy56RYS/bF0i4a+LsnBsyu3ixlnAb83e9+ArW1nd6G74nxT67c262PaUP0P0i2U/SC6hBFVdQddf/PGNgrpJ9n6qbpDt92SNMXG9R1na32Ubk3Po1o7e490N5b5qXna8Mew5XXkdqEbgf/DJPvTjShdioOTPK1933ozcFE7R79Li/FsunPtXdr59muY/5z9Y8Bj092sZ2e6fvLBs+r8A/ACuiTRnPVateMxQTTl2of8UGAV8A26rPB76RYegy4JcHmS79Et5nZEm8d6G92aCP/Whkc+OcnTW7353udWukbhbLpFg3+NWUMI2xSszwM/z0ACp6oup1t4+AN0mejv0a2R8COAJC9Jcvkif+oZdGseXUO3QNqcdXTa6JjnAs9px+GdwEur6qvz1P0nunm0nwSuar9nO4823akdr6U4he5Ly2fp/p1+SHdMlqyqrqa7hfNKun/rm+lGe60Fbp2n/h10624stmjpF+gWzf4O3f+VF1bVlqYGHtli+BbdsTu+qv5la/6W5u10/7c+nuRW4CK6RVclaVFt7YVnAV+lS7zcQvdFY3e6dm0+fwosOOJoa9tZuv7lPnTt50XAP89T54wW5wdnXSx4FV0f/m26fuNMWn8JkO4OnS9ZKFaGbLu31OdL0qSN6zvONsR1K90F8SPoznm/TTeC6V6tyqvopoF9GziVbqHrhfwO8KftfPcN3L3Q9rY6g+6CxI3AE9l8AelBv0u33uk1dGv6nUH3XWUzVfUd4EXACXRTrfcD/m1WnWuBL9GN3PrXJcav7UA2n8YuLV0bQXMTsF9VfWOI+uuA39zGZMOSJLkaeMUk3nuSkryM7pg/bdKxSFKfJXkr8OCqWvRuZrbdktRPSU4F1lfVnDs9j+G9T6G7Qc/Y31vj5wgiLYskz21DLe9Lt47RV4B1k41qy5L8Kl02fL7RRZIkLbs2Le5n0tmfbrrueZOOS5Kk2ZKspFuz9eTJRqJxMUGk5XIY3TDMb9ENTzyipnh4WpJPA+8Cjp11VzRJkkZpF7o1LL5PN137bcCHJxqRJEmzJHkzcBnwF8PMCtGOwSlmkiRJkiRJPecIIkmSJEmSpJ4zQSRJkiRJktRzO086AIDdd9+9Vq5cOekwJGkqXXzxxd+pqhWTjmOS7CckaX72ER37CUma39b0E1ORIFq5ciVr166ddBiSNJWSfHPSMUya/YQkzc8+omM/IUnz25p+wilmkiRJkiRJPWeCSJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIkiRJkiSp50wQSZIkSZIk9dzOkw5g2qw87mND1Vt3wiEjjkSSNI3sJyRpx2K7LkkdRxBJkpYkySlJNia5bKDsrCSXtJ91SS5p5SuT/GBg37snF7kkSZKkGY4gkiQt1anAO4DTZwqq6sUz20neBtw8UP/qqlo1tugkSZIkLcoEkSRpSarqs0lWzrcvSYDDgWeOMyZJkiRJW8cpZpKkUXo6cH1VfX2g7GFJvpzkM0mevtATk6xJsjbJ2k2bNo0+UkmSJKnHTBBJkkbpSODMgccbgH2r6vHAa4Azktx/vidW1UlVtbqqVq9YsWIMoUqSJEn9ZYJIkjQSSXYGfgU4a6asqn5UVTe07YuBq4FHTSZCSZIkSTNMEEmSRuVZwFerav1MQZIVSXZq2w8H9gOumVB8kiRJkhoTRJKkJUlyJvB54NFJ1ic5pu06gs2nlwE8A7i03fb+HOCVVXXj+KKVJEmSNB/vYiZJWpKqOnKB8pfNU3YucO6oY5IkSZK0dRYdQZTklCQbk1w2UHZWkkvaz7p2JZgkK5P8YGDfu0cZvCRJkiRJkpZumBFEpwLvAE6fKaiqF89sJ3kbcPNA/auratVyBShJkiRJkqTRWjRBVFWfTbJyvn1JAhwOPHN5w5IkSZIkSdK4LHWR6qcD11fV1wfKHpbky0k+k+TpS3x9SZIkSZIkjdhSF6k+ks3vULMB2LeqbkjyROAfkzy2qm6Z/cQka4A1APvuu+8Sw5AkSZIkSdK22uYRREl2Bn4FOGumrKp+VFU3tO2LgauBR833/Ko6qapWV9XqFStWbGsYkiRJkiRJWqKlTDF7FvDVqlo/U5BkRZKd2vbDgf2Aa5YWoiRJkiRJkkZpmNvcnwl8Hnh0kvVJjmm7jmDz6WUAzwAubbe9Pwd4ZVXduJwBS5IkSZIkaXkNcxezIxcof9k8ZecC5y49LEmSJEk7kjbTYC1wXVUdmuRhwAeA3YCLgaOq6r+S3As4HXgicAPw4qpaN6GwJak3lnoXM0mSJEkaxquBKwcevxU4saoeCXwXmJmpcAzw3VZ+YqsnSRoxE0SSJEmSRirJ3sAhwHvb4wDPpFuWAuA04Plt+7D2mLb/wFZfkjRCJogkSZIkjdpfA38E3Nke7wbcVFW3t8frgb3a9l7AtQBt/82tviRphEwQSZIkSRqZJIcCG6vq4mV+3TVJ1iZZu2nTpuV8aUnqJRNEkiRJkkbpqcDzkqyjW5T6mcDbgV2TzNw0Z2/gurZ9HbAPQNv/ALrFqjdTVSdV1eqqWr1ixYrR/gWS1AMmiCRJkiSNTFW9tqr2rqqVwBHAJ6vqJcCngBe2akcDH27b57fHtP2frKoaY8iS1EsmiCRJkiRNwh8Dr0lyFd0aQye38pOB3Vr5a4DjJhSfJPXKzotXkSRJkqSlq6pPA59u29cA+89T54fAi8YamCTJEUSSJEmSJEl9Z4JIkiRJkiSp50wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz+086QAkSZIkSdNl5XEfG6reuhMOGXEkksbFEUSSJEmSJEk9Z4JIkiRJkiSp50wQSZKWJMkpSTYmuWyg7I1JrktySfs5eGDfa5NcleRrSX55MlFLkiRJGmSCSJK0VKcCB81TfmJVrWo/FwAkeQxwBPDY9px3JtlpbJFKkiRJmpcJIknSklTVZ4Ebh6x+GPCBqvpRVX0DuArYf2TBSZIkSRqKCSJJ0qi8KsmlbQraA1vZXsC1A3XWtzJJkiRJE2SCSJI0Cu8CHgGsAjYAb9vaF0iyJsnaJGs3bdq03PFJkiRJGmCCSJK07Krq+qq6o6ruBN7D3dPIrgP2Gai6dyub7zVOqqrVVbV6xYoVow1YkiRJ6jkTRJKkZZdkz4GHLwBm7nB2PnBEknsleRiwH/DFcccnSZIkaXM7TzoASdL2LcmZwAHA7knWA8cDByRZBRSwDngFQFVdnuRs4ArgduDYqrpjEnFLkiRJutuiCaIkpwCHAhur6nGt7I3AbwEzi0K8buAWxq8FjgHuAH6vqv7PCOKWJE2JqjpynuKTt1D/LcBbRheRJEmSpK01zBSzU4GD5ik/sapWtZ+Z5NBjgCOAx7bnvDPJTssVrCRJkiRJkpbfogmiqvoscOOQr3cY8IGq+lFVfQO4irsXJpUkSZIkSdIUWsoi1a9KcmmSU5I8sJXtBVw7UGd9K5MkSZIkSdKU2tYE0buARwCrgA3A27b2BZKsSbI2ydpNmzYt/gRJkiRJkiSNxDYliKrq+qq6o6ruBN7D3dPIrgP2Gai6dyub7zVOqqrVVbV6xYoV2xKGJEmSJEmSlsE2JYiS7Dnw8AXAZW37fOCIJPdK8jBgP+CLSwtRkiRJkiRJozTMbe7PBA4Adk+yHjgeOCDJKqCAdcArAKrq8iRnA1cAtwPHVtUdowldkiRJkiRJy2HRBFFVHTlP8clbqP8W4C1LCUqSJEmSJEnjs5S7mEmSJEmSJGkHYIJIkiRJkiSp50wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz5kgkiRJkiRJ6jkTRJIkSZIkST1ngkiSJEmSJKnnTBBJkiRJkiT1nAkiSZIkSZKknjNBJEmSJEmS1HMmiCRJkiRJknrOBJEkSZIkSVLPmSCSJEmSJEnqORNEkiRJkiRJPWeCSJIkSZIkqedMEEmSJEmSJPWcCSJJ0pIkOSXJxiSXDZT9RZKvJrk0yXlJdm3lK5P8IMkl7efdk4tckiRJ0gwTRJKkpToVOGhW2YXA46rqZ4D/C7x2YN/VVbWq/bxyTDFKkiRJ2gITRJKkJamqzwI3zir7eFXd3h5eBOw99sAkSZIkDc0EkSRp1H4D+KeBxw9L8uUkn0ny9EkFJUmSJOluJogkSSOT5PXA7cD7W9EGYN+qejzwGuCMJPdf4LlrkqxNsnbTpk3jCViStOyS3DvJF5P8Z5LLk7yplT8syReSXJXkrCT3bOX3ao+vavtXTjJ+SeoLE0SSpJFI8jLgUOAlVVUAVfWjqrqhbV8MXA08ar7nV9VJVbW6qlavWLFiTFFLkkbgR8Azq+pngVXAQUmeDLwVOLGqHgl8Fzim1T8G+G4rP7HVkySNmAkiSdKyS3IQ8EfA86rqtoHyFUl2atsPB/YDrplMlJKkcajO99rDe7SfAp4JnNPKTwOe37YPa49p+w9MkjGFK0m9tWiCyNsXS5K2JMmZwOeBRydZn+QY4B3ALsCFs/qDZwCXJrmE7qT/lVV147wvLEnaYSTZqbX9G+nudHk1cNPADQ3WA3u17b2AawHa/puB3cYbsST1z85D1DmV7kT/9IGyC4HXVtXtSd5Kd/viP277rq6qVcsapSRpalXVkfMUn7xA3XOBc0cbkSRp2lTVHcCqdmH5POAnl/qaSdYAawD23Xffpb6cJPXeoiOIvH2xJEmSpOVQVTcBnwKeAuyaZOaC9d7AdW37OmAfgLb/AcAN87yWa9VJ0jJajjWIvH2xJEmSpHm19edmlqS4D/Bs4Eq6RNELW7WjgQ+37fPbY9r+T87c7ECSNDrDTDFb0BZuX3xDkicC/5jksVV1yzzPdUioJEmStOPbEzit3aTgx4Czq+qjSa4APpDkz4Avc/f05JOBv09yFd1MhiMmEbQk9c02J4gGbl984ODti+luY0lVXZxk5vbFa2c/v6pOAk4CWL16tVcEJEmSpB1QVV0KPH6e8muA/ecp/yHwojGEJkkasE1TzLx9sSRJkiRJ0o5j0RFE7fbFBwC7J1kPHE9317J70d2+GOCiqnol3e2L/zTJfwN34u2LJUmSJEmSpt6iCSJvXyxJkiRJkrRjW467mEmSJEmSJGk7ZoJIkiRJkiSp50wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz5kgkiRJkiRJ6jkTRJIkSZIkST1ngkiSJEmSJKnnTBBJkiRJkiT1nAkiSZIkSZKknjNBJEmSJEmS1HMmiCRJkiRJknrOBJEkSZIkSVLPmSCSJEmSJEnqORNEkiRJkiRJPWeCSJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJElLkuSUJBuTXDZQ9qAkFyb5evv9wFaeJH+T5KoklyZ5wuQilyRJkjTDBJEkaalOBQ6aVXYc8Imq2g/4RHsM8Bxgv/azBnjXmGKUJEmStAUmiCRJS1JVnwVunFV8GHBa2z4NeP5A+enVuQjYNcme44lUkiRJ0kJMEEmSRmGPqtrQtr8N7NG29wKuHai3vpXNkWRNkrVJ1m7atGl0kUqSJEkyQSRJGq2qKqC24XknVdXqqlq9YsWKEUQmSZIkacZQCSIXIJUkbaXrZ6aOtd8bW/l1wD4D9fZuZZIkSZImaNgRRKfiAqSSpOGdDxzdto8GPjxQ/tJ2MeHJwM0DU9EkSZIkTchQCSIXIJUkLSTJmcDngUcnWZ/kGOAE4NlJvg48qz0GuAC4BrgKeA/wOxMIWZIkSdIsOy/huVu7AKlXiCVpB1RVRy6w68B56hZw7GgjkiRJkrS1lmWR6m1ZgNS700iSJEmSJE2HpSSIlrQAqXenkSRJkiRJmg5LSRC5AKkkSZIkSdIOYKg1iNoCpAcAuydZDxxPt+Do2W0x0m8Ch7fqFwAH0y1Aehvw8mWOWZIkSZIkSctoqASRC5BKkiRJkiTtuJZlkWpJkiRJkiRtv0wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzQy1SLUmSts7K4z42VL11Jxwy4kgkSZKkxTmCSJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIkiRJ0sgk2SfJp5JckeTyJK9u5Q9KcmGSr7ffD2zlSfI3Sa5KcmmSJ0z2L5CkfjBBJEmSJGmUbgf+sKoeAzwZODbJY4DjgE9U1X7AJ9pjgOcA+7WfNcC7xh+yJPWPCSJJkiRJI1NVG6rqS237VuBKYC/gMOC0Vu004Plt+zDg9OpcBOyaZM8xhy1JvWOCSJIkSdJYJFkJPB74ArBHVW1ou74N7NG29wKuHXja+lY2+7XWJFmbZO2mTZtGFrMk9cXOkw5AkiRt/1Ye97Gh6q074ZARRyJpWiW5H3Au8PtVdUuSu/ZVVSWprXm9qjoJOAlg9erVW/VcSdJcjiCSJEmSNFJJ7kGXHHp/VX2oFV8/M3Ws/d7Yyq8D9hl4+t6tTJI0QiaIJEmSJI1MuqFCJwNXVtVfDew6Hzi6bR8NfHig/KXtbmZPBm4emIomSRoRp5hJkiRJGqWnAkcBX0lySSt7HXACcHaSY4BvAoe3fRcABwNXAbcBLx9vuJLUTyaIJEmSJI1MVX0OyAK7D5ynfgHHjjQoSdIcJogkSSOR5NHAWQNFDwfeAOwK/BYwc8uZ11XVBWMOT5IkSdIAE0SSpJGoqq8BqwCS7ES3wOh5dFMFTqyqv5xgeJIkSZIGuEi1JGkcDgSurqpvTjoQSZIkSXOZIJIkjcMRwJkDj1+V5NIkpyR54KSCkiRJktTZ5gRRkkcnuWTg55Ykv5/kjUm+scBIAAAQ5klEQVSuGyg/eDkDliRtX5LcE3ge8MFW9C7gEXTTzzYAb1vgeWuSrE2ydtOmTfNVkSRJkrRMtjlBVFVfq6pVVbUKeCLdLSjPa7tPnNnnwqOS1HvPAb5UVdcDVNX1VXVHVd0JvAfYf74nVdVJVbW6qlavWLFijOFKkiRJ/bNcU8xcW0KStJAjGZhelmTPgX0vAC4be0SSJEmSNrNcCSLXlpAkzZHkvsCzgQ8NFP95kq8kuRT4ReAPJhKcJEmSpLssOUHk2hKSpIVU1ferarequnmg7Kiq+umq+pmqel5VbZhkjJIkSZKWZwSRa0tIkiRJkiRtx3ZehteYs7bEwNVg15aQJGkKrTzuY0PVW3fCISOORJIkSdNgSQmigbUlXjFQ/OdJVgEFrJu1T9spv0hIkiRJkrTjWlKCqKq+D+w2q+yoJUUkSZKmxrAXCCRJkrR9W667mEmSJEmSJGk7tRxrEEmSJEmSJFyeQ9svE0Q7KBslSZIkSZI0LKeYSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz5kgkiRJkiRJ6jlvc99zK4/72KRDkCRJkiRJE+YIIkmSJEmSpJ4zQSRJkiRJktRzTjHbRsNOzVp3wiEjjkSSJEmSJGlpTBBpIkywSZIkSZI0PUwQSZKksfECgSRJ0nRyDSJJkiRJkqSeM0EkSZIkSZLUc04xkyRpBzLsFK5pN8zf4TQ0SZKk5eMIIkmSJEmSpJ7rzQiiHeWKqiRJkiRJ0nLrTYJoR2GiS5IkSZIkLTenmEmSJEmSJPWcCSJJkiRJkqSec4qZlpVT4CQNSrIOuBW4A7i9qlYneRBwFrASWAccXlXfnVSMkiRJkhxBJEkavV+sqlVVtbo9Pg74RFXtB3yiPZYkSZI0QSaIJEnjdhhwWts+DXj+BGORJEmSxDJMMXP6gCRpCwr4eJIC/ndVnQTsUVUb2v5vA3tMLLrtiFN4JUmSNErLNYLI6QOSpPk8raqeADwHODbJMwZ3VlXRJZHmSLImydokazdt2jSGUCVJkqT+GtUUM6cPSJKoquva743AecD+wPVJ9gRovzcu8NyTqmp1Va1esWLFuEKWJEmSemk5EkQz0wcuTrKmlTl9QJJ6Lsl9k+wysw38EnAZcD5wdKt2NPDhyUQoSZIkacZy3Ob+aVV1XZKfAC5M8tXBnVVVbe2JzbRk0hqAfffddxnCkCRNmT2A85JA19+cUVX/nOQ/gLOTHAN8Ezh8gjFKkiRJYhkSRIPTB5JsNn2gqjYsNH2gLVR6EsDq1avnXX+iT1x8VNKOpqquAX52nvIbgAPHH5EkSZKkhSxpipnTByRJkiRJkrZ/Sx1B5PQBSZIkSZKk7dySEkROH5AkSTuKYad7rzvhkBFHIkmSNH6jus29JEmSJEmSthMmiCRJkiRJknpuOW5zL0nSds+7SUrSaCQ5BTgU2FhVj2tlDwLOAlYC64DDq+q76RY3fTtwMHAb8LKq+tIk4pakvnEEkSRJkqRROhU4aFbZccAnqmo/4BPtMcBzgP3azxrgXWOKUZJ6zwSRJEmSpJGpqs8CN84qPgw4rW2fBjx/oPz06lwE7Jpkz/FEKkn9ZoJIkiRJ0rjtUVUb2va3gT3a9l7AtQP11rcySdKImSCSJEmSNDFVVUBt7fOSrEmyNsnaTZs2jSAySeoXE0SSJEmSxu36malj7ffGVn4dsM9Avb1b2RxVdVJVra6q1StWrBhpsJLUByaIJEmSJI3b+cDRbfto4MMD5S9N58nAzQNT0SRJI+Rt7jXVhr3t9LoTDhnra0mSJGk4Sc4EDgB2T7IeOB44ATg7yTHAN4HDW/UL6G5xfxXdbe5fPvaAJamnTBBJkqTtkol/aftQVUcusOvAeeoWcOxoI5IkzccEkSRJ2qENm0iSJEnqM9cgkiRJkiRJ6jkTRJIkSZIkST3nFDPtEJw+IEmSJEnStnMEkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk95xpEkiRNkGuoSZIkaRqYIBoxT/wlSZIkSdK0M0EkSZK0FYa9+LPuhENGHIkkSdLyMUEkzeKJvyRJkiSpb1ykWpIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSRqJJPsk+VSSK5JcnuTVrfyNSa5Lckn7OXjSsUqSJEl9t80JIk/8JUmLuB34w6p6DPBk4Ngkj2n7TqyqVe3ngsmFKEmSJAmWdhezmRP/LyXZBbg4yYVt34lV9ZdLD0+StL2qqg3AhrZ9a5Irgb0mG5UkSZKk+WzzCKKq2lBVX2rbtwKe+EuS5pVkJfB44Aut6FVJLk1ySpIHLvCcNUnWJlm7adOmMUUqSZIk9dOyrEG0LSf+kqR+SHI/4Fzg96vqFuBdwCOAVXQjjN423/Oq6qSqWl1Vq1esWDG2eCVJkqQ+WnKCaFtP/L0yLEk7viT3oOsj3l9VHwKoquur6o6quhN4D7D/JGOUJEmStMQE0VJO/L0yLEk7tiQBTgaurKq/Gijfc6DaC4DLxh2bJEmSpM1t8yLVWzrxbwuTgif+ktRnTwWOAr6S5JJW9jrgyCSrgALWAa+YTHiSJEmSZizlLmae+EuSFlRVnwMyzy5vay9JkiRNmW1OEHniL0mSJEmStGNYlruYSZIkSZIkaftlgkiSJEmSJKnnlrIGkdRrK4/72FD11p1wyIgjkSRJkiRpaRxBJEmSJEmS1HOOIJIkSRoBR5pKkqTtiSOIJEmSJEmSem67H0E07NU5SZIkSZIkzc8RRJIkSZIkST233Y8gkiRpSxxpKkmSJC3OBJEkSdJ2wEWvJUnSKDnFTJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs95m3tpSnj7YkmSJEnSpJggkrYzJpIkSZIkScvNKWaSJEmSJEk9Z4JIkiRJkiSp50wQSZIkSZIk9ZxrEEkjNuyaQZKkflrufmK516pz7TtJkvrBEUSSJEmSJEk95wgiSZIkSZI0Uo5InX4miKQdlFMMJEmSJEnDMkEkSZLUQ66RJ0mSBo0sQZTkIODtwE7Ae6vqhFG9l6Rt5xcETYr9hCRpIfYRkjR+I0kQJdkJ+Dvg2cB64D+SnF9VV4zi/SRtf3aUKWs7yt8xbvYT0o5nuS847Cjtpv3E1rOPkKTJGNUIov2Bq6rqGoAkHwAOA2zUpR3ctI9I8kR9athPSNqiSfUntv9TwT5CUi9M23eTUd3mfi/g2oHH61uZJElgPyFJWph9hCRNwMQWqU6yBljTHn4vydcmFcsidge+M+kgppDHZS6PyfyWdFzy1mWMZILvO+v1tvaYPHRZg9lObEf9xCDbgbk8JvPzuMw1FcdkyvqdYY5JL/sIWNZ+Yqj/e2P+vzEVn4d5zIlrUp+ZAdN4rKbx/xRsJ8fK/1Nz5a1LimnofmJUCaLrgH0GHu/dyu5SVScBJ43o/ZdNkrVVtXrScUwbj8tcHpP5eVzm8pgAO1A/Mch/27k8JvPzuMzlMZmrx8dk0T4Clq+fmMbjPI0xwXTGZUzDm8a4jGk444ppVFPM/gPYL8nDktwTOAI4f0TvJUna/thPSJIWYh8hSRMwkhFEVXV7klcB/4fu1pSnVNXlo3gvSdL2x35CkrQQ+whJmoyRrUFUVRcAF4zq9cdou5reMEYel7k8JvPzuMzlMWGH6icG+W87l8dkfh6XuTwmc/X2mIy5j5jG4zyNMcF0xmVMw5vGuIxpOGOJKVU1jveRJEmSJEnSlBrVGkSSJEmSJEnaTpggWkCSfZJ8KskVSS5P8upJxzRpSe6d5ItJ/rMdkzdNOqZpkmSnJF9O8tFJxzINkqxL8pUklyRZO+l4pkWSXZOck+SrSa5M8pRJx6Sls8+Yyz5jYfYXc9lnzGV/sbyGaafT+ZskVyW5NMkTpiCmA5Lc3D4blyR5w4hjWrTtTnKvJGe14/SFJCtHGdNWxPWyJJsGjtVvjjqu9r4LtumTOFZDxDSp47TFdn7cn78hYxrr56+95xbb/lEfp5GtQbQDuB34w6r6UpJdgIuTXFhVV0w6sAn6EfDMqvpeknsAn0vyT1V10aQDmxKvBq4E7j/pQKbIL1bVdyYdxJR5O/DPVfXCdHdm+fFJB6RlYZ8xl33Gwuwv5mefsTn7i+U1TDv9HGC/9vMk4F3t9yRjAvjXqjp0hHEMGqbtPgb4blU9MskRwFuBF09BXABnVdWrRhzLbFtq0ydxrBaLCSZznGDL7fy4P3/DxATj/fzB4m3/SI+TI4gWUFUbqupLbftWug/YXpONarKq87328B7tx0WsgCR7A4cA7510LJpeSR4APAM4GaCq/quqbppsVFoO9hlz2WfMz/5Cw7C/WH5DttOHAae39usiYNcke044prEasu0+DDitbZ8DHJgkUxDX2A3Rpo/9WG3H/cxYP3/TaMi2f6THyQTRENpQwMcDX5hsJJPXhiteAmwELqyq3h+T5q+BPwLunHQgU6SAjye5OMmaSQczJR4GbALe14b9vjfJfScdlJaXfcbd7DPmZX8xP/uMzdlfjNAW2um9gGsHHq9nTAmbRfqOp7SpVf+U5LFjiGWxtvuu41RVtwM3A7tNQVwAv9qm3ZyTZJ9Rx8TibfokjtUw/cy4jxMs3s5P4vM3TN8zzs/fMG3/SI+TCaJFJLkfcC7w+1V1y6TjmbSquqOqVgF7A/snedykY5q0JIcCG6vq4knHMmWeVlVPoBsGeWySZ0w6oCmwM/AE4F1V9Xjg+8Bxkw1Jy8k+Y3P2GZuzv9gi+4zN2V+MyDS204vE9CXgoVX1s8DfAv846nimte0eIq6PACur6meAC7l75M5ITGObPmRMYz1OA6axnV8spnF//ibe9psg2oI2v/Vc4P1V9aFJxzNN2lC3TwEHTTqWKfBU4HlJ1gEfAJ6Z5B8mG9LkVdV17fdG4Dxg/8lGNBXWA+sHrnidQ9cJaAdgn7Ew+4y72F8swD5jDvuLERiinb4OGBxNsXcrm1hMVXXLzNSqqroAuEeS3UcZ08B7L9R233WckuwMPAC4YRwxbSmuqrqhqn7UHr4XeOKIQxmmTR/3sVo0pgkcp5n3XaydH/vnb7GYJvD5G6btH+lxMkG0gDY39GTgyqr6q0nHMw2SrEiya9u+D/Bs4KuTjWryquq1VbV3Va0EjgA+WVW/PuGwJirJfdtii7Rhkb8EXDbZqCavqr4NXJvk0a3oQKDPixjvMOwz5rLPmMv+Yn72GXPZXyy/Idvp84GXpvNk4Oaq2jDJmJI8eGbNmiT7031/G1mCYci2+3zg6Lb9Qrq2bKTrAQ0T16x1WJ5Ht6bTyAzZpo/1WA0T07iPU3vPYdr5cX/+Fo1p3J+/Idv+kR4n72K2sKcCRwFfaXNdAV7XMod9tSdwWpKd6D4cZ1eVt+jVfPYAzmvt6c7AGVX1z5MNaWr8LvD+dHcluAZ4+YTj0fKwz5jLPkPDss+Yn/3F8pq3nQb2BaiqdwMXAAcDVwG3MfpjPkxMLwR+O8ntwA+AI0acjJm37U7yp8DaqjqfLqn190muAm6kS0SM2jBx/V6S59HdHe5G4GVjiGuOKThWi8U0ieM0bzuf5JUwsc/fMDGN+/MH87T94zxOGf3fJ0mSJEmSpGnmFDNJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIkiRJkiSp50wQSZIkSZIk9dz/D24mQN1DAO6HAAAAAElFTkSuQmCC\n",
3641
      "text/plain": [
3642
       "<Figure size 1440x360 with 3 Axes>"
3643
      ]
3644
     },
3645
     "metadata": {},
3646
     "output_type": "display_data"
3647
    },
3648
    {
3649
     "data": {
3650
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEKCAYAAAARnO4WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X2clXWd//HX55wzDMi9gBjgCC4/1wABa5RclDbNtryhWslKy1W3tX67P1ZNozVLsx7tbppbamVrWmpLKUKKlFmaN0mJCTogN+5KKQoq6iwio3iGmfn8/riug2eGM2fO3TXnnLnez8djHpxznevmMwf4nO/5Xt/v52vujoiIDHyJagcgIiL9QwlfRCQmlPBFRGJCCV9EJCaU8EVEYkIJX0QkJpTwRURiQglfRCQmlPBFRGIiVe0Aso0dO9YnT55c7TBEROrGmjVrXnX3cYXsW1MJf/LkyaxevbraYYiI1A0z21LovurSERGJiUgTvpmNMrOlZvaUmW0ys6OjvJ6IiPQu6i6dq4F73H2BmQ0C9ov4eiIi0ovIEr6ZjQTmAWcBuHs70B7V9UREJL8ou3SmAK8APzazJ8zsBjMbGuH1REQkjygTfgp4F3Cdux8BvAH8S8+dzOxcM1ttZqtfeeWVCMMREYm3KBP+VmCruz8aPl9K8AHQjbtf7+7N7t48blxBQ0lFROpCa1uatc+/Rmtbuqx9KiWyPnx3f8nMnjezv3T3/waOBzZGdT0RkVqyvGUbX1y2joZEgj1dXVxx6kzmz55Y9D6VFPU4/IXAYjNbB8wG/jXi64mIVF1rW5ovLlvHW3u62JXu4K09XSxatq5bK76QfSot0mGZ7t4CNEd5DRGRWrN1x24aEgneomvvtoZEgq07djNmWGPB+1SaZtqKiFTYpNFD2NPV1W3bnq4uJo0eUtQ+laaELyJSYWOGNXLFqTMZ3JBgeGOKwQ0Jrjh1ZreWeyH7VJq5e2QnL1Zzc7OreJqIDBStbWm27tjNpNFDek3kheyTj5mtcfeCus5rqlqmiMhAMmZYY59JvJB9KkVdOiIiMaGELyISE0r4IiIxoYQvIhITSvgiIjGhhC8iEhNK+CIiMaGELyISE0r4IiIxoYQvIhITSvgiIjGhhC8iEhNK+CIiof5cX7YaVC1TRIT+X1+2GtTCF5HYq8b6stWghC8isZdZXzZbZn3ZgUQJX0Rirxrry1aDEr6IxF411petBt20FREB5s+eyNypY8taX7bWKeGLiIT6c33ZalCXjohITCjhi4jEhBK+iNSNgT4TNmrqwxeRuhCHmbBRUwtfRGpeXGbCRk0JX0RqXlxmwkZNCV9Eal5cZsJGTQlfRGpeXGbCRk03bUWkLsRhJmzUlPBFpG4M9JmwUVOXjohITCjhi0jN04SrylCXjojUNE24qpxIW/hm9qyZPWlmLWa2OspricjAowlXldUfLfz3ufur/XAdEakTrW3pgkbbZCZcvcXbY/AzE65087Z46tIRkX5VTBeNJlxVVtQ3bR34jZmtMbNzI76WiNSozE3Xzdt3FdVFowlXlRV1C/8Yd99mZgcA95rZU+7+u+wdwg+CcwGampoiDkdE+lt2iz7d2YW5d3u9ry4aTbiqnEhb+O6+LfzzZeAO4Kgc+1zv7s3u3jxu3LgowxGRftbzpmt7Rxfpzu4Jv5AumjHDGpl10Cgl+zJFlvDNbKiZDc88Bj4ArI/qeiJSe3JVuRzckGBQ0tRFUwVRdumMB+4ws8x1furu90R4PRGpMbluugLc/c/H8kZ7p7po+llkCd/d/wzMiur8IlL7MjddF/UYlTN1/PBqhxZLGpYpIpHSTdfaoYQvIpFTlcvaoOJpIlKSzdt3sXT182zevqvaoUiB1MIXkaJdeueT3LLqub3PT5pxIF/7yAy14mucWvgiUpTN23d1S/YAv1z/Ekf/233c1bKtSlFJIZTwRaQoLc+/lnN7eyeqZFnjlPBFpFe5Fh6ZfdCoXvfPlEmQ2qQ+fBHJqbeqllPHD+fMo5u45ZHn9jlGlSxrm1r4IjFTyHKBfS088rUPH859F8zjyINHdzvutOZJunFbw5TwRWJkecs25n7zfj51w6PM/eb9vd5kzVUDp2d3zaPP/C+PbdnRbZ8lq7eqD7+GKeGLxESuVvtFt6/NOY6+r4VHWtvSXP6LjfsclzRTH34NU8IXiYlcrfb2TufEa1fu09Lva+GRrTt2Myhp+1xjT6f68GuZbtqKxERvlSvbO7q4aOk6pr1jRLeiZvlq4EwaPYSOLt/nXJedMl19+DVMLXyRmMi02nO1zNs7ujjxmoe5q2Vbt5u6vS08kv0NYGhjkkGpBN/46AzOeM/B/fXrSAnMfd9P6Wppbm721atXVzsMkQFt8/ZdnHjtSto79m3tpxKQTCQYlOx7gXEI+vJVBbO6zGyNuzcXsq9a+CIxM3X8cL61YCaDUvv+9+/ognRHYQuMg5YerDclJXwzO7DSgYhI/5k/eyJ3LzwmZ/dONs2cHVhKbeHfWNEoRKTfTR0/nG99bNbekTiNKaOhxweAZs4OLCWN0nH3kyodiIhUTq6+9Vzbeo7E+f3mV/dZjlDdNQNHrwnfzPbPd6C7/2/lwxGRcrS2pVn86HN874GnGZRM7k3aDjnr4kD31ai0HOHA1usoHTN7BnAgVyefu/shlQ5Go3RESre8ZRtfuH0t7Z3d/083phKAk+54e/vghgS//+JxSugDQDGjdHpt4bv7lMqFJCJRaW1Ls+GFnVxwawv7DrSEZMLADejcuy1zM1YJP14K6sM3s78FjiFo8T/s7ndGGpWIFCRTwriry3Mme4DOLif4r/s23YyNpz4Tvpl9H5gK/Czc9DkzO8Hd/ynSyEQkr+xiaL1pSMKVC2YC6GasFNTCPw54p4ed/WZ2M7Ah0qhEpE+ZYmhv9dq2h5995j00TxkDoJuxUlDC3ww0AVvC5weF20QkYvlKF0waPYS3Ojp7OTIok9CQSu59nj0aR+KpkIQ/HNhkZn8k6Ag8ClhtZncBuPv8COMTia3Fq7Zw+YoNNCSNPV1w2SnTOGPOwXs/BIYOSpKvFlYyYeqnl24KSfiXRh6FiHSzeNUWLrlzPcDeYZaX3LGetc+/xl1rX6AhkSDd2UXSjA5yJ/1LT1apYumuz4Tv7g/1RyAiEti8fRdfCZN9T0tWbwXI228PMLQxyYyJIysem9Q3VcsUqSHLW7Zx4jUr+0jnbxvckNin/g0EQzHVnSM9KeGL1IjMMMv2zkLTfeBX/3wsF55wKI0py7kcoUiGljgUiVAxC4RseOF1EjkrmeTWkDSuOHUmU8cPZ+H44Zw+p0nDLiWvkhK+mX3V3b9a4VhEBpTMLNhcBcty7bto6TrSOVah6k3CgrH1GRp2KX0ptUtnTUWjEBlgsmfB9rV6VGbfYpI9wKBkUouTSFFKrYe/otKBiAwkuWbB9lawbOuO3eRaeGpQ0vjpZ+aw/oXXaUwl+OqKDd0qXqoejhSrkFo61+TYvBNY7e7LKx+SSP2bNHoIe7q6t9h7S9Drt+3kjfZ9W/cNqQQNqSRnzQ0K1w5tTKkejpSlkBb+YOAw4Pbw+anAM8AsM3ufu58fVXAi9WrMsEauOHVmnwm6tS3N13+5Mec5eg6t1OIkUq5CEv5MYK67dwKY2XXAwwTlkp/s62AzSwKrgW3ufnIZsYrUvOxROfNnT2TaO0awcvMrjB02mKP/Ysw++/dWAG1QKvfQSt2YlXIUkvBHA8MIunEAhgL7u3unme17B2pf5wGbgBGlhShSH4KRNmtJWoKOrk7mHXoADzz1MpkFqAz4/AmHcvqcpr1JO1fXz6CkcffCY5g6fng//wYy0BUySucKoMXMfmxmNwFPAFea2VDgvnwHmtkk4CTghnIDFallrW1pLlzSQrrDeXNPJ+2dcN+mt5M9BJUHr7r3f/irf7+fu1q2AW93/QxuSOydNPWtj81SspdIFFJL50Yzu5ugSibAl9z9hfDxF/o4/DvAIoKKmzmZ2bnAuQBNTU19BixSix75UyuFjqpMdwRDNOdOHcuYYY3qm5d+02cL38xWAH8N3Ofuy7OSfV/HnQy87O55x+y7+/Xu3uzuzePGjSvk1CI1ZfGqLZx/W0tRx2SGaGaMGdbIrINGKdlLpArp0vkWcCyw0cyWmtkCMxtcwHFzgflm9ixwK3Ccmf1X6aGK1J5MGeOOrt7r0ufyVkenxtBLv+sz4bv7Q+7+j8AhwH8CpwEvF3Dcxe4+yd0nA58A7nf3T5UZr0hVtbalWfv8a7S2pWltS/daxrgv+RYuEYlKQTNtzWwIcArwceBdwM1RBiVSizK1cZJm7Ons4vSjmgouY9xTYyqZc9atSJQKmWm7hOCG7T3Ad4GH3L2of+fu/iDwYAnxiVTV5u27aHn+NSaP2Y9FS9d2K21w0yNb8hyZ355OlUWQ/ldIC/9G4JOZiVcicXHpnU9yy6rnIjn3Zado+UHpf4UMy/y1mc0ws2kEZRYy22+JNDKRKtq8fVfJyd6AOVNGs+qZHd227deYZE+n712MXKS/FdKlcxnBsMxpwN3Ah4CVgBK+DFgtz79W8rE3n3Mk8w49YG930OyDRjF66CCNs5eqK6RLZwEwC3jC3c82s/GAhlfKgHbvppdKPnbCyKBvfur44d1mzCrRS7UVkvB3u3uXmXWY2QiCIZkHRRyXSEUUs8RgZt9frt3Grzf0OfI4p8ENCd5o1+0uqU2FJPzVZjYK+CHBSldtwCORRiVSAcUsMXj9Q3/iil8/BQ4dZQ6R1+gbqVWF3LT9x/DhD8zsHmCEu6+LNiyR8mQvMZgpPbxo2TqmvWMEb7R3dmvxL1q6liWrt5Z0ndmTRvLU9l1alETqQlFLHLr7sxHFIVJRvdWZP/Gah2lMJWnv7OKkww9k8pj9Sk72SYMbzzpy7/V0Q1ZqXUlr2orUulx15t/aEzxv7+wA4OdPFFQHMK/fb36V+bMnKtFLXSikeJpI3elZZ35QKpFzofBydHrQTdTaVsg6QCLV12sL38yOBMa6+696bD8R2N5X2WORasuuM3//pu1cff/mil8jU+ZYLXypB/la+N8Ecq2uvAG4MppwRCprzLBGJo0ewnW/+3NFztfzS8KeLtXEkfqRrw9/uLvvUx3K3beY2dgIYxIpWa5x98ENXKO9iPMMShpmhjucPucgJo8ZyjFTx7LxxddZ1GOop1r3Ui/yJfzReV7br9KBiJSjtS3Ntb99mp+s2kJjKkEX8JWTpjFj4kiGDkqSLnT9QeCaT8zudcnBqeOHazlCqVv5Ev59ZvYN4MsertZgZgZcDtzfH8GJFGJ5yzYuuK2FzKJTb4ajcS65cz3JhOHuFLog1dLPvofmKWOA3kshjBnWqEQvdSlfwr8QuAHYbGaZBTtnAauBz0QdmEghWtvSfOH2tb0m9M4ilh4c3JCgIZWsUGQitafXhO/ubwCfNLNDgOnh5g3uXpm7XyIVsPjR52jvrNxygboBKwNZvmGZ78p6ui38c1Rmu7s/HmVgIn1pbUtz9X3/U/Z59mtI0oXrBqwMePm6dK7K85oDx1U4FpGCtbalWbH2Bcpt3DemEvzg0+9m+oQRSvYy4OXr0nlffwYi0pfMkMtVf27lqnv/h47OUpcQh8akYQnjilNnMu/QcRWMUqR25evSmZfnOHf3hyOIRyiuhntcZEodu3u3hcQLlQDOPPpgjnvneCaMHLxPxUyROMjXpfOFHNscmEmwAIqGM0SgmBrucZFd6rgUZxzVxOc/cKiSu8Revi6dU7Kfm9lc4MvAS8DCiOOKpd5quM+dOjaWySrzTef5/32zqOGV2RoSKNmLhApZxPx44CsErft/dfd7I48qpnLVcI9Lca6e3ViLV23h8hUb6Op0Oso472nNTQP+vRMpVL4+/JOAS4CdBLNtV/ZbVDGVq4Z7HIpzZXdjtXd2ctSU/Xn46daKnPvsuZMrch6RgSBftcwVwCSgA1hkZndl//RPePHSs4b74IbEgB8bnt2NtSvdQbrDK5bszzy6ianjh1fkXCIDQb4uHQ3LrILsGu5xGEXS21KEpRiUSnDd6Uew4809zD5olJK9SA/5Ev5GYJy7d6uJb2bTgFcijSrm4lSca9LoIexuL6eXHoY2JunsCmbKHj/twApFJjLw5Ev41wLfz7F9DMFondMjiUhi5aIlaylhWD1DUoab7S2BHIdvQyLlypfwp7r773pudPeHzey6CGOSGFj9TCt/f9Nj7Ex3Fn1syuDrHzmc9x12gJK8SBHyrniV57WGSgciA0/PoZatbWke+VMr/373RrbuLH3h70TClOxFSpAv4W82sxPd/e7sjWb2IUAlkiWvzFDLpBntnV28Z8r+/H5zawVuzcLC4/6Pkr1ICfIl/POBX5rZacCacFszcDRwctSBSf3KVQrh4c2VGWrZmEpw+pymipxLJG7ylVZ42swOJ7g5OyPc/BDwWXd/qz+Ck/qS6cLZuXtP3gkepdhvUIIuZ8DPSxCJUt7SCu6eBn7cT7FIHcueLftme0fZdeozGlMJLj1lGjMmaCSOSLnylVbYRVA/Z5+XCMojj4gsKqkbrW1pNrywk0VL15Lu8IpMoDKgIQkLjzuU0+eoFo5IpeTr0ilrmqKZDQZ+BzSG11nq7peVc06pLZlWvUFJNepzGZSEG/7uSKZPGKlEL1Jh+Vr4g4HPAVOBdcCP3L2YKZFp4Dh3bzOzBmClmf3K3VeVFbHUhHJr1PfUfQWqAypyThHpLl8f/s3AHuBh4ERgOnBeoSd2dwfawqcN4U+Fenalv/UcU7/hhddLrlGfbVDS+Oln5tCQSqqPXiRi+RL+NHc/HMDMbgT+WOzJzSxJMKRzKvA9d3+0pCilqnquwjV/1gR+/vg2OspI+I2pBGbBqJvmKWMqGK2I9CZfwt+TeeDuHWZW9MndvROYbWajgDvMbIa7r8/ex8zOBc4FaGrS+Opak2sVriWrt5Z8PgMuPvEw5kwZoxa9SD/Ll/Bnmdnr4WMDhoTPix6l4+6vmdkDwAeB9T1eux64HqC5uVldPjXmNxteorNCYywTBr85f57KFotUSb5ROmUtUm5m44A9YbIfApwAfLOcc0r0svvqz7v1CVZWYIZsKgHJhHHlgllK9iJV1OeatmV4B3Bz2I+fAJa4+y8ivJ6UaXnLNhYtDYZZdnR5WX30GY2pBD88s5npE0ao+0akyiJL+O6+DjgiqvNL5QSTp17n87etpdMr16vWmEpw5YKZzDt0XMXOKSKli7KFL3UgMwIH94ol+1QCzjtes2RFao0SfoxVevKUAZed8k5OmTVRiV6kBinhx0j2DVmAB556mVSi+OG2vWlsSCjZi9QwJfyYyJ489VZHJ52dTkPSSFdgyOV+g5J0uat0sUiNU8KPgVyTp4Cyk31jyrj05OlaRFykTijhx8DWHbuDln1FFhgM+upvPkcVLUXqjRJ+DEwaPYR0R/nJPgE0pIIJVKpoKVJ/lPAHuNa2NF+580naO0tP+Of8VRMnHj5BFS1F6pwS/gC2eNUWvrpiA3vK6Ks/duoYLp1/eAWjEpFqUcIfgFrb0lz726e56ZEtJZ9j7iH7c8EJh6p0scgAooQ/wCxetYVL79pQ8uIkSYOvfWQGZ8w5uMKRiUi1KeHXuezJVLf84Vmuvn9zyecy4MdnH6XaNyIDlBJ+HcueTPVmewflzqFqbEgwfULByxyISJ1Rwq9TvU2mKlVjKqGZsiIDnBJ+HWptS/PAUy/TUebY+iSQTMLC41TZUiQOlPDrzOJVW7h8xQbay+i/MeDiDx3GnEO0rqxInCjh15HFq7ZwyZ3r+94xh3OPncLIIQ0cPGYoR//FGCV5kRhSwq8Tm7fv4svLS0v273/nAXzppGkVjkhE6o0Sfg3KLDkIzoSRQ/jJqi3cXOIkqlTC+OapMysboIjUJSX8GrO8ZRsXLmmhArXOaEgaV31slrpvRARQwq8prW1pFi1dW1ayTxl88UOH8ZcHDlf5YhHpRgm/hmzdsZt0R3mzp8zgb981SYleRPaRqHYAEti8fRcLf7qm7PMMSiXZumN3BSISkYFGLfwqya6B87UVG1i+9sWiz5Ew6FkjrdN97yLlIiLZlPCrIFMDJ2nGG+2dRR9/2AFD+dLJ05g+YST3rH+Jy1dsoCGZoFMLiYtIHkr4/Sy7Bk6xPjh9PJ85Zkq3GvVnvOdgPjjjwL3fFpTsRaQ3Svj9bMMLr5dUA6cxZXzjo4fnTOhjhjUq0YtIn5Tw+0EwkWonv930Mj95ZEvRtS2TBlcu0Hh6ESmPEn7Elrds46Lb15a8rmzC4Nfnz2Pq+OEVjkxE4kYJP0KtbWkuXLKWjhKWGxzSkKDL4coFM5XsRaQilPAjsnn7Li6+48mik30qYVz+4enMmDBSN2FFpKKU8CustS3Nebe2sHLzq0UdZ8C1nzxCpYtFJDJK+GXI3IwFY7+GBD977HmWPb6t6PMkgO98YjYnz5pQ8RhFRDKU8EtUiaqWCeCz7z2Ezxx7iFr1IhI5JfwStLal+fxtLZSxyiDJhPHr847VDVkR6TcqnlaCnz++taxk35hK8O3TZinZi0i/Ugu/CK1taU77zz/wp1feLPkc/1ddOCJSJZElfDM7CLgFGA84cL27Xx3V9aK0+plWfvjwM/x64/aSz5FMGF/78HTOmHNwBSMTESlclC38DuBCd3/czIYDa8zsXnffGOE1K6q1Lc05Nz3G2q07Sz7Hv310BhNHD9HqUyJSdZElfHd/EXgxfLzLzDYBE4G6SPjLW7Zx4W0tlLoAVdLg2x+fzfzZEysbmIhIifqlD9/MJgNHAI/2x/XKlZk8VaqTDz+Qyz88Qy16EakpkSd8MxsGLAPOd/fXc7x+LnAuQFNTU9Th5JWZSHX2jx4r+RwNSVOyF5GaFGnCN7MGgmS/2N1/nmsfd78euB6gubm5vBW8y7C8ZRufX7KWzhIKnQEMThmOceUCrTglIrUpylE6BtwIbHL3/4jqOpXQ2pbm/FtbKCXVn/e+Qzhz7iFacUpEal6ULfy5wKeBJ80s0yH+JXe/O8JrFq21Lc3ffPuhopO9GVyddVNWiV5Eal2Uo3RWEhSBrDmtbWm27tjN+m07+cry9RTTi5MA/vWjMzhh+oFK8iJSV2I303bxqi1cdtd63Cm6PMKsicNZvnBeNIGJiEQsVgl/8aotXHLn+pKObUjAj86eU+GIRET6TywSfmtbmkf+9GrJyT6VgKtOm60uHBGpawM+4S9etYWv3LmeYsrWD07CiTMncOTk/Zk4ej+mTxihZC8idW9AJ/xSu3B+f/H7leBFZMAZkAk/6MJpLSnZf+MjmiUrIgPTgEv4y1u2cdHta9lTxBAcAxpSCS47ZZrKF4vIgDVgEn5QB+d1LlrSwp4i15m9+ZwjVb5YRAa8AZHwl7ds44vL1vFWEZm+IVzc8arTZjPv0AMiikxEpHbUfcJvbUuzaOk60h2FJfvGVIJLT57GjIkjVftGRGKl7hP+4kefKyjZJ4ALTjiU0+c0KcmLSCzVdcJvbUvzvQeezrtPAvisFg4XEanvhL91x24GJZOkOzpyvj570khuPOtIJXoREeo84U8aPYQ9Xd27c1IGZx8zmb+ZdiDNU8ZUKTIRkdqTqHYA5RgzrJErTp3J4IYEwxtTDG5I8B8fn80lJ01XshcR6aGuW/gA82dPZO7UsVpxSkSkD3Wf8CFo6SvRi4jkV9ddOiIiUjglfBGRmFDCFxGJCSV8EZGYUMIXEYkJJXwRkZgw98IXComamb0CbKl2HHmMBV6tdhA1SO9L7/Te9E7vTW7Fvi8Hu/u4QnasqYRf68xstbs3VzuOWqP3pXd6b3qn9ya3KN8XdemIiMSEEr6ISEwo4Rfn+moHUKP0vvRO703v9N7kFtn7oj58EZGYUAtfRCQmlPD7YGYHmdkDZrbRzDaY2XnVjqlWmNlgM/ujma0N35vLqx1TLTGzpJk9YWa/qHYstcTMnjWzJ82sxcxWVzueWmJmo8xsqZk9ZWabzOzoSp5/QJRHjlgHcKG7P25mw4E1Znavu2+sdmA1IA0c5+5tZtYArDSzX7n7qmoHViPOAzYBI6odSA16n7trDP6+rgbucfcFZjYI2K+SJ1cLvw/u/qK7Px4+3kXwH3hidaOqDR5oC582hD+6KQSY2STgJOCGasci9cHMRgLzgBsB3L3d3V+r5DWU8ItgZpOBI4BHqxtJ7Qi7LVqAl4F73V3vTeA7wCKgq68dY8iB35jZGjM7t9rB1JApwCvAj8OuwBvMbGglL6CEXyAzGwYsA85399erHU+tcPdOd58NTAKOMrMZ1Y6p2szsZOBld19T7Vhq1DHu/i7gQ8A/mdm8agdUI1LAu4Dr3P0I4A3gXyp5ASX8AoT908uAxe7+82rHU4vCr54PAB+sdiw1YC4w38yeBW4FjjOz/6puSLXD3beFf74M3AEcVd2IasZWYGvWt+SlBB8AFaOE3wczM4I+tU3u/h/VjqeWmNk4MxsVPh4CnAA8Vd2oqs/dL3b3Se4+GfgEcL+7f6rKYdUEMxsaDn4g7K74ALC+ulHVBnd/CXjezP4y3HQ8UNHBIRql07e5wKeBJ8O+aoAvufvdVYypVrwDuNnMkgSNhyXuriGIks944I6gHUUK+Km731PdkGrKQmBxOELnz8DZlTy5ZtqKiMSEunRERGJCCV9EJCaU8EVEYkIJX0QkJpTwRURiQglf8jKzs8xsQrXjyCeM8bt97PM5Mzszx/bJZlbUOHAz+5mZrTOzC4qNtcjrNJjZv5vZ02b2uJk9YmYfCl971syWZe27wMxuiiiOj4WVGx8ws2Yzuybc3uf7LrVF4/ClL2cRTIx5ocpxlMXdf1CJ85jZgcCR7j41x2spd++oxHVCXyeY6zDD3dNmNh54b9br7zazaeVWbg0nF5q791b35++Bf3D3leFzlTSuU2rhx5CZfSqsY99iZv8ZFkBLmtlNZrY+rFV+gZktAJoJJoK0hLNpezvnUWEL9Akz+0NmtqCZrTKz6Vn7PRi2EseZ2b1hHf0bzGyLmY3tI+4HzezqMJb1ZrbPlPywxX5/2AL/rZk1hdu/amYXhY/fHdbwXwv8U9axvzOz2VnQD22HAAAEa0lEQVTPV5rZrB6X+A0wMYzh2DCm74R13c/Lc/2bzOy68P34s5n9tZn9KGw535Tj99gP+AdgobunAdx9u7svydrtKuCSPt6zs8xseRjn02Z2Wdb79N9mdgvBB/pBZvbJ8O9+vZl9M9zvUuAY4EYzuzKMe5/JdeHf5zIzeyz8mZsvLqkSd9dPjH6AdwIrgIbw+feBM4F3E1S7zOw3KvzzQaC5gPOOAFLh4/cDy8LHFwCXh4/fAfx3+Pi7wMXh4w8SVFAc28c1HgR+GD6eB6wPH58FfDd8vAL4u/DxOcCd4eOvAheFj9cB88LHV2ad5++A74SPDwVW54hhcmb/rJi+n/W8t+vfRFBXx4APA68DhxM0utYAs3tcZybwRJ734lmCWaubgKnAAuCmHPudBbwIjAGGECT35vD36ALeE+43AXgOGEfwzf9+4CM9/w0Afw38Isf7/lOComgATQSlSKr+710/3X/Uwo+f4wmS+2NhqYjjgUMIpnEfYmbXmtkHCRJSMUYCt4f94d8GMq36JQTJCOA0goJQELQabwXwYGr9jgKv87PwmN8BIyys5ZPlaILkA/CT8Dp7hfuPCo/P7JNxO3CyBcXyziFI0oW4rcDrr/AgIz4JbHf3Jz3oRtlAkICL1UnwgXVxH/vd6+6t7r4b+HlWTFv87cVqjgQedPdXPOiWWkzwoVqo9wPfDf9N3UXwdzOsiOOlH6gPP34MuNnd90kSYffF3wCfI0jO5xRx3q8DD7j7Ry1YN+BBCCojmlmrmc0EPh6euxw9a4FUrDaIu79pZvcStMBPI/hgLMQbBe6XDv/synqced7z/+JmoMnMRnj+ctw/IUj4+W489/aeFRp3IRIE3xbequA5pcLUwo+f3wILzOwAADPb38wODvvPE+6+DPgyb5dl3QUML+C8I4Ft4eOzerx2G8FiICPdfV247fcESRUz+wAwusD4Px4ecwyw09139nj9DwQVKgHOAB7OftGDMs6vhcdn9sl2A3AN8Ji7F/qto+DrF8rd3ySo0nq1BYW0Mv3kH+ux3x6Cb1T5RgydEP49DwE+QvDe9/RH4L1mNtaCYnifBB4qIuTfEBT+Iox1dp59pUqU8GPGgxEdXyZYcWgdcC9B3/pE4MHwK/l/8XY3wU3ADzI3bc3sa2Y2P8eprwD+zcyeYN/W6lKCJJh9w/Fy4ANhF9DHgJcIPlwws7ut96Ggb4XX+AHB6JGeFgJnh7/bpwnWle3pbOB74e9q2S94sGjJ68CPe7l+Xwq5fqG+TLAC0sbwffoFubvabiT/t/U/EqznsI7g3so+o2zc/UWCxTYeANYCa9x9eRGx/jPQHN6s3kj53+QkAqqWKVVhZo1Ap7t3mNnRBKv85G0VmtmDBDdeIxsWGH7QPAgc5r0PU6wbZnYWwQ3X/1ftWKT61Icv1dIELDGzBNBOMASxqiyYmPUN4PMDIdmL9KQWvohITKgPX0QkJpTwRURiQglfRCQmlPBFRGJCCV9EJCaU8EVEYuL/AwAdTVsfmGjjAAAAAElFTkSuQmCC\n",
3651
      "text/plain": [
3652
       "<Figure size 432x288 with 1 Axes>"
3653
      ]
3654
     },
3655
     "metadata": {},
3656
     "output_type": "display_data"
3657
    }
3658
   ],
3659
   "source": [
3660
    "plt.figure(figsize=(20,5))\n",
3661
    "plt.subplot(131)\n",
3662
    "tmp = plt.hist(est_ploidies[\"avg_pl\"],bins=30)\n",
3663
    "plt.title(\"est. avg. ploidy from CN profile\")\n",
3664
    "plt.subplot(132)\n",
3665
    "tmp = plt.hist(sorted(list(GDSC_Ploidies[\"average_ploidy\"].values)),bins=30)\n",
3666
    "plt.title(\"PICNIC avg. pl.\")\n",
3667
    "plt.subplot(133)\n",
3668
    "tmp = plt.hist(est_ploidies[\"median_pl\"],bins=30)\n",
3669
    "plt.title(\"est. median ploidy\")\n",
3670
    "\n",
3671
    "tmp = df_ploidies.plot.scatter(x = \"est. avg. ploidy from CN profile\",y=\"PICNIC avg. pl.\")"
3672
   ]
3673
  },
3674
  {
3675
   "cell_type": "code",
3676
   "execution_count": 20,
3677
   "metadata": {},
3678
   "outputs": [
3679
    {
3680
     "data": {
3681
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEcCAYAAADN+K/qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X+cXFV9//HXmyQmQGJQwCAQiT+oBoP8SL5gMUKWoFKgylexNv7EpqbxYbdY6ENi0wqkphKrVgtKFSMJgonID8FQ8gXLrjRYwUSCCEuLhYBAEEEICVBM4uf7xzkbJrOzO7OZ2Z3ZO+/n4zGPnZl75tzPnJ35zL3nnnuuIgIzMyue3ZodgJmZDQ0neDOzgnKCNzMrKCd4M7OCcoI3MysoJ3gzs4Jygh8hJC2T9Nlmx9FsA7WDpNMlrRnumFqVpFmSHi55fLekWU0MCUlTJIWk0TWU/VtJ3xxg+QZJJzQ2wmKp2si2M0kbgEnAdmAr8GNgfkT8qplxlZIUwMER8ctmx9JuWrntI+KNzY5hMCLiH5sdw0jnLfhd88cRMR54JfBr4IImxzNklPhzYjYC+Ytbh4j4X+BK4JDe5yRNlHSppN9IelDS3/UmSEkXSbqqpOwSSf+ek+gsSQ/n3dIn8u7nB/pbt6SPSfqlpN9Kuk7S/vn5W3KROyVtkfS+Cq8dJemLeT0PSPrL0t1mSd2SFku6FXgOeI2k/fN6fpvX+7GS+nbqNqnQNbBB0qcl3SPpKUmXSBpXsvwUSeslPS3px5LeVLLsCEk/k7RZ0neBHa/rv2l0oaRNku6VNDs/+V5J68oKninp2n4qmShpqaSNkh6R9FlJo/Ky10n6UV7HEzmumtq+wnqWSfqapBvya26VtJ+kL+e2ulfSESXl95d0Vf58PSDpr0qW7Z7re0rSPcD/KVvXji4NSUdJ+s/c5htzm72kpGxImi/pvlzmq5JU7f3k13ZL+pyk2yU9I+laSS/vp+xAn6tzJV1W8vhDSt+pJyUtLHl+P0nPSdq75LkjcxuNqSXmwooI3wZxAzYAJ+T7ewDLgUtLll8KXAtMAKYA/w3MLSn/38DpwFuBJ4AD87JZwDbgS8BY4DjgWeD1efky4LP5/vH5tUfmshcAt5TEEMDrBngP84F7gAOBlwE/zK8ZnZd3Aw8BbyR1440BbgG+RkqwhwO/AY4vj63kvTxc1ma/ACYDLwduLXkvRwCPA0cDo4CP5PJjgZcADwJ/nWM4jdQt9tl+3tfpuQ17y78P2JTXORb4LTC1pPwdwHv6qesa4OvAnsArgNuBv8jLVgALSRtI44CZtbZ9hfUsy//L6bmum4EHgA/n9vgs0JXL7gasAz6T2+Y1wP3AO/Ly84H/yO93cm7z8v9D72d3OvDm/P+dAvQAnyx7H6uAvYBX5f/3iTW+p27gEWBabr+rgMvysins/Fkb6HN1bsnrDgG2AMfm/+WX8v+69/38G/Dxkhj+Gbig2fmi2bemBzDSbvlLsgV4OiebR4FD87JRwO+AQ0rK/wXQXfL46JxoHgTmlDw/K39g9yx57grg7/P9ZbyYFJcCny8pNz7HMiU/rpbgbyYnq/z4BPom+EUlyyeTjjlMKHnuc8Cy8thK3kt5Yplf8vgk4H/y/YuAfyiL779IP3DH5vZVybIfM3CCLy9/O/ChknUtzvffCDwFjK1QzyTgBWD3kufm8GKivRT4BvnHuey1u5LgLy553An0lDw+FHi65LPzUNnrPw1cku/fT0kSBuZV+D+c0E8cnwSuKXsfpT9cVwALanxP3cD5JY8PIX0vRlGS4Gv4XJ3Liwn+M8DKknJ75jp7E/z7gFtLvoePAUfV810vws1dNLvm1IjYi7TV8ZfAjyTtB+xD2nJ8sKTsg8ABvQ8i4jbSF1GkL02ppyLi2bLX7l9h/fuXriMitgBPlq6niv2B0oPClQ4Qlz63P/DbiNhcFlut6yuvr/R9HQSclbsBnpb0NOmLv3++PRL5W1vy2oFUKt+7ruXA+3NXw4eAKyLihQp1HET6P24sienrpC15gE+R/n+3K41M+bMqMVXz65L7z1d4PL4krv3L2upvST9I0Pf/2m9bSfoDSaskPSbpGeAfSZ/fUo+V3H+uJI5alMcxpkL9g/lc7fTe8vfkyZLl1wKHSHo18DZgU0TcPoh4C8kJvg4RsT0iriZthcwk7WpvJX0Re72KtLsKgKRPkHYxHyUlilIvk7Rn2WsfrbDqR0vXkV+zd+l6qthI6p7pNblCmdIk+SjwckkTymLrXd+zpO6nXvtVqK90HaXv61ekreq9Sm57RMSKHOcBZX2/r+rvTWWVyj8KEBE/IW31vRV4P/Dtfur4FWkLfp+SmF4aeRRKRDwWER+LiP1Je2hfk/S6KnENmqRu4L1lcT1Q1lYTIuKkvHwjfdu5PxcB95JG/LyU9ENRUx97jcrj2Er6fpSq9rkqtdN7k7QH6TMP7DgedgXwQdKPd3//27biBF8HJe8i9WP3RMR20odssaQJkg4CzgQuy+X/gNSn2vsh/JSkw8uqPU/SSyS9FTgF+F6FVa8APirpcEljSVtft0XEhrz816T+2f5cAZwh6QBJewFnD/Q+Iw0B/THwOUnjlA6Czu19X8B64CRJL897Mp8E9pP0vKQtpC/m5/JW48uBlaTkAnAx8HFJKyU9lA80bpT0VdLxim3AE5IelzQHOCq35Z/nBEh+HKQt7FcAfyXpEEn/STrQ+FVJP5d0JumLfyGwNSIqjpmPiI3AjcAXJb1U0m6SXivpuLyu90rq/YF8ivRj+Pv8uFrb1+N2YLOks/MB1VGSpknqPZh6BfBpSS/L8XUOUNcE4Blgi6Q3AB9vcKwfzP+DPYBFwJX5+7FDDZ+rUlcCp0iamQ8GL6Jv/rqU1E33TpzgASf4XfWDnLieARYDH4mIu/OyTtIW7f3AGuA7wLeURqhcBiyJiDsj4j7SVtO3c5KGtEv8FGnL5nJSv3VvItwhIn4I/D3p4NVG4LXAn5YUORdYnnfj/6RC/BeTEtjPSQca/42USLdXKNtrDqn/9FHSAchzchyQvkx3kvp4bwS+m5/vHU76KGmL+Me5XZ4Hrs9lfp7b649IB/Q2Az8lHac4Ang3qWtgX1J/89UDxAhwG+kA4t3AG4D35y3U9wIzgO+TDv5VSiKlPkw6kHkP6X9yJWlYLKQfjdvyZ+A64IyIuD8vO5eStpf0qvyjVW3Po6qcIE8hHYx8gLRF/E1gYi5yHqmL4wHS/2GgJPc3pL2YzaTPw3cHKLsTSW/N730g3yYdX3iM1JX5V/2UG+hztUP+fn2C9H3aSPqfPFxW5lbSD+3PIqJaV157aPZBAN/SjbIDk8O87j8CHmxwnRt48QDYBtLW5ar8uBv483z/z0lbveOr1LWAlPT3Knldd0mZHQc3Scn7+n7q2p2U1A4uee5lpBEjvyEljlW8OLrpfcDasjr+Grgu398b+AHpx/6npD20NTW20emkEUUXkkb73AvMLlm+o51G0q2ZcZMGEIy4Nhuqm7fg21DevT9J0mhJBwDnkLaehsooUtfKHRWWnQCsjnSgeCBrSYnjb2pY3wmkLe5KPg78NNIeVK/dgEtIxzVeRdrDuDAv+wHwekkHl5R/P2lLEuCrpD2Q/UhDPD9SQ3yljgb+h3QA8hzgavUzZtwGlruqjmQQeyNF5wTfnkTanX+KlHR7SMPQGu37eaTHK0ldMZVOPd+btMtdi88AnZL2rVKuYp1K00ycAZxV+nxEPBkRV0XEc5FGdCwmDdMkIp4jjdCYk+s4mNT1c53SiU/vIXUrPBcR95BG6gzG48CXI2JrRHyXNET05EHW0fYkLSedz/HJ2HlUTlvzXDQtIiK62Xlky1Cu6znKznIcIqdGhf7UMk/yYt/2gCLiF5JWkbpregZbZ0RMqVQ4Hwj8Z+BEUncNwARJoyL1e38H+CLpwN77ge9HxHP5gPJoqg85HchAwzpHpIiY1YR1DnbPqS14C96a7YfAO8qGhw7kHOBjDDwG/4ekLetanQW8Hjg60gHZY/PzvcMGbwL2zSOe5vBi98xvSAenqw05HUi/wzrN6uUEb832bdJW71WS3pCHJO6tNCfPSeWFI83S+F36H5UB6UfgGEn/lLeye+ePuSwPCy03gdTv/nTu/z6nbJ1bScNV/4k0DcBN+fntpFE950raIw83/PCg3v2LwzrHSHovMJU0qsmsbk7w1lSRziQ9gTSC5CbSaJTbSQcdb+vnZYtIp6r3V+f/AH9IGn53t6RNpCGla0kjaMp9mTS65gngJ8DqCmW+k+P8XkRsK3n+L0nDFB8j/VitIA0JBXbMwd7vpHGk93hwXvdi4LSIeLK8UI1DE812op27/8ysHpKWAPvV0ics6XTSkL6ZQx6YtSVvwZvVIXcrvSmf1XwU6UzMoRxyalYzj6Ixq88EUrfM/qQTtr5IGlZp1nTuojEzKyh30ZiZFZQTvJlZQQ1JH/w+++wTU6ZMGYqqB+XZZ59lzz1rPX+mPbhN+nKb9OU2qaxV2mXdunVPRES1KTuGJsFPmTKFtWvXDkXVg9Ld3c2sWbOaHUZLcZv05Tbpy21SWau0i6SapkOuqYtG0l6SrlS6wnuPpD+sLzwzMxtqtW7Bf4U0petp+Woqe1R7gZmZNVfVBC9pImnypdMBIuJ3pOtamplZC6uli+bVpFnzLpF0h6RvDmLmPzMza5KqJzpJmkGagOktEXGbpK8Az0TE35eVmwfMA5g0adL0lStXDlHItduyZQvjx49vdhgtxW3Sl9ukL7dJZa3SLh0dHesiYka1crUk+P2An/ReLEHSW4EFEdHvVWdmzJgRHkXTmtwmfblN+nKbVNYq7SKppgRftYsmIh4DfiXp9fmp2aQrzZuZWQurdRRNJ3B5HkFzP/DRoQvJzMwaoaYEHxHrgaq7A2ZmrejQ5Yc2rrLBXla9grs+clf9ldTA0wWbWeFt7jmfDef3e9iwZo3og5+y4Pq646iVJxszMysoJ3gzs4JygjczKygneDOzgnKCNzMrKCd4M7OCcoI3MysoJ3gzs4LyiU5m1hYadoLR6vrqmbj7mMbEUQMneDMrvEacxQrpR6JRdQ0Hd9GYmRWUE7yZWUE5wZuZFZQTvJlZQTnBm5kVlBO8mVlBOcGbmRWUE7yZWUH5RCczM0BSbeWWDLw8IhoQTWN4C97MjJSYq926urqqlmklTvBmZgXlBG9mVlBO8GZmBeUEb2ZWUE7wZmYF5QRvZlZQTvBmZgXlBG9mVlA+k9WsDdV61mY1rXZij+3MW/Bmbaja2ZgHnb2qpjM7rbU5wZuZFZQTvJlZQTnBm5kVlBO8mVlBOcGbmRWUE7yZWUHVNA5e0gZgM7Ad2BYRM4YyKDMzq99gTnTqiIgnhiwSMzNrKHfRmJkVlGo5G03SA8BTQABfj4hvVCgzD5gHMGnSpOkrV65scKiDt2XLFsaPH9/sMFqK26Qvt0lfp69+lmUn7tnsMFpOq3xWOjo61tXSVV5rF83MiHhE0iuAmyTdGxG3lBbISf8bADNmzIhZs2YNNuaG6+7uphXiaCVuk77cJhWsvt5tUsFI+6zUlOAj4pH893FJ1wBHAbcM/Coza4bDzruRTc9vrbueKQuur7uOibuP4c5z3l53PbZrqiZ4SXsCu0XE5nz/7cCiIY/MzHbJpue3suH8k+uqo1Fbqo34kbBdV8sW/CTgmjy96GjgOxGxekijMjOzulVN8BFxP3DYMMRiZmYN5GGSZmYF5QRvZlZQTvDWtlasWMG0adOYPXs206ZNY8WKFc0OyayhfE1Wa0srVqxg4cKFLF26lO3btzNq1Cjmzp0LwJw5c5ocnVljeAve2tLixYtZunQpHR0djB49mo6ODpYuXcrixYubHZpZwzjBW1vq6elh5syZOz03c+ZMenp6mhSRWeO5i8ba0tSpU1mzZg0dHR07nluzZg1Tp05tYlSNMWHqAg5dvqD+ipY3IhaA+k66sl3nBG9taeHChcydO3dHH3xXVxdz584tRBfN5p7zfSarAU7w1qbmzJnDsmXLmD17NhGBJN72trf5AKsVivvgrS11dnZy880384UvfIEbbriBL3zhC9x88810dnY2OzSzhnGCt7Z08cUXs2TJEs4880zGjRvHmWeeyZIlS7j44oubHZpZw7iLxtrSCy+8wPz583d6bv78+Zx11llNiqixGtL3vbox0wVb8zjBW1saO3Ys8+bNY/369fT09DB16lQOP/xwxo4d2+zQ6lbvAVZIPxCNqMeay1001paOO+44Lr/8co499liuvfZajj32WC6//HKOO+64Zodm1jDegre29Mgjj3DqqafyrW99i4suuoixY8dy6qmnct999zU7NLOGcYK3ttTT08Mdd9zBmDFjdoz53rp1K+PGjWt2aGYN4y4aa0u9Z7KWKsqZrGa9nOCtLfWeydrV1cW2bdt2nMm6cOHCZodm1jDuorG21HvGamdn545RNIsXL/aZrFYoTvDWtubMmcOcOXMaNu+KWatxgrfCk9SQeiKiIfWYDRf3wVvhRcSAt4POXlW1TNGSu6QBbw8uOaVqmUb9cNrQcYI3a0PVfsy6urra7keviJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoJzgzcwKquYEL2mUpDskrRrKgMzMrDEGswV/BtAzVIGYmVlj1ZTgJR0InAx8c2jDMTOzRql1C/7LwKeA3w9hLGZm1kBVr8kq6RTg8YhYJ2nWAOXmAfMAJk2aRHd3d6Ni3GVbtmxpiThaidukMrfJzvw5qWyktYuqXXZL0ueADwHbgHHAS4GrI+KD/b1mxowZsXbt2kbGuUu6u7uZNWtWs8NoKW6TvqYsuJ4N55/c7DBaij8nlbVKu0haFxEzqpWr2kUTEZ+OiAMjYgrwp8DNAyV3MzNrDVW7aMxa2WHn3cim57fWXc+UBdfXXcfE3cdw5zlvr7ses0YZVIKPiG6ge0giMdsFm57fWnf3SqN2uxvxI2HWSD6T1cysoJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoHyik41oE6Yu4NDlC+qvaHkjYoE06apZa3CCLxhJDamn2hxFrWJzz/k+0cmsH+6iKZiIGPB20NmrqpYZKcndzAbmBG9mVlBO8GZmBeUEb2ZWUE7wZmYF5VE0NuI1ZPTK6sbMB2/WSpzgbURrxKX2fMk+Kyp30ZiZFZQTvJlZQTnBm5kVlBO8mVlBOcGbmRVUIRP8ihUrmDZtGrNnz2batGmsWLGi2SGZmQ27wg2TXLFiBQsXLmTp0qVs376dUaNGMXfuXADmzJnT5OjMzIZP4bbgFy9ezNKlS+no6GD06NF0dHSwdOlSFi9e3OzQzMyGVeESfE9PDzNnztzpuZkzZ9LT09OkiMzMmqNwCX7q1KmsWbNmp+fWrFnD1KlTmxSRmVlzFC7BL1y4kLlz59LV1cW2bdvo6upi7ty5LFy4sNmhmZkNq8IdZO09kNrZ2UlPTw9Tp05l8eLFPsBqZm2ncAkeUpKfM2dOw661aWY2EhWui8bMzBIneDOzgnKCNzMrKCd4M7OCKuRB1qI67Lwb2fT81rrracQl7ibuPoY7z3l73fUMB0nVyyypXk9ENCAas+HjBD+CbHp+a92XlmvUyKKGXAd1mFRLzB5tZUXlLhozs4JygjczK6iqCV7SOEm3S7pT0t2SzhuOwMzMrD619MG/ABwfEVskjQHWSLohIn4yxLGZmVkdqib4SEeotuSHY/LNwwnMzFpcTX3wkkZJWg88DtwUEbcNbVhmZlYvDWZsr6S9gGuAzoj4RdmyecA8gEmTJk1fuXJlI+PcJVu2bGH8+PHNDqNhTl/9LMtO3LOuOhrVJo2IpVUU7XPSCG6TylqlXTo6OtZFxIyqBSNiUDfgM8DfDFRm+vTp0Qq6urqaHUJDHXT2qrrraFSbNCKWVlG0z0kjuE0qa5V2AdZGDfm6ah+8pH2BrRHxtKTdgbcBNZz3Z402YeoCDl2+oP6KljciFoD6Troys6FVyyiaVwLLJY0i9dlfERGrhjYsq2Rzz/k+k9XMalbLKJqfA0cMQyxmZtZAPpPVzKygnODNzArKCd7MrKCc4M3MCsoJ3sysoHzBjxGmIcMTVzfmik5m1tqc4EeQesfAQ/qBaEQ9Ztb63EVjZlZQhUzwnZ2djBs3jo6ODsaNG0dnZ2ezQzIzG3aF66Lp7Ozkwgsv3PH4hRde2PH4ggsuaFZYZmbDrnBb8L3J/JhjjuF73/sexxxzzE7Pm5m1i8IleICjjz6aW2+9lX322Ydbb72Vo48+utkhmZkNu0Im+He84x0DPjYzawcjtg9eUr/LFi1axKJFi2p+TQziqlZmZiPFiN2C7+8KJpMnT65YfvLkyQNdpcrMrHBGbILvz0MPPdQnyU+ePJmHHnqoSRGZmTVH4RI8pCQfERx09ioiwsndzNpSIRO8mZk5wZuZFZYTvJlZQTnBm5kVlBO8mVlBjdgTnayygU4A21FmSfV6fH6A2cjnLfiC6e9krt5bV1dX1TJO7mbF4ARvZlZQTvBmZgXlBG9mVlBO8GZmBeUEb2ZWUE7wZmYF1bLj4A8770Y2Pb+17nqmLLi+rtdP3H0Md57z9rrjMDMbbi2b4Dc9v5UN559cVx3d3d3MmjWrrjrq/YEwM2sWd9GYmRWUE7yZWUE5wZuZFZQTvJlZQTnBm5kVVNUEL2mypC5J90i6W9IZwxGYmZnVp5ZhktuAsyLiZ5ImAOsk3RQR9wxxbGZmVoeqCT4iNgIb8/3NknqAA4AhTfATpi7g0OUL6q9oeb1xANQ3Ht/MrBk0mIs7SJoC3AJMi4hnypbNA+YBTJo0afrKlSvrCuz01c+y7MQ966pjy5YtjB8/vulxtJJGtEnRuE36cptU1irt0tHRsS4iZlQtWMvVffKPwHhgHfDuamWnT58e9Tro7FV119HV1dUScbSSRrRJ0bhN+nKbVNYq7QKsjRrydk1TFUgaA1wFXB4RV+/6787gNGSagNX1z0VjZjYSVU3wSldxXgr0RMSXhj6kpN55aCD9QDSiHjOzkaiWcfBvAT4EHC9pfb6dNMRxmZlZnWoZRbMG0DDEYmZmDeQzWc3MCsoJ3sysoJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoJzgzcwKygnezKygnODNzArKCd7MrKBqmg++FaVZjGsot6R6mRjEVa3MzEaKEbsFX8vVTLq6umq9WpWZWeGM2ARvZmYDc4I3MysoJ3gzs4JygjczKygneDOzgnKCNzMrKCd4M7OCcoI3MysoDcWJPpJ+AzzY8IoHbx/giWYH0WLcJn25Tfpym1TWKu1yUETsW63QkCT4ViFpbUTMaHYcrcRt0pfbpC+3SWUjrV3cRWNmVlBO8GZmBVX0BP+NZgfQgtwmfblN+nKbVDai2qXQffBmZu2s6FvwZmZta8QneEmTJXVJukfS3ZLOqFBGkv5F0i8l/VzSkc2IdbhIGifpdkl35jY5r0KZsZK+m9vkNklThj/S4SdplKQ7JK2qsKzt2kTSBkl3SVovaW2F5W313QGQtJekKyXdK6lH0h+WLR8xbTLiEzywDTgrIg4B3gx8QtIhZWX+CDg43+YBFw1viMPuBeD4iDgMOBw4UdKby8rMBZ6KiNcB/wzUcO2rQjgD6OlnWbu2SUdEHN7P8L92++4AfAVYHRFvAA6j7+dlxLTJiE/wEbExIn6W728m/TMOKCv2LuDSSH4C7CXplcMc6rDJ73NLfjgm38oPtrwLWJ7vXwnMVq3XQRyhJB0InAx8s58ibdcmNWir746kicCxwFKAiPhdRDxdVmzEtMmIT/Cl8i71EcBtZYsOAH5V8vhh+v4IFEruilgPPA7cFBH9tklEbAM2AXsPb5TD7svAp4Df97O8HdskgBslrZM0r8LydvvuvBr4DXBJ7sr7pqQ9y8qMmDYpTIKXNB64CvhkRDzT7HiaLSK2R8ThwIHAUZKmNTumZpJ0CvB4RKxrdiwtZmZEHEnqdviEpGObHVCTjQaOBC6KiCOAZ4EFzQ1p1xUiwUsaQ0rul0fE1RWKPAJMLnl8YH6u8PLuZRdwYtmiHW0iaTQwEXhyeKMbVm8B3ilpA7ASOF7SZWVl2q1NiIhH8t/HgWuAo8qKtNt352Hg4ZI93itJCb/UiGmTEZ/gcx/pUqAnIr7UT7HrgA/no99vBjZFxMZhC3KYSdpX0l75/u7A24B7y4pdB3wk3z8NuDkKfFJERHw6Ig6MiCnAn5Le7wfLirVVm0jaU9KE3vvA24FflBVrq+9ORDwG/ErS6/NTs4F7yoqNmDYZ3ewAGuAtwIeAu3KfM8DfAq8CiIh/Bf4NOAn4JfAc8NEmxDmcXgkslzSK9CN+RUSskrQIWBsR15F+FL8t6ZfAb0lJr+20eZtMAq7Jx5FHA9+JiNWS5kPbfncAOoHLJb0EuB/46EhtE5/JamZWUCO+i8bMzCpzgjczKygneDOzgnKCNzMrKCd4M7OCcoK3ppB0uqT9h3mdUyT9It+fIelfhnHdsyrNYFlW5p2SKp41KWlLpefNBlKEcfA2Mp1OOqnm0WasPCLWAn2mx22mPBb/umbHYcXhLXhrGEkfzPPQr5f09Tzh2ShJyyT9Is87/teSTgNmkE4mWZ/Ptu2vznMlLZf0H5IelPRuSZ/Pda3O01QgabqkH+VJs/5f7+x++fk7Jd0JfKKk3h1b1JKOkvSfeXKpH/eexZj3Mq7O67lP0udraINlkv5V0lpJ/53nwCkv83JJ31eaS/wnkt5Usr4L8/1X55jukvTZktdeKunUkseXS3pXtbisPTnBW0NImgq8D3hLnuRsO/AB0nz0B0TEtIg4FLgkIq4kbT1/IM9D/nyV6l8LHA+8E7gM6Mp1PQ+cnJP8BcBpETEd+BawOL/2EqAzz43fn3uBt+bJpT4D/GPJssPz+zoUeJ+kyRVeX24KaU6Xk4F/lTSubPl5wB0R8SbSWdeXVqjjK6QJrw4FSk+DX0ra++md2vYY4PoaYrI25C4aa5TZwHTgp/l4vMN0AAACI0lEQVTU991JUxX/AHiNpAtIiejGXaj7hojYKukuYBSwOj9/FymZvh6YBtyU1z0K2Jjn49krIm7J5b9NmjWx3ETS1A4Hk6bPHVOy7N8jYhOApHuAg9h5qthKroiI3wP3SbofeEPZ8pnAewAi4mZJe0t6aVmZt/SWyXEvyeV/JOlrkvbNy6/KUxub9eEEb40iYHlEfLrPAukw4B3AfOBPgD8bZN0vAETE7yVtLZkA7Pekz7CAuyOi/NJqe9VY/z+Q9gr+r9I1BbrL151tp7bvTPn8H7s6H0h/r7sU+CBprpyWnQfFms9dNNYo/w6cJukVsKOf+SBJ+wC7RcRVwN/x4tSrm4EJDVr3fwH7Kl87U9IYSW/MUyU/LWlmLveBfl4/kRenez29AfG8V9Jukl4LvCbHV+o/emORNAt4osI1DG7lxcnOyuNeBnwSICLKZzo028EJ3hoiJ5q/I10d6OfATaRZLQ8AuvNMn5cBvVv4y0j90+sl7S5pkaR37uK6f0ea3ndJPpi6ntQ3DWkL96t5/f1dfu/zwOck3UGNe7VKV/qpdA1TgIeA24EbgPkR8b9ly88Fpud2Op8XpygudQbpAhx3UXa1oIj4NenSlJfUEqu1L88madZAkpYBq/KB5KFaxx6k4w9H9h4fMKvEW/BmI4ikE0hb7xc4uVs13oI3Mysob8GbmRWUE7yZWUE5wZuZFZQTvJlZQTnBm5kVlBO8mVlB/X+KIRNPxs+29gAAAABJRU5ErkJggg==\n",
3682
      "text/plain": [
3683
       "<Figure size 432x288 with 1 Axes>"
3684
      ]
3685
     },
3686
     "metadata": {},
3687
     "output_type": "display_data"
3688
    }
3689
   ],
3690
   "source": [
3691
    "# PICNIC average ploidy vs estimated copy-neutral \n",
3692
    "tmp = df_ploidies.boxplot(column=\"PICNIC avg. pl.\", by = \"est. median. ploidy\" )"
3693
   ]
3694
  },
3695
  {
3696
   "cell_type": "markdown",
3697
   "metadata": {},
3698
   "source": [
3699
    "### Convert gene-level integer CN into log2R-like format in order to make it compatible with TCGA and CCLE\n",
3700
    "\n",
3701
    "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n",
3702
    "\n",
3703
    "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n",
3704
    "\n",
3705
    "3) Replace estimates below thresholds with zeroes. "
3706
   ]
3707
  },
3708
  {
3709
   "cell_type": "code",
3710
   "execution_count": 21,
3711
   "metadata": {},
3712
   "outputs": [
3713
    {
3714
     "data": {
3715
      "text/plain": [
3716
       "3.0"
3717
      ]
3718
     },
3719
     "execution_count": 21,
3720
     "metadata": {},
3721
     "output_type": "execute_result"
3722
    }
3723
   ],
3724
   "source": [
3725
    "estimated_CN = est_ploidies[\"median_pl\"].to_dict()\n",
3726
    "estimated_CN[1287381]"
3727
   ]
3728
  },
3729
  {
3730
   "cell_type": "code",
3731
   "execution_count": null,
3732
   "metadata": {},
3733
   "outputs": [],
3734
   "source": []
3735
  },
3736
  {
3737
   "cell_type": "code",
3738
   "execution_count": 22,
3739
   "metadata": {},
3740
   "outputs": [
3741
    {
3742
     "data": {
3743
      "text/html": [
3744
       "<div>\n",
3745
       "<style scoped>\n",
3746
       "    .dataframe tbody tr th:only-of-type {\n",
3747
       "        vertical-align: middle;\n",
3748
       "    }\n",
3749
       "\n",
3750
       "    .dataframe tbody tr th {\n",
3751
       "        vertical-align: top;\n",
3752
       "    }\n",
3753
       "\n",
3754
       "    .dataframe thead th {\n",
3755
       "        text-align: right;\n",
3756
       "    }\n",
3757
       "</style>\n",
3758
       "<table border=\"1\" class=\"dataframe\">\n",
3759
       "  <thead>\n",
3760
       "    <tr style=\"text-align: right;\">\n",
3761
       "      <th></th>\n",
3762
       "      <th>1287381</th>\n",
3763
       "      <th>924100</th>\n",
3764
       "      <th>910924</th>\n",
3765
       "      <th>687561</th>\n",
3766
       "      <th>1287706</th>\n",
3767
       "      <th>687452</th>\n",
3768
       "      <th>906798</th>\n",
3769
       "      <th>906797</th>\n",
3770
       "      <th>906800</th>\n",
3771
       "      <th>910922</th>\n",
3772
       "      <th>...</th>\n",
3773
       "      <th>909785</th>\n",
3774
       "      <th>909904</th>\n",
3775
       "      <th>909905</th>\n",
3776
       "      <th>687592</th>\n",
3777
       "      <th>1303911</th>\n",
3778
       "      <th>946358</th>\n",
3779
       "      <th>909907</th>\n",
3780
       "      <th>1298146</th>\n",
3781
       "      <th>908452</th>\n",
3782
       "      <th>908450</th>\n",
3783
       "    </tr>\n",
3784
       "  </thead>\n",
3785
       "  <tbody>\n",
3786
       "    <tr>\n",
3787
       "      <th>5S_rRNA</th>\n",
3788
       "      <td>-4.320000</td>\n",
3789
       "      <td>1.807355</td>\n",
3790
       "      <td>2.0</td>\n",
3791
       "      <td>-0.415037</td>\n",
3792
       "      <td>-4.32</td>\n",
3793
       "      <td>-4.320000</td>\n",
3794
       "      <td>-1.00</td>\n",
3795
       "      <td>2.807355</td>\n",
3796
       "      <td>2.169925</td>\n",
3797
       "      <td>-1.00</td>\n",
3798
       "      <td>...</td>\n",
3799
       "      <td>-4.32</td>\n",
3800
       "      <td>2.222392</td>\n",
3801
       "      <td>-4.32</td>\n",
3802
       "      <td>-4.320000</td>\n",
3803
       "      <td>-4.32</td>\n",
3804
       "      <td>1.807355</td>\n",
3805
       "      <td>-4.320000</td>\n",
3806
       "      <td>1.736966</td>\n",
3807
       "      <td>2.169925</td>\n",
3808
       "      <td>-1.584963</td>\n",
3809
       "    </tr>\n",
3810
       "    <tr>\n",
3811
       "      <th>5_8S_rRNA</th>\n",
3812
       "      <td>-0.584963</td>\n",
3813
       "      <td>0.000000</td>\n",
3814
       "      <td>0.0</td>\n",
3815
       "      <td>-0.415037</td>\n",
3816
       "      <td>0.00</td>\n",
3817
       "      <td>-0.584963</td>\n",
3818
       "      <td>0.00</td>\n",
3819
       "      <td>0.000000</td>\n",
3820
       "      <td>0.000000</td>\n",
3821
       "      <td>0.00</td>\n",
3822
       "      <td>...</td>\n",
3823
       "      <td>0.00</td>\n",
3824
       "      <td>0.415037</td>\n",
3825
       "      <td>0.00</td>\n",
3826
       "      <td>-0.584963</td>\n",
3827
       "      <td>-4.32</td>\n",
3828
       "      <td>0.000000</td>\n",
3829
       "      <td>-0.415037</td>\n",
3830
       "      <td>0.000000</td>\n",
3831
       "      <td>0.000000</td>\n",
3832
       "      <td>-0.584963</td>\n",
3833
       "    </tr>\n",
3834
       "    <tr>\n",
3835
       "      <th>7SK</th>\n",
3836
       "      <td>1.000000</td>\n",
3837
       "      <td>0.000000</td>\n",
3838
       "      <td>0.0</td>\n",
3839
       "      <td>-4.320000</td>\n",
3840
       "      <td>-4.32</td>\n",
3841
       "      <td>-0.584963</td>\n",
3842
       "      <td>-4.32</td>\n",
3843
       "      <td>0.584963</td>\n",
3844
       "      <td>0.000000</td>\n",
3845
       "      <td>-4.32</td>\n",
3846
       "      <td>...</td>\n",
3847
       "      <td>-4.32</td>\n",
3848
       "      <td>0.000000</td>\n",
3849
       "      <td>-4.32</td>\n",
3850
       "      <td>-4.320000</td>\n",
3851
       "      <td>-4.32</td>\n",
3852
       "      <td>-0.415037</td>\n",
3853
       "      <td>-4.320000</td>\n",
3854
       "      <td>-4.320000</td>\n",
3855
       "      <td>0.000000</td>\n",
3856
       "      <td>-4.320000</td>\n",
3857
       "    </tr>\n",
3858
       "  </tbody>\n",
3859
       "</table>\n",
3860
       "<p>3 rows × 996 columns</p>\n",
3861
       "</div>"
3862
      ],
3863
      "text/plain": [
3864
       "            1287381   924100   910924    687561   1287706   687452   906798   \\\n",
3865
       "5S_rRNA   -4.320000  1.807355      2.0 -0.415037    -4.32 -4.320000    -1.00   \n",
3866
       "5_8S_rRNA -0.584963  0.000000      0.0 -0.415037     0.00 -0.584963     0.00   \n",
3867
       "7SK        1.000000  0.000000      0.0 -4.320000    -4.32 -0.584963    -4.32   \n",
3868
       "\n",
3869
       "            906797    906800   910922     ...     909785    909904   909905   \\\n",
3870
       "5S_rRNA    2.807355  2.169925    -1.00    ...       -4.32  2.222392    -4.32   \n",
3871
       "5_8S_rRNA  0.000000  0.000000     0.00    ...        0.00  0.415037     0.00   \n",
3872
       "7SK        0.584963  0.000000    -4.32    ...       -4.32  0.000000    -4.32   \n",
3873
       "\n",
3874
       "            687592   1303911   946358    909907    1298146   908452    908450   \n",
3875
       "5S_rRNA   -4.320000    -4.32  1.807355 -4.320000  1.736966  2.169925 -1.584963  \n",
3876
       "5_8S_rRNA -0.584963    -4.32  0.000000 -0.415037  0.000000  0.000000 -0.584963  \n",
3877
       "7SK       -4.320000    -4.32 -0.415037 -4.320000 -4.320000  0.000000 -4.320000  \n",
3878
       "\n",
3879
       "[3 rows x 996 columns]"
3880
      ]
3881
     },
3882
     "execution_count": 22,
3883
     "metadata": {},
3884
     "output_type": "execute_result"
3885
    }
3886
   ],
3887
   "source": [
3888
    "gdsc = gdsc.apply(lambda x : CN2log2R(x,estimated_CN[x.name] ))\n",
3889
    "# drop genes without any determined value\n",
3890
    "gdsc = gdsc.dropna(axis=0,how=\"all\")\n",
3891
    "# fill with zeroes the remaining ones\n",
3892
    "gdsc.fillna(0,inplace=True)\n",
3893
    "gdsc.head(3)"
3894
   ]
3895
  },
3896
  {
3897
   "cell_type": "code",
3898
   "execution_count": 23,
3899
   "metadata": {},
3900
   "outputs": [],
3901
   "source": [
3902
    "\n",
3903
    "gdsc = gdsc.applymap(lambda x :  clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))"
3904
   ]
3905
  },
3906
  {
3907
   "cell_type": "code",
3908
   "execution_count": 24,
3909
   "metadata": {},
3910
   "outputs": [
3911
    {
3912
     "name": "stderr",
3913
     "output_type": "stream",
3914
     "text": [
3915
      "Ok: no empty rows detected\n",
3916
      "Ok: no duplicated pairs detected\n",
3917
      "Ok: All Symbol rows are not empty.\n",
3918
      "Ok: All Symbol are mapped to GeneID\n",
3919
      "16 Symbol mapped to multiple GeneID\n",
3920
      "Ok: All GeneID are unique\n",
3921
      "59266 Symbol can be mapped directly to GeneID\n"
3922
     ]
3923
    }
3924
   ],
3925
   "source": [
3926
    "NCBI = pd.read_csv(root_dir+\"Homo_sapiens.gene_info\",sep = \"\\t\")\n",
3927
    "NCBI = NCBI[[\"#tax_id\",\"GeneID\",\"Symbol\",\"Synonyms\",\"type_of_gene\"]]\n",
3928
    "NCBI = NCBI.loc[NCBI[\"#tax_id\"] == 9606]\n",
3929
    "NCBI = NCBI.loc[NCBI[\"type_of_gene\"] != \"unknown\"]\n",
3930
    "ncbi_symbols = parse_mapping_table(NCBI, \"Symbol\",\"GeneID\")"
3931
   ]
3932
  },
3933
  {
3934
   "cell_type": "code",
3935
   "execution_count": 25,
3936
   "metadata": {},
3937
   "outputs": [
3938
    {
3939
     "name": "stderr",
3940
     "output_type": "stream",
3941
     "text": [
3942
      "Ok: no empty rows detected\n",
3943
      "Ok: no duplicated pairs detected\n",
3944
      "Ok: All Synonyms rows are not empty.\n",
3945
      "Ok: All Synonyms are mapped to GeneID\n",
3946
      "3145 Synonyms mapped to multiple GeneID\n",
3947
      "49179 different Synonyms mapped to the same GeneID\n",
3948
      "10839 Synonyms can be mapped directly to GeneID\n"
3949
     ]
3950
    }
3951
   ],
3952
   "source": [
3953
    "ncbi_synonyms = expand(NCBI[[\"Synonyms\",\"GeneID\"]],column=\"Synonyms\",sep=\"|\") \n",
3954
    "ncbi_synonyms = parse_mapping_table(ncbi_synonyms, \"Synonyms\",\"GeneID\")"
3955
   ]
3956
  },
3957
  {
3958
   "cell_type": "code",
3959
   "execution_count": 26,
3960
   "metadata": {},
3961
   "outputs": [
3962
    {
3963
     "name": "stdout",
3964
     "output_type": "stream",
3965
     "text": [
3966
      "Mapped: 24545 \n",
3967
      "\tdirectly via main_mapper 22363 \n",
3968
      "\tvia alternative mapper 766 \n",
3969
      "\tvia one of multiple synonyms in alternative mapper 1416 \n",
3970
      "\tLOC 0 \n",
3971
      "Unmapped: 21587 \n",
3972
      "\trecognized symbols without Entrez ID 0 \n",
3973
      "\tmultiple query_ids map to the same target_id 0 \n",
3974
      "\tquery_ids map to multiple target_ids in the main mapper 0 \n",
3975
      "\tquery_ids map to multiple target_ids in the alternative mapper 76 \n",
3976
      "\tLOC not found in Entrez 0 \n",
3977
      "\tNot found at all: 21511\n",
3978
      "Warning: query IDs mapping to duplicated target IDs in mapping table: 156\n",
3979
      "Warning: query IDs not mapped to any target IDs excluded: 21587\n"
3980
     ]
3981
    },
3982
    {
3983
     "name": "stderr",
3984
     "output_type": "stream",
3985
     "text": [
3986
      "/home/olya/miniconda2/lib/python2.7/site-packages/pandas/core/frame.py:3781: SettingWithCopyWarning: \n",
3987
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
3988
      "\n",
3989
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
3990
      "  return super(DataFrame, self).rename(**kwargs)\n",
3991
      "IDs mapped to multiple target IDs are kept:\n",
3992
      " [143872, 286464, 140290, 414212, 414213, 51463, 642826, 84631, 574445, 399761, 100132115, 647060, 284565, 6551, 161176, 341019, 4253, 9502, 442416, 51236, 643749, 54438, 728113, 100302179, 414761, 29099, 729438, 256815, 10160, 645425, 653234, 644019, 26165, 3255, 644509, 2749, 653505, 653067, 643479, 100462820, 100418977, 26824, 79817, 6218, 728695, 100034743, 221262, 647507, 677844, 728917, 26583, 100289124, 84316, 200030, 768096, 642658, 23523, 401508, 23334, 119016, 106478953, 84458, 1517, 246126, 26095, 100033392, 92017, 374, 26871, 100132948, 125050, 387707, 653308, 79741, 728798]\n",
3993
      "mapper.py:204: SettingWithCopyWarning: \n",
3994
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
3995
      "\n",
3996
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
3997
      "  df.sort_index(inplace=True)\n"
3998
     ]
3999
    },
4000
    {
4001
     "data": {
4002
      "text/html": [
4003
       "<div>\n",
4004
       "<style scoped>\n",
4005
       "    .dataframe tbody tr th:only-of-type {\n",
4006
       "        vertical-align: middle;\n",
4007
       "    }\n",
4008
       "\n",
4009
       "    .dataframe tbody tr th {\n",
4010
       "        vertical-align: top;\n",
4011
       "    }\n",
4012
       "\n",
4013
       "    .dataframe thead th {\n",
4014
       "        text-align: right;\n",
4015
       "    }\n",
4016
       "</style>\n",
4017
       "<table border=\"1\" class=\"dataframe\">\n",
4018
       "  <thead>\n",
4019
       "    <tr style=\"text-align: right;\">\n",
4020
       "      <th></th>\n",
4021
       "      <th>1287381</th>\n",
4022
       "      <th>924100</th>\n",
4023
       "      <th>910924</th>\n",
4024
       "      <th>687561</th>\n",
4025
       "      <th>1287706</th>\n",
4026
       "      <th>687452</th>\n",
4027
       "      <th>906798</th>\n",
4028
       "      <th>906797</th>\n",
4029
       "      <th>906800</th>\n",
4030
       "      <th>910922</th>\n",
4031
       "      <th>...</th>\n",
4032
       "      <th>909785</th>\n",
4033
       "      <th>909904</th>\n",
4034
       "      <th>909905</th>\n",
4035
       "      <th>687592</th>\n",
4036
       "      <th>1303911</th>\n",
4037
       "      <th>946358</th>\n",
4038
       "      <th>909907</th>\n",
4039
       "      <th>1298146</th>\n",
4040
       "      <th>908452</th>\n",
4041
       "      <th>908450</th>\n",
4042
       "    </tr>\n",
4043
       "  </thead>\n",
4044
       "  <tbody>\n",
4045
       "    <tr>\n",
4046
       "      <th>1</th>\n",
4047
       "      <td>0.000000</td>\n",
4048
       "      <td>0.000000</td>\n",
4049
       "      <td>0.0</td>\n",
4050
       "      <td>0.000000</td>\n",
4051
       "      <td>0.000000</td>\n",
4052
       "      <td>0.000000</td>\n",
4053
       "      <td>0.000000</td>\n",
4054
       "      <td>0.584963</td>\n",
4055
       "      <td>0.0</td>\n",
4056
       "      <td>0.0</td>\n",
4057
       "      <td>...</td>\n",
4058
       "      <td>0.0</td>\n",
4059
       "      <td>-0.584963</td>\n",
4060
       "      <td>0.584963</td>\n",
4061
       "      <td>0.415037</td>\n",
4062
       "      <td>0.000000</td>\n",
4063
       "      <td>0.000000</td>\n",
4064
       "      <td>0.0</td>\n",
4065
       "      <td>0.000000</td>\n",
4066
       "      <td>0.584963</td>\n",
4067
       "      <td>0.415037</td>\n",
4068
       "    </tr>\n",
4069
       "    <tr>\n",
4070
       "      <th>2</th>\n",
4071
       "      <td>0.000000</td>\n",
4072
       "      <td>0.584963</td>\n",
4073
       "      <td>0.0</td>\n",
4074
       "      <td>-0.415037</td>\n",
4075
       "      <td>0.321928</td>\n",
4076
       "      <td>0.000000</td>\n",
4077
       "      <td>0.584963</td>\n",
4078
       "      <td>0.584963</td>\n",
4079
       "      <td>0.0</td>\n",
4080
       "      <td>0.0</td>\n",
4081
       "      <td>...</td>\n",
4082
       "      <td>0.0</td>\n",
4083
       "      <td>-0.584963</td>\n",
4084
       "      <td>0.000000</td>\n",
4085
       "      <td>0.000000</td>\n",
4086
       "      <td>0.736966</td>\n",
4087
       "      <td>0.321928</td>\n",
4088
       "      <td>0.0</td>\n",
4089
       "      <td>-0.584963</td>\n",
4090
       "      <td>0.000000</td>\n",
4091
       "      <td>0.736966</td>\n",
4092
       "    </tr>\n",
4093
       "    <tr>\n",
4094
       "      <th>9</th>\n",
4095
       "      <td>-0.584963</td>\n",
4096
       "      <td>0.584963</td>\n",
4097
       "      <td>0.0</td>\n",
4098
       "      <td>-0.415037</td>\n",
4099
       "      <td>-0.415037</td>\n",
4100
       "      <td>-0.584963</td>\n",
4101
       "      <td>0.000000</td>\n",
4102
       "      <td>0.000000</td>\n",
4103
       "      <td>0.0</td>\n",
4104
       "      <td>0.0</td>\n",
4105
       "      <td>...</td>\n",
4106
       "      <td>0.0</td>\n",
4107
       "      <td>-0.584963</td>\n",
4108
       "      <td>0.000000</td>\n",
4109
       "      <td>0.415037</td>\n",
4110
       "      <td>0.000000</td>\n",
4111
       "      <td>0.000000</td>\n",
4112
       "      <td>-1.0</td>\n",
4113
       "      <td>-0.584963</td>\n",
4114
       "      <td>0.000000</td>\n",
4115
       "      <td>0.000000</td>\n",
4116
       "    </tr>\n",
4117
       "  </tbody>\n",
4118
       "</table>\n",
4119
       "<p>3 rows × 996 columns</p>\n",
4120
       "</div>"
4121
      ],
4122
      "text/plain": [
4123
       "    1287381   924100   910924    687561    1287706   687452    906798   \\\n",
4124
       "1  0.000000  0.000000      0.0  0.000000  0.000000  0.000000  0.000000   \n",
4125
       "2  0.000000  0.584963      0.0 -0.415037  0.321928  0.000000  0.584963   \n",
4126
       "9 -0.584963  0.584963      0.0 -0.415037 -0.415037 -0.584963  0.000000   \n",
4127
       "\n",
4128
       "    906797   906800   910922     ...     909785    909904    909905   \\\n",
4129
       "1  0.584963      0.0      0.0    ...         0.0 -0.584963  0.584963   \n",
4130
       "2  0.584963      0.0      0.0    ...         0.0 -0.584963  0.000000   \n",
4131
       "9  0.000000      0.0      0.0    ...         0.0 -0.584963  0.000000   \n",
4132
       "\n",
4133
       "    687592    1303911   946358   909907    1298146   908452    908450   \n",
4134
       "1  0.415037  0.000000  0.000000      0.0  0.000000  0.584963  0.415037  \n",
4135
       "2  0.000000  0.736966  0.321928      0.0 -0.584963  0.000000  0.736966  \n",
4136
       "9  0.415037  0.000000  0.000000     -1.0 -0.584963  0.000000  0.000000  \n",
4137
       "\n",
4138
       "[3 rows x 996 columns]"
4139
      ]
4140
     },
4141
     "execution_count": 26,
4142
     "metadata": {},
4143
     "output_type": "execute_result"
4144
    }
4145
   ],
4146
   "source": [
4147
    "gdsc,query2target,not_mapped  = apply_mappers(gdsc, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n",
4148
    "gdsc.head(3)"
4149
   ]
4150
  },
4151
  {
4152
   "cell_type": "code",
4153
   "execution_count": 27,
4154
   "metadata": {
4155
    "scrolled": true
4156
   },
4157
   "outputs": [
4158
    {
4159
     "name": "stdout",
4160
     "output_type": "stream",
4161
     "text": [
4162
      "75 duplicated IDs in 156 rows found.\n",
4163
      "duplicate rows removed due to low correlation of duplicated profiles 25\n",
4164
      "Merged  131 duplicated rows into 63 rows\n"
4165
     ]
4166
    }
4167
   ],
4168
   "source": [
4169
    "gdsc = handle_dups(gdsc,corr_thr = 0.75)"
4170
   ]
4171
  },
4172
  {
4173
   "cell_type": "code",
4174
   "execution_count": 28,
4175
   "metadata": {},
4176
   "outputs": [
4177
    {
4178
     "data": {
4179
      "text/html": [
4180
       "<div>\n",
4181
       "<style scoped>\n",
4182
       "    .dataframe tbody tr th:only-of-type {\n",
4183
       "        vertical-align: middle;\n",
4184
       "    }\n",
4185
       "\n",
4186
       "    .dataframe tbody tr th {\n",
4187
       "        vertical-align: top;\n",
4188
       "    }\n",
4189
       "\n",
4190
       "    .dataframe thead th {\n",
4191
       "        text-align: right;\n",
4192
       "    }\n",
4193
       "</style>\n",
4194
       "<table border=\"1\" class=\"dataframe\">\n",
4195
       "  <thead>\n",
4196
       "    <tr style=\"text-align: right;\">\n",
4197
       "      <th></th>\n",
4198
       "      <th>683665</th>\n",
4199
       "      <th>683667</th>\n",
4200
       "      <th>684052</th>\n",
4201
       "      <th>684055</th>\n",
4202
       "      <th>684057</th>\n",
4203
       "      <th>684059</th>\n",
4204
       "      <th>684062</th>\n",
4205
       "      <th>684072</th>\n",
4206
       "      <th>684681</th>\n",
4207
       "      <th>687448</th>\n",
4208
       "      <th>...</th>\n",
4209
       "      <th>1659818</th>\n",
4210
       "      <th>1659819</th>\n",
4211
       "      <th>1659823</th>\n",
4212
       "      <th>1659928</th>\n",
4213
       "      <th>1659929</th>\n",
4214
       "      <th>1660034</th>\n",
4215
       "      <th>1660035</th>\n",
4216
       "      <th>1660036</th>\n",
4217
       "      <th>1674021</th>\n",
4218
       "      <th>1789883</th>\n",
4219
       "    </tr>\n",
4220
       "    <tr>\n",
4221
       "      <th>gene_id</th>\n",
4222
       "      <th></th>\n",
4223
       "      <th></th>\n",
4224
       "      <th></th>\n",
4225
       "      <th></th>\n",
4226
       "      <th></th>\n",
4227
       "      <th></th>\n",
4228
       "      <th></th>\n",
4229
       "      <th></th>\n",
4230
       "      <th></th>\n",
4231
       "      <th></th>\n",
4232
       "      <th></th>\n",
4233
       "      <th></th>\n",
4234
       "      <th></th>\n",
4235
       "      <th></th>\n",
4236
       "      <th></th>\n",
4237
       "      <th></th>\n",
4238
       "      <th></th>\n",
4239
       "      <th></th>\n",
4240
       "      <th></th>\n",
4241
       "      <th></th>\n",
4242
       "      <th></th>\n",
4243
       "    </tr>\n",
4244
       "  </thead>\n",
4245
       "  <tbody>\n",
4246
       "    <tr>\n",
4247
       "      <th>1</th>\n",
4248
       "      <td>0.0</td>\n",
4249
       "      <td>0.000000</td>\n",
4250
       "      <td>0.0</td>\n",
4251
       "      <td>0.000000</td>\n",
4252
       "      <td>-0.415037</td>\n",
4253
       "      <td>0.0</td>\n",
4254
       "      <td>-0.415037</td>\n",
4255
       "      <td>0.000000</td>\n",
4256
       "      <td>0.415037</td>\n",
4257
       "      <td>0.0</td>\n",
4258
       "      <td>...</td>\n",
4259
       "      <td>0.0</td>\n",
4260
       "      <td>0.0</td>\n",
4261
       "      <td>0.000000</td>\n",
4262
       "      <td>0.0</td>\n",
4263
       "      <td>-0.415037</td>\n",
4264
       "      <td>0.0</td>\n",
4265
       "      <td>-0.584963</td>\n",
4266
       "      <td>0.000000</td>\n",
4267
       "      <td>0.000000</td>\n",
4268
       "      <td>0.321928</td>\n",
4269
       "    </tr>\n",
4270
       "    <tr>\n",
4271
       "      <th>2</th>\n",
4272
       "      <td>0.0</td>\n",
4273
       "      <td>0.000000</td>\n",
4274
       "      <td>0.0</td>\n",
4275
       "      <td>0.584963</td>\n",
4276
       "      <td>0.000000</td>\n",
4277
       "      <td>0.0</td>\n",
4278
       "      <td>0.000000</td>\n",
4279
       "      <td>0.584963</td>\n",
4280
       "      <td>0.000000</td>\n",
4281
       "      <td>0.0</td>\n",
4282
       "      <td>...</td>\n",
4283
       "      <td>0.0</td>\n",
4284
       "      <td>0.0</td>\n",
4285
       "      <td>0.000000</td>\n",
4286
       "      <td>0.0</td>\n",
4287
       "      <td>0.000000</td>\n",
4288
       "      <td>0.0</td>\n",
4289
       "      <td>0.000000</td>\n",
4290
       "      <td>0.000000</td>\n",
4291
       "      <td>0.584963</td>\n",
4292
       "      <td>-0.415037</td>\n",
4293
       "    </tr>\n",
4294
       "    <tr>\n",
4295
       "      <th>9</th>\n",
4296
       "      <td>0.0</td>\n",
4297
       "      <td>0.321928</td>\n",
4298
       "      <td>0.0</td>\n",
4299
       "      <td>0.584963</td>\n",
4300
       "      <td>0.584963</td>\n",
4301
       "      <td>0.0</td>\n",
4302
       "      <td>0.321928</td>\n",
4303
       "      <td>0.000000</td>\n",
4304
       "      <td>-0.584963</td>\n",
4305
       "      <td>0.0</td>\n",
4306
       "      <td>...</td>\n",
4307
       "      <td>0.0</td>\n",
4308
       "      <td>0.0</td>\n",
4309
       "      <td>-0.415037</td>\n",
4310
       "      <td>0.0</td>\n",
4311
       "      <td>0.000000</td>\n",
4312
       "      <td>-1.0</td>\n",
4313
       "      <td>-1.584963</td>\n",
4314
       "      <td>0.000000</td>\n",
4315
       "      <td>-1.000000</td>\n",
4316
       "      <td>-1.000000</td>\n",
4317
       "    </tr>\n",
4318
       "    <tr>\n",
4319
       "      <th>10</th>\n",
4320
       "      <td>0.0</td>\n",
4321
       "      <td>0.321928</td>\n",
4322
       "      <td>0.0</td>\n",
4323
       "      <td>0.584963</td>\n",
4324
       "      <td>0.584963</td>\n",
4325
       "      <td>0.0</td>\n",
4326
       "      <td>0.321928</td>\n",
4327
       "      <td>0.000000</td>\n",
4328
       "      <td>-0.584963</td>\n",
4329
       "      <td>0.0</td>\n",
4330
       "      <td>...</td>\n",
4331
       "      <td>0.0</td>\n",
4332
       "      <td>0.0</td>\n",
4333
       "      <td>-0.415037</td>\n",
4334
       "      <td>0.0</td>\n",
4335
       "      <td>0.000000</td>\n",
4336
       "      <td>-1.0</td>\n",
4337
       "      <td>-1.584963</td>\n",
4338
       "      <td>0.000000</td>\n",
4339
       "      <td>-1.000000</td>\n",
4340
       "      <td>-1.000000</td>\n",
4341
       "    </tr>\n",
4342
       "    <tr>\n",
4343
       "      <th>12</th>\n",
4344
       "      <td>0.0</td>\n",
4345
       "      <td>-1.000000</td>\n",
4346
       "      <td>0.0</td>\n",
4347
       "      <td>0.000000</td>\n",
4348
       "      <td>-1.000000</td>\n",
4349
       "      <td>0.0</td>\n",
4350
       "      <td>-0.415037</td>\n",
4351
       "      <td>0.000000</td>\n",
4352
       "      <td>0.000000</td>\n",
4353
       "      <td>0.0</td>\n",
4354
       "      <td>...</td>\n",
4355
       "      <td>-1.0</td>\n",
4356
       "      <td>0.0</td>\n",
4357
       "      <td>0.321928</td>\n",
4358
       "      <td>0.0</td>\n",
4359
       "      <td>-0.415037</td>\n",
4360
       "      <td>0.0</td>\n",
4361
       "      <td>0.415037</td>\n",
4362
       "      <td>0.584963</td>\n",
4363
       "      <td>0.000000</td>\n",
4364
       "      <td>0.000000</td>\n",
4365
       "    </tr>\n",
4366
       "  </tbody>\n",
4367
       "</table>\n",
4368
       "<p>5 rows × 996 columns</p>\n",
4369
       "</div>"
4370
      ],
4371
      "text/plain": [
4372
       "         683665    683667   684052    684055    684057   684059    684062   \\\n",
4373
       "gene_id                                                                      \n",
4374
       "1            0.0  0.000000      0.0  0.000000 -0.415037      0.0 -0.415037   \n",
4375
       "2            0.0  0.000000      0.0  0.584963  0.000000      0.0  0.000000   \n",
4376
       "9            0.0  0.321928      0.0  0.584963  0.584963      0.0  0.321928   \n",
4377
       "10           0.0  0.321928      0.0  0.584963  0.584963      0.0  0.321928   \n",
4378
       "12           0.0 -1.000000      0.0  0.000000 -1.000000      0.0 -0.415037   \n",
4379
       "\n",
4380
       "          684072    684681   687448     ...     1659818  1659819   1659823  \\\n",
4381
       "gene_id                                 ...                                  \n",
4382
       "1        0.000000  0.415037      0.0    ...         0.0      0.0  0.000000   \n",
4383
       "2        0.584963  0.000000      0.0    ...         0.0      0.0  0.000000   \n",
4384
       "9        0.000000 -0.584963      0.0    ...         0.0      0.0 -0.415037   \n",
4385
       "10       0.000000 -0.584963      0.0    ...         0.0      0.0 -0.415037   \n",
4386
       "12       0.000000  0.000000      0.0    ...        -1.0      0.0  0.321928   \n",
4387
       "\n",
4388
       "         1659928   1659929  1660034   1660035   1660036   1674021   1789883  \n",
4389
       "gene_id                                                                      \n",
4390
       "1            0.0 -0.415037      0.0 -0.584963  0.000000  0.000000  0.321928  \n",
4391
       "2            0.0  0.000000      0.0  0.000000  0.000000  0.584963 -0.415037  \n",
4392
       "9            0.0  0.000000     -1.0 -1.584963  0.000000 -1.000000 -1.000000  \n",
4393
       "10           0.0  0.000000     -1.0 -1.584963  0.000000 -1.000000 -1.000000  \n",
4394
       "12           0.0 -0.415037      0.0  0.415037  0.584963  0.000000  0.000000  \n",
4395
       "\n",
4396
       "[5 rows x 996 columns]"
4397
      ]
4398
     },
4399
     "execution_count": 28,
4400
     "metadata": {},
4401
     "output_type": "execute_result"
4402
    }
4403
   ],
4404
   "source": [
4405
    "gdsc.index.name = \"gene_id\"\n",
4406
    "gdsc = gdsc.T.sort_index().T\n",
4407
    "gdsc.head()"
4408
   ]
4409
  },
4410
  {
4411
   "cell_type": "code",
4412
   "execution_count": 29,
4413
   "metadata": {},
4414
   "outputs": [],
4415
   "source": [
4416
    "gdsc.to_csv(preprocessed_dir+\"/\"+\"GDSC\"+\".Segment_Mean.CNA.tsv\",\n",
4417
    "                 sep = \"\\t\",header=True,index=True)"
4418
   ]
4419
  },
4420
  {
4421
   "cell_type": "markdown",
4422
   "metadata": {},
4423
   "source": [
4424
    "# PDX \n",
4425
    "\n",
4426
    "For PDX dataset only gene-level estimated copy-number (non-integer) reported. \n",
4427
    "From ploidy distributions, calculated as average over all genes we concluded that CN estimates were called under assumption that copy-neutral state of each xenograft corresponds CN = 2.\n",
4428
    "\n",
4429
    "\n",
4430
    "For gene ID conversion we used the same approach as for RNA-seq."
4431
   ]
4432
  },
4433
  {
4434
   "cell_type": "code",
4435
   "execution_count": 30,
4436
   "metadata": {},
4437
   "outputs": [
4438
    {
4439
     "name": "stdout",
4440
     "output_type": "stream",
4441
     "text": [
4442
      "(23852, 375)\n"
4443
     ]
4444
    },
4445
    {
4446
     "data": {
4447
      "text/html": [
4448
       "<div>\n",
4449
       "<style scoped>\n",
4450
       "    .dataframe tbody tr th:only-of-type {\n",
4451
       "        vertical-align: middle;\n",
4452
       "    }\n",
4453
       "\n",
4454
       "    .dataframe tbody tr th {\n",
4455
       "        vertical-align: top;\n",
4456
       "    }\n",
4457
       "\n",
4458
       "    .dataframe thead th {\n",
4459
       "        text-align: right;\n",
4460
       "    }\n",
4461
       "</style>\n",
4462
       "<table border=\"1\" class=\"dataframe\">\n",
4463
       "  <thead>\n",
4464
       "    <tr style=\"text-align: right;\">\n",
4465
       "      <th></th>\n",
4466
       "      <th>X-1004</th>\n",
4467
       "      <th>X-1008</th>\n",
4468
       "      <th>X-1027</th>\n",
4469
       "      <th>X-1095</th>\n",
4470
       "      <th>X-1119</th>\n",
4471
       "      <th>X-1156</th>\n",
4472
       "      <th>X-1167</th>\n",
4473
       "      <th>X-1169</th>\n",
4474
       "      <th>X-1172</th>\n",
4475
       "      <th>X-1173</th>\n",
4476
       "      <th>...</th>\n",
4477
       "      <th>X-5694</th>\n",
4478
       "      <th>X-5696</th>\n",
4479
       "      <th>X-5713</th>\n",
4480
       "      <th>X-5717</th>\n",
4481
       "      <th>X-5727</th>\n",
4482
       "      <th>X-5739</th>\n",
4483
       "      <th>X-5808</th>\n",
4484
       "      <th>X-5959</th>\n",
4485
       "      <th>X-5975</th>\n",
4486
       "      <th>X-6047</th>\n",
4487
       "    </tr>\n",
4488
       "    <tr>\n",
4489
       "      <th>Sample</th>\n",
4490
       "      <th></th>\n",
4491
       "      <th></th>\n",
4492
       "      <th></th>\n",
4493
       "      <th></th>\n",
4494
       "      <th></th>\n",
4495
       "      <th></th>\n",
4496
       "      <th></th>\n",
4497
       "      <th></th>\n",
4498
       "      <th></th>\n",
4499
       "      <th></th>\n",
4500
       "      <th></th>\n",
4501
       "      <th></th>\n",
4502
       "      <th></th>\n",
4503
       "      <th></th>\n",
4504
       "      <th></th>\n",
4505
       "      <th></th>\n",
4506
       "      <th></th>\n",
4507
       "      <th></th>\n",
4508
       "      <th></th>\n",
4509
       "      <th></th>\n",
4510
       "      <th></th>\n",
4511
       "    </tr>\n",
4512
       "  </thead>\n",
4513
       "  <tbody>\n",
4514
       "    <tr>\n",
4515
       "      <th>A1BG</th>\n",
4516
       "      <td>2.58</td>\n",
4517
       "      <td>1.60</td>\n",
4518
       "      <td>2.17</td>\n",
4519
       "      <td>2.08</td>\n",
4520
       "      <td>2.00</td>\n",
4521
       "      <td>3.94</td>\n",
4522
       "      <td>2.04</td>\n",
4523
       "      <td>11.39</td>\n",
4524
       "      <td>2.17</td>\n",
4525
       "      <td>2.01</td>\n",
4526
       "      <td>...</td>\n",
4527
       "      <td>2.08</td>\n",
4528
       "      <td>2.10</td>\n",
4529
       "      <td>2.14</td>\n",
4530
       "      <td>2.95</td>\n",
4531
       "      <td>2.06</td>\n",
4532
       "      <td>2.07</td>\n",
4533
       "      <td>1.99</td>\n",
4534
       "      <td>2.07</td>\n",
4535
       "      <td>1.43</td>\n",
4536
       "      <td>2.03</td>\n",
4537
       "    </tr>\n",
4538
       "    <tr>\n",
4539
       "      <th>A1BG-AS1</th>\n",
4540
       "      <td>2.58</td>\n",
4541
       "      <td>1.60</td>\n",
4542
       "      <td>2.17</td>\n",
4543
       "      <td>2.08</td>\n",
4544
       "      <td>2.00</td>\n",
4545
       "      <td>3.94</td>\n",
4546
       "      <td>2.04</td>\n",
4547
       "      <td>11.39</td>\n",
4548
       "      <td>2.17</td>\n",
4549
       "      <td>2.01</td>\n",
4550
       "      <td>...</td>\n",
4551
       "      <td>2.08</td>\n",
4552
       "      <td>2.10</td>\n",
4553
       "      <td>2.14</td>\n",
4554
       "      <td>2.95</td>\n",
4555
       "      <td>2.06</td>\n",
4556
       "      <td>2.07</td>\n",
4557
       "      <td>1.99</td>\n",
4558
       "      <td>2.07</td>\n",
4559
       "      <td>1.43</td>\n",
4560
       "      <td>2.03</td>\n",
4561
       "    </tr>\n",
4562
       "    <tr>\n",
4563
       "      <th>A1CF</th>\n",
4564
       "      <td>2.87</td>\n",
4565
       "      <td>2.97</td>\n",
4566
       "      <td>2.01</td>\n",
4567
       "      <td>2.06</td>\n",
4568
       "      <td>2.10</td>\n",
4569
       "      <td>1.58</td>\n",
4570
       "      <td>2.01</td>\n",
4571
       "      <td>1.64</td>\n",
4572
       "      <td>1.89</td>\n",
4573
       "      <td>1.99</td>\n",
4574
       "      <td>...</td>\n",
4575
       "      <td>2.04</td>\n",
4576
       "      <td>0.97</td>\n",
4577
       "      <td>1.58</td>\n",
4578
       "      <td>2.08</td>\n",
4579
       "      <td>1.95</td>\n",
4580
       "      <td>1.92</td>\n",
4581
       "      <td>1.54</td>\n",
4582
       "      <td>1.28</td>\n",
4583
       "      <td>1.33</td>\n",
4584
       "      <td>2.10</td>\n",
4585
       "    </tr>\n",
4586
       "    <tr>\n",
4587
       "      <th>A2LD1</th>\n",
4588
       "      <td>5.74</td>\n",
4589
       "      <td>1.64</td>\n",
4590
       "      <td>2.06</td>\n",
4591
       "      <td>2.01</td>\n",
4592
       "      <td>2.07</td>\n",
4593
       "      <td>1.74</td>\n",
4594
       "      <td>2.06</td>\n",
4595
       "      <td>1.59</td>\n",
4596
       "      <td>1.40</td>\n",
4597
       "      <td>2.53</td>\n",
4598
       "      <td>...</td>\n",
4599
       "      <td>2.03</td>\n",
4600
       "      <td>2.07</td>\n",
4601
       "      <td>2.25</td>\n",
4602
       "      <td>2.00</td>\n",
4603
       "      <td>1.01</td>\n",
4604
       "      <td>2.00</td>\n",
4605
       "      <td>1.08</td>\n",
4606
       "      <td>1.85</td>\n",
4607
       "      <td>1.93</td>\n",
4608
       "      <td>1.45</td>\n",
4609
       "    </tr>\n",
4610
       "  </tbody>\n",
4611
       "</table>\n",
4612
       "<p>4 rows × 375 columns</p>\n",
4613
       "</div>"
4614
      ],
4615
      "text/plain": [
4616
       "          X-1004  X-1008  X-1027  X-1095  X-1119  X-1156  X-1167  X-1169  \\\n",
4617
       "Sample                                                                     \n",
4618
       "A1BG        2.58    1.60    2.17    2.08    2.00    3.94    2.04   11.39   \n",
4619
       "A1BG-AS1    2.58    1.60    2.17    2.08    2.00    3.94    2.04   11.39   \n",
4620
       "A1CF        2.87    2.97    2.01    2.06    2.10    1.58    2.01    1.64   \n",
4621
       "A2LD1       5.74    1.64    2.06    2.01    2.07    1.74    2.06    1.59   \n",
4622
       "\n",
4623
       "          X-1172  X-1173   ...    X-5694  X-5696  X-5713  X-5717  X-5727  \\\n",
4624
       "Sample                     ...                                             \n",
4625
       "A1BG        2.17    2.01   ...      2.08    2.10    2.14    2.95    2.06   \n",
4626
       "A1BG-AS1    2.17    2.01   ...      2.08    2.10    2.14    2.95    2.06   \n",
4627
       "A1CF        1.89    1.99   ...      2.04    0.97    1.58    2.08    1.95   \n",
4628
       "A2LD1       1.40    2.53   ...      2.03    2.07    2.25    2.00    1.01   \n",
4629
       "\n",
4630
       "          X-5739  X-5808  X-5959  X-5975  X-6047  \n",
4631
       "Sample                                            \n",
4632
       "A1BG        2.07    1.99    2.07    1.43    2.03  \n",
4633
       "A1BG-AS1    2.07    1.99    2.07    1.43    2.03  \n",
4634
       "A1CF        1.92    1.54    1.28    1.33    2.10  \n",
4635
       "A2LD1       2.00    1.08    1.85    1.93    1.45  \n",
4636
       "\n",
4637
       "[4 rows x 375 columns]"
4638
      ]
4639
     },
4640
     "execution_count": 30,
4641
     "metadata": {},
4642
     "output_type": "execute_result"
4643
    }
4644
   ],
4645
   "source": [
4646
    "PDX_xls = \"/home/olya/SFU/Hossein/PDX/nm.3954-S2.xlsx\"\n",
4647
    "pdx = pd.read_excel(PDX_xls,\"copy number\")\n",
4648
    "pdx.set_index(\"Sample\",drop=True,inplace=True)\n",
4649
    "focal  = pdx.T[\"FocalCNScore\"]\n",
4650
    "pdx.drop([\"ArmLevelCNScore\",\"FocalCNScore\"],inplace = True)\n",
4651
    "print(pdx.shape)\n",
4652
    "pdx.head(4)"
4653
   ]
4654
  },
4655
  {
4656
   "cell_type": "code",
4657
   "execution_count": 31,
4658
   "metadata": {},
4659
   "outputs": [
4660
    {
4661
     "name": "stdout",
4662
     "output_type": "stream",
4663
     "text": [
4664
      "Strings containing duplicated gene IDs: 544\n",
4665
      "268 duplicated IDs in 544 rows found.\n",
4666
      "duplicate rows removed due to low correlation of duplicated profiles 134\n",
4667
      "Merged  410 duplicated rows into 205 rows\n"
4668
     ]
4669
    }
4670
   ],
4671
   "source": [
4672
    "pdx.index.name = \"gene_id\"\n",
4673
    "ids = pdx.index\n",
4674
    "ids = list(set(ids[ids.duplicated()]))\n",
4675
    "print(\"Strings containing duplicated gene IDs:\",pdx.loc[ids,:].shape[0])\n",
4676
    "pdx = handle_dups(pdx,corr_thr = 0.75)\n"
4677
   ]
4678
  },
4679
  {
4680
   "cell_type": "code",
4681
   "execution_count": 32,
4682
   "metadata": {},
4683
   "outputs": [
4684
    {
4685
     "data": {
4686
      "text/plain": [
4687
       "Text(0.5,0,'CN Averaged over all')"
4688
      ]
4689
     },
4690
     "execution_count": 32,
4691
     "metadata": {},
4692
     "output_type": "execute_result"
4693
    },
4694
    {
4695
     "data": {
4696
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGyJJREFUeJzt3XuYHVWd7vHvCwRFAwaGNicCMXhFFAnQIgoqIKjgwwFGUaMgcJyJjkfUOaMDelRgvEyYR1ARj+cEQcELl1FuCioIAl6RBCMhIIoQFIgkCMhFAQPv+aNWk01Pd3Z10rV3uuv9PM9+eu+qVVW/vZ6kfnutWrVKtomIiPZar98BREREfyURRES0XBJBRETLJRFERLRcEkFERMslEUREtFwSQTRC0uWS/qFGuZmSHpC0/ijrj5H0tTU4/mr3G//VmtZ1THxJBLHGJC2V9Ndywr1T0lckTR3LPmz/3vZU24+OZ2xrs19Ju0t6rHyv+yXdKOnwsm6WJJd1Q9/7O5L27th+aqmbt3Us21jS7yW9cXy+YcT4SSKItbWf7anAjsAg8JE+xzNe7ijfaxPgSOBkSdt2rJ9W1m8PXAKcK+kwANsPAO8EPitpoJT/D2CB7W/26gtE1JVEEOPC9u3Ad4EXDV8naT1JH5F0q6Tlkk6X9LSybugX9gbl89aSrii/xC8BNu/Yz4WSjhi272slHTjCMYfv93JJH5f0k7LviyVtPny7Eb6XbZ8H3ANsO8L6P9r+HHAMcJyk9cry7wMXAidK2h14E/Du0Y4j6TBJN5fYbhlqTUh6tqTLJP1J0l2Svi5pWsd2SyV9sNTDg5JOkTRd0nfLvn4gadNhdTJX0h2Slkn6wGpi2kXSTyXdK+lX5XusNt6YmJIIYlxI2grYF/jlCKsPK689gGcBU4GTRtnVN4CFVAng48ChHetOAw7uOOb2wBZUJ9w63gocDjwd2BAY9STYcYz1SqKZBixeTdFzyn6f37Hsn4HdgW8CH7D9x1GO8VTgRGAf2xsDLwcWDa0G/h14BvACYCuqpNPpDcDewPOA/agS8oeBAar/4+8dVn4P4LnAa4AjJe01QkxD9foJYDOquvqWpIEu8cYElEQQa+s8SfcCPwauAD41Qpm3ASfYvrl0m3wIeMvQr/UhkmYCLwE+avth21cC3+4ocgHwPEnPLZ8PAc6y/UjNWL9s+ze2/wqcDcxeTdlnlO91F3A0cIjtG1dT/o7yd7OhBbbvAZYAT6FKFKvzGPAiSRvZXmZ7SdnHTbYvKfWxAjgBeNWwbT9v+87SKvsRcJXtX9p+CDgX2GFY+WNtP2h7MfBlYM4I8RwMXGT7ItuP2b4EWECV7EeNNyamJIJYWwfYnmb7mbbfXU6ywz0DuLXj863ABsD0EcrdY/vBYWUBKCe2s4CDSxfMHOCrY4i18xf5X6haJqO5o3yvzWzPtn1ml31vUf7ePbRA0sHALOAHwHGjbVi+75uBdwHLShfYNmUf0yWdKel2SfcBX6Oju6y4s+P9X0f4PPx7/qHj/a1U9T7cM4GDSrfQvSUp7gbMWF28MTElEUQv3EF1YhkyE1jJE09YAMuATUvXQ2fZTqdRtTBeDfzF9s/GOdY1dSCwHLgRQNLTgc8A/0h14fhNkl4x2sa2v297b2AG8Gvg5LLqU4CB7WxvQvVLXWsZ61Yd72eyqjXT6Q/AV0syHHo91fa8LvHGBJREEL1wBvDP5ULwVKqT21m2V3YWsn0rVffDsZI2lLQbVZ93Z5mfUXVLHM/YWgONKL/Y30PVffQh24+VVScB59n+oe1lwL9SjTx60ij72L8kwIeBB6i+I8DG5fOfS7/9B8ch7I9KeoqkF1JdMzlrhDJfA/aT9FpJ60t6sqphtVt2iTcmoCSC6IVTqU7aVwK3AA8BR4xS9q3AS6m6WI4GTh+hzOnAdlQnq365V9KDVBeQ9wUOsn0qgKQDqLpRHj9p2/4S1S/vj42wr/WA/1XW3011DeCfyrpjqYbm/pnq4m23aw11XAHcBFwKfNr2xcML2P4DsD/VRecVVC2ED5ZYVxdvTEDKg2liopH0dmCu7d36HctEImkWVSKeMrw1Fu2WFkFMKJKeQjUef36/Y4mYLJIIYsKQ9Fqqboo7qe43iIhxkK6hiIiWS4sgIqLlNuhepP8233xzz5o1q99hRERMKAsXLrzL9kC3chMiEcyaNYsFCxb0O4yIiAlF0q3dS6VrKCKi9ZIIIiJaLokgIqLlkggiIlouiSAiouWSCCIiWq6xRFCmrf1FedbpEknHluVfKc84XVReq3tKVERENKzJ+wgeBva0/YCkKcCPJX23rPug7W82eOyIiKipsUTgahKjB8rHKeWViY0iItYxjd5ZLGl9YCHwHOALtq+S9E/AJyV9jOrBGEfZfniEbecCcwFmzhz+tMKY6GYddWGtckvnvb7hSCKi0YvFth+1PRvYEthZ0ouADwHbAC8BNgOOHGXb+bYHbQ8ODHSdKiMiItZQT0YN2b4X+CHwOtvLXHkY+DKwcy9iiIiIkTU5amhA0rTyfiNgb+DXkmaUZQIOAK5rKoaIiOiuyWsEM4DTynWC9YCzbX9H0mWSBgABi4B3NRhDRER00eSooWuBHUZYvmdTx4yIiLHLncURES2XRBAR0XJJBBERLZdEEBHRckkEEREtl0QQEdFySQQRES2XRBAR0XJJBBERLZdEEBHRckkEEREtl0QQEdFySQQRES2XRBAR0XJJBBERLZdEEBHRckkEEREt1+SjKqNlZh11Yb9DiIg1kBZBRETLJRFERLRcY4lA0pMl/ULSryQtkXRsWb61pKsk3STpLEkbNhVDRER012SL4GFgT9vbA7OB10naBTgO+Izt5wD3AO9oMIaIiOiisUTgygPl45TyMrAn8M2y/DTggKZiiIiI7hodNSRpfWAh8BzgC8DvgHttryxFbgO2GGXbucBcgJkzZzYZZqzD6o5EWjrv9Q1HEjF5NXqx2PajtmcDWwI7A9uMYdv5tgdtDw4MDDQWY0RE2/Vk1JDte4EfAi8DpkkaaolsCdzeixgiImJkTY4aGpA0rbzfCNgbuIEqIbyxFDsUOL+pGCIiorsmrxHMAE4r1wnWA862/R1J1wNnSvoE8EvglAZjiIiILhpLBLavBXYYYfnNVNcLIiJiHZC5hqKriTCHUEYXRay5TDEREdFySQQRES2XRBAR0XJJBBERLZdEEBHRchk1NAlNhFE+EbHuSIsgIqLlkggiIlouiSAiouWSCCIiWi6JICKi5ZIIIiJaLokgIqLlkggiIlouiSAiouWSCCIiWi6JICKi5TLXULRKnmQW8V+lRRAR0XJJBBERLddYIpC0laQfSrpe0hJJ7yvLj5F0u6RF5bVvUzFERER3TV4jWAn8i+1rJG0MLJR0SVn3GdufbvDYERFRU2OJwPYyYFl5f7+kG4AtmjpeRESsmZ5cI5A0C9gBuKoseo+kayWdKmnTUbaZK2mBpAUrVqzoRZgREa3UeCKQNBX4FvB+2/cBXwSeDcymajEcP9J2tufbHrQ9ODAw0HSYERGt1WgikDSFKgl83fY5ALbvtP2o7ceAk4Gdm4whIiJWr8lRQwJOAW6wfULH8hkdxQ4ErmsqhoiI6K7JUUO7AocAiyUtKss+DMyRNBswsBR4Z4MxREREF02OGvoxoBFWXdTUMSMiYuxyZ3FERMslEUREtFwSQUREyyURRES0XBJBRETLJRFERLRcEkFERMslEUREtFwSQUREy3VNBJKeLelJ5f3ukt4raVrzoUVERC/UaRF8C3hU0nOA+cBWwDcajSoiInqmTiJ4zPZKqplCP2/7g8CMLttERMQEUWfSub9JmgMcCuxXlk1pLqSI/pt11IW1yi2d9/qGI4loXp0WweHAy4BP2r5F0tbAV5sNKyIieqVri8D29ZKOBGaWz7cAxzUdWERE9EadUUP7AYuA75XPsyVd0HRgERHRG3W6ho6heq7wvQC2FwHPajCmiIjooTqJ4G+2/zxs2WNNBBMREb1XZ9TQEklvBdaX9FzgvcBPmw0rIiJ6pU6L4AjghcDDwBnAfcD7mwwqIiJ6p86oob8A/7u8apO0FXA6MB0wMN/25yRtBpwFzAKWAm+yfc/Ywo6IiPEyaiKQ9G2qE/iIbP/3LvteCfyL7WskbQwslHQJcBhwqe15ko4CjgKOHHPkERExLlbXIvj02uzY9jJgWXl/v6QbgC2A/YHdS7HTgMtJIoiI6JtRE4HtK4beS9oQ2IaqhXCj7UfGchBJs4AdgKuA6SVJAPyRqutopG3mAnMBZs6cOZbDRUTEGNS5oez1wO+AE4GTgJsk7VP3AJKmUs1g+n7b93Wus21G6X6yPd/2oO3BgYGBuoeLiIgxqjN89HhgD9s3QfV8AuBC4LvdNpQ0hSoJfN32OWXxnZJm2F4maQawfM1Cj4iI8VBn+Oj9Q0mguBm4v9tGkgScAtxg+4SOVRdQzWRK+Xt+zVgjIqIBdVoECyRdBJxN1Y1zEHC1pL8H6PilP9yuwCHAYkmLyrIPA/OAsyW9A7gVeNNaxB8REWupTiJ4MnAn8KryeQWwEdWzCQyMmAhs/xjQKPt89djCjIiIptS5oezwXgQSERH90TURlAfRHEF1J/Dj5WvcUBYRERNAna6h86gu+n6bzDoaETHp1EkED9k+sfFIIiKiL+okgs9JOhq4mGoGUgBsX9NYVBER0TN1EsF2VMNA92RV15DL54iImODqJIKDgGeNdX6hiIiYGOrcWXwdMK3pQCIioj/qtAimAb+WdDVPvEaQ4aMREZNAnURwdONRRERE39S5s/iKbmUiImLiqvM8gl0kXS3pAUmPSHpU0n3dtouIiImhzsXik4A5wG+pJpv7B+ALTQYVERG9UycRUJ5HsL7tR21/GXhds2FFRESv1LlY/JfyzOJFkv6D6oH0tRJIRESs++qc0A8p5d4DPAhsBbyhyaAiIqJ36owaurW8fUjSicBWwx5dGRERE1idUUOXS9pE0mbANcDJkk7otl1EREwMdbqGnmb7PuDvgdNtvxTYq9mwIiKiV+okgg0kzaB6yPx3Go4nIiJ6rE4i+Dfg+8BNtq+W9CyqewoiImIS6JoIbP+n7Rfbfnf5fLPtrqOGJJ0qabmk6zqWHSPpdkmLymvftQs/IiLWVpP3A3yFkW88+4zt2eV1UYPHj4iIGhpLBLavBO5uav8RETE++nGH8HskXVu6jjYdrZCkuZIWSFqwYsWKXsYXEdEqXW8ok/QkqjuJZ3WWt/1va3C8LwIfp3rm8ceB44H/MVJB2/OB+QCDg4Neg2NFREQNdeYaOh/4M7CQjieUrQnbdw69l3QyGY4aEdF3dRLBlrbHZbZRSTNsLysfD6R6HnJERPRRnUTwU0nb2V48lh1LOgPYHdhc0m1Uj7zcXdJsqq6hpcA7xxZuRESMtzqJYDfgMEm3UHUNCbDtF69uI9tzRlh8ythDjIiIJtVJBPs0HkVERPTNWKahjoiISShPGouIaLkkgoiIlksiiIhouSSCiIiWSyKIiGi5JIKIiJZLIoiIaLkkgoiIlksiiIhouSSCiIiWqzPXUKyhWUddWKvc0nmvbziSiIjRpUUQEdFySQQRES2XRBAR0XJJBBERLZdEEBHRchk1FLEW+jkyLKPSYrykRRAR0XKNJQJJp0paLum6jmWbSbpE0m/L302bOn5ERNTTZIvgK8Drhi07CrjU9nOBS8vniIjoo8YSge0rgbuHLd4fOK28Pw04oKnjR0REPb2+RjDd9rLy/o/A9B4fPyIihunbqCHbluTR1kuaC8wFmDlzZs/i6qbuSI1+7zMioq5etwjulDQDoPxdPlpB2/NtD9oeHBgY6FmAERFt0+tEcAFwaHl/KHB+j48fERHDNDl89AzgZ8DzJd0m6R3APGBvSb8F9iqfIyKijxq7RmB7ziirXt3UMSMiYuxyZ3FERMtlrqGIHhjLyLDMDRS9lhZBRETLJRFERLRcEkFERMslEUREtFwSQUREyyURRES0XBJBRETLJRFERLRcEkFERMslEUREtFymmIhYx4z3g4rq7q/u1Bbjvb/ov7QIIiJaLokgIqLlkggiIlouiSAiouWSCCIiWi6JICKi5ZIIIiJaLokgIqLl+nJDmaSlwP3Ao8BK24P9iCMiIvp7Z/Eetu/q4/EjIoJ0DUVEtF6/EoGBiyUtlDR3pAKS5kpaIGnBihUrehxeRER79CsR7GZ7R2Af4H9KeuXwArbn2x60PTgwMND7CCMiWqIvicD27eXvcuBcYOd+xBEREX1IBJKeKmnjoffAa4Dreh1HRERU+jFqaDpwrqSh43/D9vf6EEdERNCHRGD7ZmD7Xh83IiJGlieURUQjxvKktTzNrL9yH0FERMslEUREtFwSQUREyyURRES0XBJBRETLZdRQMZYRDhGT0WT6P1D3u2S0UiUtgoiIlksiiIhouSSCiIiWSyKIiGi5JIKIiJaT7X7H0NXg4KAXLFiwRttOppEQEdEfE3V0kaSFtge7lUuLICKi5ZIIIiJaLokgIqLlkggiIlouiSAiouUy11BExDhpYpRiL0YspUUQEdFySQQRES3Xl0Qg6XWSbpR0k6Sj+hFDRERUep4IJK0PfAHYB9gWmCNp217HERERlX60CHYGbrJ9s+1HgDOB/fsQR0RE0J9RQ1sAf+j4fBvw0uGFJM0F5paPD0i6sQexrY3Ngbv6HcQ6IPWwSupilQldFzpu3HY15npYy2M/s06hdXb4qO35wPx+x1GXpAV1Jnea7FIPq6QuVkldVNbVeuhH19DtwFYdn7csyyIiog/6kQiuBp4raWtJGwJvAS7oQxwREUEfuoZsr5T0HuD7wPrAqbaX9DqOBkyYbqyGpR5WSV2skrqorJP1MCEeTBMREc3JncURES2XRBAR0XJJBGMg6VRJyyVdN8r6p0n6tqRfSVoi6fBex9gLkraS9ENJ15fv+b4RykjSiWUakWsl7diPWJtWsy7eVupgsaSfStq+H7E2qU49dJR9iaSVkt7Yyxh7pW5dSNpd0qJS5opex/kEtvOq+QJeCewIXDfK+g8Dx5X3A8DdwIb9jruBepgB7Fjebwz8Bth2WJl9ge8CAnYBrup33H2si5cDm5b3+0zGuqhTD2Xd+sBlwEXAG/sddx//TUwDrgdmls9P72fMaRGMge0rqU7uoxYBNpYkYGopu7IXsfWS7WW2rynv7wduoLpjvNP+wOmu/ByYJmlGj0NtXJ26sP1T2/eUjz+nundmUqn5bwLgCOBbwPIehtdTNevircA5tn9fyvW1PpIIxtdJwAuAO4DFwPtsP9bfkJolaRawA3DVsFUjTSUy0olh0lhNXXR6B1VLadIarR4kbQEcCHyx91H1x2r+TTwP2FTS5ZIWSnp7r2PrtM5OMTFBvRZYBOwJPBu4RNKPbN/X37CaIWkq1a+790/W71hXnbqQtAdVItitl7H1Upd6+CxwpO3Hqkbz5NalLjYAdgJeDWwE/EzSz23/psdhPh5MjJ/DgXmuOv1uknQLsA3wi/6GNf4kTaH6R/512+eMUKQ1U4nUqAskvRj4ErCP7T/1Mr5eqVEPg8CZJQlsDuwraaXt83oYZk/UqIvbgD/ZfhB4UNKVwPZU1xN6Ll1D4+v3VBkeSdOB5wM39zWiBpRrIKcAN9g+YZRiFwBvL6OHdgH+bHtZz4LskTp1IWkmcA5wSL9+8TWtTj3Y3tr2LNuzgG8C756kSaDO/4/zgd0kbSDpKVQzMN/QqxiHS4tgDCSdAewObC7pNuBoYAqA7f8LfBz4iqTFVKNljrQ9YafeXY1dgUOAxZIWlWUfBmbC43VxEdXIoZuAv1C1liajOnXxMeDvgP9Tfg2v9Do4A+VaqlMPbdG1LmzfIOl7wLXAY8CXbI84LL0XMsVERETLpWsoIqLlkggiIlouiSAiouWSCCIiWi6JICKi5ZIIoqck/TdJZ0r6Xbm1/iJJz5M0S5IlHdFR9iRJh61mX4skndmTwNeSpMMkndTvOIaT9ED5O2u0WXVj8ksiiJ4pN9qcC1xu+9m2dwI+BEwvRZYD7yvPsu62rxdQzWT5CklPHaf4JsV9NZPle0TvJBFEL+0B/K3z5iLbv7L9o/JxBXApcGiNfc0BvgpcTDXTKZK2kfT4dB7lV+7i8n4nSVeUVsj3h2ZCLZN+fVbSAqoktJ+kqyT9UtIPyh3iSBqQdEmZO/5Lkm6VtHlZd7CkX5QWyv+TtH5Zfrik35SYdh3pS0jaTNJ5qp5X8HNJL5a0nqSlkqZ1lPutpOkljm9Jurq8di3rj5H0VUk/KfXSeYypki6VdI2qZyLsX6N+o0WSCKKXXgQs7FLmOOADQyfT1XgzcCZwBlVSwPavgQ0lbd1R5qwy78vnqea/3wk4Ffhkx742tD1o+3jgx8Autnco+//XUuZo4DLbL6SaHmEmPN4yeTOwq+3ZwKPA20qiOZYqAewGbDvK9zgW+KXtF1PdfXp6mbH2fKqZOpH0UuBW23cCnwM+Y/slwBuo5i8asi2wl+05w47xEHCg7R2pkvHxasOsb1FbmpCxTrF9s6SrqOZrH5GkQeAu27+XdDtwqqTNbN8NnE11Yp5X/r6Zas6nF1HNBgtVl1LnvEdndbzfkip5zAA2BG4py3ejnJhtf0/S0PMFXk01i+TVZd8bUXVxvZSqC2xFifksqqmHh9uN6oSO7csk/Z2kTUpMHwO+DLylI8a9gG07zuObqJrlEuAC238dqcqAT0l6JdV0BltQdcf9cYSy0UJJBNFLS4A6jyf8FNWv7tEe3zcH2EbS0vJ5E6qT6clUJ8z/lHQOYNu/lbQdsMT2y0bZ34Md7z8PnGD7Akm7A8d0iVXAabY/9ISF0gFdtuvmZ8BzJA0ABwCfKMvXo2qxPDTsePDE79HpbVRPzNvJ9t9KvT15LeOLSSRdQ9FLlwFPkjR3aEHpE39FZ6HSxXM9sN/wHUhaD3gTsF3HTJb7s6p76HdU3TMfZdWv6BuBAUkvK/uYIumFo8T4NFZNl915reIn5bhIeg2waVl+KfBGSU8v6zaT9EyqB5G8qvzCnwIcNMrxfkR1oqYknrts31emMj8XOIFqFsuhqasvpnrK11B9zB5lv8O/0/KSBPYAnlljm2iRJILomXJyOxDYqwwfXQL8OyN3UXySkR/p+Argdtt3dCy7kqq7ZOhRmGcBB1N1E2H7EaqWyHGSfkX18KCXjxLmMVQtioVA58yxxwKvKUMsDyox32/7euAjwMWSrgUuAWaUKbePofpl/xNGn2L4GGCnsu08nph8hr5HZ9fVe4HBcnH5euBdo+y309fLNouBtwO/rrFNtEhmH42oQdKTgEdtrywtiy+Wi8MRE16uEUTUMxM4u3RNPQL8Y5/jiRg3aRFERLRcrhFERLRcEkFERMslEUREtFwSQUREyyURRES03P8H9AxF0s+Zly8AAAAASUVORK5CYII=\n",
4697
      "text/plain": [
4698
       "<Figure size 432x288 with 1 Axes>"
4699
      ]
4700
     },
4701
     "metadata": {},
4702
     "output_type": "display_data"
4703
    }
4704
   ],
4705
   "source": [
4706
    "average_ploidies = pdx.apply(np.mean)\n",
4707
    "p = plt.hist(average_ploidies,bins=30)\n",
4708
    "plt.title(\"Ploidy in PDX samples\")\n",
4709
    "plt.ylabel(\"n samples\")\n",
4710
    "plt.xlabel(\"CN Averaged over all\")"
4711
   ]
4712
  },
4713
  {
4714
   "cell_type": "code",
4715
   "execution_count": 33,
4716
   "metadata": {},
4717
   "outputs": [
4718
    {
4719
     "data": {
4720
      "text/html": [
4721
       "<div>\n",
4722
       "<style scoped>\n",
4723
       "    .dataframe tbody tr th:only-of-type {\n",
4724
       "        vertical-align: middle;\n",
4725
       "    }\n",
4726
       "\n",
4727
       "    .dataframe tbody tr th {\n",
4728
       "        vertical-align: top;\n",
4729
       "    }\n",
4730
       "\n",
4731
       "    .dataframe thead th {\n",
4732
       "        text-align: right;\n",
4733
       "    }\n",
4734
       "</style>\n",
4735
       "<table border=\"1\" class=\"dataframe\">\n",
4736
       "  <thead>\n",
4737
       "    <tr style=\"text-align: right;\">\n",
4738
       "      <th></th>\n",
4739
       "      <th>X-1004</th>\n",
4740
       "      <th>X-1008</th>\n",
4741
       "      <th>X-1027</th>\n",
4742
       "      <th>X-1095</th>\n",
4743
       "      <th>X-1119</th>\n",
4744
       "      <th>X-1156</th>\n",
4745
       "      <th>X-1167</th>\n",
4746
       "      <th>X-1169</th>\n",
4747
       "      <th>X-1172</th>\n",
4748
       "      <th>X-1173</th>\n",
4749
       "      <th>...</th>\n",
4750
       "      <th>X-5694</th>\n",
4751
       "      <th>X-5696</th>\n",
4752
       "      <th>X-5713</th>\n",
4753
       "      <th>X-5717</th>\n",
4754
       "      <th>X-5727</th>\n",
4755
       "      <th>X-5739</th>\n",
4756
       "      <th>X-5808</th>\n",
4757
       "      <th>X-5959</th>\n",
4758
       "      <th>X-5975</th>\n",
4759
       "      <th>X-6047</th>\n",
4760
       "    </tr>\n",
4761
       "    <tr>\n",
4762
       "      <th>gene_id</th>\n",
4763
       "      <th></th>\n",
4764
       "      <th></th>\n",
4765
       "      <th></th>\n",
4766
       "      <th></th>\n",
4767
       "      <th></th>\n",
4768
       "      <th></th>\n",
4769
       "      <th></th>\n",
4770
       "      <th></th>\n",
4771
       "      <th></th>\n",
4772
       "      <th></th>\n",
4773
       "      <th></th>\n",
4774
       "      <th></th>\n",
4775
       "      <th></th>\n",
4776
       "      <th></th>\n",
4777
       "      <th></th>\n",
4778
       "      <th></th>\n",
4779
       "      <th></th>\n",
4780
       "      <th></th>\n",
4781
       "      <th></th>\n",
4782
       "      <th></th>\n",
4783
       "      <th></th>\n",
4784
       "    </tr>\n",
4785
       "  </thead>\n",
4786
       "  <tbody>\n",
4787
       "    <tr>\n",
4788
       "      <th>A1BG</th>\n",
4789
       "      <td>0.367371</td>\n",
4790
       "      <td>-0.321928</td>\n",
4791
       "      <td>0.117695</td>\n",
4792
       "      <td>0.056584</td>\n",
4793
       "      <td>0.000000</td>\n",
4794
       "      <td>0.978196</td>\n",
4795
       "      <td>0.028569</td>\n",
4796
       "      <td>2.509696</td>\n",
4797
       "      <td>0.117695</td>\n",
4798
       "      <td>0.007196</td>\n",
4799
       "      <td>...</td>\n",
4800
       "      <td>0.056584</td>\n",
4801
       "      <td>0.070389</td>\n",
4802
       "      <td>0.097611</td>\n",
4803
       "      <td>0.560715</td>\n",
4804
       "      <td>0.042644</td>\n",
4805
       "      <td>0.049631</td>\n",
4806
       "      <td>-0.007232</td>\n",
4807
       "      <td>0.049631</td>\n",
4808
       "      <td>-0.483985</td>\n",
4809
       "      <td>0.021480</td>\n",
4810
       "    </tr>\n",
4811
       "    <tr>\n",
4812
       "      <th>A1BG-AS1</th>\n",
4813
       "      <td>0.367371</td>\n",
4814
       "      <td>-0.321928</td>\n",
4815
       "      <td>0.117695</td>\n",
4816
       "      <td>0.056584</td>\n",
4817
       "      <td>0.000000</td>\n",
4818
       "      <td>0.978196</td>\n",
4819
       "      <td>0.028569</td>\n",
4820
       "      <td>2.509696</td>\n",
4821
       "      <td>0.117695</td>\n",
4822
       "      <td>0.007196</td>\n",
4823
       "      <td>...</td>\n",
4824
       "      <td>0.056584</td>\n",
4825
       "      <td>0.070389</td>\n",
4826
       "      <td>0.097611</td>\n",
4827
       "      <td>0.560715</td>\n",
4828
       "      <td>0.042644</td>\n",
4829
       "      <td>0.049631</td>\n",
4830
       "      <td>-0.007232</td>\n",
4831
       "      <td>0.049631</td>\n",
4832
       "      <td>-0.483985</td>\n",
4833
       "      <td>0.021480</td>\n",
4834
       "    </tr>\n",
4835
       "    <tr>\n",
4836
       "      <th>A1CF</th>\n",
4837
       "      <td>0.521051</td>\n",
4838
       "      <td>0.570463</td>\n",
4839
       "      <td>0.007196</td>\n",
4840
       "      <td>0.042644</td>\n",
4841
       "      <td>0.070389</td>\n",
4842
       "      <td>-0.340075</td>\n",
4843
       "      <td>0.007196</td>\n",
4844
       "      <td>-0.286304</td>\n",
4845
       "      <td>-0.081614</td>\n",
4846
       "      <td>-0.007232</td>\n",
4847
       "      <td>...</td>\n",
4848
       "      <td>0.028569</td>\n",
4849
       "      <td>-1.043943</td>\n",
4850
       "      <td>-0.340075</td>\n",
4851
       "      <td>0.056584</td>\n",
4852
       "      <td>-0.036526</td>\n",
4853
       "      <td>-0.058894</td>\n",
4854
       "      <td>-0.377070</td>\n",
4855
       "      <td>-0.643856</td>\n",
4856
       "      <td>-0.588574</td>\n",
4857
       "      <td>0.070389</td>\n",
4858
       "    </tr>\n",
4859
       "  </tbody>\n",
4860
       "</table>\n",
4861
       "<p>3 rows × 375 columns</p>\n",
4862
       "</div>"
4863
      ],
4864
      "text/plain": [
4865
       "            X-1004    X-1008    X-1027    X-1095    X-1119    X-1156  \\\n",
4866
       "gene_id                                                                \n",
4867
       "A1BG      0.367371 -0.321928  0.117695  0.056584  0.000000  0.978196   \n",
4868
       "A1BG-AS1  0.367371 -0.321928  0.117695  0.056584  0.000000  0.978196   \n",
4869
       "A1CF      0.521051  0.570463  0.007196  0.042644  0.070389 -0.340075   \n",
4870
       "\n",
4871
       "            X-1167    X-1169    X-1172    X-1173    ...       X-5694  \\\n",
4872
       "gene_id                                             ...                \n",
4873
       "A1BG      0.028569  2.509696  0.117695  0.007196    ...     0.056584   \n",
4874
       "A1BG-AS1  0.028569  2.509696  0.117695  0.007196    ...     0.056584   \n",
4875
       "A1CF      0.007196 -0.286304 -0.081614 -0.007232    ...     0.028569   \n",
4876
       "\n",
4877
       "            X-5696    X-5713    X-5717    X-5727    X-5739    X-5808  \\\n",
4878
       "gene_id                                                                \n",
4879
       "A1BG      0.070389  0.097611  0.560715  0.042644  0.049631 -0.007232   \n",
4880
       "A1BG-AS1  0.070389  0.097611  0.560715  0.042644  0.049631 -0.007232   \n",
4881
       "A1CF     -1.043943 -0.340075  0.056584 -0.036526 -0.058894 -0.377070   \n",
4882
       "\n",
4883
       "            X-5959    X-5975    X-6047  \n",
4884
       "gene_id                                 \n",
4885
       "A1BG      0.049631 -0.483985  0.021480  \n",
4886
       "A1BG-AS1  0.049631 -0.483985  0.021480  \n",
4887
       "A1CF     -0.643856 -0.588574  0.070389  \n",
4888
       "\n",
4889
       "[3 rows x 375 columns]"
4890
      ]
4891
     },
4892
     "execution_count": 33,
4893
     "metadata": {},
4894
     "output_type": "execute_result"
4895
    }
4896
   ],
4897
   "source": [
4898
    "pdx = pdx.applymap(lambda x: np.log2(x/2))\n",
4899
    "pdx.head(3)"
4900
   ]
4901
  },
4902
  {
4903
   "cell_type": "code",
4904
   "execution_count": 34,
4905
   "metadata": {},
4906
   "outputs": [
4907
    {
4908
     "data": {
4909
      "text/html": [
4910
       "<div>\n",
4911
       "<style scoped>\n",
4912
       "    .dataframe tbody tr th:only-of-type {\n",
4913
       "        vertical-align: middle;\n",
4914
       "    }\n",
4915
       "\n",
4916
       "    .dataframe tbody tr th {\n",
4917
       "        vertical-align: top;\n",
4918
       "    }\n",
4919
       "\n",
4920
       "    .dataframe thead th {\n",
4921
       "        text-align: right;\n",
4922
       "    }\n",
4923
       "</style>\n",
4924
       "<table border=\"1\" class=\"dataframe\">\n",
4925
       "  <thead>\n",
4926
       "    <tr style=\"text-align: right;\">\n",
4927
       "      <th></th>\n",
4928
       "      <th>X-1004</th>\n",
4929
       "      <th>X-1008</th>\n",
4930
       "      <th>X-1027</th>\n",
4931
       "      <th>X-1095</th>\n",
4932
       "      <th>X-1119</th>\n",
4933
       "      <th>X-1156</th>\n",
4934
       "      <th>X-1167</th>\n",
4935
       "      <th>X-1169</th>\n",
4936
       "      <th>X-1172</th>\n",
4937
       "      <th>X-1173</th>\n",
4938
       "      <th>...</th>\n",
4939
       "      <th>X-5694</th>\n",
4940
       "      <th>X-5696</th>\n",
4941
       "      <th>X-5713</th>\n",
4942
       "      <th>X-5717</th>\n",
4943
       "      <th>X-5727</th>\n",
4944
       "      <th>X-5739</th>\n",
4945
       "      <th>X-5808</th>\n",
4946
       "      <th>X-5959</th>\n",
4947
       "      <th>X-5975</th>\n",
4948
       "      <th>X-6047</th>\n",
4949
       "    </tr>\n",
4950
       "    <tr>\n",
4951
       "      <th>gene_id</th>\n",
4952
       "      <th></th>\n",
4953
       "      <th></th>\n",
4954
       "      <th></th>\n",
4955
       "      <th></th>\n",
4956
       "      <th></th>\n",
4957
       "      <th></th>\n",
4958
       "      <th></th>\n",
4959
       "      <th></th>\n",
4960
       "      <th></th>\n",
4961
       "      <th></th>\n",
4962
       "      <th></th>\n",
4963
       "      <th></th>\n",
4964
       "      <th></th>\n",
4965
       "      <th></th>\n",
4966
       "      <th></th>\n",
4967
       "      <th></th>\n",
4968
       "      <th></th>\n",
4969
       "      <th></th>\n",
4970
       "      <th></th>\n",
4971
       "      <th></th>\n",
4972
       "      <th></th>\n",
4973
       "    </tr>\n",
4974
       "  </thead>\n",
4975
       "  <tbody>\n",
4976
       "    <tr>\n",
4977
       "      <th>A1BG</th>\n",
4978
       "      <td>0.367371</td>\n",
4979
       "      <td>-0.321928</td>\n",
4980
       "      <td>0.0</td>\n",
4981
       "      <td>0.0</td>\n",
4982
       "      <td>0.0</td>\n",
4983
       "      <td>0.978196</td>\n",
4984
       "      <td>0.0</td>\n",
4985
       "      <td>2.509696</td>\n",
4986
       "      <td>0.0</td>\n",
4987
       "      <td>0.0</td>\n",
4988
       "      <td>...</td>\n",
4989
       "      <td>0.0</td>\n",
4990
       "      <td>0.000000</td>\n",
4991
       "      <td>0.000000</td>\n",
4992
       "      <td>0.560715</td>\n",
4993
       "      <td>0.0</td>\n",
4994
       "      <td>0.0</td>\n",
4995
       "      <td>0.00000</td>\n",
4996
       "      <td>0.000000</td>\n",
4997
       "      <td>-0.483985</td>\n",
4998
       "      <td>0.0</td>\n",
4999
       "    </tr>\n",
5000
       "    <tr>\n",
5001
       "      <th>A1BG-AS1</th>\n",
5002
       "      <td>0.367371</td>\n",
5003
       "      <td>-0.321928</td>\n",
5004
       "      <td>0.0</td>\n",
5005
       "      <td>0.0</td>\n",
5006
       "      <td>0.0</td>\n",
5007
       "      <td>0.978196</td>\n",
5008
       "      <td>0.0</td>\n",
5009
       "      <td>2.509696</td>\n",
5010
       "      <td>0.0</td>\n",
5011
       "      <td>0.0</td>\n",
5012
       "      <td>...</td>\n",
5013
       "      <td>0.0</td>\n",
5014
       "      <td>0.000000</td>\n",
5015
       "      <td>0.000000</td>\n",
5016
       "      <td>0.560715</td>\n",
5017
       "      <td>0.0</td>\n",
5018
       "      <td>0.0</td>\n",
5019
       "      <td>0.00000</td>\n",
5020
       "      <td>0.000000</td>\n",
5021
       "      <td>-0.483985</td>\n",
5022
       "      <td>0.0</td>\n",
5023
       "    </tr>\n",
5024
       "    <tr>\n",
5025
       "      <th>A1CF</th>\n",
5026
       "      <td>0.521051</td>\n",
5027
       "      <td>0.570463</td>\n",
5028
       "      <td>0.0</td>\n",
5029
       "      <td>0.0</td>\n",
5030
       "      <td>0.0</td>\n",
5031
       "      <td>-0.340075</td>\n",
5032
       "      <td>0.0</td>\n",
5033
       "      <td>-0.286304</td>\n",
5034
       "      <td>0.0</td>\n",
5035
       "      <td>0.0</td>\n",
5036
       "      <td>...</td>\n",
5037
       "      <td>0.0</td>\n",
5038
       "      <td>-1.043943</td>\n",
5039
       "      <td>-0.340075</td>\n",
5040
       "      <td>0.000000</td>\n",
5041
       "      <td>0.0</td>\n",
5042
       "      <td>0.0</td>\n",
5043
       "      <td>-0.37707</td>\n",
5044
       "      <td>-0.643856</td>\n",
5045
       "      <td>-0.588574</td>\n",
5046
       "      <td>0.0</td>\n",
5047
       "    </tr>\n",
5048
       "  </tbody>\n",
5049
       "</table>\n",
5050
       "<p>3 rows × 375 columns</p>\n",
5051
       "</div>"
5052
      ],
5053
      "text/plain": [
5054
       "            X-1004    X-1008  X-1027  X-1095  X-1119    X-1156  X-1167  \\\n",
5055
       "gene_id                                                                  \n",
5056
       "A1BG      0.367371 -0.321928     0.0     0.0     0.0  0.978196     0.0   \n",
5057
       "A1BG-AS1  0.367371 -0.321928     0.0     0.0     0.0  0.978196     0.0   \n",
5058
       "A1CF      0.521051  0.570463     0.0     0.0     0.0 -0.340075     0.0   \n",
5059
       "\n",
5060
       "            X-1169  X-1172  X-1173   ...    X-5694    X-5696    X-5713  \\\n",
5061
       "gene_id                              ...                                 \n",
5062
       "A1BG      2.509696     0.0     0.0   ...       0.0  0.000000  0.000000   \n",
5063
       "A1BG-AS1  2.509696     0.0     0.0   ...       0.0  0.000000  0.000000   \n",
5064
       "A1CF     -0.286304     0.0     0.0   ...       0.0 -1.043943 -0.340075   \n",
5065
       "\n",
5066
       "            X-5717  X-5727  X-5739   X-5808    X-5959    X-5975  X-6047  \n",
5067
       "gene_id                                                                  \n",
5068
       "A1BG      0.560715     0.0     0.0  0.00000  0.000000 -0.483985     0.0  \n",
5069
       "A1BG-AS1  0.560715     0.0     0.0  0.00000  0.000000 -0.483985     0.0  \n",
5070
       "A1CF      0.000000     0.0     0.0 -0.37707 -0.643856 -0.588574     0.0  \n",
5071
       "\n",
5072
       "[3 rows x 375 columns]"
5073
      ]
5074
     },
5075
     "execution_count": 34,
5076
     "metadata": {},
5077
     "output_type": "execute_result"
5078
    }
5079
   ],
5080
   "source": [
5081
    "pdx = pdx.applymap(lambda x :  clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))\n",
5082
    "pdx.head(3)"
5083
   ]
5084
  },
5085
  {
5086
   "cell_type": "code",
5087
   "execution_count": 35,
5088
   "metadata": {},
5089
   "outputs": [
5090
    {
5091
     "name": "stdout",
5092
     "output_type": "stream",
5093
     "text": [
5094
      "Mapped: 23313 \n",
5095
      "\tdirectly via main_mapper 21188 \n",
5096
      "\tvia alternative mapper 466 \n",
5097
      "\tvia one of multiple synonyms in alternative mapper 926 \n",
5098
      "\tLOC 733 \n",
5099
      "Unmapped: 200 \n",
5100
      "\trecognized symbols without Entrez ID 0 \n",
5101
      "\tmultiple query_ids map to the same target_id 0 \n",
5102
      "\tquery_ids map to multiple target_ids in the main mapper 0 \n",
5103
      "\tquery_ids map to multiple target_ids in the alternative mapper 52 \n",
5104
      "\tLOC not found in Entrez 29 \n",
5105
      "\tNot found at all: 119\n",
5106
      "Warning: query IDs mapping to duplicated target IDs in mapping table: 77\n",
5107
      "Warning: query IDs not mapped to any target IDs excluded: 200\n"
5108
     ]
5109
    },
5110
    {
5111
     "name": "stderr",
5112
     "output_type": "stream",
5113
     "text": [
5114
      "IDs mapped to multiple target IDs are kept:\n",
5115
      " [143872, 286464, 51463, 642826, 653067, 399761, 647060, 284565, 84631, 161176, 341019, 83869, 9502, 83871, 728113, 729438, 4253, 645425, 26165, 6218, 728695, 100132948, 100134869, 84316, 200030, 642658, 100302179, 401508, 119016, 84458, 574445, 26095, 84968, 80759, 3192, 387707, 79741]\n"
5116
     ]
5117
    },
5118
    {
5119
     "data": {
5120
      "text/html": [
5121
       "<div>\n",
5122
       "<style scoped>\n",
5123
       "    .dataframe tbody tr th:only-of-type {\n",
5124
       "        vertical-align: middle;\n",
5125
       "    }\n",
5126
       "\n",
5127
       "    .dataframe tbody tr th {\n",
5128
       "        vertical-align: top;\n",
5129
       "    }\n",
5130
       "\n",
5131
       "    .dataframe thead th {\n",
5132
       "        text-align: right;\n",
5133
       "    }\n",
5134
       "</style>\n",
5135
       "<table border=\"1\" class=\"dataframe\">\n",
5136
       "  <thead>\n",
5137
       "    <tr style=\"text-align: right;\">\n",
5138
       "      <th></th>\n",
5139
       "      <th>X-1004</th>\n",
5140
       "      <th>X-1008</th>\n",
5141
       "      <th>X-1027</th>\n",
5142
       "      <th>X-1095</th>\n",
5143
       "      <th>X-1119</th>\n",
5144
       "      <th>X-1156</th>\n",
5145
       "      <th>X-1167</th>\n",
5146
       "      <th>X-1169</th>\n",
5147
       "      <th>X-1172</th>\n",
5148
       "      <th>X-1173</th>\n",
5149
       "      <th>...</th>\n",
5150
       "      <th>X-5694</th>\n",
5151
       "      <th>X-5696</th>\n",
5152
       "      <th>X-5713</th>\n",
5153
       "      <th>X-5717</th>\n",
5154
       "      <th>X-5727</th>\n",
5155
       "      <th>X-5739</th>\n",
5156
       "      <th>X-5808</th>\n",
5157
       "      <th>X-5959</th>\n",
5158
       "      <th>X-5975</th>\n",
5159
       "      <th>X-6047</th>\n",
5160
       "    </tr>\n",
5161
       "    <tr>\n",
5162
       "      <th>gene_id</th>\n",
5163
       "      <th></th>\n",
5164
       "      <th></th>\n",
5165
       "      <th></th>\n",
5166
       "      <th></th>\n",
5167
       "      <th></th>\n",
5168
       "      <th></th>\n",
5169
       "      <th></th>\n",
5170
       "      <th></th>\n",
5171
       "      <th></th>\n",
5172
       "      <th></th>\n",
5173
       "      <th></th>\n",
5174
       "      <th></th>\n",
5175
       "      <th></th>\n",
5176
       "      <th></th>\n",
5177
       "      <th></th>\n",
5178
       "      <th></th>\n",
5179
       "      <th></th>\n",
5180
       "      <th></th>\n",
5181
       "      <th></th>\n",
5182
       "      <th></th>\n",
5183
       "      <th></th>\n",
5184
       "    </tr>\n",
5185
       "  </thead>\n",
5186
       "  <tbody>\n",
5187
       "    <tr>\n",
5188
       "      <th>1</th>\n",
5189
       "      <td>0.367371</td>\n",
5190
       "      <td>-0.321928</td>\n",
5191
       "      <td>0.0</td>\n",
5192
       "      <td>0.000000</td>\n",
5193
       "      <td>0.0</td>\n",
5194
       "      <td>0.978196</td>\n",
5195
       "      <td>0.0</td>\n",
5196
       "      <td>2.509696</td>\n",
5197
       "      <td>0.000000</td>\n",
5198
       "      <td>0.0</td>\n",
5199
       "      <td>...</td>\n",
5200
       "      <td>0.0</td>\n",
5201
       "      <td>0.0</td>\n",
5202
       "      <td>0.0</td>\n",
5203
       "      <td>0.560715</td>\n",
5204
       "      <td>0.0</td>\n",
5205
       "      <td>0.000000</td>\n",
5206
       "      <td>0.000000</td>\n",
5207
       "      <td>0.000000</td>\n",
5208
       "      <td>-0.483985</td>\n",
5209
       "      <td>0.0</td>\n",
5210
       "    </tr>\n",
5211
       "    <tr>\n",
5212
       "      <th>2</th>\n",
5213
       "      <td>0.761285</td>\n",
5214
       "      <td>0.000000</td>\n",
5215
       "      <td>0.0</td>\n",
5216
       "      <td>0.500802</td>\n",
5217
       "      <td>0.0</td>\n",
5218
       "      <td>0.700440</td>\n",
5219
       "      <td>0.0</td>\n",
5220
       "      <td>0.000000</td>\n",
5221
       "      <td>0.201634</td>\n",
5222
       "      <td>0.0</td>\n",
5223
       "      <td>...</td>\n",
5224
       "      <td>0.0</td>\n",
5225
       "      <td>0.0</td>\n",
5226
       "      <td>0.0</td>\n",
5227
       "      <td>0.739848</td>\n",
5228
       "      <td>0.0</td>\n",
5229
       "      <td>0.739848</td>\n",
5230
       "      <td>0.327687</td>\n",
5231
       "      <td>-0.494109</td>\n",
5232
       "      <td>-0.535332</td>\n",
5233
       "      <td>0.0</td>\n",
5234
       "    </tr>\n",
5235
       "    <tr>\n",
5236
       "      <th>3</th>\n",
5237
       "      <td>0.761285</td>\n",
5238
       "      <td>0.000000</td>\n",
5239
       "      <td>0.0</td>\n",
5240
       "      <td>0.500802</td>\n",
5241
       "      <td>0.0</td>\n",
5242
       "      <td>0.700440</td>\n",
5243
       "      <td>0.0</td>\n",
5244
       "      <td>0.000000</td>\n",
5245
       "      <td>0.201634</td>\n",
5246
       "      <td>0.0</td>\n",
5247
       "      <td>...</td>\n",
5248
       "      <td>0.0</td>\n",
5249
       "      <td>0.0</td>\n",
5250
       "      <td>0.0</td>\n",
5251
       "      <td>0.739848</td>\n",
5252
       "      <td>0.0</td>\n",
5253
       "      <td>0.739848</td>\n",
5254
       "      <td>0.327687</td>\n",
5255
       "      <td>-0.494109</td>\n",
5256
       "      <td>-0.535332</td>\n",
5257
       "      <td>0.0</td>\n",
5258
       "    </tr>\n",
5259
       "  </tbody>\n",
5260
       "</table>\n",
5261
       "<p>3 rows × 375 columns</p>\n",
5262
       "</div>"
5263
      ],
5264
      "text/plain": [
5265
       "           X-1004    X-1008  X-1027    X-1095  X-1119    X-1156  X-1167  \\\n",
5266
       "gene_id                                                                   \n",
5267
       "1        0.367371 -0.321928     0.0  0.000000     0.0  0.978196     0.0   \n",
5268
       "2        0.761285  0.000000     0.0  0.500802     0.0  0.700440     0.0   \n",
5269
       "3        0.761285  0.000000     0.0  0.500802     0.0  0.700440     0.0   \n",
5270
       "\n",
5271
       "           X-1169    X-1172  X-1173   ...    X-5694  X-5696  X-5713    X-5717  \\\n",
5272
       "gene_id                               ...                                       \n",
5273
       "1        2.509696  0.000000     0.0   ...       0.0     0.0     0.0  0.560715   \n",
5274
       "2        0.000000  0.201634     0.0   ...       0.0     0.0     0.0  0.739848   \n",
5275
       "3        0.000000  0.201634     0.0   ...       0.0     0.0     0.0  0.739848   \n",
5276
       "\n",
5277
       "         X-5727    X-5739    X-5808    X-5959    X-5975  X-6047  \n",
5278
       "gene_id                                                          \n",
5279
       "1           0.0  0.000000  0.000000  0.000000 -0.483985     0.0  \n",
5280
       "2           0.0  0.739848  0.327687 -0.494109 -0.535332     0.0  \n",
5281
       "3           0.0  0.739848  0.327687 -0.494109 -0.535332     0.0  \n",
5282
       "\n",
5283
       "[3 rows x 375 columns]"
5284
      ]
5285
     },
5286
     "execution_count": 35,
5287
     "metadata": {},
5288
     "output_type": "execute_result"
5289
    }
5290
   ],
5291
   "source": [
5292
    "pdx,query2target,not_mapped = apply_mappers(pdx, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n",
5293
    "pdx.head(3)"
5294
   ]
5295
  },
5296
  {
5297
   "cell_type": "code",
5298
   "execution_count": 36,
5299
   "metadata": {},
5300
   "outputs": [
5301
    {
5302
     "data": {
5303
      "text/html": [
5304
       "<div>\n",
5305
       "<style scoped>\n",
5306
       "    .dataframe tbody tr th:only-of-type {\n",
5307
       "        vertical-align: middle;\n",
5308
       "    }\n",
5309
       "\n",
5310
       "    .dataframe tbody tr th {\n",
5311
       "        vertical-align: top;\n",
5312
       "    }\n",
5313
       "\n",
5314
       "    .dataframe thead th {\n",
5315
       "        text-align: right;\n",
5316
       "    }\n",
5317
       "</style>\n",
5318
       "<table border=\"1\" class=\"dataframe\">\n",
5319
       "  <thead>\n",
5320
       "    <tr style=\"text-align: right;\">\n",
5321
       "      <th></th>\n",
5322
       "      <th>X-1004</th>\n",
5323
       "      <th>X-1008</th>\n",
5324
       "      <th>X-1027</th>\n",
5325
       "      <th>X-1095</th>\n",
5326
       "      <th>X-1119</th>\n",
5327
       "      <th>X-1156</th>\n",
5328
       "      <th>X-1167</th>\n",
5329
       "      <th>X-1169</th>\n",
5330
       "      <th>X-1172</th>\n",
5331
       "      <th>X-1173</th>\n",
5332
       "      <th>...</th>\n",
5333
       "      <th>X-5694</th>\n",
5334
       "      <th>X-5696</th>\n",
5335
       "      <th>X-5713</th>\n",
5336
       "      <th>X-5717</th>\n",
5337
       "      <th>X-5727</th>\n",
5338
       "      <th>X-5739</th>\n",
5339
       "      <th>X-5808</th>\n",
5340
       "      <th>X-5959</th>\n",
5341
       "      <th>X-5975</th>\n",
5342
       "      <th>X-6047</th>\n",
5343
       "    </tr>\n",
5344
       "    <tr>\n",
5345
       "      <th>gene_id</th>\n",
5346
       "      <th></th>\n",
5347
       "      <th></th>\n",
5348
       "      <th></th>\n",
5349
       "      <th></th>\n",
5350
       "      <th></th>\n",
5351
       "      <th></th>\n",
5352
       "      <th></th>\n",
5353
       "      <th></th>\n",
5354
       "      <th></th>\n",
5355
       "      <th></th>\n",
5356
       "      <th></th>\n",
5357
       "      <th></th>\n",
5358
       "      <th></th>\n",
5359
       "      <th></th>\n",
5360
       "      <th></th>\n",
5361
       "      <th></th>\n",
5362
       "      <th></th>\n",
5363
       "      <th></th>\n",
5364
       "      <th></th>\n",
5365
       "      <th></th>\n",
5366
       "      <th></th>\n",
5367
       "    </tr>\n",
5368
       "  </thead>\n",
5369
       "  <tbody>\n",
5370
       "    <tr>\n",
5371
       "      <th>143872</th>\n",
5372
       "      <td>0.000000</td>\n",
5373
       "      <td>0.560715</td>\n",
5374
       "      <td>0.000000</td>\n",
5375
       "      <td>0.000000</td>\n",
5376
       "      <td>0.000000</td>\n",
5377
       "      <td>-0.330973</td>\n",
5378
       "      <td>0.000000</td>\n",
5379
       "      <td>-0.367732</td>\n",
5380
       "      <td>0.350497</td>\n",
5381
       "      <td>0.000000</td>\n",
5382
       "      <td>...</td>\n",
5383
       "      <td>0.000000</td>\n",
5384
       "      <td>-0.349235</td>\n",
5385
       "      <td>0.000000</td>\n",
5386
       "      <td>-0.321928</td>\n",
5387
       "      <td>-1.014500</td>\n",
5388
       "      <td>-0.588574</td>\n",
5389
       "      <td>0.000000</td>\n",
5390
       "      <td>0.000000</td>\n",
5391
       "      <td>0.000000</td>\n",
5392
       "      <td>-0.514573</td>\n",
5393
       "    </tr>\n",
5394
       "    <tr>\n",
5395
       "      <th>143872</th>\n",
5396
       "      <td>0.000000</td>\n",
5397
       "      <td>0.560715</td>\n",
5398
       "      <td>0.000000</td>\n",
5399
       "      <td>0.000000</td>\n",
5400
       "      <td>0.000000</td>\n",
5401
       "      <td>-0.330973</td>\n",
5402
       "      <td>0.000000</td>\n",
5403
       "      <td>-0.367732</td>\n",
5404
       "      <td>0.350497</td>\n",
5405
       "      <td>0.000000</td>\n",
5406
       "      <td>...</td>\n",
5407
       "      <td>0.000000</td>\n",
5408
       "      <td>-0.349235</td>\n",
5409
       "      <td>0.000000</td>\n",
5410
       "      <td>-0.321928</td>\n",
5411
       "      <td>-1.014500</td>\n",
5412
       "      <td>-0.588574</td>\n",
5413
       "      <td>0.000000</td>\n",
5414
       "      <td>0.000000</td>\n",
5415
       "      <td>0.000000</td>\n",
5416
       "      <td>-0.514573</td>\n",
5417
       "    </tr>\n",
5418
       "    <tr>\n",
5419
       "      <th>286464</th>\n",
5420
       "      <td>0.000000</td>\n",
5421
       "      <td>0.378512</td>\n",
5422
       "      <td>0.000000</td>\n",
5423
       "      <td>0.550901</td>\n",
5424
       "      <td>0.000000</td>\n",
5425
       "      <td>-0.524915</td>\n",
5426
       "      <td>0.000000</td>\n",
5427
       "      <td>-0.902389</td>\n",
5428
       "      <td>-1.321928</td>\n",
5429
       "      <td>0.367371</td>\n",
5430
       "      <td>...</td>\n",
5431
       "      <td>0.000000</td>\n",
5432
       "      <td>-1.014500</td>\n",
5433
       "      <td>0.718088</td>\n",
5434
       "      <td>0.000000</td>\n",
5435
       "      <td>-1.014500</td>\n",
5436
       "      <td>0.000000</td>\n",
5437
       "      <td>0.000000</td>\n",
5438
       "      <td>0.000000</td>\n",
5439
       "      <td>-1.494109</td>\n",
5440
       "      <td>-1.494109</td>\n",
5441
       "    </tr>\n",
5442
       "    <tr>\n",
5443
       "      <th>286464</th>\n",
5444
       "      <td>0.000000</td>\n",
5445
       "      <td>0.378512</td>\n",
5446
       "      <td>0.000000</td>\n",
5447
       "      <td>0.550901</td>\n",
5448
       "      <td>0.000000</td>\n",
5449
       "      <td>-0.524915</td>\n",
5450
       "      <td>0.000000</td>\n",
5451
       "      <td>-0.902389</td>\n",
5452
       "      <td>-1.321928</td>\n",
5453
       "      <td>0.367371</td>\n",
5454
       "      <td>...</td>\n",
5455
       "      <td>0.000000</td>\n",
5456
       "      <td>-1.014500</td>\n",
5457
       "      <td>0.718088</td>\n",
5458
       "      <td>0.000000</td>\n",
5459
       "      <td>-1.014500</td>\n",
5460
       "      <td>0.000000</td>\n",
5461
       "      <td>0.000000</td>\n",
5462
       "      <td>0.000000</td>\n",
5463
       "      <td>-1.494109</td>\n",
5464
       "      <td>-1.494109</td>\n",
5465
       "    </tr>\n",
5466
       "    <tr>\n",
5467
       "      <th>286464</th>\n",
5468
       "      <td>0.000000</td>\n",
5469
       "      <td>0.378512</td>\n",
5470
       "      <td>0.000000</td>\n",
5471
       "      <td>0.550901</td>\n",
5472
       "      <td>0.000000</td>\n",
5473
       "      <td>-0.524915</td>\n",
5474
       "      <td>0.000000</td>\n",
5475
       "      <td>-0.902389</td>\n",
5476
       "      <td>-1.321928</td>\n",
5477
       "      <td>0.367371</td>\n",
5478
       "      <td>...</td>\n",
5479
       "      <td>0.000000</td>\n",
5480
       "      <td>-1.014500</td>\n",
5481
       "      <td>0.718088</td>\n",
5482
       "      <td>0.000000</td>\n",
5483
       "      <td>-1.014500</td>\n",
5484
       "      <td>0.000000</td>\n",
5485
       "      <td>0.000000</td>\n",
5486
       "      <td>0.000000</td>\n",
5487
       "      <td>-1.494109</td>\n",
5488
       "      <td>-1.494109</td>\n",
5489
       "    </tr>\n",
5490
       "    <tr>\n",
5491
       "      <th>51463</th>\n",
5492
       "      <td>1.238787</td>\n",
5493
       "      <td>1.090853</td>\n",
5494
       "      <td>0.000000</td>\n",
5495
       "      <td>0.000000</td>\n",
5496
       "      <td>0.000000</td>\n",
5497
       "      <td>1.839960</td>\n",
5498
       "      <td>0.000000</td>\n",
5499
       "      <td>0.448901</td>\n",
5500
       "      <td>0.000000</td>\n",
5501
       "      <td>0.000000</td>\n",
5502
       "      <td>...</td>\n",
5503
       "      <td>0.469886</td>\n",
5504
       "      <td>0.000000</td>\n",
5505
       "      <td>0.618239</td>\n",
5506
       "      <td>0.201634</td>\n",
5507
       "      <td>0.000000</td>\n",
5508
       "      <td>-0.260152</td>\n",
5509
       "      <td>-0.349235</td>\n",
5510
       "      <td>0.000000</td>\n",
5511
       "      <td>0.469886</td>\n",
5512
       "      <td>0.000000</td>\n",
5513
       "    </tr>\n",
5514
       "    <tr>\n",
5515
       "      <th>51463</th>\n",
5516
       "      <td>1.238787</td>\n",
5517
       "      <td>1.090853</td>\n",
5518
       "      <td>0.000000</td>\n",
5519
       "      <td>0.000000</td>\n",
5520
       "      <td>0.000000</td>\n",
5521
       "      <td>1.839960</td>\n",
5522
       "      <td>0.000000</td>\n",
5523
       "      <td>0.448901</td>\n",
5524
       "      <td>0.000000</td>\n",
5525
       "      <td>0.000000</td>\n",
5526
       "      <td>...</td>\n",
5527
       "      <td>0.438293</td>\n",
5528
       "      <td>0.000000</td>\n",
5529
       "      <td>0.618239</td>\n",
5530
       "      <td>0.201634</td>\n",
5531
       "      <td>0.000000</td>\n",
5532
       "      <td>-0.260152</td>\n",
5533
       "      <td>0.000000</td>\n",
5534
       "      <td>0.000000</td>\n",
5535
       "      <td>0.469886</td>\n",
5536
       "      <td>0.000000</td>\n",
5537
       "    </tr>\n",
5538
       "    <tr>\n",
5539
       "      <th>642826</th>\n",
5540
       "      <td>0.608809</td>\n",
5541
       "      <td>0.859970</td>\n",
5542
       "      <td>0.531069</td>\n",
5543
       "      <td>0.000000</td>\n",
5544
       "      <td>0.000000</td>\n",
5545
       "      <td>0.871844</td>\n",
5546
       "      <td>0.000000</td>\n",
5547
       "      <td>-0.286304</td>\n",
5548
       "      <td>0.000000</td>\n",
5549
       "      <td>0.000000</td>\n",
5550
       "      <td>...</td>\n",
5551
       "      <td>0.000000</td>\n",
5552
       "      <td>-1.014500</td>\n",
5553
       "      <td>-0.473931</td>\n",
5554
       "      <td>0.000000</td>\n",
5555
       "      <td>0.618239</td>\n",
5556
       "      <td>0.000000</td>\n",
5557
       "      <td>-0.386468</td>\n",
5558
       "      <td>-1.494109</td>\n",
5559
       "      <td>-0.312939</td>\n",
5560
       "      <td>0.000000</td>\n",
5561
       "    </tr>\n",
5562
       "    <tr>\n",
5563
       "      <th>642826</th>\n",
5564
       "      <td>0.608809</td>\n",
5565
       "      <td>0.859970</td>\n",
5566
       "      <td>0.531069</td>\n",
5567
       "      <td>0.000000</td>\n",
5568
       "      <td>0.000000</td>\n",
5569
       "      <td>0.871844</td>\n",
5570
       "      <td>0.000000</td>\n",
5571
       "      <td>-0.286304</td>\n",
5572
       "      <td>0.000000</td>\n",
5573
       "      <td>0.000000</td>\n",
5574
       "      <td>...</td>\n",
5575
       "      <td>0.000000</td>\n",
5576
       "      <td>-1.014500</td>\n",
5577
       "      <td>-0.473931</td>\n",
5578
       "      <td>0.000000</td>\n",
5579
       "      <td>0.618239</td>\n",
5580
       "      <td>0.000000</td>\n",
5581
       "      <td>-0.386468</td>\n",
5582
       "      <td>-1.494109</td>\n",
5583
       "      <td>-0.312939</td>\n",
5584
       "      <td>0.000000</td>\n",
5585
       "    </tr>\n",
5586
       "    <tr>\n",
5587
       "      <th>653067</th>\n",
5588
       "      <td>0.000000</td>\n",
5589
       "      <td>0.378512</td>\n",
5590
       "      <td>0.000000</td>\n",
5591
       "      <td>0.550901</td>\n",
5592
       "      <td>0.000000</td>\n",
5593
       "      <td>-0.434403</td>\n",
5594
       "      <td>0.000000</td>\n",
5595
       "      <td>-0.902389</td>\n",
5596
       "      <td>1.220330</td>\n",
5597
       "      <td>0.367371</td>\n",
5598
       "      <td>...</td>\n",
5599
       "      <td>0.000000</td>\n",
5600
       "      <td>-1.014500</td>\n",
5601
       "      <td>0.580145</td>\n",
5602
       "      <td>0.000000</td>\n",
5603
       "      <td>-1.014500</td>\n",
5604
       "      <td>0.000000</td>\n",
5605
       "      <td>0.000000</td>\n",
5606
       "      <td>-0.577767</td>\n",
5607
       "      <td>-1.494109</td>\n",
5608
       "      <td>-1.494109</td>\n",
5609
       "    </tr>\n",
5610
       "    <tr>\n",
5611
       "      <th>653067</th>\n",
5612
       "      <td>0.000000</td>\n",
5613
       "      <td>0.378512</td>\n",
5614
       "      <td>0.000000</td>\n",
5615
       "      <td>0.550901</td>\n",
5616
       "      <td>0.000000</td>\n",
5617
       "      <td>-0.434403</td>\n",
5618
       "      <td>0.000000</td>\n",
5619
       "      <td>-0.902389</td>\n",
5620
       "      <td>1.220330</td>\n",
5621
       "      <td>0.367371</td>\n",
5622
       "      <td>...</td>\n",
5623
       "      <td>0.000000</td>\n",
5624
       "      <td>-1.014500</td>\n",
5625
       "      <td>0.580145</td>\n",
5626
       "      <td>0.000000</td>\n",
5627
       "      <td>-1.014500</td>\n",
5628
       "      <td>0.000000</td>\n",
5629
       "      <td>0.000000</td>\n",
5630
       "      <td>-0.577767</td>\n",
5631
       "      <td>-1.494109</td>\n",
5632
       "      <td>-1.494109</td>\n",
5633
       "    </tr>\n",
5634
       "    <tr>\n",
5635
       "      <th>653067</th>\n",
5636
       "      <td>0.000000</td>\n",
5637
       "      <td>0.378512</td>\n",
5638
       "      <td>0.000000</td>\n",
5639
       "      <td>0.550901</td>\n",
5640
       "      <td>0.000000</td>\n",
5641
       "      <td>-0.434403</td>\n",
5642
       "      <td>0.000000</td>\n",
5643
       "      <td>-0.902389</td>\n",
5644
       "      <td>1.220330</td>\n",
5645
       "      <td>0.367371</td>\n",
5646
       "      <td>...</td>\n",
5647
       "      <td>0.000000</td>\n",
5648
       "      <td>-1.014500</td>\n",
5649
       "      <td>0.580145</td>\n",
5650
       "      <td>0.000000</td>\n",
5651
       "      <td>-1.014500</td>\n",
5652
       "      <td>0.000000</td>\n",
5653
       "      <td>0.000000</td>\n",
5654
       "      <td>-0.577767</td>\n",
5655
       "      <td>-1.494109</td>\n",
5656
       "      <td>-1.494109</td>\n",
5657
       "    </tr>\n",
5658
       "    <tr>\n",
5659
       "      <th>653067</th>\n",
5660
       "      <td>0.000000</td>\n",
5661
       "      <td>0.378512</td>\n",
5662
       "      <td>0.000000</td>\n",
5663
       "      <td>0.550901</td>\n",
5664
       "      <td>0.000000</td>\n",
5665
       "      <td>-0.434403</td>\n",
5666
       "      <td>0.000000</td>\n",
5667
       "      <td>-0.902389</td>\n",
5668
       "      <td>1.220330</td>\n",
5669
       "      <td>0.367371</td>\n",
5670
       "      <td>...</td>\n",
5671
       "      <td>0.000000</td>\n",
5672
       "      <td>-1.014500</td>\n",
5673
       "      <td>0.580145</td>\n",
5674
       "      <td>0.000000</td>\n",
5675
       "      <td>-1.014500</td>\n",
5676
       "      <td>0.000000</td>\n",
5677
       "      <td>0.000000</td>\n",
5678
       "      <td>-0.577767</td>\n",
5679
       "      <td>-1.494109</td>\n",
5680
       "      <td>-1.494109</td>\n",
5681
       "    </tr>\n",
5682
       "    <tr>\n",
5683
       "      <th>399761</th>\n",
5684
       "      <td>0.531069</td>\n",
5685
       "      <td>0.718088</td>\n",
5686
       "      <td>0.000000</td>\n",
5687
       "      <td>0.000000</td>\n",
5688
       "      <td>0.000000</td>\n",
5689
       "      <td>-0.251539</td>\n",
5690
       "      <td>0.000000</td>\n",
5691
       "      <td>-0.286304</td>\n",
5692
       "      <td>0.000000</td>\n",
5693
       "      <td>0.790772</td>\n",
5694
       "      <td>...</td>\n",
5695
       "      <td>0.367371</td>\n",
5696
       "      <td>-1.535332</td>\n",
5697
       "      <td>0.000000</td>\n",
5698
       "      <td>0.000000</td>\n",
5699
       "      <td>0.000000</td>\n",
5700
       "      <td>0.000000</td>\n",
5701
       "      <td>-0.599462</td>\n",
5702
       "      <td>-0.588574</td>\n",
5703
       "      <td>0.000000</td>\n",
5704
       "      <td>0.000000</td>\n",
5705
       "    </tr>\n",
5706
       "    <tr>\n",
5707
       "      <th>399761</th>\n",
5708
       "      <td>0.531069</td>\n",
5709
       "      <td>0.718088</td>\n",
5710
       "      <td>0.000000</td>\n",
5711
       "      <td>0.000000</td>\n",
5712
       "      <td>0.000000</td>\n",
5713
       "      <td>-0.251539</td>\n",
5714
       "      <td>0.000000</td>\n",
5715
       "      <td>-0.286304</td>\n",
5716
       "      <td>0.000000</td>\n",
5717
       "      <td>0.790772</td>\n",
5718
       "      <td>...</td>\n",
5719
       "      <td>0.367371</td>\n",
5720
       "      <td>-1.535332</td>\n",
5721
       "      <td>0.000000</td>\n",
5722
       "      <td>0.000000</td>\n",
5723
       "      <td>0.000000</td>\n",
5724
       "      <td>0.000000</td>\n",
5725
       "      <td>-0.599462</td>\n",
5726
       "      <td>-0.588574</td>\n",
5727
       "      <td>0.000000</td>\n",
5728
       "      <td>0.000000</td>\n",
5729
       "    </tr>\n",
5730
       "    <tr>\n",
5731
       "      <th>647060</th>\n",
5732
       "      <td>0.000000</td>\n",
5733
       "      <td>0.000000</td>\n",
5734
       "      <td>0.000000</td>\n",
5735
       "      <td>0.000000</td>\n",
5736
       "      <td>0.000000</td>\n",
5737
       "      <td>-0.312939</td>\n",
5738
       "      <td>0.000000</td>\n",
5739
       "      <td>-0.321928</td>\n",
5740
       "      <td>0.000000</td>\n",
5741
       "      <td>0.618239</td>\n",
5742
       "      <td>...</td>\n",
5743
       "      <td>0.000000</td>\n",
5744
       "      <td>-0.621488</td>\n",
5745
       "      <td>0.000000</td>\n",
5746
       "      <td>-0.377070</td>\n",
5747
       "      <td>-0.749038</td>\n",
5748
       "      <td>-0.405451</td>\n",
5749
       "      <td>0.000000</td>\n",
5750
       "      <td>-0.395929</td>\n",
5751
       "      <td>0.000000</td>\n",
5752
       "      <td>0.000000</td>\n",
5753
       "    </tr>\n",
5754
       "    <tr>\n",
5755
       "      <th>647060</th>\n",
5756
       "      <td>0.000000</td>\n",
5757
       "      <td>0.000000</td>\n",
5758
       "      <td>0.000000</td>\n",
5759
       "      <td>0.000000</td>\n",
5760
       "      <td>0.000000</td>\n",
5761
       "      <td>-0.312939</td>\n",
5762
       "      <td>0.000000</td>\n",
5763
       "      <td>-0.321928</td>\n",
5764
       "      <td>0.000000</td>\n",
5765
       "      <td>0.618239</td>\n",
5766
       "      <td>...</td>\n",
5767
       "      <td>0.000000</td>\n",
5768
       "      <td>-0.621488</td>\n",
5769
       "      <td>0.000000</td>\n",
5770
       "      <td>-0.377070</td>\n",
5771
       "      <td>-0.749038</td>\n",
5772
       "      <td>-0.405451</td>\n",
5773
       "      <td>0.000000</td>\n",
5774
       "      <td>-0.395929</td>\n",
5775
       "      <td>0.000000</td>\n",
5776
       "      <td>0.000000</td>\n",
5777
       "    </tr>\n",
5778
       "    <tr>\n",
5779
       "      <th>284565</th>\n",
5780
       "      <td>1.238787</td>\n",
5781
       "      <td>1.090853</td>\n",
5782
       "      <td>0.000000</td>\n",
5783
       "      <td>0.000000</td>\n",
5784
       "      <td>0.000000</td>\n",
5785
       "      <td>1.570463</td>\n",
5786
       "      <td>0.000000</td>\n",
5787
       "      <td>0.448901</td>\n",
5788
       "      <td>0.411426</td>\n",
5789
       "      <td>0.448901</td>\n",
5790
       "      <td>...</td>\n",
5791
       "      <td>0.469886</td>\n",
5792
       "      <td>-0.504305</td>\n",
5793
       "      <td>0.618239</td>\n",
5794
       "      <td>0.201634</td>\n",
5795
       "      <td>0.000000</td>\n",
5796
       "      <td>-0.260152</td>\n",
5797
       "      <td>0.000000</td>\n",
5798
       "      <td>0.000000</td>\n",
5799
       "      <td>0.469886</td>\n",
5800
       "      <td>0.000000</td>\n",
5801
       "    </tr>\n",
5802
       "    <tr>\n",
5803
       "      <th>284565</th>\n",
5804
       "      <td>1.238787</td>\n",
5805
       "      <td>1.090853</td>\n",
5806
       "      <td>0.000000</td>\n",
5807
       "      <td>0.000000</td>\n",
5808
       "      <td>0.000000</td>\n",
5809
       "      <td>1.310340</td>\n",
5810
       "      <td>0.000000</td>\n",
5811
       "      <td>0.448901</td>\n",
5812
       "      <td>0.411426</td>\n",
5813
       "      <td>0.448901</td>\n",
5814
       "      <td>...</td>\n",
5815
       "      <td>0.469886</td>\n",
5816
       "      <td>-0.524915</td>\n",
5817
       "      <td>0.618239</td>\n",
5818
       "      <td>0.201634</td>\n",
5819
       "      <td>0.000000</td>\n",
5820
       "      <td>-0.260152</td>\n",
5821
       "      <td>0.000000</td>\n",
5822
       "      <td>0.000000</td>\n",
5823
       "      <td>0.469886</td>\n",
5824
       "      <td>0.000000</td>\n",
5825
       "    </tr>\n",
5826
       "    <tr>\n",
5827
       "      <th>84631</th>\n",
5828
       "      <td>-0.823677</td>\n",
5829
       "      <td>-0.524915</td>\n",
5830
       "      <td>0.000000</td>\n",
5831
       "      <td>0.632268</td>\n",
5832
       "      <td>0.000000</td>\n",
5833
       "      <td>-1.074001</td>\n",
5834
       "      <td>0.000000</td>\n",
5835
       "      <td>-0.875672</td>\n",
5836
       "      <td>-0.678072</td>\n",
5837
       "      <td>0.378512</td>\n",
5838
       "      <td>...</td>\n",
5839
       "      <td>0.000000</td>\n",
5840
       "      <td>-1.014500</td>\n",
5841
       "      <td>0.500802</td>\n",
5842
       "      <td>0.000000</td>\n",
5843
       "      <td>0.000000</td>\n",
5844
       "      <td>0.000000</td>\n",
5845
       "      <td>0.000000</td>\n",
5846
       "      <td>-0.545824</td>\n",
5847
       "      <td>-0.545824</td>\n",
5848
       "      <td>-0.535332</td>\n",
5849
       "    </tr>\n",
5850
       "    <tr>\n",
5851
       "      <th>84631</th>\n",
5852
       "      <td>-0.823677</td>\n",
5853
       "      <td>-0.524915</td>\n",
5854
       "      <td>0.000000</td>\n",
5855
       "      <td>0.632268</td>\n",
5856
       "      <td>0.000000</td>\n",
5857
       "      <td>-1.074001</td>\n",
5858
       "      <td>0.000000</td>\n",
5859
       "      <td>-0.875672</td>\n",
5860
       "      <td>-0.678072</td>\n",
5861
       "      <td>0.378512</td>\n",
5862
       "      <td>...</td>\n",
5863
       "      <td>0.000000</td>\n",
5864
       "      <td>-1.014500</td>\n",
5865
       "      <td>0.500802</td>\n",
5866
       "      <td>0.000000</td>\n",
5867
       "      <td>0.000000</td>\n",
5868
       "      <td>0.000000</td>\n",
5869
       "      <td>0.000000</td>\n",
5870
       "      <td>-0.545824</td>\n",
5871
       "      <td>-0.545824</td>\n",
5872
       "      <td>-0.535332</td>\n",
5873
       "    </tr>\n",
5874
       "    <tr>\n",
5875
       "      <th>161176</th>\n",
5876
       "      <td>0.000000</td>\n",
5877
       "      <td>0.000000</td>\n",
5878
       "      <td>0.000000</td>\n",
5879
       "      <td>0.000000</td>\n",
5880
       "      <td>0.000000</td>\n",
5881
       "      <td>0.000000</td>\n",
5882
       "      <td>0.000000</td>\n",
5883
       "      <td>-0.358454</td>\n",
5884
       "      <td>0.250962</td>\n",
5885
       "      <td>0.339137</td>\n",
5886
       "      <td>...</td>\n",
5887
       "      <td>-0.875672</td>\n",
5888
       "      <td>0.000000</td>\n",
5889
       "      <td>-0.483985</td>\n",
5890
       "      <td>0.000000</td>\n",
5891
       "      <td>0.000000</td>\n",
5892
       "      <td>-0.610433</td>\n",
5893
       "      <td>0.000000</td>\n",
5894
       "      <td>0.411426</td>\n",
5895
       "      <td>0.000000</td>\n",
5896
       "      <td>0.000000</td>\n",
5897
       "    </tr>\n",
5898
       "    <tr>\n",
5899
       "      <th>161176</th>\n",
5900
       "      <td>0.000000</td>\n",
5901
       "      <td>0.000000</td>\n",
5902
       "      <td>0.000000</td>\n",
5903
       "      <td>0.000000</td>\n",
5904
       "      <td>0.000000</td>\n",
5905
       "      <td>0.000000</td>\n",
5906
       "      <td>0.000000</td>\n",
5907
       "      <td>-0.358454</td>\n",
5908
       "      <td>0.250962</td>\n",
5909
       "      <td>0.339137</td>\n",
5910
       "      <td>...</td>\n",
5911
       "      <td>-0.875672</td>\n",
5912
       "      <td>0.000000</td>\n",
5913
       "      <td>-0.483985</td>\n",
5914
       "      <td>0.000000</td>\n",
5915
       "      <td>0.000000</td>\n",
5916
       "      <td>-0.610433</td>\n",
5917
       "      <td>0.000000</td>\n",
5918
       "      <td>0.411426</td>\n",
5919
       "      <td>0.000000</td>\n",
5920
       "      <td>0.000000</td>\n",
5921
       "    </tr>\n",
5922
       "    <tr>\n",
5923
       "      <th>341019</th>\n",
5924
       "      <td>0.959770</td>\n",
5925
       "      <td>0.000000</td>\n",
5926
       "      <td>0.000000</td>\n",
5927
       "      <td>0.000000</td>\n",
5928
       "      <td>0.000000</td>\n",
5929
       "      <td>-0.304006</td>\n",
5930
       "      <td>0.000000</td>\n",
5931
       "      <td>-0.349235</td>\n",
5932
       "      <td>0.000000</td>\n",
5933
       "      <td>0.000000</td>\n",
5934
       "      <td>...</td>\n",
5935
       "      <td>0.000000</td>\n",
5936
       "      <td>-0.666576</td>\n",
5937
       "      <td>0.000000</td>\n",
5938
       "      <td>-0.304006</td>\n",
5939
       "      <td>-1.000000</td>\n",
5940
       "      <td>-0.610433</td>\n",
5941
       "      <td>0.000000</td>\n",
5942
       "      <td>0.000000</td>\n",
5943
       "      <td>-0.444184</td>\n",
5944
       "      <td>0.000000</td>\n",
5945
       "    </tr>\n",
5946
       "    <tr>\n",
5947
       "      <th>341019</th>\n",
5948
       "      <td>0.959770</td>\n",
5949
       "      <td>0.000000</td>\n",
5950
       "      <td>0.000000</td>\n",
5951
       "      <td>0.000000</td>\n",
5952
       "      <td>0.000000</td>\n",
5953
       "      <td>-0.304006</td>\n",
5954
       "      <td>0.000000</td>\n",
5955
       "      <td>-0.349235</td>\n",
5956
       "      <td>0.000000</td>\n",
5957
       "      <td>0.000000</td>\n",
5958
       "      <td>...</td>\n",
5959
       "      <td>0.000000</td>\n",
5960
       "      <td>-0.312939</td>\n",
5961
       "      <td>0.000000</td>\n",
5962
       "      <td>-0.304006</td>\n",
5963
       "      <td>-1.000000</td>\n",
5964
       "      <td>-0.610433</td>\n",
5965
       "      <td>0.000000</td>\n",
5966
       "      <td>0.000000</td>\n",
5967
       "      <td>-0.444184</td>\n",
5968
       "      <td>0.000000</td>\n",
5969
       "    </tr>\n",
5970
       "    <tr>\n",
5971
       "      <th>83869</th>\n",
5972
       "      <td>-3.000000</td>\n",
5973
       "      <td>-3.184425</td>\n",
5974
       "      <td>-3.321928</td>\n",
5975
       "      <td>-3.000000</td>\n",
5976
       "      <td>-2.599462</td>\n",
5977
       "      <td>-3.556393</td>\n",
5978
       "      <td>-2.785875</td>\n",
5979
       "      <td>-0.875672</td>\n",
5980
       "      <td>-2.736966</td>\n",
5981
       "      <td>-2.514573</td>\n",
5982
       "      <td>...</td>\n",
5983
       "      <td>-2.556393</td>\n",
5984
       "      <td>-3.943416</td>\n",
5985
       "      <td>-2.785875</td>\n",
5986
       "      <td>-2.152003</td>\n",
5987
       "      <td>-3.556393</td>\n",
5988
       "      <td>-3.251539</td>\n",
5989
       "      <td>-2.736966</td>\n",
5990
       "      <td>-3.473931</td>\n",
5991
       "      <td>-3.120294</td>\n",
5992
       "      <td>-3.643856</td>\n",
5993
       "    </tr>\n",
5994
       "    <tr>\n",
5995
       "      <th>83869</th>\n",
5996
       "      <td>-3.000000</td>\n",
5997
       "      <td>-3.184425</td>\n",
5998
       "      <td>-3.321928</td>\n",
5999
       "      <td>-3.000000</td>\n",
6000
       "      <td>-2.599462</td>\n",
6001
       "      <td>-3.556393</td>\n",
6002
       "      <td>-2.785875</td>\n",
6003
       "      <td>-0.875672</td>\n",
6004
       "      <td>-2.736966</td>\n",
6005
       "      <td>-2.514573</td>\n",
6006
       "      <td>...</td>\n",
6007
       "      <td>-2.556393</td>\n",
6008
       "      <td>-3.943416</td>\n",
6009
       "      <td>-2.785875</td>\n",
6010
       "      <td>-2.152003</td>\n",
6011
       "      <td>-3.556393</td>\n",
6012
       "      <td>-3.251539</td>\n",
6013
       "      <td>-2.736966</td>\n",
6014
       "      <td>-3.473931</td>\n",
6015
       "      <td>-3.120294</td>\n",
6016
       "      <td>-3.643856</td>\n",
6017
       "    </tr>\n",
6018
       "    <tr>\n",
6019
       "      <th>9502</th>\n",
6020
       "      <td>0.000000</td>\n",
6021
       "      <td>0.378512</td>\n",
6022
       "      <td>0.000000</td>\n",
6023
       "      <td>0.550901</td>\n",
6024
       "      <td>0.000000</td>\n",
6025
       "      <td>-0.434403</td>\n",
6026
       "      <td>0.000000</td>\n",
6027
       "      <td>-0.902389</td>\n",
6028
       "      <td>1.220330</td>\n",
6029
       "      <td>0.367371</td>\n",
6030
       "      <td>...</td>\n",
6031
       "      <td>0.000000</td>\n",
6032
       "      <td>-1.014500</td>\n",
6033
       "      <td>0.580145</td>\n",
6034
       "      <td>0.000000</td>\n",
6035
       "      <td>-1.014500</td>\n",
6036
       "      <td>0.000000</td>\n",
6037
       "      <td>0.000000</td>\n",
6038
       "      <td>-0.577767</td>\n",
6039
       "      <td>-1.494109</td>\n",
6040
       "      <td>-1.494109</td>\n",
6041
       "    </tr>\n",
6042
       "    <tr>\n",
6043
       "      <th>9502</th>\n",
6044
       "      <td>0.000000</td>\n",
6045
       "      <td>0.378512</td>\n",
6046
       "      <td>0.000000</td>\n",
6047
       "      <td>0.550901</td>\n",
6048
       "      <td>0.000000</td>\n",
6049
       "      <td>-0.434403</td>\n",
6050
       "      <td>0.000000</td>\n",
6051
       "      <td>-0.902389</td>\n",
6052
       "      <td>1.220330</td>\n",
6053
       "      <td>0.367371</td>\n",
6054
       "      <td>...</td>\n",
6055
       "      <td>0.000000</td>\n",
6056
       "      <td>-1.014500</td>\n",
6057
       "      <td>0.580145</td>\n",
6058
       "      <td>0.000000</td>\n",
6059
       "      <td>-1.014500</td>\n",
6060
       "      <td>0.000000</td>\n",
6061
       "      <td>0.000000</td>\n",
6062
       "      <td>-0.577767</td>\n",
6063
       "      <td>-1.494109</td>\n",
6064
       "      <td>-1.494109</td>\n",
6065
       "    </tr>\n",
6066
       "    <tr>\n",
6067
       "      <th>83871</th>\n",
6068
       "      <td>0.000000</td>\n",
6069
       "      <td>0.000000</td>\n",
6070
       "      <td>0.000000</td>\n",
6071
       "      <td>0.500802</td>\n",
6072
       "      <td>0.000000</td>\n",
6073
       "      <td>-0.304006</td>\n",
6074
       "      <td>0.000000</td>\n",
6075
       "      <td>-0.349235</td>\n",
6076
       "      <td>0.327687</td>\n",
6077
       "      <td>0.000000</td>\n",
6078
       "      <td>...</td>\n",
6079
       "      <td>0.000000</td>\n",
6080
       "      <td>0.000000</td>\n",
6081
       "      <td>-0.463947</td>\n",
6082
       "      <td>-0.358454</td>\n",
6083
       "      <td>0.000000</td>\n",
6084
       "      <td>0.000000</td>\n",
6085
       "      <td>0.000000</td>\n",
6086
       "      <td>0.448901</td>\n",
6087
       "      <td>-0.535332</td>\n",
6088
       "      <td>0.000000</td>\n",
6089
       "    </tr>\n",
6090
       "    <tr>\n",
6091
       "      <th>...</th>\n",
6092
       "      <td>...</td>\n",
6093
       "      <td>...</td>\n",
6094
       "      <td>...</td>\n",
6095
       "      <td>...</td>\n",
6096
       "      <td>...</td>\n",
6097
       "      <td>...</td>\n",
6098
       "      <td>...</td>\n",
6099
       "      <td>...</td>\n",
6100
       "      <td>...</td>\n",
6101
       "      <td>...</td>\n",
6102
       "      <td>...</td>\n",
6103
       "      <td>...</td>\n",
6104
       "      <td>...</td>\n",
6105
       "      <td>...</td>\n",
6106
       "      <td>...</td>\n",
6107
       "      <td>...</td>\n",
6108
       "      <td>...</td>\n",
6109
       "      <td>...</td>\n",
6110
       "      <td>...</td>\n",
6111
       "      <td>...</td>\n",
6112
       "      <td>...</td>\n",
6113
       "    </tr>\n",
6114
       "    <tr>\n",
6115
       "      <th>100134869</th>\n",
6116
       "      <td>0.000000</td>\n",
6117
       "      <td>0.207893</td>\n",
6118
       "      <td>0.000000</td>\n",
6119
       "      <td>0.000000</td>\n",
6120
       "      <td>0.000000</td>\n",
6121
       "      <td>0.000000</td>\n",
6122
       "      <td>0.000000</td>\n",
6123
       "      <td>0.000000</td>\n",
6124
       "      <td>0.232661</td>\n",
6125
       "      <td>0.000000</td>\n",
6126
       "      <td>...</td>\n",
6127
       "      <td>0.000000</td>\n",
6128
       "      <td>0.389567</td>\n",
6129
       "      <td>0.000000</td>\n",
6130
       "      <td>0.000000</td>\n",
6131
       "      <td>-0.463947</td>\n",
6132
       "      <td>0.000000</td>\n",
6133
       "      <td>-0.367732</td>\n",
6134
       "      <td>0.790772</td>\n",
6135
       "      <td>0.000000</td>\n",
6136
       "      <td>-0.535332</td>\n",
6137
       "    </tr>\n",
6138
       "    <tr>\n",
6139
       "      <th>100134869</th>\n",
6140
       "      <td>0.000000</td>\n",
6141
       "      <td>0.207893</td>\n",
6142
       "      <td>0.000000</td>\n",
6143
       "      <td>0.000000</td>\n",
6144
       "      <td>0.000000</td>\n",
6145
       "      <td>0.000000</td>\n",
6146
       "      <td>0.000000</td>\n",
6147
       "      <td>0.000000</td>\n",
6148
       "      <td>0.232661</td>\n",
6149
       "      <td>0.000000</td>\n",
6150
       "      <td>...</td>\n",
6151
       "      <td>0.000000</td>\n",
6152
       "      <td>0.389567</td>\n",
6153
       "      <td>0.000000</td>\n",
6154
       "      <td>0.000000</td>\n",
6155
       "      <td>-0.463947</td>\n",
6156
       "      <td>0.000000</td>\n",
6157
       "      <td>-0.367732</td>\n",
6158
       "      <td>0.790772</td>\n",
6159
       "      <td>0.000000</td>\n",
6160
       "      <td>-0.535332</td>\n",
6161
       "    </tr>\n",
6162
       "    <tr>\n",
6163
       "      <th>84316</th>\n",
6164
       "      <td>-0.251539</td>\n",
6165
       "      <td>0.000000</td>\n",
6166
       "      <td>0.000000</td>\n",
6167
       "      <td>0.000000</td>\n",
6168
       "      <td>0.000000</td>\n",
6169
       "      <td>-0.340075</td>\n",
6170
       "      <td>0.000000</td>\n",
6171
       "      <td>-0.349235</td>\n",
6172
       "      <td>-0.588574</td>\n",
6173
       "      <td>0.000000</td>\n",
6174
       "      <td>...</td>\n",
6175
       "      <td>0.000000</td>\n",
6176
       "      <td>-0.985645</td>\n",
6177
       "      <td>-0.358454</td>\n",
6178
       "      <td>-0.340075</td>\n",
6179
       "      <td>-0.971431</td>\n",
6180
       "      <td>-0.545824</td>\n",
6181
       "      <td>0.310340</td>\n",
6182
       "      <td>0.438293</td>\n",
6183
       "      <td>-0.473931</td>\n",
6184
       "      <td>0.000000</td>\n",
6185
       "    </tr>\n",
6186
       "    <tr>\n",
6187
       "      <th>84316</th>\n",
6188
       "      <td>-0.810966</td>\n",
6189
       "      <td>-0.689660</td>\n",
6190
       "      <td>0.000000</td>\n",
6191
       "      <td>0.000000</td>\n",
6192
       "      <td>0.000000</td>\n",
6193
       "      <td>0.718088</td>\n",
6194
       "      <td>0.000000</td>\n",
6195
       "      <td>0.000000</td>\n",
6196
       "      <td>0.731183</td>\n",
6197
       "      <td>0.367371</td>\n",
6198
       "      <td>...</td>\n",
6199
       "      <td>0.000000</td>\n",
6200
       "      <td>0.000000</td>\n",
6201
       "      <td>0.000000</td>\n",
6202
       "      <td>-0.434403</td>\n",
6203
       "      <td>1.060047</td>\n",
6204
       "      <td>0.490570</td>\n",
6205
       "      <td>0.000000</td>\n",
6206
       "      <td>-0.632629</td>\n",
6207
       "      <td>-0.655172</td>\n",
6208
       "      <td>0.000000</td>\n",
6209
       "    </tr>\n",
6210
       "    <tr>\n",
6211
       "      <th>200030</th>\n",
6212
       "      <td>1.238787</td>\n",
6213
       "      <td>1.090853</td>\n",
6214
       "      <td>0.000000</td>\n",
6215
       "      <td>0.000000</td>\n",
6216
       "      <td>0.000000</td>\n",
6217
       "      <td>1.839960</td>\n",
6218
       "      <td>0.000000</td>\n",
6219
       "      <td>0.448901</td>\n",
6220
       "      <td>0.000000</td>\n",
6221
       "      <td>0.000000</td>\n",
6222
       "      <td>...</td>\n",
6223
       "      <td>0.438293</td>\n",
6224
       "      <td>0.000000</td>\n",
6225
       "      <td>0.618239</td>\n",
6226
       "      <td>0.201634</td>\n",
6227
       "      <td>0.000000</td>\n",
6228
       "      <td>-0.260152</td>\n",
6229
       "      <td>0.000000</td>\n",
6230
       "      <td>0.000000</td>\n",
6231
       "      <td>0.469886</td>\n",
6232
       "      <td>0.000000</td>\n",
6233
       "    </tr>\n",
6234
       "    <tr>\n",
6235
       "      <th>200030</th>\n",
6236
       "      <td>1.238787</td>\n",
6237
       "      <td>1.090853</td>\n",
6238
       "      <td>0.000000</td>\n",
6239
       "      <td>0.000000</td>\n",
6240
       "      <td>0.000000</td>\n",
6241
       "      <td>1.839960</td>\n",
6242
       "      <td>0.000000</td>\n",
6243
       "      <td>0.448901</td>\n",
6244
       "      <td>0.000000</td>\n",
6245
       "      <td>0.000000</td>\n",
6246
       "      <td>...</td>\n",
6247
       "      <td>0.438293</td>\n",
6248
       "      <td>0.000000</td>\n",
6249
       "      <td>0.618239</td>\n",
6250
       "      <td>0.201634</td>\n",
6251
       "      <td>0.000000</td>\n",
6252
       "      <td>-0.260152</td>\n",
6253
       "      <td>0.000000</td>\n",
6254
       "      <td>0.000000</td>\n",
6255
       "      <td>0.469886</td>\n",
6256
       "      <td>0.000000</td>\n",
6257
       "    </tr>\n",
6258
       "    <tr>\n",
6259
       "      <th>642658</th>\n",
6260
       "      <td>1.049631</td>\n",
6261
       "      <td>1.358959</td>\n",
6262
       "      <td>0.000000</td>\n",
6263
       "      <td>0.599318</td>\n",
6264
       "      <td>0.000000</td>\n",
6265
       "      <td>0.000000</td>\n",
6266
       "      <td>0.000000</td>\n",
6267
       "      <td>0.321928</td>\n",
6268
       "      <td>0.700440</td>\n",
6269
       "      <td>0.298658</td>\n",
6270
       "      <td>...</td>\n",
6271
       "      <td>0.632268</td>\n",
6272
       "      <td>0.000000</td>\n",
6273
       "      <td>0.769772</td>\n",
6274
       "      <td>0.232661</td>\n",
6275
       "      <td>1.121015</td>\n",
6276
       "      <td>0.831877</td>\n",
6277
       "      <td>-0.367732</td>\n",
6278
       "      <td>0.000000</td>\n",
6279
       "      <td>1.121015</td>\n",
6280
       "      <td>0.459432</td>\n",
6281
       "    </tr>\n",
6282
       "    <tr>\n",
6283
       "      <th>642658</th>\n",
6284
       "      <td>1.049631</td>\n",
6285
       "      <td>1.358959</td>\n",
6286
       "      <td>0.000000</td>\n",
6287
       "      <td>0.599318</td>\n",
6288
       "      <td>0.000000</td>\n",
6289
       "      <td>0.000000</td>\n",
6290
       "      <td>0.000000</td>\n",
6291
       "      <td>0.321928</td>\n",
6292
       "      <td>0.700440</td>\n",
6293
       "      <td>0.298658</td>\n",
6294
       "      <td>...</td>\n",
6295
       "      <td>0.632268</td>\n",
6296
       "      <td>0.000000</td>\n",
6297
       "      <td>0.769772</td>\n",
6298
       "      <td>0.232661</td>\n",
6299
       "      <td>1.121015</td>\n",
6300
       "      <td>0.831877</td>\n",
6301
       "      <td>-0.367732</td>\n",
6302
       "      <td>0.000000</td>\n",
6303
       "      <td>1.121015</td>\n",
6304
       "      <td>0.459432</td>\n",
6305
       "    </tr>\n",
6306
       "    <tr>\n",
6307
       "      <th>100302179</th>\n",
6308
       "      <td>0.778209</td>\n",
6309
       "      <td>0.000000</td>\n",
6310
       "      <td>0.000000</td>\n",
6311
       "      <td>0.000000</td>\n",
6312
       "      <td>0.000000</td>\n",
6313
       "      <td>-0.340075</td>\n",
6314
       "      <td>0.000000</td>\n",
6315
       "      <td>0.000000</td>\n",
6316
       "      <td>0.000000</td>\n",
6317
       "      <td>0.000000</td>\n",
6318
       "      <td>...</td>\n",
6319
       "      <td>0.000000</td>\n",
6320
       "      <td>0.000000</td>\n",
6321
       "      <td>0.201634</td>\n",
6322
       "      <td>-0.251539</td>\n",
6323
       "      <td>0.000000</td>\n",
6324
       "      <td>-0.463947</td>\n",
6325
       "      <td>0.000000</td>\n",
6326
       "      <td>0.000000</td>\n",
6327
       "      <td>0.000000</td>\n",
6328
       "      <td>0.000000</td>\n",
6329
       "    </tr>\n",
6330
       "    <tr>\n",
6331
       "      <th>100302179</th>\n",
6332
       "      <td>0.778209</td>\n",
6333
       "      <td>0.000000</td>\n",
6334
       "      <td>0.000000</td>\n",
6335
       "      <td>0.000000</td>\n",
6336
       "      <td>0.000000</td>\n",
6337
       "      <td>-0.340075</td>\n",
6338
       "      <td>0.000000</td>\n",
6339
       "      <td>0.000000</td>\n",
6340
       "      <td>0.000000</td>\n",
6341
       "      <td>0.000000</td>\n",
6342
       "      <td>...</td>\n",
6343
       "      <td>0.000000</td>\n",
6344
       "      <td>0.000000</td>\n",
6345
       "      <td>0.201634</td>\n",
6346
       "      <td>-0.251539</td>\n",
6347
       "      <td>0.000000</td>\n",
6348
       "      <td>-0.463947</td>\n",
6349
       "      <td>0.000000</td>\n",
6350
       "      <td>0.000000</td>\n",
6351
       "      <td>0.000000</td>\n",
6352
       "      <td>0.000000</td>\n",
6353
       "    </tr>\n",
6354
       "    <tr>\n",
6355
       "      <th>401508</th>\n",
6356
       "      <td>0.232661</td>\n",
6357
       "      <td>0.000000</td>\n",
6358
       "      <td>0.000000</td>\n",
6359
       "      <td>0.000000</td>\n",
6360
       "      <td>0.250962</td>\n",
6361
       "      <td>0.000000</td>\n",
6362
       "      <td>0.207893</td>\n",
6363
       "      <td>0.000000</td>\n",
6364
       "      <td>-0.666576</td>\n",
6365
       "      <td>1.629939</td>\n",
6366
       "      <td>...</td>\n",
6367
       "      <td>0.000000</td>\n",
6368
       "      <td>-2.321928</td>\n",
6369
       "      <td>0.000000</td>\n",
6370
       "      <td>0.000000</td>\n",
6371
       "      <td>-0.260152</td>\n",
6372
       "      <td>0.000000</td>\n",
6373
       "      <td>-0.545824</td>\n",
6374
       "      <td>-0.330973</td>\n",
6375
       "      <td>0.000000</td>\n",
6376
       "      <td>0.000000</td>\n",
6377
       "    </tr>\n",
6378
       "    <tr>\n",
6379
       "      <th>401508</th>\n",
6380
       "      <td>0.232661</td>\n",
6381
       "      <td>0.000000</td>\n",
6382
       "      <td>0.000000</td>\n",
6383
       "      <td>0.000000</td>\n",
6384
       "      <td>0.250962</td>\n",
6385
       "      <td>0.000000</td>\n",
6386
       "      <td>0.207893</td>\n",
6387
       "      <td>0.000000</td>\n",
6388
       "      <td>-0.666576</td>\n",
6389
       "      <td>1.629939</td>\n",
6390
       "      <td>...</td>\n",
6391
       "      <td>0.000000</td>\n",
6392
       "      <td>-2.321928</td>\n",
6393
       "      <td>0.000000</td>\n",
6394
       "      <td>0.000000</td>\n",
6395
       "      <td>-0.260152</td>\n",
6396
       "      <td>0.000000</td>\n",
6397
       "      <td>-0.545824</td>\n",
6398
       "      <td>-0.330973</td>\n",
6399
       "      <td>0.000000</td>\n",
6400
       "      <td>0.000000</td>\n",
6401
       "    </tr>\n",
6402
       "    <tr>\n",
6403
       "      <th>119016</th>\n",
6404
       "      <td>0.378512</td>\n",
6405
       "      <td>0.570463</td>\n",
6406
       "      <td>0.000000</td>\n",
6407
       "      <td>0.000000</td>\n",
6408
       "      <td>0.000000</td>\n",
6409
       "      <td>-0.251539</td>\n",
6410
       "      <td>0.000000</td>\n",
6411
       "      <td>-0.286304</td>\n",
6412
       "      <td>0.000000</td>\n",
6413
       "      <td>0.000000</td>\n",
6414
       "      <td>...</td>\n",
6415
       "      <td>0.000000</td>\n",
6416
       "      <td>-1.043943</td>\n",
6417
       "      <td>-0.524915</td>\n",
6418
       "      <td>0.000000</td>\n",
6419
       "      <td>0.000000</td>\n",
6420
       "      <td>0.000000</td>\n",
6421
       "      <td>-0.377070</td>\n",
6422
       "      <td>-0.643856</td>\n",
6423
       "      <td>0.000000</td>\n",
6424
       "      <td>0.000000</td>\n",
6425
       "    </tr>\n",
6426
       "    <tr>\n",
6427
       "      <th>119016</th>\n",
6428
       "      <td>0.448901</td>\n",
6429
       "      <td>-0.286304</td>\n",
6430
       "      <td>0.000000</td>\n",
6431
       "      <td>0.000000</td>\n",
6432
       "      <td>0.000000</td>\n",
6433
       "      <td>-0.242977</td>\n",
6434
       "      <td>0.000000</td>\n",
6435
       "      <td>-0.286304</td>\n",
6436
       "      <td>0.000000</td>\n",
6437
       "      <td>0.000000</td>\n",
6438
       "      <td>...</td>\n",
6439
       "      <td>0.000000</td>\n",
6440
       "      <td>-1.014500</td>\n",
6441
       "      <td>-0.473931</td>\n",
6442
       "      <td>0.000000</td>\n",
6443
       "      <td>0.000000</td>\n",
6444
       "      <td>0.000000</td>\n",
6445
       "      <td>-0.367732</td>\n",
6446
       "      <td>-0.577767</td>\n",
6447
       "      <td>0.000000</td>\n",
6448
       "      <td>0.000000</td>\n",
6449
       "    </tr>\n",
6450
       "    <tr>\n",
6451
       "      <th>84458</th>\n",
6452
       "      <td>0.389567</td>\n",
6453
       "      <td>-0.666576</td>\n",
6454
       "      <td>0.000000</td>\n",
6455
       "      <td>0.000000</td>\n",
6456
       "      <td>0.000000</td>\n",
6457
       "      <td>-0.340075</td>\n",
6458
       "      <td>0.000000</td>\n",
6459
       "      <td>-0.367732</td>\n",
6460
       "      <td>0.000000</td>\n",
6461
       "      <td>0.000000</td>\n",
6462
       "      <td>...</td>\n",
6463
       "      <td>0.000000</td>\n",
6464
       "      <td>-1.014500</td>\n",
6465
       "      <td>-0.473931</td>\n",
6466
       "      <td>0.000000</td>\n",
6467
       "      <td>0.000000</td>\n",
6468
       "      <td>0.000000</td>\n",
6469
       "      <td>-0.386468</td>\n",
6470
       "      <td>-0.610433</td>\n",
6471
       "      <td>-0.577767</td>\n",
6472
       "      <td>0.000000</td>\n",
6473
       "    </tr>\n",
6474
       "    <tr>\n",
6475
       "      <th>84458</th>\n",
6476
       "      <td>0.389567</td>\n",
6477
       "      <td>-0.666576</td>\n",
6478
       "      <td>0.000000</td>\n",
6479
       "      <td>0.000000</td>\n",
6480
       "      <td>0.000000</td>\n",
6481
       "      <td>-0.340075</td>\n",
6482
       "      <td>0.000000</td>\n",
6483
       "      <td>-0.367732</td>\n",
6484
       "      <td>0.000000</td>\n",
6485
       "      <td>0.000000</td>\n",
6486
       "      <td>...</td>\n",
6487
       "      <td>0.000000</td>\n",
6488
       "      <td>-1.014500</td>\n",
6489
       "      <td>-0.473931</td>\n",
6490
       "      <td>0.000000</td>\n",
6491
       "      <td>0.000000</td>\n",
6492
       "      <td>0.000000</td>\n",
6493
       "      <td>-0.386468</td>\n",
6494
       "      <td>-0.610433</td>\n",
6495
       "      <td>-0.577767</td>\n",
6496
       "      <td>0.000000</td>\n",
6497
       "    </tr>\n",
6498
       "    <tr>\n",
6499
       "      <th>574445</th>\n",
6500
       "      <td>0.490570</td>\n",
6501
       "      <td>0.000000</td>\n",
6502
       "      <td>-0.286304</td>\n",
6503
       "      <td>0.000000</td>\n",
6504
       "      <td>0.000000</td>\n",
6505
       "      <td>-1.074001</td>\n",
6506
       "      <td>0.000000</td>\n",
6507
       "      <td>0.238787</td>\n",
6508
       "      <td>0.000000</td>\n",
6509
       "      <td>0.000000</td>\n",
6510
       "      <td>...</td>\n",
6511
       "      <td>-0.823677</td>\n",
6512
       "      <td>-0.957356</td>\n",
6513
       "      <td>-0.321928</td>\n",
6514
       "      <td>0.000000</td>\n",
6515
       "      <td>-0.985645</td>\n",
6516
       "      <td>-0.454032</td>\n",
6517
       "      <td>0.000000</td>\n",
6518
       "      <td>-0.599462</td>\n",
6519
       "      <td>-0.577767</td>\n",
6520
       "      <td>0.000000</td>\n",
6521
       "    </tr>\n",
6522
       "    <tr>\n",
6523
       "      <th>574445</th>\n",
6524
       "      <td>0.490570</td>\n",
6525
       "      <td>0.000000</td>\n",
6526
       "      <td>-0.286304</td>\n",
6527
       "      <td>0.000000</td>\n",
6528
       "      <td>0.000000</td>\n",
6529
       "      <td>-1.074001</td>\n",
6530
       "      <td>0.000000</td>\n",
6531
       "      <td>0.238787</td>\n",
6532
       "      <td>0.000000</td>\n",
6533
       "      <td>0.000000</td>\n",
6534
       "      <td>...</td>\n",
6535
       "      <td>-0.823677</td>\n",
6536
       "      <td>-0.957356</td>\n",
6537
       "      <td>-0.321928</td>\n",
6538
       "      <td>0.000000</td>\n",
6539
       "      <td>-0.985645</td>\n",
6540
       "      <td>-0.454032</td>\n",
6541
       "      <td>0.000000</td>\n",
6542
       "      <td>-0.599462</td>\n",
6543
       "      <td>-0.577767</td>\n",
6544
       "      <td>0.000000</td>\n",
6545
       "    </tr>\n",
6546
       "    <tr>\n",
6547
       "      <th>26095</th>\n",
6548
       "      <td>0.531069</td>\n",
6549
       "      <td>0.000000</td>\n",
6550
       "      <td>0.000000</td>\n",
6551
       "      <td>0.000000</td>\n",
6552
       "      <td>0.000000</td>\n",
6553
       "      <td>-0.251539</td>\n",
6554
       "      <td>0.000000</td>\n",
6555
       "      <td>-0.286304</td>\n",
6556
       "      <td>0.000000</td>\n",
6557
       "      <td>0.790772</td>\n",
6558
       "      <td>...</td>\n",
6559
       "      <td>0.269033</td>\n",
6560
       "      <td>-1.043943</td>\n",
6561
       "      <td>0.000000</td>\n",
6562
       "      <td>0.000000</td>\n",
6563
       "      <td>0.000000</td>\n",
6564
       "      <td>0.000000</td>\n",
6565
       "      <td>0.000000</td>\n",
6566
       "      <td>-0.588574</td>\n",
6567
       "      <td>0.000000</td>\n",
6568
       "      <td>0.000000</td>\n",
6569
       "    </tr>\n",
6570
       "    <tr>\n",
6571
       "      <th>26095</th>\n",
6572
       "      <td>0.531069</td>\n",
6573
       "      <td>0.000000</td>\n",
6574
       "      <td>0.000000</td>\n",
6575
       "      <td>0.000000</td>\n",
6576
       "      <td>0.000000</td>\n",
6577
       "      <td>-0.251539</td>\n",
6578
       "      <td>0.000000</td>\n",
6579
       "      <td>-0.286304</td>\n",
6580
       "      <td>0.000000</td>\n",
6581
       "      <td>0.790772</td>\n",
6582
       "      <td>...</td>\n",
6583
       "      <td>0.269033</td>\n",
6584
       "      <td>-1.043943</td>\n",
6585
       "      <td>0.000000</td>\n",
6586
       "      <td>0.000000</td>\n",
6587
       "      <td>0.000000</td>\n",
6588
       "      <td>0.000000</td>\n",
6589
       "      <td>0.000000</td>\n",
6590
       "      <td>-0.588574</td>\n",
6591
       "      <td>0.000000</td>\n",
6592
       "      <td>0.000000</td>\n",
6593
       "    </tr>\n",
6594
       "    <tr>\n",
6595
       "      <th>84968</th>\n",
6596
       "      <td>-0.736966</td>\n",
6597
       "      <td>-0.321928</td>\n",
6598
       "      <td>0.000000</td>\n",
6599
       "      <td>0.570463</td>\n",
6600
       "      <td>0.000000</td>\n",
6601
       "      <td>0.000000</td>\n",
6602
       "      <td>0.000000</td>\n",
6603
       "      <td>-0.875672</td>\n",
6604
       "      <td>-0.678072</td>\n",
6605
       "      <td>0.378512</td>\n",
6606
       "      <td>...</td>\n",
6607
       "      <td>0.000000</td>\n",
6608
       "      <td>-1.000000</td>\n",
6609
       "      <td>0.250962</td>\n",
6610
       "      <td>0.000000</td>\n",
6611
       "      <td>0.000000</td>\n",
6612
       "      <td>0.000000</td>\n",
6613
       "      <td>0.000000</td>\n",
6614
       "      <td>-0.545824</td>\n",
6615
       "      <td>-0.545824</td>\n",
6616
       "      <td>-0.535332</td>\n",
6617
       "    </tr>\n",
6618
       "    <tr>\n",
6619
       "      <th>84968</th>\n",
6620
       "      <td>-0.736966</td>\n",
6621
       "      <td>-0.321928</td>\n",
6622
       "      <td>0.000000</td>\n",
6623
       "      <td>0.570463</td>\n",
6624
       "      <td>0.000000</td>\n",
6625
       "      <td>0.000000</td>\n",
6626
       "      <td>0.000000</td>\n",
6627
       "      <td>-0.875672</td>\n",
6628
       "      <td>-0.678072</td>\n",
6629
       "      <td>0.378512</td>\n",
6630
       "      <td>...</td>\n",
6631
       "      <td>0.000000</td>\n",
6632
       "      <td>-1.000000</td>\n",
6633
       "      <td>0.250962</td>\n",
6634
       "      <td>0.000000</td>\n",
6635
       "      <td>0.000000</td>\n",
6636
       "      <td>0.000000</td>\n",
6637
       "      <td>0.000000</td>\n",
6638
       "      <td>-0.545824</td>\n",
6639
       "      <td>-0.545824</td>\n",
6640
       "      <td>-0.535332</td>\n",
6641
       "    </tr>\n",
6642
       "    <tr>\n",
6643
       "      <th>80759</th>\n",
6644
       "      <td>-0.556393</td>\n",
6645
       "      <td>0.599318</td>\n",
6646
       "      <td>0.000000</td>\n",
6647
       "      <td>0.000000</td>\n",
6648
       "      <td>0.000000</td>\n",
6649
       "      <td>0.250962</td>\n",
6650
       "      <td>0.000000</td>\n",
6651
       "      <td>-0.386468</td>\n",
6652
       "      <td>-0.567041</td>\n",
6653
       "      <td>-0.556393</td>\n",
6654
       "      <td>...</td>\n",
6655
       "      <td>0.000000</td>\n",
6656
       "      <td>0.000000</td>\n",
6657
       "      <td>-0.621488</td>\n",
6658
       "      <td>-0.454032</td>\n",
6659
       "      <td>0.000000</td>\n",
6660
       "      <td>0.000000</td>\n",
6661
       "      <td>-0.386468</td>\n",
6662
       "      <td>-0.621488</td>\n",
6663
       "      <td>0.761285</td>\n",
6664
       "      <td>0.000000</td>\n",
6665
       "    </tr>\n",
6666
       "    <tr>\n",
6667
       "      <th>80759</th>\n",
6668
       "      <td>-0.556393</td>\n",
6669
       "      <td>0.599318</td>\n",
6670
       "      <td>0.000000</td>\n",
6671
       "      <td>0.000000</td>\n",
6672
       "      <td>0.000000</td>\n",
6673
       "      <td>0.250962</td>\n",
6674
       "      <td>0.000000</td>\n",
6675
       "      <td>-0.386468</td>\n",
6676
       "      <td>-0.567041</td>\n",
6677
       "      <td>-0.556393</td>\n",
6678
       "      <td>...</td>\n",
6679
       "      <td>0.000000</td>\n",
6680
       "      <td>0.000000</td>\n",
6681
       "      <td>-0.621488</td>\n",
6682
       "      <td>-0.454032</td>\n",
6683
       "      <td>0.000000</td>\n",
6684
       "      <td>0.000000</td>\n",
6685
       "      <td>-0.386468</td>\n",
6686
       "      <td>-0.621488</td>\n",
6687
       "      <td>0.761285</td>\n",
6688
       "      <td>0.000000</td>\n",
6689
       "    </tr>\n",
6690
       "    <tr>\n",
6691
       "      <th>3192</th>\n",
6692
       "      <td>1.350497</td>\n",
6693
       "      <td>0.632268</td>\n",
6694
       "      <td>0.000000</td>\n",
6695
       "      <td>0.000000</td>\n",
6696
       "      <td>0.000000</td>\n",
6697
       "      <td>0.220330</td>\n",
6698
       "      <td>0.000000</td>\n",
6699
       "      <td>0.448901</td>\n",
6700
       "      <td>-0.268817</td>\n",
6701
       "      <td>0.000000</td>\n",
6702
       "      <td>...</td>\n",
6703
       "      <td>0.480265</td>\n",
6704
       "      <td>0.000000</td>\n",
6705
       "      <td>0.207893</td>\n",
6706
       "      <td>0.000000</td>\n",
6707
       "      <td>0.000000</td>\n",
6708
       "      <td>-0.330973</td>\n",
6709
       "      <td>0.250962</td>\n",
6710
       "      <td>0.000000</td>\n",
6711
       "      <td>1.021480</td>\n",
6712
       "      <td>0.599318</td>\n",
6713
       "    </tr>\n",
6714
       "    <tr>\n",
6715
       "      <th>3192</th>\n",
6716
       "      <td>1.350497</td>\n",
6717
       "      <td>0.632268</td>\n",
6718
       "      <td>0.000000</td>\n",
6719
       "      <td>0.000000</td>\n",
6720
       "      <td>0.000000</td>\n",
6721
       "      <td>0.220330</td>\n",
6722
       "      <td>0.000000</td>\n",
6723
       "      <td>0.448901</td>\n",
6724
       "      <td>-0.268817</td>\n",
6725
       "      <td>0.000000</td>\n",
6726
       "      <td>...</td>\n",
6727
       "      <td>0.480265</td>\n",
6728
       "      <td>0.000000</td>\n",
6729
       "      <td>0.207893</td>\n",
6730
       "      <td>0.000000</td>\n",
6731
       "      <td>0.000000</td>\n",
6732
       "      <td>-0.330973</td>\n",
6733
       "      <td>0.250962</td>\n",
6734
       "      <td>0.000000</td>\n",
6735
       "      <td>1.021480</td>\n",
6736
       "      <td>0.599318</td>\n",
6737
       "    </tr>\n",
6738
       "    <tr>\n",
6739
       "      <th>387707</th>\n",
6740
       "      <td>0.389567</td>\n",
6741
       "      <td>-0.666576</td>\n",
6742
       "      <td>0.000000</td>\n",
6743
       "      <td>0.000000</td>\n",
6744
       "      <td>0.000000</td>\n",
6745
       "      <td>-0.340075</td>\n",
6746
       "      <td>0.000000</td>\n",
6747
       "      <td>-0.367732</td>\n",
6748
       "      <td>0.000000</td>\n",
6749
       "      <td>0.000000</td>\n",
6750
       "      <td>...</td>\n",
6751
       "      <td>0.000000</td>\n",
6752
       "      <td>-1.014500</td>\n",
6753
       "      <td>-0.473931</td>\n",
6754
       "      <td>0.000000</td>\n",
6755
       "      <td>0.000000</td>\n",
6756
       "      <td>0.000000</td>\n",
6757
       "      <td>-0.386468</td>\n",
6758
       "      <td>-0.610433</td>\n",
6759
       "      <td>-0.577767</td>\n",
6760
       "      <td>0.000000</td>\n",
6761
       "    </tr>\n",
6762
       "    <tr>\n",
6763
       "      <th>387707</th>\n",
6764
       "      <td>0.389567</td>\n",
6765
       "      <td>-0.666576</td>\n",
6766
       "      <td>0.000000</td>\n",
6767
       "      <td>0.000000</td>\n",
6768
       "      <td>0.000000</td>\n",
6769
       "      <td>-0.340075</td>\n",
6770
       "      <td>0.000000</td>\n",
6771
       "      <td>-0.367732</td>\n",
6772
       "      <td>0.000000</td>\n",
6773
       "      <td>0.000000</td>\n",
6774
       "      <td>...</td>\n",
6775
       "      <td>0.000000</td>\n",
6776
       "      <td>-1.014500</td>\n",
6777
       "      <td>-0.473931</td>\n",
6778
       "      <td>0.000000</td>\n",
6779
       "      <td>0.000000</td>\n",
6780
       "      <td>0.000000</td>\n",
6781
       "      <td>-0.386468</td>\n",
6782
       "      <td>-0.610433</td>\n",
6783
       "      <td>-0.577767</td>\n",
6784
       "      <td>0.000000</td>\n",
6785
       "    </tr>\n",
6786
       "    <tr>\n",
6787
       "      <th>79741</th>\n",
6788
       "      <td>0.000000</td>\n",
6789
       "      <td>0.589763</td>\n",
6790
       "      <td>0.000000</td>\n",
6791
       "      <td>0.000000</td>\n",
6792
       "      <td>0.000000</td>\n",
6793
       "      <td>-0.242977</td>\n",
6794
       "      <td>0.000000</td>\n",
6795
       "      <td>0.207893</td>\n",
6796
       "      <td>0.000000</td>\n",
6797
       "      <td>0.000000</td>\n",
6798
       "      <td>...</td>\n",
6799
       "      <td>-0.798366</td>\n",
6800
       "      <td>-0.985645</td>\n",
6801
       "      <td>-0.358454</td>\n",
6802
       "      <td>0.000000</td>\n",
6803
       "      <td>0.700440</td>\n",
6804
       "      <td>0.000000</td>\n",
6805
       "      <td>-0.367732</td>\n",
6806
       "      <td>-0.588574</td>\n",
6807
       "      <td>0.459432</td>\n",
6808
       "      <td>0.000000</td>\n",
6809
       "    </tr>\n",
6810
       "    <tr>\n",
6811
       "      <th>79741</th>\n",
6812
       "      <td>0.000000</td>\n",
6813
       "      <td>0.589763</td>\n",
6814
       "      <td>0.000000</td>\n",
6815
       "      <td>0.000000</td>\n",
6816
       "      <td>0.000000</td>\n",
6817
       "      <td>-0.242977</td>\n",
6818
       "      <td>0.000000</td>\n",
6819
       "      <td>0.207893</td>\n",
6820
       "      <td>0.000000</td>\n",
6821
       "      <td>0.220330</td>\n",
6822
       "      <td>...</td>\n",
6823
       "      <td>-0.798366</td>\n",
6824
       "      <td>-0.985645</td>\n",
6825
       "      <td>0.000000</td>\n",
6826
       "      <td>0.000000</td>\n",
6827
       "      <td>0.700440</td>\n",
6828
       "      <td>0.000000</td>\n",
6829
       "      <td>-0.367732</td>\n",
6830
       "      <td>-0.588574</td>\n",
6831
       "      <td>0.459432</td>\n",
6832
       "      <td>0.269033</td>\n",
6833
       "    </tr>\n",
6834
       "  </tbody>\n",
6835
       "</table>\n",
6836
       "<p>77 rows × 375 columns</p>\n",
6837
       "</div>"
6838
      ],
6839
      "text/plain": [
6840
       "             X-1004    X-1008    X-1027    X-1095    X-1119    X-1156  \\\n",
6841
       "gene_id                                                                 \n",
6842
       "143872     0.000000  0.560715  0.000000  0.000000  0.000000 -0.330973   \n",
6843
       "143872     0.000000  0.560715  0.000000  0.000000  0.000000 -0.330973   \n",
6844
       "286464     0.000000  0.378512  0.000000  0.550901  0.000000 -0.524915   \n",
6845
       "286464     0.000000  0.378512  0.000000  0.550901  0.000000 -0.524915   \n",
6846
       "286464     0.000000  0.378512  0.000000  0.550901  0.000000 -0.524915   \n",
6847
       "51463      1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
6848
       "51463      1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
6849
       "642826     0.608809  0.859970  0.531069  0.000000  0.000000  0.871844   \n",
6850
       "642826     0.608809  0.859970  0.531069  0.000000  0.000000  0.871844   \n",
6851
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
6852
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
6853
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
6854
       "653067     0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
6855
       "399761     0.531069  0.718088  0.000000  0.000000  0.000000 -0.251539   \n",
6856
       "399761     0.531069  0.718088  0.000000  0.000000  0.000000 -0.251539   \n",
6857
       "647060     0.000000  0.000000  0.000000  0.000000  0.000000 -0.312939   \n",
6858
       "647060     0.000000  0.000000  0.000000  0.000000  0.000000 -0.312939   \n",
6859
       "284565     1.238787  1.090853  0.000000  0.000000  0.000000  1.570463   \n",
6860
       "284565     1.238787  1.090853  0.000000  0.000000  0.000000  1.310340   \n",
6861
       "84631     -0.823677 -0.524915  0.000000  0.632268  0.000000 -1.074001   \n",
6862
       "84631     -0.823677 -0.524915  0.000000  0.632268  0.000000 -1.074001   \n",
6863
       "161176     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
6864
       "161176     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
6865
       "341019     0.959770  0.000000  0.000000  0.000000  0.000000 -0.304006   \n",
6866
       "341019     0.959770  0.000000  0.000000  0.000000  0.000000 -0.304006   \n",
6867
       "83869     -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393   \n",
6868
       "83869     -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393   \n",
6869
       "9502       0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
6870
       "9502       0.000000  0.378512  0.000000  0.550901  0.000000 -0.434403   \n",
6871
       "83871      0.000000  0.000000  0.000000  0.500802  0.000000 -0.304006   \n",
6872
       "...             ...       ...       ...       ...       ...       ...   \n",
6873
       "100134869  0.000000  0.207893  0.000000  0.000000  0.000000  0.000000   \n",
6874
       "100134869  0.000000  0.207893  0.000000  0.000000  0.000000  0.000000   \n",
6875
       "84316     -0.251539  0.000000  0.000000  0.000000  0.000000 -0.340075   \n",
6876
       "84316     -0.810966 -0.689660  0.000000  0.000000  0.000000  0.718088   \n",
6877
       "200030     1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
6878
       "200030     1.238787  1.090853  0.000000  0.000000  0.000000  1.839960   \n",
6879
       "642658     1.049631  1.358959  0.000000  0.599318  0.000000  0.000000   \n",
6880
       "642658     1.049631  1.358959  0.000000  0.599318  0.000000  0.000000   \n",
6881
       "100302179  0.778209  0.000000  0.000000  0.000000  0.000000 -0.340075   \n",
6882
       "100302179  0.778209  0.000000  0.000000  0.000000  0.000000 -0.340075   \n",
6883
       "401508     0.232661  0.000000  0.000000  0.000000  0.250962  0.000000   \n",
6884
       "401508     0.232661  0.000000  0.000000  0.000000  0.250962  0.000000   \n",
6885
       "119016     0.378512  0.570463  0.000000  0.000000  0.000000 -0.251539   \n",
6886
       "119016     0.448901 -0.286304  0.000000  0.000000  0.000000 -0.242977   \n",
6887
       "84458      0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
6888
       "84458      0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
6889
       "574445     0.490570  0.000000 -0.286304  0.000000  0.000000 -1.074001   \n",
6890
       "574445     0.490570  0.000000 -0.286304  0.000000  0.000000 -1.074001   \n",
6891
       "26095      0.531069  0.000000  0.000000  0.000000  0.000000 -0.251539   \n",
6892
       "26095      0.531069  0.000000  0.000000  0.000000  0.000000 -0.251539   \n",
6893
       "84968     -0.736966 -0.321928  0.000000  0.570463  0.000000  0.000000   \n",
6894
       "84968     -0.736966 -0.321928  0.000000  0.570463  0.000000  0.000000   \n",
6895
       "80759     -0.556393  0.599318  0.000000  0.000000  0.000000  0.250962   \n",
6896
       "80759     -0.556393  0.599318  0.000000  0.000000  0.000000  0.250962   \n",
6897
       "3192       1.350497  0.632268  0.000000  0.000000  0.000000  0.220330   \n",
6898
       "3192       1.350497  0.632268  0.000000  0.000000  0.000000  0.220330   \n",
6899
       "387707     0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
6900
       "387707     0.389567 -0.666576  0.000000  0.000000  0.000000 -0.340075   \n",
6901
       "79741      0.000000  0.589763  0.000000  0.000000  0.000000 -0.242977   \n",
6902
       "79741      0.000000  0.589763  0.000000  0.000000  0.000000 -0.242977   \n",
6903
       "\n",
6904
       "             X-1167    X-1169    X-1172    X-1173    ...       X-5694  \\\n",
6905
       "gene_id                                              ...                \n",
6906
       "143872     0.000000 -0.367732  0.350497  0.000000    ...     0.000000   \n",
6907
       "143872     0.000000 -0.367732  0.350497  0.000000    ...     0.000000   \n",
6908
       "286464     0.000000 -0.902389 -1.321928  0.367371    ...     0.000000   \n",
6909
       "286464     0.000000 -0.902389 -1.321928  0.367371    ...     0.000000   \n",
6910
       "286464     0.000000 -0.902389 -1.321928  0.367371    ...     0.000000   \n",
6911
       "51463      0.000000  0.448901  0.000000  0.000000    ...     0.469886   \n",
6912
       "51463      0.000000  0.448901  0.000000  0.000000    ...     0.438293   \n",
6913
       "642826     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
6914
       "642826     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
6915
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
6916
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
6917
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
6918
       "653067     0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
6919
       "399761     0.000000 -0.286304  0.000000  0.790772    ...     0.367371   \n",
6920
       "399761     0.000000 -0.286304  0.000000  0.790772    ...     0.367371   \n",
6921
       "647060     0.000000 -0.321928  0.000000  0.618239    ...     0.000000   \n",
6922
       "647060     0.000000 -0.321928  0.000000  0.618239    ...     0.000000   \n",
6923
       "284565     0.000000  0.448901  0.411426  0.448901    ...     0.469886   \n",
6924
       "284565     0.000000  0.448901  0.411426  0.448901    ...     0.469886   \n",
6925
       "84631      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
6926
       "84631      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
6927
       "161176     0.000000 -0.358454  0.250962  0.339137    ...    -0.875672   \n",
6928
       "161176     0.000000 -0.358454  0.250962  0.339137    ...    -0.875672   \n",
6929
       "341019     0.000000 -0.349235  0.000000  0.000000    ...     0.000000   \n",
6930
       "341019     0.000000 -0.349235  0.000000  0.000000    ...     0.000000   \n",
6931
       "83869     -2.785875 -0.875672 -2.736966 -2.514573    ...    -2.556393   \n",
6932
       "83869     -2.785875 -0.875672 -2.736966 -2.514573    ...    -2.556393   \n",
6933
       "9502       0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
6934
       "9502       0.000000 -0.902389  1.220330  0.367371    ...     0.000000   \n",
6935
       "83871      0.000000 -0.349235  0.327687  0.000000    ...     0.000000   \n",
6936
       "...             ...       ...       ...       ...    ...          ...   \n",
6937
       "100134869  0.000000  0.000000  0.232661  0.000000    ...     0.000000   \n",
6938
       "100134869  0.000000  0.000000  0.232661  0.000000    ...     0.000000   \n",
6939
       "84316      0.000000 -0.349235 -0.588574  0.000000    ...     0.000000   \n",
6940
       "84316      0.000000  0.000000  0.731183  0.367371    ...     0.000000   \n",
6941
       "200030     0.000000  0.448901  0.000000  0.000000    ...     0.438293   \n",
6942
       "200030     0.000000  0.448901  0.000000  0.000000    ...     0.438293   \n",
6943
       "642658     0.000000  0.321928  0.700440  0.298658    ...     0.632268   \n",
6944
       "642658     0.000000  0.321928  0.700440  0.298658    ...     0.632268   \n",
6945
       "100302179  0.000000  0.000000  0.000000  0.000000    ...     0.000000   \n",
6946
       "100302179  0.000000  0.000000  0.000000  0.000000    ...     0.000000   \n",
6947
       "401508     0.207893  0.000000 -0.666576  1.629939    ...     0.000000   \n",
6948
       "401508     0.207893  0.000000 -0.666576  1.629939    ...     0.000000   \n",
6949
       "119016     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
6950
       "119016     0.000000 -0.286304  0.000000  0.000000    ...     0.000000   \n",
6951
       "84458      0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
6952
       "84458      0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
6953
       "574445     0.000000  0.238787  0.000000  0.000000    ...    -0.823677   \n",
6954
       "574445     0.000000  0.238787  0.000000  0.000000    ...    -0.823677   \n",
6955
       "26095      0.000000 -0.286304  0.000000  0.790772    ...     0.269033   \n",
6956
       "26095      0.000000 -0.286304  0.000000  0.790772    ...     0.269033   \n",
6957
       "84968      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
6958
       "84968      0.000000 -0.875672 -0.678072  0.378512    ...     0.000000   \n",
6959
       "80759      0.000000 -0.386468 -0.567041 -0.556393    ...     0.000000   \n",
6960
       "80759      0.000000 -0.386468 -0.567041 -0.556393    ...     0.000000   \n",
6961
       "3192       0.000000  0.448901 -0.268817  0.000000    ...     0.480265   \n",
6962
       "3192       0.000000  0.448901 -0.268817  0.000000    ...     0.480265   \n",
6963
       "387707     0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
6964
       "387707     0.000000 -0.367732  0.000000  0.000000    ...     0.000000   \n",
6965
       "79741      0.000000  0.207893  0.000000  0.000000    ...    -0.798366   \n",
6966
       "79741      0.000000  0.207893  0.000000  0.220330    ...    -0.798366   \n",
6967
       "\n",
6968
       "             X-5696    X-5713    X-5717    X-5727    X-5739    X-5808  \\\n",
6969
       "gene_id                                                                 \n",
6970
       "143872    -0.349235  0.000000 -0.321928 -1.014500 -0.588574  0.000000   \n",
6971
       "143872    -0.349235  0.000000 -0.321928 -1.014500 -0.588574  0.000000   \n",
6972
       "286464    -1.014500  0.718088  0.000000 -1.014500  0.000000  0.000000   \n",
6973
       "286464    -1.014500  0.718088  0.000000 -1.014500  0.000000  0.000000   \n",
6974
       "286464    -1.014500  0.718088  0.000000 -1.014500  0.000000  0.000000   \n",
6975
       "51463      0.000000  0.618239  0.201634  0.000000 -0.260152 -0.349235   \n",
6976
       "51463      0.000000  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
6977
       "642826    -1.014500 -0.473931  0.000000  0.618239  0.000000 -0.386468   \n",
6978
       "642826    -1.014500 -0.473931  0.000000  0.618239  0.000000 -0.386468   \n",
6979
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
6980
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
6981
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
6982
       "653067    -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
6983
       "399761    -1.535332  0.000000  0.000000  0.000000  0.000000 -0.599462   \n",
6984
       "399761    -1.535332  0.000000  0.000000  0.000000  0.000000 -0.599462   \n",
6985
       "647060    -0.621488  0.000000 -0.377070 -0.749038 -0.405451  0.000000   \n",
6986
       "647060    -0.621488  0.000000 -0.377070 -0.749038 -0.405451  0.000000   \n",
6987
       "284565    -0.504305  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
6988
       "284565    -0.524915  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
6989
       "84631     -1.014500  0.500802  0.000000  0.000000  0.000000  0.000000   \n",
6990
       "84631     -1.014500  0.500802  0.000000  0.000000  0.000000  0.000000   \n",
6991
       "161176     0.000000 -0.483985  0.000000  0.000000 -0.610433  0.000000   \n",
6992
       "161176     0.000000 -0.483985  0.000000  0.000000 -0.610433  0.000000   \n",
6993
       "341019    -0.666576  0.000000 -0.304006 -1.000000 -0.610433  0.000000   \n",
6994
       "341019    -0.312939  0.000000 -0.304006 -1.000000 -0.610433  0.000000   \n",
6995
       "83869     -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966   \n",
6996
       "83869     -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966   \n",
6997
       "9502      -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
6998
       "9502      -1.014500  0.580145  0.000000 -1.014500  0.000000  0.000000   \n",
6999
       "83871      0.000000 -0.463947 -0.358454  0.000000  0.000000  0.000000   \n",
7000
       "...             ...       ...       ...       ...       ...       ...   \n",
7001
       "100134869  0.389567  0.000000  0.000000 -0.463947  0.000000 -0.367732   \n",
7002
       "100134869  0.389567  0.000000  0.000000 -0.463947  0.000000 -0.367732   \n",
7003
       "84316     -0.985645 -0.358454 -0.340075 -0.971431 -0.545824  0.310340   \n",
7004
       "84316      0.000000  0.000000 -0.434403  1.060047  0.490570  0.000000   \n",
7005
       "200030     0.000000  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
7006
       "200030     0.000000  0.618239  0.201634  0.000000 -0.260152  0.000000   \n",
7007
       "642658     0.000000  0.769772  0.232661  1.121015  0.831877 -0.367732   \n",
7008
       "642658     0.000000  0.769772  0.232661  1.121015  0.831877 -0.367732   \n",
7009
       "100302179  0.000000  0.201634 -0.251539  0.000000 -0.463947  0.000000   \n",
7010
       "100302179  0.000000  0.201634 -0.251539  0.000000 -0.463947  0.000000   \n",
7011
       "401508    -2.321928  0.000000  0.000000 -0.260152  0.000000 -0.545824   \n",
7012
       "401508    -2.321928  0.000000  0.000000 -0.260152  0.000000 -0.545824   \n",
7013
       "119016    -1.043943 -0.524915  0.000000  0.000000  0.000000 -0.377070   \n",
7014
       "119016    -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.367732   \n",
7015
       "84458     -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
7016
       "84458     -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
7017
       "574445    -0.957356 -0.321928  0.000000 -0.985645 -0.454032  0.000000   \n",
7018
       "574445    -0.957356 -0.321928  0.000000 -0.985645 -0.454032  0.000000   \n",
7019
       "26095     -1.043943  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
7020
       "26095     -1.043943  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
7021
       "84968     -1.000000  0.250962  0.000000  0.000000  0.000000  0.000000   \n",
7022
       "84968     -1.000000  0.250962  0.000000  0.000000  0.000000  0.000000   \n",
7023
       "80759      0.000000 -0.621488 -0.454032  0.000000  0.000000 -0.386468   \n",
7024
       "80759      0.000000 -0.621488 -0.454032  0.000000  0.000000 -0.386468   \n",
7025
       "3192       0.000000  0.207893  0.000000  0.000000 -0.330973  0.250962   \n",
7026
       "3192       0.000000  0.207893  0.000000  0.000000 -0.330973  0.250962   \n",
7027
       "387707    -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
7028
       "387707    -1.014500 -0.473931  0.000000  0.000000  0.000000 -0.386468   \n",
7029
       "79741     -0.985645 -0.358454  0.000000  0.700440  0.000000 -0.367732   \n",
7030
       "79741     -0.985645  0.000000  0.000000  0.700440  0.000000 -0.367732   \n",
7031
       "\n",
7032
       "             X-5959    X-5975    X-6047  \n",
7033
       "gene_id                                  \n",
7034
       "143872     0.000000  0.000000 -0.514573  \n",
7035
       "143872     0.000000  0.000000 -0.514573  \n",
7036
       "286464     0.000000 -1.494109 -1.494109  \n",
7037
       "286464     0.000000 -1.494109 -1.494109  \n",
7038
       "286464     0.000000 -1.494109 -1.494109  \n",
7039
       "51463      0.000000  0.469886  0.000000  \n",
7040
       "51463      0.000000  0.469886  0.000000  \n",
7041
       "642826    -1.494109 -0.312939  0.000000  \n",
7042
       "642826    -1.494109 -0.312939  0.000000  \n",
7043
       "653067    -0.577767 -1.494109 -1.494109  \n",
7044
       "653067    -0.577767 -1.494109 -1.494109  \n",
7045
       "653067    -0.577767 -1.494109 -1.494109  \n",
7046
       "653067    -0.577767 -1.494109 -1.494109  \n",
7047
       "399761    -0.588574  0.000000  0.000000  \n",
7048
       "399761    -0.588574  0.000000  0.000000  \n",
7049
       "647060    -0.395929  0.000000  0.000000  \n",
7050
       "647060    -0.395929  0.000000  0.000000  \n",
7051
       "284565     0.000000  0.469886  0.000000  \n",
7052
       "284565     0.000000  0.469886  0.000000  \n",
7053
       "84631     -0.545824 -0.545824 -0.535332  \n",
7054
       "84631     -0.545824 -0.545824 -0.535332  \n",
7055
       "161176     0.411426  0.000000  0.000000  \n",
7056
       "161176     0.411426  0.000000  0.000000  \n",
7057
       "341019     0.000000 -0.444184  0.000000  \n",
7058
       "341019     0.000000 -0.444184  0.000000  \n",
7059
       "83869     -3.473931 -3.120294 -3.643856  \n",
7060
       "83869     -3.473931 -3.120294 -3.643856  \n",
7061
       "9502      -0.577767 -1.494109 -1.494109  \n",
7062
       "9502      -0.577767 -1.494109 -1.494109  \n",
7063
       "83871      0.448901 -0.535332  0.000000  \n",
7064
       "...             ...       ...       ...  \n",
7065
       "100134869  0.790772  0.000000 -0.535332  \n",
7066
       "100134869  0.790772  0.000000 -0.535332  \n",
7067
       "84316      0.438293 -0.473931  0.000000  \n",
7068
       "84316     -0.632629 -0.655172  0.000000  \n",
7069
       "200030     0.000000  0.469886  0.000000  \n",
7070
       "200030     0.000000  0.469886  0.000000  \n",
7071
       "642658     0.000000  1.121015  0.459432  \n",
7072
       "642658     0.000000  1.121015  0.459432  \n",
7073
       "100302179  0.000000  0.000000  0.000000  \n",
7074
       "100302179  0.000000  0.000000  0.000000  \n",
7075
       "401508    -0.330973  0.000000  0.000000  \n",
7076
       "401508    -0.330973  0.000000  0.000000  \n",
7077
       "119016    -0.643856  0.000000  0.000000  \n",
7078
       "119016    -0.577767  0.000000  0.000000  \n",
7079
       "84458     -0.610433 -0.577767  0.000000  \n",
7080
       "84458     -0.610433 -0.577767  0.000000  \n",
7081
       "574445    -0.599462 -0.577767  0.000000  \n",
7082
       "574445    -0.599462 -0.577767  0.000000  \n",
7083
       "26095     -0.588574  0.000000  0.000000  \n",
7084
       "26095     -0.588574  0.000000  0.000000  \n",
7085
       "84968     -0.545824 -0.545824 -0.535332  \n",
7086
       "84968     -0.545824 -0.545824 -0.535332  \n",
7087
       "80759     -0.621488  0.761285  0.000000  \n",
7088
       "80759     -0.621488  0.761285  0.000000  \n",
7089
       "3192       0.000000  1.021480  0.599318  \n",
7090
       "3192       0.000000  1.021480  0.599318  \n",
7091
       "387707    -0.610433 -0.577767  0.000000  \n",
7092
       "387707    -0.610433 -0.577767  0.000000  \n",
7093
       "79741     -0.588574  0.459432  0.000000  \n",
7094
       "79741     -0.588574  0.459432  0.269033  \n",
7095
       "\n",
7096
       "[77 rows x 375 columns]"
7097
      ]
7098
     },
7099
     "execution_count": 36,
7100
     "metadata": {},
7101
     "output_type": "execute_result"
7102
    }
7103
   ],
7104
   "source": [
7105
    "dups = list(set(pdx[pdx.index.duplicated(keep=False)].index.values))\n",
7106
    "pdx.loc[dups,:]"
7107
   ]
7108
  },
7109
  {
7110
   "cell_type": "code",
7111
   "execution_count": 37,
7112
   "metadata": {},
7113
   "outputs": [
7114
    {
7115
     "name": "stdout",
7116
     "output_type": "stream",
7117
     "text": [
7118
      "37 duplicated IDs in 77 rows found.\n",
7119
      "duplicate rows removed due to low correlation of duplicated profiles 4\n",
7120
      "Merged  73 duplicated rows into 35 rows\n"
7121
     ]
7122
    }
7123
   ],
7124
   "source": [
7125
    "# most of these dupliates correspond to genes merged in the current assembly, e.g. gene - gene-AS\n",
7126
    "pdx = handle_dups(pdx,corr_thr = 0.75)"
7127
   ]
7128
  },
7129
  {
7130
   "cell_type": "code",
7131
   "execution_count": 38,
7132
   "metadata": {},
7133
   "outputs": [],
7134
   "source": [
7135
    "pdx = pdx.T.sort_index().T\n",
7136
    "pdx.to_csv(preprocessed_dir+\"/\"+\"PDX\"+\".Segment_Mean.CNA.tsv\",\n",
7137
    "                 sep = \"\\t\",header=True,index=True)"
7138
   ]
7139
  },
7140
  {
7141
   "cell_type": "markdown",
7142
   "metadata": {},
7143
   "source": [
7144
    "### Evaluation of the results\n",
7145
    "1). How many common genes between four datasets?\n",
7146
    "\n",
7147
    "2). Do CNA profiles of the same cell line from GDSC and CCLE correlate?\n",
7148
    "\n",
7149
    "3). Do CNA profiles of the same cancer type from TCGA and PDX look similar?\n",
7150
    "\n"
7151
   ]
7152
  },
7153
  {
7154
   "cell_type": "code",
7155
   "execution_count": null,
7156
   "metadata": {},
7157
   "outputs": [],
7158
   "source": []
7159
  },
7160
  {
7161
   "cell_type": "code",
7162
   "execution_count": 39,
7163
   "metadata": {},
7164
   "outputs": [
7165
    {
7166
     "ename": "IOError",
7167
     "evalue": "File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist",
7168
     "output_type": "error",
7169
     "traceback": [
7170
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
7171
      "\u001b[0;31mIOError\u001b[0m                                   Traceback (most recent call last)",
7172
      "\u001b[0;32m<ipython-input-39-1f476096b0ec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m### 1). How many common genes between four datasets?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;31m# we take BRCA from TCGA because\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtcga\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"BRCA\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;31m#print(tcga.head(3))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mgdsc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"GDSC\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
7173
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m    676\u001b[0m                     skip_blank_lines=skip_blank_lines)\n\u001b[1;32m    677\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 678\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    679\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    680\u001b[0m     \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
7174
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    438\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    439\u001b[0m     \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m     \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    442\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
7175
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    785\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    786\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 787\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    788\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    789\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
7176
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m   1012\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1013\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1014\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1015\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1016\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
7177
      "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m   1706\u001b[0m         \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'usecols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1707\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1708\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1709\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1710\u001b[0m         \u001b[0mpassed_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnames\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
7178
      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n",
7179
      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n",
7180
      "\u001b[0;31mIOError\u001b[0m: File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist"
7181
     ]
7182
    }
7183
   ],
7184
   "source": [
7185
    "### 1). How many common genes between four datasets?\n",
7186
    "# we take BRCA from TCGA because  \n",
7187
    "tcga = pd.read_csv(preprocessed_dir+\"BRCA\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n",
7188
    "#print(tcga.head(3))\n",
7189
    "gdsc = pd.read_csv(preprocessed_dir+\"GDSC\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n",
7190
    "#print(tcga.head(3))"
7191
   ]
7192
  },
7193
  {
7194
   "cell_type": "code",
7195
   "execution_count": null,
7196
   "metadata": {},
7197
   "outputs": [],
7198
   "source": [
7199
    "### distribution of logR values in GDSC and CCLE \n",
7200
    "cn_values_gdsc  = []\n",
7201
    "for row in df.iterrows():\n",
7202
    "    cn_values_gdsc += list(row[1].values)\n",
7203
    "cn_values_ccle = []\n",
7204
    "for row in cna_table.iterrows():\n",
7205
    "    cn_values_ccle+=  list(row[1].values)\n",
7206
    "\n",
7207
    "cn_values_gdsc = sorted (cn_values_gdsc)\n",
7208
    "cn_values_ccle = sorted (cn_values_ccle)\n",
7209
    "plt.figure(figsize=(20,5))\n",
7210
    "plt.subplot(121)\n",
7211
    "tmp = plt.hist(cn_values_gdsc,bins=100,density = True,range=(-5,4))\n",
7212
    "plt.title(\"GDSC\")\n",
7213
    "plt.subplot(122)\n",
7214
    "tmp = plt.hist(cn_values_ccle,bins=100,density = True, range=(-5,4))\n",
7215
    "plt.title(\"CCLE\")"
7216
   ]
7217
  }
7218
 ],
7219
 "metadata": {
7220
  "kernelspec": {
7221
   "display_name": "Python 2",
7222
   "language": "python",
7223
   "name": "python2"
7224
  },
7225
  "language_info": {
7226
   "codemirror_mode": {
7227
    "name": "ipython",
7228
    "version": 2
7229
   },
7230
   "file_extension": ".py",
7231
   "mimetype": "text/x-python",
7232
   "name": "python",
7233
   "nbconvert_exporter": "python",
7234
   "pygments_lexer": "ipython2",
7235
   "version": "2.7.15"
7236
  }
7237
 },
7238
 "nbformat": 4,
7239
 "nbformat_minor": 2
7240
}