--- a +++ b/preprocessing_scr/CNA.ipynb @@ -0,0 +1,7240 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "import pandas as pd\n", + "import os,sys\n", + "import pybedtools as pbt\n", + "from StringIO import StringIO\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import time\n", + "from mapper import expand, parse_mapping_table, apply_mappers\n", + "%matplotlib inline\n", + "\n", + "\n", + "chr_dict = dict(zip(range(1,22),map(str,range(1,22))))\n", + "chr_dict.update({22: 'X', 23: \"Y\"})\n", + "\n", + "root_dir = \"/home/olya/SFU/Hossein/v2/\"\n", + "gene_coords_file = root_dir + \"ref_GRCh37.p5_top_level.gff3.bed\" # must contain chromosome, start, end and Entrez Gene ID for hg19" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TCGA \n", + "\n", + "Assume that segmentation files from GDAC : http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/*/*snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt are dowmnoaded\n", + "\n", + "1) Filtering segments:\n", + " - segments containing less than 5 probes removed\n", + " - keep only segments with segment mean below -0.23 or above 0.2. This means that one copy gains and losses are detectable when their CCF (canncer cell fraction) is 0.3 or higher. \n", + " \n", + "TODO: remove segements overlapping with germline CNA forund in normals (add this as the first step)\n", + "2). For each samples aggregte to gene-level:\n", + " - rename chromosomes 22 and 23 to X and Y\n", + " - overpal segemntation file with Entrez gene coordinates for hg19\n", + " - if a gene overlaps by multiple segments, keep the one with most extreme values" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "num_marker_thr = 5\n", + "# to detect 1 copy gains or losses presenting at CCF >= 0.3\n", + "pos_seg_mean_thr = 0.20\n", + "neg_seg_mean_thr = -0.23 \n", + "\n", + "preprocessed_dir = root_dir+\"preprocessed/CNA/\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "### functions for TCGA and CCLE #################################\n", + "def filter_lowconf_segments(df,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr ):\n", + " # filter low-confidence segments with too few probes\n", + " df = df[df[\"Num_Probes\"] >= num_marker_thr ]\n", + " # filter low-confidence segments with Segment_Mean too close to zero:\n", + " df = df[ (df[\"Segment_Mean\"] >= pos_seg_mean_thr) | (df[\"Segment_Mean\"] <= neg_seg_mean_thr)]\n", + " return df\n", + "\n", + "def sample_type(barcode):\n", + " if barcode[13:16] in [\"10A\",\"10B\",\"11A\",\"11B\",\"10C\",\"11C\"]:\n", + " return \"Normal\"\n", + " else:\n", + " return \"Tumor\"\n", + "\n", + "def find_matching_normal(tumor_barcode,barcodes_list):\n", + " patient_id = tumor_barcode[:12]\n", + " normal_barcodes = []\n", + " for barcode in barcodes_list:\n", + " if barcode.startswith(patient_id) and sample_type(barcode) == \"Normal\":\n", + " normal_barcodes.append(barcode)\n", + " return normal_barcodes\n", + "\n", + "def cnv2bed(seg):\n", + " #cnv_bed = seg[[\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\"]]\n", + " #cnv_bed.columns = [\"chrom\",\"start\",\"stop\",\"Segment_Mean\"]\n", + " cnv_bed = seg.rename({\"Chromosome\":\"chrom\",\"Start\":\"start\",\n", + " \"End\":\"stop\"},axis=\"columns\")\n", + " cnv_bed = cnv_bed.loc[:,[\"chrom\",\"start\",\"stop\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]]\n", + " return pbt.BedTool.from_dataframe(cnv_bed)\n", + "\n", + "def bed2cnv(cnv_bed):\n", + " cnv_bed = str(cnv_bed)\n", + " if len(cnv_bed) > 0:\n", + " seg = pd.read_csv(StringIO(cnv_bed),sep = \"\\t\",header=None)\n", + " seg.columns = [\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]\n", + " seg = seg.loc[:,[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n", + " else:\n", + " seg = pd.DataFrame(columns=[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"])\n", + " return seg\n", + "def remove_ovelapping_segments(tumor, normal,sample_name):\n", + " tumor_bed = cnv2bed(tumor)\n", + " normal_bed = cnv2bed(normal)\n", + " tumor_wo_germline = tumor_bed.subtract(normal_bed,r=True,f=0.8,A=True)\n", + " tumor_wo_germline = bed2cnv(tumor_wo_germline)\n", + " n_segs_removed = tumor.shape[0] - tumor_wo_germline.shape[0]\n", + " if n_segs_removed*1.0/tumor.shape[0] > 0.5 and n_segs_removed>5 :\n", + " print(n_segs_removed,\"of\",tumor.shape[0],\"segments removed in\",sample_name,\"due to overlap with normal\",file = sys.stderr)\n", + " return tumor_wo_germline\n", + "\n", + "def cnv2genelevel(cnv_bed,gene_intervals_bed,sample_name,verbose = True,sorted_index = \"\"):\n", + " \n", + " # intersect \n", + " cnv2gene = str(gene_intervals_bed.intersect(cnv_bed,wb = True,wa=True))\n", + " if len(cnv2gene)==0: # if no intersection, return all zeroes\n", + " print(sample_name,\"has no genes with altered CN\",file = sys.stderr)\n", + " return pd.DataFrame(columns=[sample])\n", + " cnv2gene = pd.read_csv(StringIO(cnv2gene),sep = \"\\t\",header=None)\n", + " cnv2gene = cnv2gene[[3,7]].copy()\n", + " cnv2gene.columns = [\"gene\",\"Segment_Mean\"] \n", + " \n", + " # find genes overlapping with more than one segment:\n", + " # take the most exterme segement_mean value\n", + " \n", + " dups = cnv2gene.loc[cnv2gene.duplicated(subset=[\"gene\"],keep=False),]\n", + " if dups.shape[0] > 0:\n", + " cnv2gene = cnv2gene.drop_duplicates(subset=[\"gene\"],keep=False)\n", + " dups[\"abs_seg_mean\"] = abs(dups[\"Segment_Mean\"])\n", + " if verbose:\n", + " print(sample_name,\"contain \",len(set(dups[\"gene\"].values)),\"genes overalpped with more than one segment\",file=sys.stderr)\n", + " #print(dups.head(10),file=sys.stderr)\n", + " dups = dups.groupby(['gene'], group_keys=False).apply(lambda row: row.loc[row['abs_seg_mean'].idxmax()])\n", + " cnv2gene = pd.concat([cnv2gene,dups],sort=False)\n", + "\n", + " cnv2gene = cnv2gene[[\"gene\",\"Segment_Mean\"]]\n", + " cnv2gene.set_index(\"gene\",inplace=True,drop=True)\n", + " cnv2gene.rename(int,axis=0,inplace=True)\n", + " # add copy-neutral genes with 0s\n", + " \n", + " cnv2gene = cnv2gene.loc[sorted_index,:]\n", + " cnv2gene.columns = [sample_name]\n", + " return cnv2gene\n", + "\n", + "\n", + "### functions for GDSC and PDX #################################\n", + "\n", + "def CN2log2R(col, median_ploidy=2 ):\n", + " # this is fr GDSC only\n", + " lRs = []\n", + " genes = col.index.values\n", + " for code in col.values:\n", + " if not code == \"-1,-1,-,-\":\n", + " [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n", + " if int(max_cn) == 0:\n", + " lRs.append(-4.32) # CN=0 with 95% purity\n", + " else:\n", + " max_lR = np.log2(float(max_cn)/median_ploidy)\n", + " if not disruption == \"D\":\n", + " lRs.append(max_lR)\n", + " else:\n", + " if int(min_cn) == 0:\n", + " min_lR = -4.32\n", + " else:\n", + " min_lR = np.log2(float(min_cn)/median_ploidy)\n", + " if abs(min_lR) > abs(max_lR):\n", + " lRs.append(min_lR)\n", + " else:\n", + " lRs.append(max_lR)\n", + " \n", + " else:\n", + " lRs.append(np.NaN)\n", + " return pd.Series(dict(zip(genes, lRs)))\n", + "\n", + "def define_avg_ploidy(col):\n", + " n,pl = 0,0\n", + " CN_non_disrupted = []\n", + " for code in col.values:\n", + " if not code == \"-1,-1,-,-\":\n", + " [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n", + " n+=1\n", + " cn = (int(max_cn)+int(min_cn))*0.5\n", + " pl += cn\n", + " if not disruption == \"D\":\n", + " CN_non_disrupted.append((cn))\n", + " return pd.Series({\"avg_pl\":pl/n , \"median_pl\":np.median(CN_non_disrupted)})\n", + "\n", + "def clean_logR(logR_value, pos_seg_mean_thr, neg_seg_mean_thr):\n", + " if logR_value >= pos_seg_mean_thr:\n", + " return logR_value \n", + " elif logR_value <= neg_seg_mean_thr:\n", + " return logR_value \n", + " else:\n", + " return 0\n", + " \n", + "def handle_dups(df,corr_thr = 0.75):\n", + " '''Detect dupliated row IDs. Merge 2 or more rows with the same ID, \n", + " if averaged correlation in all pairvise comparision is >= corr_thhr;\\n\n", + " otherwise drop all duplicates. Keeps abs. max value (negative preferred).'''\n", + " dups = df.index\n", + " dups = list(set(dups[dups.duplicated()]))\n", + " if len(dups)==0:\n", + " print(\"No duplicated row IDs. Do nothing.\")\n", + " return df\n", + " print(len(dups), \"duplicated IDs in\",df.loc[dups,:].shape[0],\"rows found.\")\n", + " dups_merge = [] # if corr > corr_thr\n", + " dups_remove = [] # corr < \n", + " for dup in dups:\n", + " r = df.loc[dup,:].T.corr()\n", + " n_dups = df.loc[dup,:].shape[0]\n", + " r_avg = []\n", + " for i in range(0,n_dups):\n", + " for j in range(i+1,n_dups):\n", + " r_avg.append(r.iloc[i,j])\n", + " if np.average(r_avg) < corr_thr :\n", + " #print(dup,r_avg, n_dups)\n", + " dups_remove.append(dup)\n", + " else:\n", + " dups_merge.append(dup)\n", + " \n", + " # remove not similar duplicates\n", + " df_size = df.shape[0]\n", + " df = df.loc[~df.index.isin(dups_remove),:]\n", + " print(\"duplicate rows removed due to low correlation of duplicated profiles\",df_size -df.shape[0] )\n", + " df_size = df.shape[0]\n", + " \n", + " # merge simialr duplicates\n", + " d1 = df.loc[~df.index.isin(dups_merge),:]\n", + " d2 = df.loc[dups_merge,:]\n", + " d2 = d2.groupby(d2.index).agg(lambda x: -max(-x.max(),-x.min(),key= abs))\n", + " df = pd.concat([d1,d2])\n", + " df.sort_index(inplace=True)\n", + " print(\"Merged \",df_size-df.shape[0]+len(dups_merge),\"duplicated rows into\",len(dups_merge),\"rows\")\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### next few tabs demonstrate necessity of removing low-confidence and germline segments: \n", + "\n", + "(e.g. fragment 11:126596926-127130276 presents in both tumor and normal\n", + "therefore, it is germline; see chr11:126596926-12713027 in UCSC browser - it covers part of KIRELL3)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in tumor 204 segemtns in normal 121\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>57803</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>456120</td>\n", + " <td>8896255</td>\n", + " <td>4489.0</td>\n", + " <td>-0.0113</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57804</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>8899400</td>\n", + " <td>8899668</td>\n", + " <td>3.0</td>\n", + " <td>-1.3344</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57805</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>8900394</td>\n", + " <td>126596817</td>\n", + " <td>67487.0</td>\n", + " <td>0.0010</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57806</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>126596926</td>\n", + " <td>127130276</td>\n", + " <td>453.0</td>\n", + " <td>-1.0306</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57807</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>127132920</td>\n", + " <td>128342803</td>\n", + " <td>864.0</td>\n", + " <td>-0.0031</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57808</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>128342819</td>\n", + " <td>128350888</td>\n", + " <td>44.0</td>\n", + " <td>0.2824</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57809</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>128353007</td>\n", + " <td>134142530</td>\n", + " <td>3708.0</td>\n", + " <td>0.0082</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "57803 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 456120 8896255 \n", + "57804 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 8899400 8899668 \n", + "57805 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 8900394 126596817 \n", + "57806 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 126596926 127130276 \n", + "57807 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 127132920 128342803 \n", + "57808 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 128342819 128350888 \n", + "57809 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 128353007 134142530 \n", + "\n", + " Num_Probes Segment_Mean \n", + "57803 4489.0 -0.0113 \n", + "57804 3.0 -1.3344 \n", + "57805 67487.0 0.0010 \n", + "57806 453.0 -1.0306 \n", + "57807 864.0 -0.0031 \n", + "57808 44.0 0.2824 \n", + "57809 3708.0 0.0082 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#file_path = \"../../TCGA/CNA/data/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015082100.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n", + "file_path = \"../../TCGA/CNA/data__2016_01_28/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2016012800.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n", + "df = pd.read_csv(file_path, sep = \"\\t\")\n", + "tumor_barcode = \"TCGA-ZJ-AAXJ-01A-11D-A42N-01\"\n", + "t = df.loc[df[\"Sample\"]==tumor_barcode,:]\n", + "t_shape = t.shape[0]\n", + "n = find_matching_normal(tumor_barcode,list(set(df[\"Sample\"].values)))\n", + "n = df.loc[df[\"Sample\"]==n[0],:]\n", + "print(\"segemtns in tumor\",t.shape[0],\"segemtns in normal\",n.shape[0])\n", + "\n", + "n.loc[n['Chromosome']==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>57960</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>456120</td>\n", + " <td>64200041</td>\n", + " <td>34710.0</td>\n", + " <td>0.0054</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57961</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64208988</td>\n", + " <td>64319750</td>\n", + " <td>61.0</td>\n", + " <td>-0.6748</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57962</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64325209</td>\n", + " <td>126596817</td>\n", + " <td>37207.0</td>\n", + " <td>0.0571</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57963</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>126596926</td>\n", + " <td>127130276</td>\n", + " <td>454.0</td>\n", + " <td>-1.0760</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57964</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>127132920</td>\n", + " <td>132080656</td>\n", + " <td>3591.0</td>\n", + " <td>0.0449</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57965</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132080885</td>\n", + " <td>132099465</td>\n", + " <td>15.0</td>\n", + " <td>-0.6123</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57966</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132099856</td>\n", + " <td>134142530</td>\n", + " <td>1010.0</td>\n", + " <td>0.0483</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "57960 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 456120 64200041 \n", + "57961 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64208988 64319750 \n", + "57962 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64325209 126596817 \n", + "57963 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 126596926 127130276 \n", + "57964 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 127132920 132080656 \n", + "57965 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132080885 132099465 \n", + "57966 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132099856 134142530 \n", + "\n", + " Num_Probes Segment_Mean \n", + "57960 34710.0 0.0054 \n", + "57961 61.0 -0.6748 \n", + "57962 37207.0 0.0571 \n", + "57963 454.0 -1.0760 \n", + "57964 3591.0 0.0449 \n", + "57965 15.0 -0.6123 \n", + "57966 1010.0 0.0483 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.loc[t[\"Chromosome\"] ==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in normal after dropping low.conf.: 38\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>57804</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>8899400</td>\n", + " <td>8899668</td>\n", + " <td>3.0</td>\n", + " <td>-1.3344</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57806</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>126596926</td>\n", + " <td>127130276</td>\n", + " <td>453.0</td>\n", + " <td>-1.0306</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "57804 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 8899400 8899668 \n", + "57806 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 126596926 127130276 \n", + "\n", + " Num_Probes Segment_Mean \n", + "57804 3.0 -1.3344 \n", + "57806 453.0 -1.0306 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n = filter_lowconf_segments(n,0,0.46, -0.68 )\n", + "print(\"segemtns in normal after dropping low.conf.:\",n.shape[0])\n", + "n.loc[n['Chromosome']==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in tumor after removing germlines: 194\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>96</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>456120</td>\n", + " <td>64200041</td>\n", + " <td>34710.0</td>\n", + " <td>0.0054</td>\n", + " </tr>\n", + " <tr>\n", + " <th>97</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64208988</td>\n", + " <td>64319750</td>\n", + " <td>61.0</td>\n", + " <td>-0.6748</td>\n", + " </tr>\n", + " <tr>\n", + " <th>98</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64325209</td>\n", + " <td>126596817</td>\n", + " <td>37207.0</td>\n", + " <td>0.0571</td>\n", + " </tr>\n", + " <tr>\n", + " <th>99</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>127132920</td>\n", + " <td>132080656</td>\n", + " <td>3591.0</td>\n", + " <td>0.0449</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132080885</td>\n", + " <td>132099465</td>\n", + " <td>15.0</td>\n", + " <td>-0.6123</td>\n", + " </tr>\n", + " <tr>\n", + " <th>101</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132099856</td>\n", + " <td>134142530</td>\n", + " <td>1010.0</td>\n", + " <td>0.0483</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "96 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 456120 64200041 \n", + "97 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64208988 64319750 \n", + "98 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64325209 126596817 \n", + "99 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 127132920 132080656 \n", + "100 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132080885 132099465 \n", + "101 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132099856 134142530 \n", + "\n", + " Num_Probes Segment_Mean \n", + "96 34710.0 0.0054 \n", + "97 61.0 -0.6748 \n", + "98 37207.0 0.0571 \n", + "99 3591.0 0.0449 \n", + "100 15.0 -0.6123 \n", + "101 1010.0 0.0483 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "t = remove_ovelapping_segments(t, n,tumor_barcode)\n", + "print(\"segemtns in tumor after removing germlines:\",t.shape[0])\n", + "t.loc[t[\"Chromosome\"] ==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in tumor after dropping low.conf.: 101\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>97</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64208988</td>\n", + " <td>64319750</td>\n", + " <td>61.0</td>\n", + " <td>-0.6748</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132080885</td>\n", + " <td>132099465</td>\n", + " <td>15.0</td>\n", + " <td>-0.6123</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "97 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64208988 64319750 \n", + "100 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132080885 132099465 \n", + "\n", + " Num_Probes Segment_Mean \n", + "97 61.0 -0.6748 \n", + "100 15.0 -0.6123 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t = filter_lowconf_segments(t,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + "print(\"segemtns in tumor after dropping low.conf.:\",t.shape[0])\n", + "t.loc[t[\"Chromosome\"] ==11,:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TCGA " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HNSC samples: 1089 CNA events per sample on avg.: 101.275482094\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1089 tumors: 530 normals: 559\n", + "\ttumors without matched normal 28\n", + "\ttumors with at least one sCNA 497\n", + "\ttumors without any somatic CNA 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "HNSC samples: 525 Segments per sample on avg.: 60.6876190476\n", + "ESCA samples: 373 CNA events per sample on avg.: 163.010723861\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 373 tumors: 185 normals: 188\n", + "\ttumors without matched normal 3\n", + "\ttumors with at least one sCNA 181\n", + "\ttumors without any somatic CNA 1\n", + "total samples: 248 tumors: 125 normals: 123\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "ESCA samples: 184 Segments per sample on avg.: 141.836956522\n", + "THYM samples: 248 CNA events per sample on avg.: 62.7862903226\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 5\n", + "\ttumors with at least one sCNA 95\n", + "\ttumors without any somatic CNA 25\n", + "total samples: 132 tumors: 66 normals: 66\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "THYM samples: 100 Segments per sample on avg.: 9.41\n", + "KICH samples: 132 CNA events per sample on avg.: 77.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 0\n", + "\ttumors with at least one sCNA 65\n", + "\ttumors without any somatic CNA 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "KICH samples: 65 Segments per sample on avg.: 51.4923076923\n", + "LUSC samples: 1032 CNA events per sample on avg.: 130.682170543\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1032 tumors: 501 normals: 531\n", + "\ttumors without matched normal 23\n", + "\ttumors with at least one sCNA 476\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LUSC samples: 499 Segments per sample on avg.: 94.6533066132\n", + "BLCA samples: 797 CNA events per sample on avg.: 130.927227102\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 797 tumors: 414 normals: 383\n", + "\ttumors without matched normal 46\n", + "\ttumors with at least one sCNA 366\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "BLCA samples: 412 Segments per sample on avg.: 94.8859223301\n", + "GBM samples: 1104 CNA events per sample on avg.: 133.018115942\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1104 tumors: 590 normals: 514\n", + "\ttumors without matched normal 78\n", + "\ttumors with at least one sCNA 511\n", + "\ttumors without any somatic CNA 1\n", + "total samples: 85 tumors: 36 normals: 49\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "GBM samples: 589 Segments per sample on avg.: 70.2139219015\n", + "CHOL samples: 85 CNA events per sample on avg.: 89.0588235294\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 0\n", + "\ttumors with at least one sCNA 36\n", + "\ttumors without any somatic CNA 0\n", + "total samples: 111 tumors: 56 normals: 55\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "CHOL samples: 36 Segments per sample on avg.: 56.6944444444\n", + "UCS samples: 111 CNA events per sample on avg.: 173.855855856\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 54\n", + "\ttumors without any somatic CNA 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "UCS samples: 56 Segments per sample on avg.: 179.125\n", + "LGG samples: 1015 CNA events per sample on avg.: 78.6118226601\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1015 tumors: 530 normals: 485\n", + "\ttumors without matched normal 33\n", + "\ttumors with at least one sCNA 494\n", + "\ttumors without any somatic CNA 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LGG samples: 527 Segments per sample on avg.: 29.1157495256\n", + "THCA samples: 1013 CNA events per sample on avg.: 54.4096742349\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1013 tumors: 506 normals: 507\n", + "\ttumors without matched normal 15\n", + "\ttumors with at least one sCNA 367\n", + "\ttumors without any somatic CNA 124\n", + "total samples: 365 tumors: 185 normals: 180\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "THCA samples: 382 Segments per sample on avg.: 3.8219895288\n", + "PAAD samples: 365 CNA events per sample on avg.: 95.3643835616\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 10\n", + "\ttumors with at least one sCNA 161\n", + "\ttumors without any somatic CNA 14\n", + "total samples: 1059 tumors: 529 normals: 530\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "PAAD samples: 171 Segments per sample on avg.: 32.4093567251\n", + "KIRC samples: 1059 CNA events per sample on avg.: 80.298394712\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 22\n", + "\ttumors with at least one sCNA 501\n", + "\ttumors without any somatic CNA 6\n", + "total samples: 160 tumors: 80 normals: 80\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "KIRC samples: 523 Segments per sample on avg.: 20.5009560229\n", + "UVM samples: 160 CNA events per sample on avg.: 81.08125\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 0\n", + "\ttumors with at least one sCNA 80\n", + "\ttumors without any somatic CNA 0\n", + "total samples: 586 tumors: 297 normals: 289\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "UVM samples: 80 Segments per sample on avg.: 38.425\n", + "CESC samples: 586 CNA events per sample on avg.: 101.450511945\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 16\n", + "\ttumors with at least one sCNA 280\n", + "\ttumors without any somatic CNA 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "CESC samples: 296 Segments per sample on avg.: 58.1351351351\n", + "LUAD samples: 1095 CNA events per sample on avg.: 105.78630137\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1095 tumors: 518 normals: 577\n", + "\ttumors without matched normal 19\n", + "\ttumors with at least one sCNA 494\n", + "\ttumors without any somatic CNA 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LUAD samples: 513 Segments per sample on avg.: 70.469785575\n", + "STAD samples: 904 CNA events per sample on avg.: 130.961283186\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 904 tumors: 442 normals: 462\n", + "\ttumors without matched normal 26\n", + "\ttumors with at least one sCNA 410\n", + "\ttumors without any somatic CNA 6\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "STAD samples: 436 Segments per sample on avg.: 96.4220183486\n", + "UCEC samples: 1089 CNA events per sample on avg.: 116.707070707\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1089 tumors: 540 normals: 549\n", + "\ttumors without matched normal 23\n", + "\ttumors with at least one sCNA 504\n", + "\ttumors without any somatic CNA 13\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "UCEC samples: 527 Segments per sample on avg.: 78.89943074\n", + "SKCM samples: 937 CNA events per sample on avg.: 115.351120598\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 937 tumors: 472 normals: 465\n", + "\ttumors without matched normal 7\n", + "\ttumors with at least one sCNA 463\n", + "\ttumors without any somatic CNA 2\n", + "total samples: 172 tumors: 87 normals: 85\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "SKCM samples: 470 Segments per sample on avg.: 82.9957446809\n", + "MESO samples: 172 CNA events per sample on avg.: 106.598837209\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 82\n", + "\ttumors without any somatic CNA 3\n", + "total samples: 346 tumors: 168 normals: 178\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "MESO samples: 84 Segments per sample on avg.: 60.8333333333\n", + "PCPG samples: 346 CNA events per sample on avg.: 90.3352601156\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 6\n", + "\ttumors with at least one sCNA 159\n", + "\ttumors without any somatic CNA 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "PCPG samples: 165 Segments per sample on avg.: 43.5878787879\n", + "STES samples: 1277 CNA events per sample on avg.: 140.322631167\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1277 tumors: 627 normals: 650\n", + "\ttumors without matched normal 29\n", + "\ttumors with at least one sCNA 591\n", + "\ttumors without any somatic CNA 7\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "STES samples: 620 Segments per sample on avg.: 109.9\n", + "SARC samples: 513 CNA events per sample on avg.: 208.068226121\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 513 tumors: 263 normals: 250\n", + "\ttumors without matched normal 17\n", + "\ttumors with at least one sCNA 245\n", + "\ttumors without any somatic CNA 1\n", + "total samples: 380 tumors: 191 normals: 189\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "SARC samples: 262 Segments per sample on avg.: 187.057251908\n", + "LAML samples: 380 CNA events per sample on avg.: 74.5368421053\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 3\n", + "\ttumors with at least one sCNA 167\n", + "\ttumors without any somatic CNA 21\n", + "total samples: 590 tumors: 288 normals: 302\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LAML samples: 170 Segments per sample on avg.: 7.18823529412\n", + "KIRP samples: 590 CNA events per sample on avg.: 79.5152542373\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 15\n", + "\ttumors with at least one sCNA 271\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "KIRP samples: 286 Segments per sample on avg.: 21.8846153846\n", + "LIHC samples: 760 CNA events per sample on avg.: 122.8\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 760 tumors: 373 normals: 387\n", + "\ttumors without matched normal 21\n", + "\ttumors with at least one sCNA 348\n", + "\ttumors without any somatic CNA 4\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LIHC samples: 369 Segments per sample on avg.: 81.1327913279\n", + "OV samples: 1168 CNA events per sample on avg.: 224.04109589\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1168 tumors: 597 normals: 571\n", + "\ttumors without matched normal 26\n", + "\ttumors with at least one sCNA 571\n", + "\ttumors without any somatic CNA 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "OV samples: 597 Segments per sample on avg.: 207.924623116\n", + "TGCT samples: 304 CNA events per sample on avg.: 83.8125\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 304 tumors: 156 normals: 148\n", + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 154\n", + "\ttumors without any somatic CNA 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "TGCT samples: 156 Segments per sample on avg.: 37.7820512821\n", + "COAD samples: 918 CNA events per sample on avg.: 98.6209150327\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 918 tumors: 453 normals: 465\n", + "\ttumors without matched normal 44\n", + "\ttumors with at least one sCNA 406\n", + "\ttumors without any somatic CNA 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "COAD samples: 450 Segments per sample on avg.: 48.4755555556\n", + "BRCA samples: 2199 CNA events per sample on avg.: 129.35788995\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 2199 tumors: 1088 normals: 1111\n", + "\ttumors without matched normal 35\n", + "\ttumors with at least one sCNA 1046\n", + "\ttumors without any somatic CNA 7\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "BRCA samples: 1081 Segments per sample on avg.: 102.808510638\n", + "PRAD samples: 1023 CNA events per sample on avg.: 114.706744868\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1023 tumors: 493 normals: 530\n", + "\ttumors without matched normal 17\n", + "\ttumors with at least one sCNA 458\n", + "\ttumors without any somatic CNA 18\n", + "total samples: 96 tumors: 52 normals: 44\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "PRAD samples: 475 Segments per sample on avg.: 60.3831578947\n", + "DLBC samples: 96 CNA events per sample on avg.: 97.3229166667\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 10\n", + "\ttumors with at least one sCNA 40\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "DLBC samples: 50 Segments per sample on avg.: 44.44\n", + "READ samples: 316 CNA events per sample on avg.: 113.180379747\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 316 tumors: 166 normals: 150\n", + "\ttumors without matched normal 23\n", + "\ttumors with at least one sCNA 141\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "READ samples: 164 Segments per sample on avg.: 70.012195122\n", + "ACC samples: 180 CNA events per sample on avg.: 116.955555556\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 180 tumors: 90 normals: 90\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "ACC samples: 89 Segments per sample on avg.: 107.449438202\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 87\n", + "\ttumors without any somatic CNA 1\n" + ] + } + ], + "source": [ + "\n", + "data_dir = \"../../TCGA/CNA/data__2016_01_28//\"\n", + "\n", + "dfs = {}\n", + "dfs_normals = {}\n", + "tumors_without_CNA = {}\n", + "for f in os.listdir(data_dir):\n", + " if f.endswith(\"tar.gz\"):\n", + " fp = f.replace(\".tar.gz\",\"\")\n", + " cohort = fp.split(\".\")[2].replace(\"org_\",\"\")\n", + " file_path = fp+\"/\"+cohort+\".\"+fp.split(\".\")[3].replace(\"Merge_\",\"\")+\".seg.txt\"\n", + " df = pd.read_csv(data_dir+file_path, sep = \"\\t\")\n", + " \n", + " df[\"Chromosome\"] = df[\"Chromosome\"].map(chr_dict)\n", + " print(cohort,\"samples:\",len(set(df[\"Sample\"].values)),\n", + " \"CNA events per sample on avg.:\",float(df.shape[0])/len(set(df[\"Sample\"].values)))\n", + " \n", + " #### remove segments overlapping with segemnts in normals by 80% or more reciprocally ####\n", + " df[\"type\"] = df[\"Sample\"].apply(sample_type)\n", + " df_normals = df.loc[df[\"type\"]== \"Normal\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n", + " df_tumors = df.loc[df[\"type\"]== \"Tumor\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n", + " normal_samples = list(set(df_normals[\"Sample\"].values))\n", + " tumor_samples = list(set(df_tumors[\"Sample\"].values))\n", + " print(\"total samples:\", len(set(df[\"Sample\"].values)),\n", + " \"tumors:\",len(tumor_samples),\"normals:\",len(normal_samples),file= sys.stderr)\n", + " \n", + " tumors_without_somatic_CNA = []\n", + " tumors_germline_removed = []\n", + " tumors_without_matching_normal = []\n", + " filtered_normals = []\n", + " for tumor_sample in tumor_samples:\n", + " #print(sample, find_matching_normal(sample,list(set(d[\"Sample\"]))))\n", + " tumor = df_tumors.loc[df_tumors [\"Sample\"]== tumor_sample,:]\n", + " matching_normals = find_matching_normal(tumor_sample,normal_samples)\n", + " if len(matching_normals) >0:\n", + " n_segs = tumor.shape[0]\n", + " for normal_sample in matching_normals:\n", + " normal = df_normals.loc[df_normals[\"Sample\"]== normal_sample,:]\n", + " # thresholds for +1 and -1 copy in 75% of normal cell;\n", + " # this is to retain segments appeared due to slight tumor contamination\n", + " normal = filter_lowconf_segments(normal,0,0.46, -0.68 )\n", + " filtered_normals.append(normal)\n", + " tumor = remove_ovelapping_segments(tumor, normal,tumor_sample)\n", + " #if n_segs > tumor.shape[0]:\n", + " # print(n_segs - tumor.shape[0],\"segments removed in sample\",tumor_sample,\n", + " # tumor.shape[0],\"remained\",file= sys.stderr)\n", + " tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + " if tumor.shape[0] == 0:\n", + " tumors_without_somatic_CNA.append(tumor_sample)\n", + " else:\n", + " tumors_germline_removed.append(tumor)\n", + " else:\n", + " tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + " if tumor.shape[0] == 0:\n", + " tumors_without_somatic_CNA.append(tumor_sample)\n", + " else:\n", + " tumors_without_matching_normal.append(tumor)\n", + "\n", + " print(\"\\ttumors without matched normal\",len(tumors_without_matching_normal),file= sys.stderr)\n", + " print(\"\\ttumors with at least one sCNA\",len(tumors_germline_removed),file= sys.stderr)\n", + " print(\"\\ttumors without any somatic CNA\",len(tumors_without_somatic_CNA),file= sys.stderr)\n", + " #dfs[cohort] = df\n", + " filtered_tumors = pd.concat(tumors_germline_removed+tumors_without_matching_normal)\n", + " dfs[cohort] = filtered_tumors\n", + " filtered_normals = pd.concat(filtered_normals)\n", + " dfs_normals[cohort] = filtered_normals\n", + " tumors_without_CNA[cohort] = tumors_without_somatic_CNA\n", + " print(\"after filtering\")\n", + " print(cohort,\"samples:\",len(set(filtered_tumors[\"Sample\"].values)),\n", + " \"Segments per sample on avg.:\",float(filtered_tumors.shape[0])/len(set(filtered_tumors[\"Sample\"].values)))\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregating to gene-level\n", + "\n", + "Gene annotation must be:\n", + " - with Entrez gene IDs \n", + " - in hg19 coordinates\n", + " - with columns \"chrom\",\"start\",\"stop\",\"gene\" (this is foru-column bed format)\n", + " \n", + "wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz\n", + "\n", + "echo -e \"chrom\\tstart\\tstop\\tgene\\tname\" > ef_GRCh37.p5_top_level.gff3.bed;\n", + "zcat ref_GRCh37.p5_top_level.gff3.gz | awk '$3==\"gene\"' | cut -f 1,4,5,9| sed -e 's/;/\\t/g'| cut -f 1-3,5,6 | grep GeneID | sed -re 's/(Dbxref=GeneID:[0-9]*),.*/\\1/' | sed -e 's/Name=//' -e 's/Dbxref=GeneID://' | awk '{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$5\"\\t\"$4}' >> \n", + "ref_GRCh37.p5_top_level.gff3.bed\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "rename_chroms = {\"NC_000001.10\":1,\"NC_000002.11\":2,\"NC_000003.11\":3,\"NC_000004.11\":4,\n", + " \"NC_000005.9\":5,\"NC_000006.11\":6,\"NC_000007.13\":7,\"NC_000008.10\":8,\n", + " \"NC_000009.11\":9,\"NC_000010.10\":10,\"NC_000011.9\":11,\"NC_000012.11\":12,\"NC_000013.10\":13,\n", + " \"NC_000014.8\":14,\"NC_000015.9\":15,\"NC_000016.9\":16,\"NC_000017.10\":17,\n", + " \"NC_000018.9\":18,\"NC_000019.9\":19,\"NC_000020.10\":20,\"NC_000021.8\":21,\n", + " \"NC_000022.10\":22,\"NC_000023.10\":23,\"NC_000024.9\":24}" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(36019, 5)\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>chrom</th>\n", + " <th>start</th>\n", + " <th>stop</th>\n", + " <th>gene</th>\n", + " <th>name</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>10954</td>\n", + " <td>11507</td>\n", + " <td>100506145</td>\n", + " <td>LOC100506145</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>12190</td>\n", + " <td>13639</td>\n", + " <td>100652771</td>\n", + " <td>LOC100652771</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>14362</td>\n", + " <td>29370</td>\n", + " <td>653635</td>\n", + " <td>WASH7P</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>30366</td>\n", + " <td>30503</td>\n", + " <td>100302278</td>\n", + " <td>MIR1302-2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " chrom start stop gene name\n", + "0 1 10954 11507 100506145 LOC100506145\n", + "1 1 12190 13639 100652771 LOC100652771\n", + "2 1 14362 29370 653635 WASH7P\n", + "3 1 30366 30503 100302278 MIR1302-2" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_intervals = pd.read_csv(gene_coords_file, sep = \"\\t\")\n", + "gene_intervals = gene_intervals.loc[gene_intervals[\"chrom\"].isin(rename_chroms.keys()),:]\n", + "gene_intervals[\"chrom\"] = gene_intervals[\"chrom\"].apply(lambda x : rename_chroms[x])\n", + "#print(\"chromosomes:\",list(set(gene_intervals[\"chrom\"].values)))\n", + "gene_intervals = gene_intervals.sort_values(by=[\"chrom\",\"start\",\"stop\"],ascending=True)\n", + "gene_intervals.to_csv(\"/home/olya/SFU/Hossein/v1/ref_GRCh37.p5_top_level.gff3.chroms_renamed.bed\",sep = \"\\t\",index=False)\n", + "print(gene_intervals.shape)\n", + "gene_intervals.head(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "gene_intervals_bed = pbt.BedTool.from_dataframe(gene_intervals[[\"chrom\",\"start\",\"stop\",\"gene\"]])\n", + "# prepare copy-neutral table\n", + "cnv_baseline = gene_intervals.copy()\n", + "cnv_baseline[\"Segment_Mean\"] = [0]*cnv_baseline.shape[0]\n", + "cnv_baseline = cnv_baseline[[\"gene\",\"Segment_Mean\"]]\n", + "cnv_baseline.set_index(\"gene\",inplace=True,drop=True)\n", + "cnv_baseline.sort_index(inplace=True)\n", + "sorted_index = list(cnv_baseline.index.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ESCA\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "ESCA (36019, 185)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DLBC\n", + "TCGA-G8-6914-14A-01D-2209-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLBC (36019, 52)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "READ\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "READ (36019, 166)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GBM\n", + "TCGA-06-0165-01A-01D-0236-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-06-0119-01A-08D-0214-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n", + "... 400 processed.\n", + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-06-5410-01A-01D-1694-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GBM (36019, 590)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "STES\n", + "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n", + "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n", + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n", + "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n", + "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 600 processed.\n", + "STES (36019, 627)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "BLCA\n", + "TCGA-YC-A8S6-01A-31D-A38F-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DK-A3WY-01A-11D-A22Y-01 has no genes with altered CN\n", + "TCGA-XF-A9SL-01A-11D-A390-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-E7-A7XN-01A-11D-A34T-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n", + "... 400 processed.\n", + "BLCA (36019, 414)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UCEC\n", + "TCGA-D1-A16Y-01A-31D-A12G-01 has no genes with altered CN\n", + "TCGA-BK-A6W4-01A-12D-A34P-01 has no genes with altered CN\n", + "TCGA-BS-A0V7-01A-21D-A120-01 has no genes with altered CN\n", + "TCGA-B5-A11Y-01A-21D-A10L-01 has no genes with altered CN\n", + "TCGA-D1-A17F-01A-11D-A12G-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AX-A062-01A-11D-A00X-01 has no genes with altered CN\n", + "TCGA-D1-A16D-01A-11D-A12G-01 has no genes with altered CN\n", + "TCGA-BG-A0VZ-01A-11D-A107-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AJ-A2QL-01A-11D-A18N-01 has no genes with altered CN\n", + "TCGA-BS-A0UA-01A-11D-A120-01 has no genes with altered CN\n", + "TCGA-B5-A11U-01A-11D-A120-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EO-A3AU-01A-21D-A19X-01 has no genes with altered CN\n", + "TCGA-QF-A5YS-01A-11D-A31T-01 has no genes with altered CN\n", + "TCGA-D1-A0ZV-01A-11D-A10L-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-QS-A5YR-01A-31D-A31T-01 has no genes with altered CN\n", + "TCGA-DI-A1BU-01A-11D-A134-01 has no genes with altered CN\n", + "TCGA-AP-A0LG-01A-11D-A042-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-D1-A0ZS-01A-11D-A120-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UCEC (36019, 540)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PCPG\n", + "TCGA-RW-A7CZ-01A-11D-A35C-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-WB-A817-01A-11D-A35H-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PCPG (36019, 168)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "STAD\n", + "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n", + "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n", + "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n", + "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n", + "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n", + "STAD (36019, 442)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CESC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "CESC (36019, 297)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UCS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UCS (36019, 56)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TGCT\n", + "TCGA-YU-A90S-01A-11D-A434-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "TGCT (36019, 156)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "THCA\n", + "TCGA-EL-A4JZ-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A13X-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-EL-A3ZT-01A-12D-A23L-01 has no genes with altered CN\n", + "TCGA-DE-A0XZ-01A-11D-A17S-01 has no genes with altered CN\n", + "TCGA-DJ-A2PP-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-KS-A4I5-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A2PS-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-EL-A3GW-01A-11D-A201-01 has no genes with altered CN\n", + "TCGA-BJ-A0ZG-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-J8-A3O2-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-FY-A3RA-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-CE-A483-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-EM-A1CW-01A-21D-A13V-01 has no genes with altered CN\n", + "TCGA-DJ-A4V4-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-E3-A3E1-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-ET-A2MZ-01A-12D-A19I-01 has no genes with altered CN\n", + "TCGA-E8-A414-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-EL-A3T6-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-DJ-A4V5-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A3UY-01A-21D-A22C-01 has no genes with altered CN\n", + "TCGA-EL-A3D4-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-FY-A76V-01A-11D-A396-01 has no genes with altered CN\n", + "TCGA-FY-A4B3-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-DJ-A3UO-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-EL-A4K7-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A1QI-01A-11D-A14V-01 has no genes with altered CN\n", + "TCGA-EL-A3N2-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-E3-A3E5-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-EM-A1YD-01A-11D-A14V-01 has no genes with altered CN\n", + "TCGA-GE-A2C6-01A-11D-A16M-01 has no genes with altered CN\n", + "TCGA-DJ-A2Q5-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-ET-A3DP-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-DJ-A4UT-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A2PT-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-DJ-A4V2-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-L6-A4ET-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-BJ-A0ZJ-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-DE-A4M9-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A4KD-01A-11D-A256-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-QD-A8IV-01A-11D-A396-01 has no genes with altered CN\n", + "TCGA-ET-A3DV-01A-12D-A201-01 has no genes with altered CN\n", + "TCGA-EM-A22K-01A-11D-A17S-01 has no genes with altered CN\n", + "TCGA-DJ-A3VE-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-EL-A3D1-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-BJ-A2P4-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-CE-A3ME-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-E8-A417-01A-21D-A23L-01 has no genes with altered CN\n", + "TCGA-KS-A41I-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-FK-A3SB-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-BJ-A28S-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-MK-A4N9-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-E8-A437-01A-12D-A23T-01 has no genes with altered CN\n", + "TCGA-EM-A3AP-01A-12D-A20A-01 has no genes with altered CN\n", + "TCGA-EL-A3TA-01A-12D-A22C-01 has no genes with altered CN\n", + "TCGA-IM-A41Z-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-EM-A2CQ-01A-11D-A17S-01 has no genes with altered CN\n", + "TCGA-EM-A3O7-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-FE-A3PC-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-DJ-A2PY-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-EM-A4FQ-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EM-A3FO-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-BJ-A0Z9-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-EM-A3FK-01A-11D-A219-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-ET-A3BU-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-BJ-A0Z5-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-EL-A3MY-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-ET-A39L-01A-12D-A19I-01 has no genes with altered CN\n", + "TCGA-E8-A415-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-ET-A40Q-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-KS-A4I7-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-MK-A4N7-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-L6-A4EQ-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-FY-A3TY-01A-11D-A22Y-01 has no genes with altered CN\n", + "TCGA-ET-A2N1-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-DJ-A2PO-01A-21D-A19I-01 has no genes with altered CN\n", + "TCGA-J8-A3O2-06A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-CE-A485-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-ET-A3BX-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DJ-A3VK-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-DE-A4M8-01A-21D-A256-01 has no genes with altered CN\n", + "TCGA-ET-A40T-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-BJ-A18Z-01A-21D-A13V-01 has no genes with altered CN\n", + "TCGA-DJ-A3UT-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-DJ-A2Q2-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-BJ-A18Y-01A-11D-A13V-01 has no genes with altered CN\n", + "TCGA-ET-A39T-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-EL-A3CL-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DJ-A4V0-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A3H8-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-ET-A39J-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-FY-A3I4-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-EM-A2CU-01A-12D-A17S-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EM-A3FM-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-EM-A4FF-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A3GX-01A-11D-A201-01 has no genes with altered CN\n", + "TCGA-DJ-A3UN-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-EM-A4FO-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A3TB-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-ET-A25N-01A-11D-A16M-01 has no genes with altered CN\n", + "TCGA-ET-A39M-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DE-A4MA-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-ET-A39O-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DE-A0Y2-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-FY-A3R8-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-EM-A3AL-01A-11D-A201-01 has no genes with altered CN\n", + "TCGA-EM-A2CN-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-FY-A3BL-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-EM-A1CS-01A-11D-A13V-01 has no genes with altered CN\n", + "TCGA-EL-A3D0-01A-12D-A201-01 has no genes with altered CN\n", + "TCGA-E3-A3DZ-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-DJ-A1QF-01A-12D-A14V-01 has no genes with altered CN\n", + "TCGA-J8-A3YH-01A-11D-A22Y-01 has no genes with altered CN\n", + "TCGA-EL-A4K1-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EM-A3O8-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-DJ-A3VJ-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-BJ-A45D-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-FY-A4B4-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-EM-A1CU-01A-11D-A13V-01 has no genes with altered CN\n", + "TCGA-EL-A3CX-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-ET-A25O-01A-11D-A16M-01 has no genes with altered CN\n", + "TCGA-E8-A433-01A-11D-A23L-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "THCA (36019, 506)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CHOL\n", + "TCGA-W5-AA2H-01A-31D-A416-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CHOL (36019, 36)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "HNSC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n", + "... 500 processed.\n", + "HNSC (36019, 530)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UVM\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UVM (36019, 80)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "SKCM\n", + "TCGA-ER-A19A-06A-21D-A191-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EB-A4OZ-01A-12D-A25P-01 has no genes with altered CN\n", + "TCGA-EE-A2GK-06A-11D-A194-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SKCM (36019, 472)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "COAD\n", + "TCGA-G4-6302-01A-11D-1717-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AA-A03F-01A-11D-A080-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n", + "COAD (36019, 453)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ACC\n", + "TCGA-OR-A5KQ-01A-11D-A309-01 has no genes with altered CN\n", + "TCGA-OR-A5KV-01A-11D-A29H-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ACC (36019, 90)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PAAD\n", + "TCGA-IB-AAUR-01A-21D-A38F-01 has no genes with altered CN\n", + "TCGA-HZ-8002-01A-11D-2200-01 has no genes with altered CN\n", + "TCGA-XD-AAUG-01A-61D-A40V-01 has no genes with altered CN\n", + "TCGA-Z5-AAPL-01A-12D-A40V-01 has no genes with altered CN\n", + "TCGA-IB-A5SQ-01A-11D-A32M-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-IB-AAUS-01A-12D-A38F-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PAAD (36019, 185)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "THYM\n", + "TCGA-4V-A9QW-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZB-A96B-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-X7-A8DB-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-X7-A8M4-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-X7-A8D8-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-3S-AAYX-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-YT-A95E-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-X7-A8M8-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZT-A8OM-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZB-A96E-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-3Q-A9WF-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-X7-A8M1-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZB-A96A-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZB-A96R-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZB-A963-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZC-AAAA-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-XM-A8RB-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZB-A96G-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-X7-A8M7-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-XU-AAXZ-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-XH-A853-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-XM-AAZ3-01A-11D-A422-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "THYM (36019, 125)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LUSC\n", + "TCGA-56-8623-01A-11D-2391-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-98-A53H-01A-12D-A25M-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LUSC (36019, 501)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "MESO\n", + "TCGA-TS-A8AS-01A-21D-A39Q-01 has no genes with altered CN\n", + "TCGA-TS-A7P8-01A-11D-A34B-01 has no genes with altered CN\n", + "TCGA-TS-A8AV-01A-12D-A39Q-01 has no genes with altered CN\n", + "TCGA-3H-AB3O-01A-11D-A39Q-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MESO (36019, 87)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OV\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n", + "... 500 processed.\n", + "OV (36019, 597)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "SARC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-WK-A8Y0-10D-01D-A419-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-WK-A8XS-10E-01D-A37E-01 has no genes with altered CN\n", + "TCGA-QQ-A5V2-01A-11D-A32H-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SARC (36019, 263)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "KIRP\n", + "TCGA-Y8-A8S1-01A-11D-A36W-01 has no genes with altered CN\n", + "TCGA-GL-A4EM-01A-11D-A253-01 has no genes with altered CN\n", + "TCGA-4A-A93Y-01A-11D-A36W-01 has no genes with altered CN\n", + "TCGA-AL-3467-01A-02D-1348-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-A4-7828-01A-11D-2135-01 has no genes with altered CN\n", + "TCGA-DW-7838-01A-11D-2135-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n", + "KIRP (36019, 288)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LGG\n", + "TCGA-HT-8106-01A-11D-2391-01 has no genes with altered CN\n", + "TCGA-S9-A6WI-01A-21D-A33S-01 has no genes with altered CN\n", + "TCGA-HT-7602-01A-21D-2085-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DU-7011-01A-11D-2023-01 has no genes with altered CN\n", + "TCGA-TM-A84B-12A-01D-A366-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-FG-8181-01A-11D-2252-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-FG-8189-01B-11D-A288-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DU-5872-02A-21D-A36N-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-HT-7680-01A-11D-2252-01 has no genes with altered CN\n", + "TCGA-P5-A5EY-01A-11D-A27J-01 has no genes with altered CN\n", + "TCGA-CS-6669-01A-11D-1892-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LGG (36019, 530)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LAML\n", + "TCGA-AB-2884-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2932-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2842-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2969-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2826-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2836-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2871-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2845-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2840-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2837-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2844-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2854-03A-01D-0756-21 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AB-3006-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2931-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2851-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2978-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2880-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2922-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2947-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2998-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2824-03A-01D-0756-21 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LAML (36019, 191)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LIHC\n", + "TCGA-2V-A95S-10D-01D-A36Z-01 has no genes with altered CN\n", + "TCGA-UB-AA0V-01A-11D-A381-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-G3-A25V-01A-11D-A16U-01 has no genes with altered CN\n", + "TCGA-DD-A3A6-01A-11D-A22E-01 has no genes with altered CN\n", + "TCGA-DD-A4NL-01A-11D-A28W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-ED-A5KG-01A-11D-A27H-01 has no genes with altered CN\n", + "TCGA-CC-A9FV-01A-11D-A36W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-MR-A520-01A-11D-A25U-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LIHC (36019, 373)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PRAD\n", + "TCGA-J9-A52C-01A-11D-A26L-01 has no genes with altered CN\n", + "TCGA-V1-A8MJ-01A-11D-A363-01 has no genes with altered CN\n", + "TCGA-XJ-A9DQ-01A-11D-A376-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-J4-A6G1-01A-11D-A30W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-J4-A67R-01A-21D-A30D-01 has no genes with altered CN\n", + "TCGA-EJ-A7NJ-01A-22D-A34T-01 has no genes with altered CN\n", + "TCGA-EJ-7791-01A-11D-2112-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EJ-A8FU-01A-11D-A363-01 has no genes with altered CN\n", + "TCGA-EJ-A6RC-01A-11D-A32A-01 has no genes with altered CN\n", + "TCGA-HC-7740-01A-11D-2112-01 has no genes with altered CN\n", + "TCGA-EJ-A65B-01A-12D-A30D-01 has no genes with altered CN\n", + "TCGA-HC-8260-01A-11D-2259-01 has no genes with altered CN\n", + "TCGA-FC-A8O0-01A-41D-A376-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-VN-A88I-01A-11D-A34T-01 has no genes with altered CN\n", + "TCGA-EJ-A7NK-01A-12D-A34T-01 has no genes with altered CN\n", + "TCGA-CH-5743-01A-21D-1574-01 has no genes with altered CN\n", + "TCGA-G9-6367-01A-11D-1785-01 has no genes with altered CN\n", + "TCGA-KC-A4BO-01A-61D-A256-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PRAD (36019, 493)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LUAD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-L4-A4E6-01A-11D-A24C-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-44-3398-01A-01D-1877-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-55-8619-01A-11D-2389-01 has no genes with altered CN\n", + "TCGA-86-A4P8-01A-11D-A24O-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n", + "LUAD (36019, 518)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "BRCA\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AO-A0JC-01A-11D-A059-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BH-A0H5-01A-21D-A111-01 has no genes with altered CN\n", + "TCGA-A2-A0CR-01A-11D-A227-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BH-A1FE-06A-11D-A20R-01 has no genes with altered CN\n", + "TCGA-AN-A0FN-01A-11D-A036-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n", + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-PL-A8LY-01A-11D-A41E-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 600 processed.\n", + "... 700 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-GM-A3XG-01A-31D-A242-01 has no genes with altered CN\n", + "TCGA-LD-A74U-01A-13D-A33D-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 800 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-GM-A2DO-10D-01D-A18N-01 has no genes with altered CN\n", + "TCGA-A2-A0EP-01A-52D-A22W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 900 processed.\n", + "... 1000 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AO-A1KO-01A-31D-A13J-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BRCA (36019, 1088)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "KIRC\n", + "TCGA-B4-5378-01A-01D-1499-01 has no genes with altered CN\n", + "TCGA-B0-5400-01A-01D-1499-01 has no genes with altered CN\n", + "TCGA-CJ-4890-01A-01D-1302-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-A3-A8OX-01A-11D-A36W-01 has no genes with altered CN\n", + "TCGA-B0-4817-01A-01D-1274-01 has no genes with altered CN\n", + "TCGA-B0-5080-01A-01D-1499-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DV-A4VZ-01A-11D-A25U-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-CJ-4891-01A-01D-1302-01 has no genes with altered CN\n", + "TCGA-CJ-4889-01A-01D-1302-01 has no genes with altered CN\n", + "TCGA-BP-4769-01A-01D-1283-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BP-4760-01A-02D-1417-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n", + "KIRC (36019, 529)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "KICH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KICH (36019, 66)\n" + ] + } + ], + "source": [ + "for cohort in dfs.keys():\n", + " print(cohort, file=sys.stderr)\n", + " df = dfs[cohort]\n", + " cna_table = []\n", + " n_samples = 0\n", + " for sample in list(set(df.Sample.values)):\n", + " n_samples +=1\n", + " cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n", + " verbose = False,sorted_index = sorted_index)\n", + " cna_table.append(cnv2gene)\n", + " if n_samples % 100 == 0:\n", + " print(\"...\",n_samples, \"processed.\")\n", + " cna_table = pd.concat(cna_table,axis =1)\n", + " \n", + "\n", + " for sample in tumors_without_CNA[cohort]:\n", + " cna_table[sample] = 0\n", + " \n", + " cna_table.fillna(0, inplace = True)\n", + " cna_table.to_csv(preprocessed_dir+\"/TCGA-\"+cohort+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)\n", + " print(cohort,cna_table.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'t = time.time()\\ncnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\\n verbose = False,sorted_index = sorted_index)\\nprint( time.time() - t)\\ncnv2gene'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"t = time.time()\n", + "cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n", + " verbose = False,sorted_index = sorted_index)\n", + "print( time.time() - t)\n", + "cnv2gene\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CCLE \n", + "\n", + "the same pipeline as for TCGA except filtering out germline CNA (because no )\n", + "\n", + "wget https://data.broadinstitute.org/ccle_legacy_data/dna_copy_number/CCLE_copynumber_2013-12-03.seg.txt\n", + "\n", + "? should we use a stronger segment_mean threshold because this data are for cell lines and purity must be 100%" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "47 duplicated IDs in 94 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 0\n", + "Merged 94 duplicated rows into 47 rows\n", + "CCLE: genes: 35972 samples 1043\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"../../CCLE/CCLE_copynumber_2013-12-03.seg.txt\",sep = \"\\t\")\n", + "df.rename({\"CCLE_name\":\"Sample\"},inplace=True, axis=\"columns\")\n", + "df[\"End\"] = df[\"End\"].apply(int)\n", + "ccle = []\n", + "for sample_name in list(set(df[\"Sample\"].values)):\n", + " cl = df.loc[df[\"Sample\"]==sample_name, :]\n", + " # keep high-conf segments \n", + " cl_filtered = filter_lowconf_segments(cl,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + " #print(sample_name, cl.shape[0], \"after filtration\",cl_filtered.shape[0])\n", + " # map to genes \n", + " cnv2gene = cnv2genelevel(cnv2bed(cl_filtered),gene_intervals_bed,sample_name,\n", + " verbose = False,sorted_index = sorted_index)\n", + " ccle.append(cnv2gene)\n", + " \n", + "ccle = pd.concat(ccle,axis =1)\n", + "ccle.fillna(0, inplace = True)\n", + "ccle = handle_dups(ccle)\n", + "ccle.to_csv(preprocessed_dir+\"/\"+\"CCLE\"+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)\n", + "print(\"CCLE:\",\"genes:\",ccle.shape[0],\"samples\",ccle.shape[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GDSC\n", + "Assume that supplementary file with gene-level CN is downloaded :\n", + "\n", + "wget \n", + "\n", + "GDSC provides gene-level integer estimated CN, max. and min. CN over all segments covering a gene. In order to make it comparable with TCGA and CCLE, we divide estimated CN by CN of copy-neutral state and log2-transform it. \n", + "\n", + "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n", + "\n", + "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n", + "\n", + "3) Replace estimates below thresholds with zeroes. \n", + "\n", + "\n", + "DGSC uses 4 comma-separated values for gene-level CN (max_cn,min_cn,zygosity,disruption): e.g. (from \"legend\" tab)\n", + "\n", + "2,2,H,-\tGene resides on a single genomic segment in a diploid region of the genome.\n", + "2,0,L,D\tGene spans multiple segments, higest copy number is 2 but part of the coding sequence is homozygously deleted, the gene is disrupted.\n", + "13,13,H,-\tGene resides on a single genomic segment of copy number 13 in a heterozygous part of the genome (amplification).\n", + "14,12,L,D\tGene spans multiple genomic segments all of which are amplified to 12 or more copies, some or all segments have LOH, the gene is disrupted.\n", + "0,0,0,-\tComplete gene sequence falls within a homozygous deletion.\n", + "-1,-1,-,- gene level CN not assigned\n", + "\n", + "* min and max CN are integers \n", + "* zygosity - can be L (LOH in any overlapping segment) or H (heterozygous) or 0 (homozygous deleteion of the whole gene) or - (undefined)\n", + "* disruption - D (if disrupted) or \"-\" (not disrupted) \n", + "\n", + "Average ploidies of cell lines were downloaded from COSMIC:\n", + "\n", + "wget https://cog.sanger.ac.uk/cosmic/GRCh37/cell_lines/v86/PICNIC_average_ploidies.tsv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1540792525&Signature=mcSB6oFv%2BXCF4%2Fezm4a3Ds1JXo4%3D\n", + "\n", + "wget ftp:// ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-7.0/Gene_level_CN.xlsx\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>gene</th>\n", + " <th>chr</th>\n", + " <th>start</th>\n", + " <th>stop</th>\n", + " <th>201T</th>\n", + " <th>22RV1</th>\n", + " <th>23132-87</th>\n", + " <th>42-MG-BA</th>\n", + " <th>451Lu</th>\n", + " <th>5637</th>\n", + " <th>...</th>\n", + " <th>WSU-NHL</th>\n", + " <th>YAPC</th>\n", + " <th>YH-13</th>\n", + " <th>YKG-1</th>\n", + " <th>YMB-1-E</th>\n", + " <th>YT</th>\n", + " <th>ZR-75-30</th>\n", + " <th>huH-1</th>\n", + " <th>no-10</th>\n", + " <th>no-11</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1287381</td>\n", + " <td>924100</td>\n", + " <td>910924</td>\n", + " <td>687561</td>\n", + " <td>1287706</td>\n", + " <td>687452</td>\n", + " <td>...</td>\n", + " <td>909785</td>\n", + " <td>909904</td>\n", + " <td>909905</td>\n", + " <td>687592</td>\n", + " <td>1303911</td>\n", + " <td>946358</td>\n", + " <td>909907</td>\n", + " <td>1298146</td>\n", + " <td>908452</td>\n", + " <td>908450</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>DDX11L1</td>\n", + " <td>1</td>\n", + " <td>11869.0</td>\n", + " <td>14412.0</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>...</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>WASH7P</td>\n", + " <td>1</td>\n", + " <td>14363.0</td>\n", + " <td>29806.0</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>...</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 1000 columns</p>\n", + "</div>" + ], + "text/plain": [ + " gene chr start stop 201T 22RV1 23132-87 42-MG-BA \\\n", + "0 NaN NaN NaN NaN 1287381 924100 910924 687561 \n", + "1 DDX11L1 1 11869.0 14412.0 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "2 WASH7P 1 14363.0 29806.0 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "\n", + " 451Lu 5637 ... WSU-NHL YAPC YH-13 \\\n", + "0 1287706 687452 ... 909785 909904 909905 \n", + "1 -1,-1,-,- -1,-1,-,- ... -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "2 -1,-1,-,- -1,-1,-,- ... -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "\n", + " YKG-1 YMB-1-E YT ZR-75-30 huH-1 no-10 no-11 \n", + "0 687592 1303911 946358 909907 1298146 908452 908450 \n", + "1 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "2 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "\n", + "[3 rows x 1000 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GDSC_CNA = \"/home/olya/SFU/Hossein/GDSC/Gene_level_CN.xlsx\"\n", + "\n", + "gdsc = pd.read_excel(GDSC_CNA,\"Gene_level_CN\")\n", + "gdsc.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "25 gene IDs excluded due to string to datetime conversion in Excel.\n", + "Strings containing duplicated gene IDs: 0\n" + ] + } + ], + "source": [ + "gdsc.set_index(\"gene\",inplace = True)\n", + "gdsc.drop([\"chr\",\"start\",\"stop\"],inplace=True,axis=1)\n", + "gdsc.columns = gdsc.iloc[0,:]\n", + "gdsc = gdsc.iloc[1:,:]\n", + "gdsc.columns.name = None\n", + "# replace 2001-12-01 with DEC1 and get remove gene names converted to datetimes\n", + "gdsc.index.values[37778] = \"DEC1\"\n", + "df_size = gdsc.shape[0]\n", + "ndxs=pd.Series(gdsc.index).apply(lambda x : type(x) == unicode or type(x) == str)\n", + "gdsc = gdsc.loc[gdsc.index.values[ndxs[ndxs].index],:]\n", + "print(df_size - gdsc.shape[0],\"gene IDs excluded due to string to datetime conversion in Excel.\")\n", + "\n", + "gdsc.index.name = \"gene_id\"\n", + "ids = gdsc.index\n", + "ids = list(set(ids[ids.duplicated()]))\n", + "print(\"Strings containing duplicated gene IDs:\",gdsc.loc[ids,:].shape[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### distribution of averaged ploidies in GDSC\n", + "\n", + "we compared average ploidies reported in PICNIC_average_ploidies.tsv provided by COSMIC with " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1020\n", + "1016\n" + ] + } + ], + "source": [ + "GDSC_Ploidies = \"/home/olya/SFU/Hossein/GDSC/PICNIC_average_ploidies.tsv\"\n", + "GDSC_Ploidies = pd.read_csv(GDSC_Ploidies,sep = \"\\t\")\n", + "GDSC_Ploidies.drop(\"#sample_name\",axis = 1, inplace= True)\n", + "GDSC_Ploidies.set_index(\"sample_id\",inplace=True)\n", + "print(GDSC_Ploidies.shape[0])\n", + "GDSC_Ploidies.dropna(inplace=True)\n", + "print(GDSC_Ploidies.shape[0])\n", + "\n", + "est_ploidies = gdsc.apply(define_avg_ploidy).T\n", + "df_ploidies = pd.DataFrame.from_dict({\"est. avg. ploidy from CN profile\":est_ploidies[\"avg_pl\"],\"PICNIC avg. pl.\":GDSC_Ploidies[\"average_ploidy\"],\n", + " \"est. median. ploidy\":est_ploidies[\"median_pl\"]})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABIgAAAE/CAYAAAAt2/ipAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xu8pWVd///XO/CUoihMiBwcD2ip1agTah4i0UJA0VKEDNGo0cKy7Psr1L6imd8fVkaWqV8UBEoQBElU6it5zBJtUEIO+hVw/DE4MiPIQVEL+Pz+uK8Na/Zh9prZex1m7tfz8diPfa/rvtZan33PrOu61+e+rutOVSFJkiRJkqT++rFJByBJkiRJkqTJMkEkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIUy3JAUnWD1n33Un+5xb2V5JHbkMM90nykSQ3J/ng1j6/D9J5X5LvJvlikqcn+drA/nVJnjXJGCVpR5fk00l+c9JxSFJfJHlZks8NPP5ekodPII6hvuckeUmSj29hv/1Iz5kg2kElWdkaip0nHcu4VNUrq+rNI3jpFwJ7ALtV1YtG8PqLSvKoJB9M8p2WqLo0yWuS7DTwb33BrOf8Q5I3jinEpwHPBvauqv2r6l+r6tFjem9JO4CWSP5BO7m+PsmpSe7X9m12wprk/kn+Osn/1+pf3R7vPvBaG5Pcd+A5v5nk0wOPNzuZ3lI7O5YDIEla1LR/x6mq+1XVNZOOYyFV9f6q+qVJx6HpZYJIWtxDgf9bVbfPt3PUHVSSRwBfAK4FfrqqHgC8CFgN7DJQ9UlJfn4E758ki7UVDwXWVdX3l/v9JfXKc6vqfsAT6Nq4P5ldIck9gU8AjwUOAu4PPAW4Adh/oOpOwKuHedOtaGclSZJ2WCaItgNJHpLk3CSbknwjye8N7Ns/ydokt7Qrrn/Vdn22/b6pXV19yhDv8/IkVya5Nck1SV4xsO/KJIcOPN65xfOE9vilSb6Z5IYk/3NrphS1uq9NckWbovS+JPdeoO5PtSvJNyW5PMnzBvadmuTPBh7/P0k2JPlWkt8YKP+5dqx2Gij7lST/Oc/7vQl4A/DidhyPaUNJ/y3JiUluAN6Y5MeS/Ek7BhuTnJ7kAe01Zq50vDzJte1vfGWL49L2t7xjC4foTcC/V9VrqmoDQFV9rap+rapuGqj358BbFjncM3/XzN/wjnal/KtJDhzY/+kkb0nyb8BtwMPb/8Pzk9yY5Kokv9XqHgO8F3hKO0ZvyhamBrZjdVy6K/43JDk7yYOGiVtSP1TVdcA/AY+bZ/dLgX2BF1TVFVV1Z1VtrKo3V9XgSMq/AP5Hkl2HeMth21kAkjwwyUdbP/jdtr132/fiJGtn1f+DJOe37d3STVu+Jcl/JPmzDExP2JLF2m5J2p6M8TvOp1tb++/tOR9pbfH7B9rilQP1fzLJhe2c92tJDh/Yt1s7H74lyReBR8x6r7tGpyY5JMmXW91rMzCyf+D7wdHpRsN+J8nrt/A3nJpuOY0L031X+0yShy5Q9wHtu8im9t3kT9Iu9mbulLhnt77k5vZ9JK38nu3v/+mBuj+R5LYkKxY75tp+mSCacu3D/BHgP4G9gAOB30/yy63K24G3V9X96Rqos1v5M9rvXdtQx88P8XYbgUPprsa+HDgxLQEEnAkcOVD3l4HvVNWXkjwGeCfwEmBP4AEt1q3xkvaajwAexfxXje9Bdyw+DvwE8LvA+5PMmcqU5CDgf9BNe9oPuCtZVVX/QXeleXB45VHA6bNfp6qOB/4XcFY7jie3XU8CrqGbevYW4GXt5xeBhwP3A2YnfZ7UYnkx8NfA61tcjwUOT/IL8xwXWp1zFtg36J3AozL8Wj9PAq4GdgeOBz40K1FzFLCG7ur5N4EPAOuBh9BNu/tfSZ7Zjskrgc+3Y3T8Iu/7u8DzgV9or/Vd4O+GjFlSDyTZBzgY+PI8u58F/HNVfW+Rl1kLfJquL1jMsO3sjB8D3kc3enJf4Afc3eZ/BHh0kv0G6v8acEbb/jvg+8CDgaPbz9ZYrO2WpKk35u84AEfQndvu1V7v83Tt+IOAK+naU9JNTb6Qrs3+ifa8d7bvO9C14T+k+87zG+1nId+nu6ixK3AI8NtJnj+rztOAR7e//w1JfmoLr/cS4M107f8lwPsXqPe3dN/HHk53vv1Suu92m0k3LftDdN+7dqfrW54KUFX/RXfu/+sDTzkS+ERVbdpCjNrOmSCafj8HrKiqP62q/2pzWt9D11gB/DfwyCS7V9X3quqibX2jqvpYVV1dnc/QJWKe3nafATwvyY+3x79GlzSCLlnwkar6XGtM3gDUVr79O6rq2qq6kS7hcuQ8dZ5Ml3g5oR2LTwIfXaDu4cD7quqyNu3pjbP2n0Zr8NqJ9S9z98n7ML5VVX9bVbdX1Q/oGuy/qqpr2peW1wJHZPPpZ2+uqh9W1cfpOowz21Xv64B/BR6/wHvtBmwYIqYf0B27P1usYrMR+Ouq+u+qOgv4Gl3nNePUqrq8Ta17MF2H8cftb7iEbtTQS4d8r0GvBF5fVeur6kd0/zYvzJTOJZc0Vv+Y5Cbgc8Bn6BL0sw3bJkLXH/3uEFc7t+Y1qaobqurcqrqtqm6la3t/oe27DfgwrW9qiaKfBM5PN3L1V4Hj23OvoOuPtsZibbckbQ/G9h2neV/7nnMz3QjVq6vqX9p57ge5+zz8ULplE97XzvO/DJwLvGigDX9DVX2/qi5jC214VX26qr7SRrpeSvfdafYF4TdV1Q+q6j/pkmU/u4W/4WNV9dl2/vx6utH7+wxWaDEeAby2qm6tqnXA2+iSY7MdDFxeVedU1X/TXcD+9sD+04Ajk6Q9Pgr4+y3Epx2ACaLp91DgIemmId3UTpxfRzdyBeAYuhE3X23DIw9d6IUWk+Q5SS5qwwlvoms0dgeoqqvosuvPbUmi53F3QuUhdOs20OreRjdCZ2tcO7D9zfaasz0EuLaq7pxVd77RSpvF1OoN+ge6v+W+dMmkf52ZVrAN8c683+B7fBPYmbv/nQCuH9j+wTyP77fAe91Ad5ViGO8F9kjy3CHqXldVg4m82cd98G98CHBj+yI0WH9rR4pB93/6vIH/z1cCd7D5sZLUT8+vql2r6qFV9TstAT/b0G1iO3n/KHDcIlW3pp0lyY8n+d9t6P4tdFMeds3dU5fP4O6LF78G/GPrG1fQ9Q2D7evs/mQxi7XdkrQ9GNt3nGbY8/CH0q3rORjXS+guls7Xhs/+jnGXJE9K8qk21etmuouku8+qNpiQuY2Fvw/A5t+3vgfcyNz2f3fgHsz9XrLo96XWtww+/kKL6YAkPwk8Ejh/C/FpB2CCaPpdC3yjnTDP/OxSVQcDVNXXq+pIuiGQbwXOaUmPrRrBk+RedNnxvwT2qKpdgQto81CbmWlmhwFXtKQRdFdd9x54rfvQXY3dGoPZ732Bb81T51vAPtl8weR9gevmqbthnte8Sxu183ngV9i2bPjs4/stug5l8P1uZ/POZ1v9C93VisWD6kZwvYlu+GkWqb7XwBUBmHvcB//GbwEPSrLLrPrzHfvFXAs8Z9b/6Xu3fxNJWsy/AL+cgTuULeJ44LfYckJ76Ha2+UO6KQFPatMfZqY8zLSpFwIrkqyi6zdnLqhsousb9h54rc2u/g5hsbZbkrYHY/mOs41xfWZWXPerqt/m7jZ8we8Ys5xBl1DZp7qbH7ybxc/Pt+Su9013l88HMbf9/w7d6KvZ30sW/b7U+pbZfdLMrIujgHOq6ofbGry2DyaIpt8XgVuT/HGS+6S7rfnjkvwcQJJfT7KijaqZWUjzTroG7E66uafDuCdwr/a825M8h83X6IFuHuovAb/N5tOxzqEbjfPz6e4u80a2vvE7NsnebbrX64Gz5qkzk8X+oyT3SHIA8NwW12xnAy9L8pg24mm+dXFOB/4I+Gm6+bdLcSbwB0ke1hrsmXWL5r3z2VY6Hvj5JH+R5MEASR6Z7jb28y2++vfAvenu7rMlPwH8XjuWLwJ+ii4pOEdVXQv8O/D/Jrl3kp+hu7LzD9vw97wbeEvawnpJViQ5bBteR1I//T3dCfy56RYS/bF0i4a+LsnBsyu3ixlnAb83e9+ArW1nd6G74nxT67c262PaUP0P0i2U/SC6hBFVdQddf/PGNgrpJ9n6qbpDt92SNMXG9R1na32Ubk3Po1o7e490N5b5qXna8Mew5XXkdqEbgf/DJPvTjShdioOTPK1933ozcFE7R79Li/FsunPtXdr59muY/5z9Y8Bj092sZ2e6fvLBs+r8A/ACuiTRnPVateMxQTTl2of8UGAV8A26rPB76RYegy4JcHmS79Et5nZEm8d6G92aCP/Whkc+OcnTW7353udWukbhbLpFg3+NWUMI2xSszwM/z0ACp6oup1t4+AN0mejv0a2R8COAJC9Jcvkif+oZdGseXUO3QNqcdXTa6JjnAs9px+GdwEur6qvz1P0nunm0nwSuar9nO4823akdr6U4he5Ly2fp/p1+SHdMlqyqrqa7hfNKun/rm+lGe60Fbp2n/h10624stmjpF+gWzf4O3f+VF1bVlqYGHtli+BbdsTu+qv5la/6W5u10/7c+nuRW4CK6RVclaVFt7YVnAV+lS7zcQvdFY3e6dm0+fwosOOJoa9tZuv7lPnTt50XAP89T54wW5wdnXSx4FV0f/m26fuNMWn8JkO4OnS9ZKFaGbLu31OdL0qSN6zvONsR1K90F8SPoznm/TTeC6V6tyqvopoF9GziVbqHrhfwO8KftfPcN3L3Q9rY6g+6CxI3AE9l8AelBv0u33uk1dGv6nUH3XWUzVfUd4EXACXRTrfcD/m1WnWuBL9GN3PrXJcav7UA2n8YuLV0bQXMTsF9VfWOI+uuA39zGZMOSJLkaeMUk3nuSkryM7pg/bdKxSFKfJXkr8OCqWvRuZrbdktRPSU4F1lfVnDs9j+G9T6G7Qc/Y31vj5wgiLYskz21DLe9Lt47RV4B1k41qy5L8Kl02fL7RRZIkLbs2Le5n0tmfbrrueZOOS5Kk2ZKspFuz9eTJRqJxMUGk5XIY3TDMb9ENTzyipnh4WpJPA+8Cjp11VzRJkkZpF7o1LL5PN137bcCHJxqRJEmzJHkzcBnwF8PMCtGOwSlmkiRJkiRJPecIIkmSJEmSpJ4zQSRJkiRJktRzO086AIDdd9+9Vq5cOekwJGkqXXzxxd+pqhWTjmOS7CckaX72ER37CUma39b0E1ORIFq5ciVr166ddBiSNJWSfHPSMUya/YQkzc8+omM/IUnz25p+wilmkiRJkiRJPWeCSJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIkiRJkiSp50wQSZIkSZIk9dzOkw5g2qw87mND1Vt3wiEjjkSSNI3sJyRpx2K7LkkdRxBJkpYkySlJNia5bKDsrCSXtJ91SS5p5SuT/GBg37snF7kkSZKkGY4gkiQt1anAO4DTZwqq6sUz20neBtw8UP/qqlo1tugkSZIkLcoEkSRpSarqs0lWzrcvSYDDgWeOMyZJkiRJW8cpZpKkUXo6cH1VfX2g7GFJvpzkM0mevtATk6xJsjbJ2k2bNo0+UkmSJKnHTBBJkkbpSODMgccbgH2r6vHAa4Azktx/vidW1UlVtbqqVq9YsWIMoUqSJEn9ZYJIkjQSSXYGfgU4a6asqn5UVTe07YuBq4FHTSZCSZIkSTNMEEmSRuVZwFerav1MQZIVSXZq2w8H9gOumVB8kiRJkhoTRJKkJUlyJvB54NFJ1ic5pu06gs2nlwE8A7i03fb+HOCVVXXj+KKVJEmSNB/vYiZJWpKqOnKB8pfNU3YucO6oY5IkSZK0dRYdQZTklCQbk1w2UHZWkkvaz7p2JZgkK5P8YGDfu0cZvCRJkiRJkpZumBFEpwLvAE6fKaiqF89sJ3kbcPNA/auratVyBShJkiRJkqTRWjRBVFWfTbJyvn1JAhwOPHN5w5IkSZIkSdK4LHWR6qcD11fV1wfKHpbky0k+k+TpS3x9SZIkSZIkjdhSF6k+ks3vULMB2LeqbkjyROAfkzy2qm6Z/cQka4A1APvuu+8Sw5AkSZIkSdK22uYRREl2Bn4FOGumrKp+VFU3tO2LgauBR833/Ko6qapWV9XqFStWbGsYkiRJkiRJWqKlTDF7FvDVqlo/U5BkRZKd2vbDgf2Aa5YWoiRJkiRJkkZpmNvcnwl8Hnh0kvVJjmm7jmDz6WUAzwAubbe9Pwd4ZVXduJwBS5IkSZIkaXkNcxezIxcof9k8ZecC5y49LEmSJEk7kjbTYC1wXVUdmuRhwAeA3YCLgaOq6r+S3As4HXgicAPw4qpaN6GwJak3lnoXM0mSJEkaxquBKwcevxU4saoeCXwXmJmpcAzw3VZ+YqsnSRoxE0SSJEmSRirJ3sAhwHvb4wDPpFuWAuA04Plt+7D2mLb/wFZfkjRCJogkSZIkjdpfA38E3Nke7wbcVFW3t8frgb3a9l7AtQBt/82tviRphEwQSZIkSRqZJIcCG6vq4mV+3TVJ1iZZu2nTpuV8aUnqJRNEkiRJkkbpqcDzkqyjW5T6mcDbgV2TzNw0Z2/gurZ9HbAPQNv/ALrFqjdTVSdV1eqqWr1ixYrR/gWS1AMmiCRJkiSNTFW9tqr2rqqVwBHAJ6vqJcCngBe2akcDH27b57fHtP2frKoaY8iS1EsmiCRJkiRNwh8Dr0lyFd0aQye38pOB3Vr5a4DjJhSfJPXKzotXkSRJkqSlq6pPA59u29cA+89T54fAi8YamCTJEUSSJEmSJEl9Z4JIkiRJkiSp50wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz+086QAkSZIkSdNl5XEfG6reuhMOGXEkksbFEUSSJEmSJEk9Z4JIkiRJkiSp50wQSZKWJMkpSTYmuWyg7I1JrktySfs5eGDfa5NcleRrSX55MlFLkiRJGmSCSJK0VKcCB81TfmJVrWo/FwAkeQxwBPDY9px3JtlpbJFKkiRJmpcJIknSklTVZ4Ebh6x+GPCBqvpRVX0DuArYf2TBSZIkSRqKCSJJ0qi8KsmlbQraA1vZXsC1A3XWtzJJkiRJE2SCSJI0Cu8CHgGsAjYAb9vaF0iyJsnaJGs3bdq03PFJkiRJGmCCSJK07Krq+qq6o6ruBN7D3dPIrgP2Gai6dyub7zVOqqrVVbV6xYoVow1YkiRJ6jkTRJKkZZdkz4GHLwBm7nB2PnBEknsleRiwH/DFcccnSZIkaXM7TzoASdL2LcmZwAHA7knWA8cDByRZBRSwDngFQFVdnuRs4ArgduDYqrpjEnFLkiRJutuiCaIkpwCHAhur6nGt7I3AbwEzi0K8buAWxq8FjgHuAH6vqv7PCOKWJE2JqjpynuKTt1D/LcBbRheRJEmSpK01zBSzU4GD5ik/sapWtZ+Z5NBjgCOAx7bnvDPJTssVrCRJkiRJkpbfogmiqvoscOOQr3cY8IGq+lFVfQO4irsXJpUkSZIkSdIUWsoi1a9KcmmSU5I8sJXtBVw7UGd9K5MkSZIkSdKU2tYE0buARwCrgA3A27b2BZKsSbI2ydpNmzYt/gRJkiRJkiSNxDYliKrq+qq6o6ruBN7D3dPIrgP2Gai6dyub7zVOqqrVVbV6xYoV2xKGJEmSJEmSlsE2JYiS7Dnw8AXAZW37fOCIJPdK8jBgP+CLSwtRkiRJkiRJozTMbe7PBA4Adk+yHjgeOCDJKqCAdcArAKrq8iRnA1cAtwPHVtUdowldkiRJkiRJy2HRBFFVHTlP8clbqP8W4C1LCUqSJEmSJEnjs5S7mEmSJEmSJGkHYIJIkiRJkiSp50wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz5kgkiRJkiRJ6jkTRJIkSZIkST1ngkiSJEmSJKnnTBBJkiRJkiT1nAkiSZIkSZKknjNBJEmSJEmS1HMmiCRJkiRJknrOBJEkSZIkSVLPmSCSJEmSJEnqORNEkiRJkiRJPWeCSJIkSZIkqedMEEmSJEmSJPWcCSJJ0pIkOSXJxiSXDZT9RZKvJrk0yXlJdm3lK5P8IMkl7efdk4tckiRJ0gwTRJKkpToVOGhW2YXA46rqZ4D/C7x2YN/VVbWq/bxyTDFKkiRJ2gITRJKkJamqzwI3zir7eFXd3h5eBOw99sAkSZIkDc0EkSRp1H4D+KeBxw9L8uUkn0ny9EkFJUmSJOluJogkSSOT5PXA7cD7W9EGYN+qejzwGuCMJPdf4LlrkqxNsnbTpk3jCViStOyS3DvJF5P8Z5LLk7yplT8syReSXJXkrCT3bOX3ao+vavtXTjJ+SeoLE0SSpJFI8jLgUOAlVVUAVfWjqrqhbV8MXA08ar7nV9VJVbW6qlavWLFiTFFLkkbgR8Azq+pngVXAQUmeDLwVOLGqHgl8Fzim1T8G+G4rP7HVkySNmAkiSdKyS3IQ8EfA86rqtoHyFUl2atsPB/YDrplMlJKkcajO99rDe7SfAp4JnNPKTwOe37YPa49p+w9MkjGFK0m9tWiCyNsXS5K2JMmZwOeBRydZn+QY4B3ALsCFs/qDZwCXJrmE7qT/lVV147wvLEnaYSTZqbX9G+nudHk1cNPADQ3WA3u17b2AawHa/puB3cYbsST1z85D1DmV7kT/9IGyC4HXVtXtSd5Kd/viP277rq6qVcsapSRpalXVkfMUn7xA3XOBc0cbkSRp2lTVHcCqdmH5POAnl/qaSdYAawD23Xffpb6cJPXeoiOIvH2xJEmSpOVQVTcBnwKeAuyaZOaC9d7AdW37OmAfgLb/AcAN87yWa9VJ0jJajjWIvH2xJEmSpHm19edmlqS4D/Bs4Eq6RNELW7WjgQ+37fPbY9r+T87c7ECSNDrDTDFb0BZuX3xDkicC/5jksVV1yzzPdUioJEmStOPbEzit3aTgx4Czq+qjSa4APpDkz4Avc/f05JOBv09yFd1MhiMmEbQk9c02J4gGbl984ODti+luY0lVXZxk5vbFa2c/v6pOAk4CWL16tVcEJEmSpB1QVV0KPH6e8muA/ecp/yHwojGEJkkasE1TzLx9sSRJkiRJ0o5j0RFE7fbFBwC7J1kPHE9317J70d2+GOCiqnol3e2L/zTJfwN34u2LJUmSJEmSpt6iCSJvXyxJkiRJkrRjW467mEmSJEmSJGk7ZoJIkiRJkiSp50wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz5kgkiRJkiRJ6jkTRJIkSZIkST1ngkiSJEmSJKnnTBBJkiRJkiT1nAkiSZIkSZKknjNBJEmSJEmS1HMmiCRJkiRJknrOBJEkSZIkSVLPmSCSJEmSJEnqORNEkiRJkiRJPWeCSJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJElLkuSUJBuTXDZQ9qAkFyb5evv9wFaeJH+T5KoklyZ5wuQilyRJkjTDBJEkaalOBQ6aVXYc8Imq2g/4RHsM8Bxgv/azBnjXmGKUJEmStAUmiCRJS1JVnwVunFV8GHBa2z4NeP5A+enVuQjYNcme44lUkiRJ0kJMEEmSRmGPqtrQtr8N7NG29wKuHai3vpXNkWRNkrVJ1m7atGl0kUqSJEkyQSRJGq2qKqC24XknVdXqqlq9YsWKEUQmSZIkacZQCSIXIJUkbaXrZ6aOtd8bW/l1wD4D9fZuZZIkSZImaNgRRKfiAqSSpOGdDxzdto8GPjxQ/tJ2MeHJwM0DU9EkSZIkTchQCSIXIJUkLSTJmcDngUcnWZ/kGOAE4NlJvg48qz0GuAC4BrgKeA/wOxMIWZIkSdIsOy/huVu7AKlXiCVpB1RVRy6w68B56hZw7GgjkiRJkrS1lmWR6m1ZgNS700iSJEmSJE2HpSSIlrQAqXenkSRJkiRJmg5LSRC5AKkkSZIkSdIOYKg1iNoCpAcAuydZDxxPt+Do2W0x0m8Ch7fqFwAH0y1Aehvw8mWOWZIkSZIkSctoqASRC5BKkiRJkiTtuJZlkWpJkiRJkiRtv0wQSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzQy1SLUmSts7K4z42VL11Jxwy4kgkSZKkxTmCSJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIkiRJ0sgk2SfJp5JckeTyJK9u5Q9KcmGSr7ffD2zlSfI3Sa5KcmmSJ0z2L5CkfjBBJEmSJGmUbgf+sKoeAzwZODbJY4DjgE9U1X7AJ9pjgOcA+7WfNcC7xh+yJPWPCSJJkiRJI1NVG6rqS237VuBKYC/gMOC0Vu004Plt+zDg9OpcBOyaZM8xhy1JvWOCSJIkSdJYJFkJPB74ArBHVW1ou74N7NG29wKuHXja+lY2+7XWJFmbZO2mTZtGFrMk9cXOkw5AkiRt/1Ye97Gh6q074ZARRyJpWiW5H3Au8PtVdUuSu/ZVVSWprXm9qjoJOAlg9erVW/VcSdJcjiCSJEmSNFJJ7kGXHHp/VX2oFV8/M3Ws/d7Yyq8D9hl4+t6tTJI0QiaIJEmSJI1MuqFCJwNXVtVfDew6Hzi6bR8NfHig/KXtbmZPBm4emIomSRoRp5hJkiRJGqWnAkcBX0lySSt7HXACcHaSY4BvAoe3fRcABwNXAbcBLx9vuJLUTyaIJEmSJI1MVX0OyAK7D5ynfgHHjjQoSdIcJogkSSOR5NHAWQNFDwfeAOwK/BYwc8uZ11XVBWMOT5IkSdIAE0SSpJGoqq8BqwCS7ES3wOh5dFMFTqyqv5xgeJIkSZIGuEi1JGkcDgSurqpvTjoQSZIkSXOZIJIkjcMRwJkDj1+V5NIkpyR54KSCkiRJktTZ5gRRkkcnuWTg55Ykv5/kjUm+scBIAAAQ5klEQVSuGyg/eDkDliRtX5LcE3ge8MFW9C7gEXTTzzYAb1vgeWuSrE2ydtOmTfNVkSRJkrRMtjlBVFVfq6pVVbUKeCLdLSjPa7tPnNnnwqOS1HvPAb5UVdcDVNX1VXVHVd0JvAfYf74nVdVJVbW6qlavWLFijOFKkiRJ/bNcU8xcW0KStJAjGZhelmTPgX0vAC4be0SSJEmSNrNcCSLXlpAkzZHkvsCzgQ8NFP95kq8kuRT4ReAPJhKcJEmSpLssOUHk2hKSpIVU1ferarequnmg7Kiq+umq+pmqel5VbZhkjJIkSZKWZwSRa0tIkiRJkiRtx3ZehteYs7bEwNVg15aQJGkKrTzuY0PVW3fCISOORJIkSdNgSQmigbUlXjFQ/OdJVgEFrJu1T9spv0hIkiRJkrTjWlKCqKq+D+w2q+yoJUUkSZKmxrAXCCRJkrR9W667mEmSJEmSJGk7tRxrEEmSJEmSJFyeQ9svE0Q7KBslSZIkSZI0LKeYSZIkSZIk9ZwJIkmSJEmSpJ4zQSRJkiRJktRzJogkSZIkSZJ6zgSRJEmSJElSz5kgkiRJkiRJ6jlvc99zK4/72KRDkCRJkiRJE+YIIkmSJEmSpJ4zQSRJkiRJktRzTjHbRsNOzVp3wiEjjkSSJEmSJGlpTBBpIkywSZIkSZI0PUwQSZKksfECgSRJ0nRyDSJJkiRJkqSeM0EkSZIkSZLUc04xkyRpBzLsFK5pN8zf4TQ0SZKk5eMIIkmSJEmSpJ7rzQiiHeWKqiRJkiRJ0nLrTYJoR2GiS5IkSZIkLTenmEmSJEmSJPWcCSJJkiRJkqSec4qZlpVT4CQNSrIOuBW4A7i9qlYneRBwFrASWAccXlXfnVSMkiRJkhxBJEkavV+sqlVVtbo9Pg74RFXtB3yiPZYkSZI0QSaIJEnjdhhwWts+DXj+BGORJEmSxDJMMXP6gCRpCwr4eJIC/ndVnQTsUVUb2v5vA3tMLLrtiFN4JUmSNErLNYLI6QOSpPk8raqeADwHODbJMwZ3VlXRJZHmSLImydokazdt2jSGUCVJkqT+GtUUM6cPSJKoquva743AecD+wPVJ9gRovzcu8NyTqmp1Va1esWLFuEKWJEmSemk5EkQz0wcuTrKmlTl9QJJ6Lsl9k+wysw38EnAZcD5wdKt2NPDhyUQoSZIkacZy3Ob+aVV1XZKfAC5M8tXBnVVVbe2JzbRk0hqAfffddxnCkCRNmT2A85JA19+cUVX/nOQ/gLOTHAN8Ezh8gjFKkiRJYhkSRIPTB5JsNn2gqjYsNH2gLVR6EsDq1avnXX+iT1x8VNKOpqquAX52nvIbgAPHH5EkSZKkhSxpipnTByRJkiRJkrZ/Sx1B5PQBSZIkSZKk7dySEkROH5AkSTuKYad7rzvhkBFHIkmSNH6jus29JEmSJEmSthMmiCRJkiRJknpuOW5zL0nSds+7SUrSaCQ5BTgU2FhVj2tlDwLOAlYC64DDq+q76RY3fTtwMHAb8LKq+tIk4pakvnEEkSRJkqRROhU4aFbZccAnqmo/4BPtMcBzgP3azxrgXWOKUZJ6zwSRJEmSpJGpqs8CN84qPgw4rW2fBjx/oPz06lwE7Jpkz/FEKkn9ZoJIkiRJ0rjtUVUb2va3gT3a9l7AtQP11rcySdKImSCSJEmSNDFVVUBt7fOSrEmyNsnaTZs2jSAySeoXE0SSJEmSxu36malj7ffGVn4dsM9Avb1b2RxVdVJVra6q1StWrBhpsJLUByaIJEmSJI3b+cDRbfto4MMD5S9N58nAzQNT0SRJI+Rt7jXVhr3t9LoTDhnra0mSJGk4Sc4EDgB2T7IeOB44ATg7yTHAN4HDW/UL6G5xfxXdbe5fPvaAJamnTBBJkqTtkol/aftQVUcusOvAeeoWcOxoI5IkzccEkSRJ2qENm0iSJEnqM9cgkiRJkiRJ6jkTRJIkSZIkST3nFDPtEJw+IEmSJEnStnMEkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk95xpEkiRNkGuoSZIkaRqYIBoxT/wlSZIkSdK0M0EkSZK0FYa9+LPuhENGHIkkSdLyMUEkzeKJvyRJkiSpb1ykWpIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSRqJJPsk+VSSK5JcnuTVrfyNSa5Lckn7OXjSsUqSJEl9t80JIk/8JUmLuB34w6p6DPBk4Ngkj2n7TqyqVe3ngsmFKEmSJAmWdhezmRP/LyXZBbg4yYVt34lV9ZdLD0+StL2qqg3AhrZ9a5Irgb0mG5UkSZKk+WzzCKKq2lBVX2rbtwKe+EuS5pVkJfB44Aut6FVJLk1ySpIHLvCcNUnWJlm7adOmMUUqSZIk9dOyrEG0LSf+kqR+SHI/4Fzg96vqFuBdwCOAVXQjjN423/Oq6qSqWl1Vq1esWDG2eCVJkqQ+WnKCaFtP/L0yLEk7viT3oOsj3l9VHwKoquur6o6quhN4D7D/JGOUJEmStMQE0VJO/L0yLEk7tiQBTgaurKq/Gijfc6DaC4DLxh2bJEmSpM1t8yLVWzrxbwuTgif+ktRnTwWOAr6S5JJW9jrgyCSrgALWAa+YTHiSJEmSZizlLmae+EuSFlRVnwMyzy5vay9JkiRNmW1OEHniL0mSJEmStGNYlruYSZIkSZIkaftlgkiSJEmSJKnnlrIGkdRrK4/72FD11p1wyIgjkSRJkiRpaRxBJEmSJEmS1HOOIJIkSRoBR5pKkqTtiSOIJEmSJEmSem67H0E07NU5SZIkSZIkzc8RRJIkSZIkST233Y8gkiRpSxxpKkmSJC3OBJEkSdJ2wEWvJUnSKDnFTJIkSZIkqedMEEmSJEmSJPWcCSJJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs95m3tpSnj7YkmSJEnSpJggkrYzJpIkSZIkScvNKWaSJEmSJEk9Z4JIkiRJkiSp50wQSZIkSZIk9ZxrEEkjNuyaQZKkflrufmK516pz7TtJkvrBEUSSJEmSJEk95wgiSZIkSZI0Uo5InX4miKQdlFMMJEmSJEnDMkEkSZLUQ66RJ0mSBo0sQZTkIODtwE7Ae6vqhFG9l6Rt5xcETYr9hCRpIfYRkjR+I0kQJdkJ+Dvg2cB64D+SnF9VV4zi/SRtf3aUKWs7yt8xbvYT0o5nuS847Cjtpv3E1rOPkKTJGNUIov2Bq6rqGoAkHwAOA2zUpR3ctI9I8kR9athPSNqiSfUntv9TwT5CUi9M23eTUd3mfi/g2oHH61uZJElgPyFJWph9hCRNwMQWqU6yBljTHn4vydcmFcsidge+M+kgppDHZS6PyfyWdFzy1mWMZILvO+v1tvaYPHRZg9lObEf9xCDbgbk8JvPzuMw1FcdkyvqdYY5JL/sIWNZ+Yqj/e2P+vzEVn4d5zIlrUp+ZAdN4rKbx/xRsJ8fK/1Nz5a1LimnofmJUCaLrgH0GHu/dyu5SVScBJ43o/ZdNkrVVtXrScUwbj8tcHpP5eVzm8pgAO1A/Mch/27k8JvPzuMzlMZmrx8dk0T4Clq+fmMbjPI0xwXTGZUzDm8a4jGk444ppVFPM/gPYL8nDktwTOAI4f0TvJUna/thPSJIWYh8hSRMwkhFEVXV7klcB/4fu1pSnVNXlo3gvSdL2x35CkrQQ+whJmoyRrUFUVRcAF4zq9cdou5reMEYel7k8JvPzuMzlMWGH6icG+W87l8dkfh6XuTwmc/X2mIy5j5jG4zyNMcF0xmVMw5vGuIxpOGOJKVU1jveRJEmSJEnSlBrVGkSSJEmSJEnaTpggWkCSfZJ8KskVSS5P8upJxzRpSe6d5ItJ/rMdkzdNOqZpkmSnJF9O8tFJxzINkqxL8pUklyRZO+l4pkWSXZOck+SrSa5M8pRJx6Sls8+Yyz5jYfYXc9lnzGV/sbyGaafT+ZskVyW5NMkTpiCmA5Lc3D4blyR5w4hjWrTtTnKvJGe14/SFJCtHGdNWxPWyJJsGjtVvjjqu9r4LtumTOFZDxDSp47TFdn7cn78hYxrr56+95xbb/lEfp5GtQbQDuB34w6r6UpJdgIuTXFhVV0w6sAn6EfDMqvpeknsAn0vyT1V10aQDmxKvBq4E7j/pQKbIL1bVdyYdxJR5O/DPVfXCdHdm+fFJB6RlYZ8xl33Gwuwv5mefsTn7i+U1TDv9HGC/9vMk4F3t9yRjAvjXqjp0hHEMGqbtPgb4blU9MskRwFuBF09BXABnVdWrRhzLbFtq0ydxrBaLCSZznGDL7fy4P3/DxATj/fzB4m3/SI+TI4gWUFUbqupLbftWug/YXpONarKq87328B7tx0WsgCR7A4cA7510LJpeSR4APAM4GaCq/quqbppsVFoO9hlz2WfMz/5Cw7C/WH5DttOHAae39usiYNcke044prEasu0+DDitbZ8DHJgkUxDX2A3Rpo/9WG3H/cxYP3/TaMi2f6THyQTRENpQwMcDX5hsJJPXhiteAmwELqyq3h+T5q+BPwLunHQgU6SAjye5OMmaSQczJR4GbALe14b9vjfJfScdlJaXfcbd7DPmZX8xP/uMzdlfjNAW2um9gGsHHq9nTAmbRfqOp7SpVf+U5LFjiGWxtvuu41RVtwM3A7tNQVwAv9qm3ZyTZJ9Rx8TibfokjtUw/cy4jxMs3s5P4vM3TN8zzs/fMG3/SI+TCaJFJLkfcC7w+1V1y6TjmbSquqOqVgF7A/snedykY5q0JIcCG6vq4knHMmWeVlVPoBsGeWySZ0w6oCmwM/AE4F1V9Xjg+8Bxkw1Jy8k+Y3P2GZuzv9gi+4zN2V+MyDS204vE9CXgoVX1s8DfAv846nimte0eIq6PACur6meAC7l75M5ITGObPmRMYz1OA6axnV8spnF//ibe9psg2oI2v/Vc4P1V9aFJxzNN2lC3TwEHTTqWKfBU4HlJ1gEfAJ6Z5B8mG9LkVdV17fdG4Dxg/8lGNBXWA+sHrnidQ9cJaAdgn7Ew+4y72F8swD5jDvuLERiinb4OGBxNsXcrm1hMVXXLzNSqqroAuEeS3UcZ08B7L9R233WckuwMPAC4YRwxbSmuqrqhqn7UHr4XeOKIQxmmTR/3sVo0pgkcp5n3XaydH/vnb7GYJvD5G6btH+lxMkG0gDY39GTgyqr6q0nHMw2SrEiya9u+D/Bs4KuTjWryquq1VbV3Va0EjgA+WVW/PuGwJirJfdtii7Rhkb8EXDbZqCavqr4NXJvk0a3oQKDPixjvMOwz5rLPmMv+Yn72GXPZXyy/Idvp84GXpvNk4Oaq2jDJmJI8eGbNmiT7031/G1mCYci2+3zg6Lb9Qrq2bKTrAQ0T16x1WJ5Ht6bTyAzZpo/1WA0T07iPU3vPYdr5cX/+Fo1p3J+/Idv+kR4n72K2sKcCRwFfaXNdAV7XMod9tSdwWpKd6D4cZ1eVt+jVfPYAzmvt6c7AGVX1z5MNaWr8LvD+dHcluAZ4+YTj0fKwz5jLPkPDss+Yn/3F8pq3nQb2BaiqdwMXAAcDVwG3MfpjPkxMLwR+O8ntwA+AI0acjJm37U7yp8DaqjqfLqn190muAm6kS0SM2jBx/V6S59HdHe5G4GVjiGuOKThWi8U0ieM0bzuf5JUwsc/fMDGN+/MH87T94zxOGf3fJ0mSJEmSpGnmFDNJkiRJkqSeM0EkSZIkSZLUcyaIJEmSJEmSes4EkSRJkiRJUs+ZIJIkSZIkSeo5E0SSJEmSJEk9Z4JIkiRJkiSp50wQSZIkSZIk9dz/D24mQN1DAO6HAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 1440x360 with 3 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEKCAYAAAARnO4WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X2clXWd//HX55wzDMi9gBjgCC4/1wABa5RclDbNtryhWslKy1W3tX67P1ZNozVLsx7tbppbamVrWmpLKUKKlFmaN0mJCTogN+5KKQoq6iwio3iGmfn8/riug2eGM2fO3TXnnLnez8djHpxznevmMwf4nO/5Xt/v52vujoiIDHyJagcgIiL9QwlfRCQmlPBFRGJCCV9EJCaU8EVEYkIJX0QkJpTwRURiQglfRCQmlPBFRGIiVe0Aso0dO9YnT55c7TBEROrGmjVrXnX3cYXsW1MJf/LkyaxevbraYYiI1A0z21LovurSERGJiUgTvpmNMrOlZvaUmW0ys6OjvJ6IiPQu6i6dq4F73H2BmQ0C9ov4eiIi0ovIEr6ZjQTmAWcBuHs70B7V9UREJL8ou3SmAK8APzazJ8zsBjMbGuH1REQkjygTfgp4F3Cdux8BvAH8S8+dzOxcM1ttZqtfeeWVCMMREYm3KBP+VmCruz8aPl9K8AHQjbtf7+7N7t48blxBQ0lFROpCa1uatc+/Rmtbuqx9KiWyPnx3f8nMnjezv3T3/waOBzZGdT0RkVqyvGUbX1y2joZEgj1dXVxx6kzmz55Y9D6VFPU4/IXAYjNbB8wG/jXi64mIVF1rW5ovLlvHW3u62JXu4K09XSxatq5bK76QfSot0mGZ7t4CNEd5DRGRWrN1x24aEgneomvvtoZEgq07djNmWGPB+1SaZtqKiFTYpNFD2NPV1W3bnq4uJo0eUtQ+laaELyJSYWOGNXLFqTMZ3JBgeGOKwQ0Jrjh1ZreWeyH7VJq5e2QnL1Zzc7OreJqIDBStbWm27tjNpNFDek3kheyTj5mtcfeCus5rqlqmiMhAMmZYY59JvJB9KkVdOiIiMaGELyISE0r4IiIxoYQvIhITSvgiIjGhhC8iEhNK+CIiMaGELyISE0r4IiIxoYQvIhITSvgiIjGhhC8iEhNK+CIiof5cX7YaVC1TRIT+X1+2GtTCF5HYq8b6stWghC8isZdZXzZbZn3ZgUQJX0Rirxrry1aDEr6IxF411petBt20FREB5s+eyNypY8taX7bWKeGLiIT6c33ZalCXjohITCjhi4jEhBK+iNSNgT4TNmrqwxeRuhCHmbBRUwtfRGpeXGbCRk0JX0RqXlxmwkZNCV9Eal5cZsJGTQlfRGpeXGbCRk03bUWkLsRhJmzUlPBFpG4M9JmwUVOXjohITCjhi0jN04SrylCXjojUNE24qpxIW/hm9qyZPWlmLWa2OspricjAowlXldUfLfz3ufur/XAdEakTrW3pgkbbZCZcvcXbY/AzE65087Z46tIRkX5VTBeNJlxVVtQ3bR34jZmtMbNzI76WiNSozE3Xzdt3FdVFowlXlRV1C/8Yd99mZgcA95rZU+7+u+wdwg+CcwGampoiDkdE+lt2iz7d2YW5d3u9ry4aTbiqnEhb+O6+LfzzZeAO4Kgc+1zv7s3u3jxu3LgowxGRftbzpmt7Rxfpzu4Jv5AumjHDGpl10Cgl+zJFlvDNbKiZDc88Bj4ArI/qeiJSe3JVuRzckGBQ0tRFUwVRdumMB+4ws8x1furu90R4PRGpMbluugLc/c/H8kZ7p7po+llkCd/d/wzMiur8IlL7MjddF/UYlTN1/PBqhxZLGpYpIpHSTdfaoYQvIpFTlcvaoOJpIlKSzdt3sXT182zevqvaoUiB1MIXkaJdeueT3LLqub3PT5pxIF/7yAy14mucWvgiUpTN23d1S/YAv1z/Ekf/233c1bKtSlFJIZTwRaQoLc+/lnN7eyeqZFnjlPBFpFe5Fh6ZfdCoXvfPlEmQ2qQ+fBHJqbeqllPHD+fMo5u45ZHn9jlGlSxrm1r4IjFTyHKBfS088rUPH859F8zjyINHdzvutOZJunFbw5TwRWJkecs25n7zfj51w6PM/eb9vd5kzVUDp2d3zaPP/C+PbdnRbZ8lq7eqD7+GKeGLxESuVvtFt6/NOY6+r4VHWtvSXP6LjfsclzRTH34NU8IXiYlcrfb2TufEa1fu09Lva+GRrTt2Myhp+1xjT6f68GuZbtqKxERvlSvbO7q4aOk6pr1jRLeiZvlq4EwaPYSOLt/nXJedMl19+DVMLXyRmMi02nO1zNs7ujjxmoe5q2Vbt5u6vS08kv0NYGhjkkGpBN/46AzOeM/B/fXrSAnMfd9P6Wppbm721atXVzsMkQFt8/ZdnHjtSto79m3tpxKQTCQYlOx7gXEI+vJVBbO6zGyNuzcXsq9a+CIxM3X8cL61YCaDUvv+9+/ognRHYQuMg5YerDclJXwzO7DSgYhI/5k/eyJ3LzwmZ/dONs2cHVhKbeHfWNEoRKTfTR0/nG99bNbekTiNKaOhxweAZs4OLCWN0nH3kyodiIhUTq6+9Vzbeo7E+f3mV/dZjlDdNQNHrwnfzPbPd6C7/2/lwxGRcrS2pVn86HN874GnGZRM7k3aDjnr4kD31ai0HOHA1usoHTN7BnAgVyefu/shlQ5Go3RESre8ZRtfuH0t7Z3d/083phKAk+54e/vghgS//+JxSugDQDGjdHpt4bv7lMqFJCJRaW1Ls+GFnVxwawv7DrSEZMLADejcuy1zM1YJP14K6sM3s78FjiFo8T/s7ndGGpWIFCRTwriry3Mme4DOLif4r/s23YyNpz4Tvpl9H5gK/Czc9DkzO8Hd/ynSyEQkr+xiaL1pSMKVC2YC6GasFNTCPw54p4ed/WZ2M7Ah0qhEpE+ZYmhv9dq2h5995j00TxkDoJuxUlDC3ww0AVvC5weF20QkYvlKF0waPYS3Ojp7OTIok9CQSu59nj0aR+KpkIQ/HNhkZn8k6Ag8ClhtZncBuPv8COMTia3Fq7Zw+YoNNCSNPV1w2SnTOGPOwXs/BIYOSpKvFlYyYeqnl24KSfiXRh6FiHSzeNUWLrlzPcDeYZaX3LGetc+/xl1rX6AhkSDd2UXSjA5yJ/1LT1apYumuz4Tv7g/1RyAiEti8fRdfCZN9T0tWbwXI228PMLQxyYyJIysem9Q3VcsUqSHLW7Zx4jUr+0jnbxvckNin/g0EQzHVnSM9KeGL1IjMMMv2zkLTfeBX/3wsF55wKI0py7kcoUiGljgUiVAxC4RseOF1EjkrmeTWkDSuOHUmU8cPZ+H44Zw+p0nDLiWvkhK+mX3V3b9a4VhEBpTMLNhcBcty7bto6TrSOVah6k3CgrH1GRp2KX0ptUtnTUWjEBlgsmfB9rV6VGbfYpI9wKBkUouTSFFKrYe/otKBiAwkuWbB9lawbOuO3eRaeGpQ0vjpZ+aw/oXXaUwl+OqKDd0qXqoejhSrkFo61+TYvBNY7e7LKx+SSP2bNHoIe7q6t9h7S9Drt+3kjfZ9W/cNqQQNqSRnzQ0K1w5tTKkejpSlkBb+YOAw4Pbw+anAM8AsM3ufu58fVXAi9WrMsEauOHVmnwm6tS3N13+5Mec5eg6t1OIkUq5CEv5MYK67dwKY2XXAwwTlkp/s62AzSwKrgW3ufnIZsYrUvOxROfNnT2TaO0awcvMrjB02mKP/Ysw++/dWAG1QKvfQSt2YlXIUkvBHA8MIunEAhgL7u3unme17B2pf5wGbgBGlhShSH4KRNmtJWoKOrk7mHXoADzz1MpkFqAz4/AmHcvqcpr1JO1fXz6CkcffCY5g6fng//wYy0BUySucKoMXMfmxmNwFPAFea2VDgvnwHmtkk4CTghnIDFallrW1pLlzSQrrDeXNPJ+2dcN+mt5M9BJUHr7r3f/irf7+fu1q2AW93/QxuSOydNPWtj81SspdIFFJL50Yzu5ugSibAl9z9hfDxF/o4/DvAIoKKmzmZ2bnAuQBNTU19BixSix75UyuFjqpMdwRDNOdOHcuYYY3qm5d+02cL38xWAH8N3Ofuy7OSfV/HnQy87O55x+y7+/Xu3uzuzePGjSvk1CI1ZfGqLZx/W0tRx2SGaGaMGdbIrINGKdlLpArp0vkWcCyw0cyWmtkCMxtcwHFzgflm9ixwK3Ccmf1X6aGK1J5MGeOOrt7r0ufyVkenxtBLv+sz4bv7Q+7+j8AhwH8CpwEvF3Dcxe4+yd0nA58A7nf3T5UZr0hVtbalWfv8a7S2pWltS/daxrgv+RYuEYlKQTNtzWwIcArwceBdwM1RBiVSizK1cZJm7Ons4vSjmgouY9xTYyqZc9atSJQKmWm7hOCG7T3Ad4GH3L2of+fu/iDwYAnxiVTV5u27aHn+NSaP2Y9FS9d2K21w0yNb8hyZ355OlUWQ/ldIC/9G4JOZiVcicXHpnU9yy6rnIjn3Zado+UHpf4UMy/y1mc0ws2kEZRYy22+JNDKRKtq8fVfJyd6AOVNGs+qZHd227deYZE+n712MXKS/FdKlcxnBsMxpwN3Ah4CVgBK+DFgtz79W8rE3n3Mk8w49YG930OyDRjF66CCNs5eqK6RLZwEwC3jC3c82s/GAhlfKgHbvppdKPnbCyKBvfur44d1mzCrRS7UVkvB3u3uXmXWY2QiCIZkHRRyXSEUUs8RgZt9frt3Grzf0OfI4p8ENCd5o1+0uqU2FJPzVZjYK+CHBSldtwCORRiVSAcUsMXj9Q3/iil8/BQ4dZQ6R1+gbqVWF3LT9x/DhD8zsHmCEu6+LNiyR8mQvMZgpPbxo2TqmvWMEb7R3dmvxL1q6liWrt5Z0ndmTRvLU9l1alETqQlFLHLr7sxHFIVJRvdWZP/Gah2lMJWnv7OKkww9k8pj9Sk72SYMbzzpy7/V0Q1ZqXUlr2orUulx15t/aEzxv7+wA4OdPFFQHMK/fb36V+bMnKtFLXSikeJpI3elZZ35QKpFzofBydHrQTdTaVsg6QCLV12sL38yOBMa6+696bD8R2N5X2WORasuuM3//pu1cff/mil8jU+ZYLXypB/la+N8Ecq2uvAG4MppwRCprzLBGJo0ewnW/+3NFztfzS8KeLtXEkfqRrw9/uLvvUx3K3beY2dgIYxIpWa5x98ENXKO9iPMMShpmhjucPucgJo8ZyjFTx7LxxddZ1GOop1r3Ui/yJfzReV7br9KBiJSjtS3Ntb99mp+s2kJjKkEX8JWTpjFj4kiGDkqSLnT9QeCaT8zudcnBqeOHazlCqVv5Ev59ZvYN4MsertZgZgZcDtzfH8GJFGJ5yzYuuK2FzKJTb4ajcS65cz3JhOHuFLog1dLPvofmKWOA3kshjBnWqEQvdSlfwr8QuAHYbGaZBTtnAauBz0QdmEghWtvSfOH2tb0m9M4ilh4c3JCgIZWsUGQitafXhO/ubwCfNLNDgOnh5g3uXpm7XyIVsPjR52jvrNxygboBKwNZvmGZ78p6ui38c1Rmu7s/HmVgIn1pbUtz9X3/U/Z59mtI0oXrBqwMePm6dK7K85oDx1U4FpGCtbalWbH2Bcpt3DemEvzg0+9m+oQRSvYy4OXr0nlffwYi0pfMkMtVf27lqnv/h47OUpcQh8akYQnjilNnMu/QcRWMUqR25evSmZfnOHf3hyOIRyiuhntcZEodu3u3hcQLlQDOPPpgjnvneCaMHLxPxUyROMjXpfOFHNscmEmwAIqGM0SgmBrucZFd6rgUZxzVxOc/cKiSu8Revi6dU7Kfm9lc4MvAS8DCiOOKpd5quM+dOjaWySrzTef5/32zqOGV2RoSKNmLhApZxPx44CsErft/dfd7I48qpnLVcI9Lca6e3ViLV23h8hUb6Op0Oso472nNTQP+vRMpVL4+/JOAS4CdBLNtV/ZbVDGVq4Z7HIpzZXdjtXd2ctSU/Xn46daKnPvsuZMrch6RgSBftcwVwCSgA1hkZndl//RPePHSs4b74IbEgB8bnt2NtSvdQbrDK5bszzy6ianjh1fkXCIDQb4uHQ3LrILsGu5xGEXS21KEpRiUSnDd6Uew4809zD5olJK9SA/5Ev5GYJy7d6uJb2bTgFcijSrm4lSca9LoIexuL6eXHoY2JunsCmbKHj/twApFJjLw5Ev41wLfz7F9DMFondMjiUhi5aIlaylhWD1DUoab7S2BHIdvQyLlypfwp7r773pudPeHzey6CGOSGFj9TCt/f9Nj7Ex3Fn1syuDrHzmc9x12gJK8SBHyrniV57WGSgciA0/PoZatbWke+VMr/373RrbuLH3h70TClOxFSpAv4W82sxPd/e7sjWb2IUAlkiWvzFDLpBntnV28Z8r+/H5zawVuzcLC4/6Pkr1ICfIl/POBX5rZacCacFszcDRwctSBSf3KVQrh4c2VGWrZmEpw+pymipxLJG7ylVZ42swOJ7g5OyPc/BDwWXd/qz+Ck/qS6cLZuXtP3gkepdhvUIIuZ8DPSxCJUt7SCu6eBn7cT7FIHcueLftme0fZdeozGlMJLj1lGjMmaCSOSLnylVbYRVA/Z5+XCMojj4gsKqkbrW1pNrywk0VL15Lu8IpMoDKgIQkLjzuU0+eoFo5IpeTr0ilrmqKZDQZ+BzSG11nq7peVc06pLZlWvUFJNepzGZSEG/7uSKZPGKlEL1Jh+Vr4g4HPAVOBdcCP3L2YKZFp4Dh3bzOzBmClmf3K3VeVFbHUhHJr1PfUfQWqAypyThHpLl8f/s3AHuBh4ERgOnBeoSd2dwfawqcN4U+Fenalv/UcU7/hhddLrlGfbVDS+Oln5tCQSqqPXiRi+RL+NHc/HMDMbgT+WOzJzSxJMKRzKvA9d3+0pCilqnquwjV/1gR+/vg2OspI+I2pBGbBqJvmKWMqGK2I9CZfwt+TeeDuHWZW9MndvROYbWajgDvMbIa7r8/ex8zOBc4FaGrS+Opak2sVriWrt5Z8PgMuPvEw5kwZoxa9SD/Ll/Bnmdnr4WMDhoTPix6l4+6vmdkDwAeB9T1eux64HqC5uVldPjXmNxteorNCYywTBr85f57KFotUSb5ROmUtUm5m44A9YbIfApwAfLOcc0r0svvqz7v1CVZWYIZsKgHJhHHlgllK9iJV1OeatmV4B3Bz2I+fAJa4+y8ivJ6UaXnLNhYtDYZZdnR5WX30GY2pBD88s5npE0ao+0akyiJL+O6+DjgiqvNL5QSTp17n87etpdMr16vWmEpw5YKZzDt0XMXOKSKli7KFL3UgMwIH94ol+1QCzjtes2RFao0SfoxVevKUAZed8k5OmTVRiV6kBinhx0j2DVmAB556mVSi+OG2vWlsSCjZi9QwJfyYyJ489VZHJ52dTkPSSFdgyOV+g5J0uat0sUiNU8KPgVyTp4Cyk31jyrj05OlaRFykTijhx8DWHbuDln1FFhgM+upvPkcVLUXqjRJ+DEwaPYR0R/nJPgE0pIIJVKpoKVJ/lPAHuNa2NF+580naO0tP+Of8VRMnHj5BFS1F6pwS/gC2eNUWvrpiA3vK6Ks/duoYLp1/eAWjEpFqUcIfgFrb0lz726e56ZEtJZ9j7iH7c8EJh6p0scgAooQ/wCxetYVL79pQ8uIkSYOvfWQGZ8w5uMKRiUi1KeHXuezJVLf84Vmuvn9zyecy4MdnH6XaNyIDlBJ+HcueTPVmewflzqFqbEgwfULByxyISJ1Rwq9TvU2mKlVjKqGZsiIDnBJ+HWptS/PAUy/TUebY+iSQTMLC41TZUiQOlPDrzOJVW7h8xQbay+i/MeDiDx3GnEO0rqxInCjh15HFq7ZwyZ3r+94xh3OPncLIIQ0cPGYoR//FGCV5kRhSwq8Tm7fv4svLS0v273/nAXzppGkVjkhE6o0Sfg3KLDkIzoSRQ/jJqi3cXOIkqlTC+OapMysboIjUJSX8GrO8ZRsXLmmhArXOaEgaV31slrpvRARQwq8prW1pFi1dW1ayTxl88UOH8ZcHDlf5YhHpRgm/hmzdsZt0R3mzp8zgb981SYleRPaRqHYAEti8fRcLf7qm7PMMSiXZumN3BSISkYFGLfwqya6B87UVG1i+9sWiz5Ew6FkjrdN97yLlIiLZlPCrIFMDJ2nGG+2dRR9/2AFD+dLJ05g+YST3rH+Jy1dsoCGZoFMLiYtIHkr4/Sy7Bk6xPjh9PJ85Zkq3GvVnvOdgPjjjwL3fFpTsRaQ3Svj9bMMLr5dUA6cxZXzjo4fnTOhjhjUq0YtIn5Tw+0EwkWonv930Mj95ZEvRtS2TBlcu0Hh6ESmPEn7Elrds46Lb15a8rmzC4Nfnz2Pq+OEVjkxE4kYJP0KtbWkuXLKWjhKWGxzSkKDL4coFM5XsRaQilPAjsnn7Li6+48mik30qYVz+4enMmDBSN2FFpKKU8CustS3Nebe2sHLzq0UdZ8C1nzxCpYtFJDJK+GXI3IwFY7+GBD977HmWPb6t6PMkgO98YjYnz5pQ8RhFRDKU8EtUiaqWCeCz7z2Ezxx7iFr1IhI5JfwStLal+fxtLZSxyiDJhPHr847VDVkR6TcqnlaCnz++taxk35hK8O3TZinZi0i/Ugu/CK1taU77zz/wp1feLPkc/1ddOCJSJZElfDM7CLgFGA84cL27Xx3V9aK0+plWfvjwM/x64/aSz5FMGF/78HTOmHNwBSMTESlclC38DuBCd3/czIYDa8zsXnffGOE1K6q1Lc05Nz3G2q07Sz7Hv310BhNHD9HqUyJSdZElfHd/EXgxfLzLzDYBE4G6SPjLW7Zx4W0tlLoAVdLg2x+fzfzZEysbmIhIifqlD9/MJgNHAI/2x/XKlZk8VaqTDz+Qyz88Qy16EakpkSd8MxsGLAPOd/fXc7x+LnAuQFNTU9Th5JWZSHX2jx4r+RwNSVOyF5GaFGnCN7MGgmS/2N1/nmsfd78euB6gubm5vBW8y7C8ZRufX7KWzhIKnQEMThmOceUCrTglIrUpylE6BtwIbHL3/4jqOpXQ2pbm/FtbKCXVn/e+Qzhz7iFacUpEal6ULfy5wKeBJ80s0yH+JXe/O8JrFq21Lc3ffPuhopO9GVyddVNWiV5Eal2Uo3RWEhSBrDmtbWm27tjN+m07+cry9RTTi5MA/vWjMzhh+oFK8iJSV2I303bxqi1cdtd63Cm6PMKsicNZvnBeNIGJiEQsVgl/8aotXHLn+pKObUjAj86eU+GIRET6TywSfmtbmkf+9GrJyT6VgKtOm60uHBGpawM+4S9etYWv3LmeYsrWD07CiTMncOTk/Zk4ej+mTxihZC8idW9AJ/xSu3B+f/H7leBFZMAZkAk/6MJpLSnZf+MjmiUrIgPTgEv4y1u2cdHta9lTxBAcAxpSCS47ZZrKF4vIgDVgEn5QB+d1LlrSwp4i15m9+ZwjVb5YRAa8AZHwl7ds44vL1vFWEZm+IVzc8arTZjPv0AMiikxEpHbUfcJvbUuzaOk60h2FJfvGVIJLT57GjIkjVftGRGKl7hP+4kefKyjZJ4ALTjiU0+c0KcmLSCzVdcJvbUvzvQeezrtPAvisFg4XEanvhL91x24GJZOkOzpyvj570khuPOtIJXoREeo84U8aPYQ9Xd27c1IGZx8zmb+ZdiDNU8ZUKTIRkdqTqHYA5RgzrJErTp3J4IYEwxtTDG5I8B8fn80lJ01XshcR6aGuW/gA82dPZO7UsVpxSkSkD3Wf8CFo6SvRi4jkV9ddOiIiUjglfBGRmFDCFxGJCSV8EZGYUMIXEYkJJXwRkZgw98IXComamb0CbKl2HHmMBV6tdhA1SO9L7/Te9E7vTW7Fvi8Hu/u4QnasqYRf68xstbs3VzuOWqP3pXd6b3qn9ya3KN8XdemIiMSEEr6ISEwo4Rfn+moHUKP0vvRO703v9N7kFtn7oj58EZGYUAtfRCQmlPD7YGYHmdkDZrbRzDaY2XnVjqlWmNlgM/ujma0N35vLqx1TLTGzpJk9YWa/qHYstcTMnjWzJ82sxcxWVzueWmJmo8xsqZk9ZWabzOzoSp5/QJRHjlgHcKG7P25mw4E1Znavu2+sdmA1IA0c5+5tZtYArDSzX7n7qmoHViPOAzYBI6odSA16n7trDP6+rgbucfcFZjYI2K+SJ1cLvw/u/qK7Px4+3kXwH3hidaOqDR5oC582hD+6KQSY2STgJOCGasci9cHMRgLzgBsB3L3d3V+r5DWU8ItgZpOBI4BHqxtJ7Qi7LVqAl4F73V3vTeA7wCKgq68dY8iB35jZGjM7t9rB1JApwCvAj8OuwBvMbGglL6CEXyAzGwYsA85399erHU+tcPdOd58NTAKOMrMZ1Y6p2szsZOBld19T7Vhq1DHu/i7gQ8A/mdm8agdUI1LAu4Dr3P0I4A3gXyp5ASX8AoT908uAxe7+82rHU4vCr54PAB+sdiw1YC4w38yeBW4FjjOz/6puSLXD3beFf74M3AEcVd2IasZWYGvWt+SlBB8AFaOE3wczM4I+tU3u/h/VjqeWmNk4MxsVPh4CnAA8Vd2oqs/dL3b3Se4+GfgEcL+7f6rKYdUEMxsaDn4g7K74ALC+ulHVBnd/CXjezP4y3HQ8UNHBIRql07e5wKeBJ8O+aoAvufvdVYypVrwDuNnMkgSNhyXuriGIks944I6gHUUK+Km731PdkGrKQmBxOELnz8DZlTy5ZtqKiMSEunRERGJCCV9EJCaU8EVEYkIJX0QkJpTwRURiQglf8jKzs8xsQrXjyCeM8bt97PM5Mzszx/bJZlbUOHAz+5mZrTOzC4qNtcjrNJjZv5vZ02b2uJk9YmYfCl971syWZe27wMxuiiiOj4WVGx8ws2Yzuybc3uf7LrVF4/ClL2cRTIx5ocpxlMXdf1CJ85jZgcCR7j41x2spd++oxHVCXyeY6zDD3dNmNh54b9br7zazaeVWbg0nF5q791b35++Bf3D3leFzlTSuU2rhx5CZfSqsY99iZv8ZFkBLmtlNZrY+rFV+gZktAJoJJoK0hLNpezvnUWEL9Akz+0NmtqCZrTKz6Vn7PRi2EseZ2b1hHf0bzGyLmY3tI+4HzezqMJb1ZrbPlPywxX5/2AL/rZk1hdu/amYXhY/fHdbwXwv8U9axvzOz2VnQD22HAAAEa0lEQVTPV5rZrB6X+A0wMYzh2DCm74R13c/Lc/2bzOy68P34s5n9tZn9KGw535Tj99gP+AdgobunAdx9u7svydrtKuCSPt6zs8xseRjn02Z2Wdb79N9mdgvBB/pBZvbJ8O9+vZl9M9zvUuAY4EYzuzKMe5/JdeHf5zIzeyz8mZsvLqkSd9dPjH6AdwIrgIbw+feBM4F3E1S7zOw3KvzzQaC5gPOOAFLh4/cDy8LHFwCXh4/fAfx3+Pi7wMXh4w8SVFAc28c1HgR+GD6eB6wPH58FfDd8vAL4u/DxOcCd4eOvAheFj9cB88LHV2ad5++A74SPDwVW54hhcmb/rJi+n/W8t+vfRFBXx4APA68DhxM0utYAs3tcZybwRJ734lmCWaubgKnAAuCmHPudBbwIjAGGECT35vD36ALeE+43AXgOGEfwzf9+4CM9/w0Afw38Isf7/lOComgATQSlSKr+710/3X/Uwo+f4wmS+2NhqYjjgUMIpnEfYmbXmtkHCRJSMUYCt4f94d8GMq36JQTJCOA0goJQELQabwXwYGr9jgKv87PwmN8BIyys5ZPlaILkA/CT8Dp7hfuPCo/P7JNxO3CyBcXyziFI0oW4rcDrr/AgIz4JbHf3Jz3oRtlAkICL1UnwgXVxH/vd6+6t7r4b+HlWTFv87cVqjgQedPdXPOiWWkzwoVqo9wPfDf9N3UXwdzOsiOOlH6gPP34MuNnd90kSYffF3wCfI0jO5xRx3q8DD7j7Ry1YN+BBCCojmlmrmc0EPh6euxw9a4FUrDaIu79pZvcStMBPI/hgLMQbBe6XDv/synqced7z/+JmoMnMRnj+ctw/IUj4+W489/aeFRp3IRIE3xbequA5pcLUwo+f3wILzOwAADPb38wODvvPE+6+DPgyb5dl3QUML+C8I4Ft4eOzerx2G8FiICPdfV247fcESRUz+wAwusD4Px4ecwyw09139nj9DwQVKgHOAB7OftGDMs6vhcdn9sl2A3AN8Ji7F/qto+DrF8rd3ySo0nq1BYW0Mv3kH+ux3x6Cb1T5RgydEP49DwE+QvDe9/RH4L1mNtaCYnifBB4qIuTfEBT+Iox1dp59pUqU8GPGgxEdXyZYcWgdcC9B3/pE4MHwK/l/8XY3wU3ADzI3bc3sa2Y2P8eprwD+zcyeYN/W6lKCJJh9w/Fy4ANhF9DHgJcIPlwws7ut96Ggb4XX+AHB6JGeFgJnh7/bpwnWle3pbOB74e9q2S94sGjJ68CPe7l+Xwq5fqG+TLAC0sbwffoFubvabiT/t/U/EqznsI7g3so+o2zc/UWCxTYeANYCa9x9eRGx/jPQHN6s3kj53+QkAqqWKVVhZo1Ap7t3mNnRBKv85G0VmtmDBDdeIxsWGH7QPAgc5r0PU6wbZnYWwQ3X/1ftWKT61Icv1dIELDGzBNBOMASxqiyYmPUN4PMDIdmL9KQWvohITKgPX0QkJpTwRURiQglfRCQmlPBFRGJCCV9EJCaU8EVEYuL/AwAdTVsfmGjjAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(20,5))\n", + "plt.subplot(131)\n", + "tmp = plt.hist(est_ploidies[\"avg_pl\"],bins=30)\n", + "plt.title(\"est. avg. ploidy from CN profile\")\n", + "plt.subplot(132)\n", + "tmp = plt.hist(sorted(list(GDSC_Ploidies[\"average_ploidy\"].values)),bins=30)\n", + "plt.title(\"PICNIC avg. pl.\")\n", + "plt.subplot(133)\n", + "tmp = plt.hist(est_ploidies[\"median_pl\"],bins=30)\n", + "plt.title(\"est. median ploidy\")\n", + "\n", + "tmp = df_ploidies.plot.scatter(x = \"est. avg. ploidy from CN profile\",y=\"PICNIC avg. pl.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEcCAYAAADN+K/qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X+cXFV9//HXmyQmQGJQwCAQiT+oBoP8SL5gMUKWoFKgylexNv7EpqbxYbdY6ENi0wqkphKrVgtKFSMJgonID8FQ8gXLrjRYwUSCCEuLhYBAEEEICVBM4uf7xzkbJrOzO7OZ2Z3ZO+/n4zGPnZl75tzPnJ35zL3nnnuuIgIzMyue3ZodgJmZDQ0neDOzgnKCNzMrKCd4M7OCcoI3MysoJ3gzs4Jygh8hJC2T9Nlmx9FsA7WDpNMlrRnumFqVpFmSHi55fLekWU0MCUlTJIWk0TWU/VtJ3xxg+QZJJzQ2wmKp2si2M0kbgEnAdmAr8GNgfkT8qplxlZIUwMER8ctmx9JuWrntI+KNzY5hMCLiH5sdw0jnLfhd88cRMR54JfBr4IImxzNklPhzYjYC+Ytbh4j4X+BK4JDe5yRNlHSppN9IelDS3/UmSEkXSbqqpOwSSf+ek+gsSQ/n3dIn8u7nB/pbt6SPSfqlpN9Kuk7S/vn5W3KROyVtkfS+Cq8dJemLeT0PSPrL0t1mSd2SFku6FXgOeI2k/fN6fpvX+7GS+nbqNqnQNbBB0qcl3SPpKUmXSBpXsvwUSeslPS3px5LeVLLsCEk/k7RZ0neBHa/rv2l0oaRNku6VNDs/+V5J68oKninp2n4qmShpqaSNkh6R9FlJo/Ky10n6UV7HEzmumtq+wnqWSfqapBvya26VtJ+kL+e2ulfSESXl95d0Vf58PSDpr0qW7Z7re0rSPcD/KVvXji4NSUdJ+s/c5htzm72kpGxImi/pvlzmq5JU7f3k13ZL+pyk2yU9I+laSS/vp+xAn6tzJV1W8vhDSt+pJyUtLHl+P0nPSdq75LkjcxuNqSXmwooI3wZxAzYAJ+T7ewDLgUtLll8KXAtMAKYA/w3MLSn/38DpwFuBJ4AD87JZwDbgS8BY4DjgWeD1efky4LP5/vH5tUfmshcAt5TEEMDrBngP84F7gAOBlwE/zK8ZnZd3Aw8BbyR1440BbgG+RkqwhwO/AY4vj63kvTxc1ma/ACYDLwduLXkvRwCPA0cDo4CP5PJjgZcADwJ/nWM4jdQt9tl+3tfpuQ17y78P2JTXORb4LTC1pPwdwHv6qesa4OvAnsArgNuBv8jLVgALSRtI44CZtbZ9hfUsy//L6bmum4EHgA/n9vgs0JXL7gasAz6T2+Y1wP3AO/Ly84H/yO93cm7z8v9D72d3OvDm/P+dAvQAnyx7H6uAvYBX5f/3iTW+p27gEWBabr+rgMvysins/Fkb6HN1bsnrDgG2AMfm/+WX8v+69/38G/Dxkhj+Gbig2fmi2bemBzDSbvlLsgV4OiebR4FD87JRwO+AQ0rK/wXQXfL46JxoHgTmlDw/K39g9yx57grg7/P9ZbyYFJcCny8pNz7HMiU/rpbgbyYnq/z4BPom+EUlyyeTjjlMKHnuc8Cy8thK3kt5Yplf8vgk4H/y/YuAfyiL779IP3DH5vZVybIfM3CCLy9/O/ChknUtzvffCDwFjK1QzyTgBWD3kufm8GKivRT4BvnHuey1u5LgLy553An0lDw+FHi65LPzUNnrPw1cku/fT0kSBuZV+D+c0E8cnwSuKXsfpT9cVwALanxP3cD5JY8PIX0vRlGS4Gv4XJ3Liwn+M8DKknJ75jp7E/z7gFtLvoePAUfV810vws1dNLvm1IjYi7TV8ZfAjyTtB+xD2nJ8sKTsg8ABvQ8i4jbSF1GkL02ppyLi2bLX7l9h/fuXriMitgBPlq6niv2B0oPClQ4Qlz63P/DbiNhcFlut6yuvr/R9HQSclbsBnpb0NOmLv3++PRL5W1vy2oFUKt+7ruXA+3NXw4eAKyLihQp1HET6P24sienrpC15gE+R/n+3K41M+bMqMVXz65L7z1d4PL4krv3L2upvST9I0Pf/2m9bSfoDSaskPSbpGeAfSZ/fUo+V3H+uJI5alMcxpkL9g/lc7fTe8vfkyZLl1wKHSHo18DZgU0TcPoh4C8kJvg4RsT0iriZthcwk7WpvJX0Re72KtLsKgKRPkHYxHyUlilIvk7Rn2WsfrbDqR0vXkV+zd+l6qthI6p7pNblCmdIk+SjwckkTymLrXd+zpO6nXvtVqK90HaXv61ekreq9Sm57RMSKHOcBZX2/r+rvTWWVyj8KEBE/IW31vRV4P/Dtfur4FWkLfp+SmF4aeRRKRDwWER+LiP1Je2hfk/S6KnENmqRu4L1lcT1Q1lYTIuKkvHwjfdu5PxcB95JG/LyU9ENRUx97jcrj2Er6fpSq9rkqtdN7k7QH6TMP7DgedgXwQdKPd3//27biBF8HJe8i9WP3RMR20odssaQJkg4CzgQuy+X/gNSn2vsh/JSkw8uqPU/SSyS9FTgF+F6FVa8APirpcEljSVtft0XEhrz816T+2f5cAZwh6QBJewFnD/Q+Iw0B/THwOUnjlA6Czu19X8B64CRJL897Mp8E9pP0vKQtpC/m5/JW48uBlaTkAnAx8HFJKyU9lA80bpT0VdLxim3AE5IelzQHOCq35Z/nBEh+HKQt7FcAfyXpEEn/STrQ+FVJP5d0JumLfyGwNSIqjpmPiI3AjcAXJb1U0m6SXivpuLyu90rq/YF8ivRj+Pv8uFrb1+N2YLOks/MB1VGSpknqPZh6BfBpSS/L8XUOUNcE4Blgi6Q3AB9vcKwfzP+DPYBFwJX5+7FDDZ+rUlcCp0iamQ8GL6Jv/rqU1E33TpzgASf4XfWDnLieARYDH4mIu/OyTtIW7f3AGuA7wLeURqhcBiyJiDsj4j7SVtO3c5KGtEv8FGnL5nJSv3VvItwhIn4I/D3p4NVG4LXAn5YUORdYnnfj/6RC/BeTEtjPSQca/42USLdXKNtrDqn/9FHSAchzchyQvkx3kvp4bwS+m5/vHU76KGmL+Me5XZ4Hrs9lfp7b649IB/Q2Az8lHac4Ang3qWtgX1J/89UDxAhwG+kA4t3AG4D35y3U9wIzgO+TDv5VSiKlPkw6kHkP6X9yJWlYLKQfjdvyZ+A64IyIuD8vO5eStpf0qvyjVW3Po6qcIE8hHYx8gLRF/E1gYi5yHqmL4wHS/2GgJPc3pL2YzaTPw3cHKLsTSW/N730g3yYdX3iM1JX5V/2UG+hztUP+fn2C9H3aSPqfPFxW5lbSD+3PIqJaV157aPZBAN/SjbIDk8O87j8CHmxwnRt48QDYBtLW5ar8uBv483z/z0lbveOr1LWAlPT3Knldd0mZHQc3Scn7+n7q2p2U1A4uee5lpBEjvyEljlW8OLrpfcDasjr+Grgu398b+AHpx/6npD20NTW20emkEUUXkkb73AvMLlm+o51G0q2ZcZMGEIy4Nhuqm7fg21DevT9J0mhJBwDnkLaehsooUtfKHRWWnQCsjnSgeCBrSYnjb2pY3wmkLe5KPg78NNIeVK/dgEtIxzVeRdrDuDAv+wHwekkHl5R/P2lLEuCrpD2Q/UhDPD9SQ3yljgb+h3QA8hzgavUzZtwGlruqjmQQeyNF5wTfnkTanX+KlHR7SMPQGu37eaTHK0ldMZVOPd+btMtdi88AnZL2rVKuYp1K00ycAZxV+nxEPBkRV0XEc5FGdCwmDdMkIp4jjdCYk+s4mNT1c53SiU/vIXUrPBcR95BG6gzG48CXI2JrRHyXNET05EHW0fYkLSedz/HJ2HlUTlvzXDQtIiK62Xlky1Cu6znKznIcIqdGhf7UMk/yYt/2gCLiF5JWkbpregZbZ0RMqVQ4Hwj8Z+BEUncNwARJoyL1e38H+CLpwN77ge9HxHP5gPJoqg85HchAwzpHpIiY1YR1DnbPqS14C96a7YfAO8qGhw7kHOBjDDwG/4ekLetanQW8Hjg60gHZY/PzvcMGbwL2zSOe5vBi98xvSAenqw05HUi/wzrN6uUEb832bdJW71WS3pCHJO6tNCfPSeWFI83S+F36H5UB6UfgGEn/lLeye+ePuSwPCy03gdTv/nTu/z6nbJ1bScNV/4k0DcBN+fntpFE950raIw83/PCg3v2LwzrHSHovMJU0qsmsbk7w1lSRziQ9gTSC5CbSaJTbSQcdb+vnZYtIp6r3V+f/AH9IGn53t6RNpCGla0kjaMp9mTS65gngJ8DqCmW+k+P8XkRsK3n+L0nDFB8j/VitIA0JBXbMwd7vpHGk93hwXvdi4LSIeLK8UI1DE812op27/8ysHpKWAPvV0ics6XTSkL6ZQx6YtSVvwZvVIXcrvSmf1XwU6UzMoRxyalYzj6Ixq88EUrfM/qQTtr5IGlZp1nTuojEzKyh30ZiZFZQTvJlZQQ1JH/w+++wTU6ZMGYqqB+XZZ59lzz1rPX+mPbhN+nKb9OU2qaxV2mXdunVPRES1KTuGJsFPmTKFtWvXDkXVg9Ld3c2sWbOaHUZLcZv05Tbpy21SWau0i6SapkOuqYtG0l6SrlS6wnuPpD+sLzwzMxtqtW7Bf4U0petp+Woqe1R7gZmZNVfVBC9pImnypdMBIuJ3pOtamplZC6uli+bVpFnzLpF0h6RvDmLmPzMza5KqJzpJmkGagOktEXGbpK8Az0TE35eVmwfMA5g0adL0lStXDlHItduyZQvjx49vdhgtxW3Sl9ukL7dJZa3SLh0dHesiYka1crUk+P2An/ReLEHSW4EFEdHvVWdmzJgRHkXTmtwmfblN+nKbVNYq7SKppgRftYsmIh4DfiXp9fmp2aQrzZuZWQurdRRNJ3B5HkFzP/DRoQvJzMwaoaYEHxHrgaq7A2ZmrejQ5Yc2rrLBXla9grs+clf9ldTA0wWbWeFt7jmfDef3e9iwZo3og5+y4Pq646iVJxszMysoJ3gzs4JygjczKygneDOzgnKCNzMrKCd4M7OCcoI3MysoJ3gzs4LyiU5m1hYadoLR6vrqmbj7mMbEUQMneDMrvEacxQrpR6JRdQ0Hd9GYmRWUE7yZWUE5wZuZFZQTvJlZQTnBm5kVlBO8mVlBOcGbmRWUE7yZWUH5RCczM0BSbeWWDLw8IhoQTWN4C97MjJSYq926urqqlmklTvBmZgXlBG9mVlBO8GZmBeUEb2ZWUE7wZmYF5QRvZlZQTvBmZgXlBG9mVlA+k9WsDdV61mY1rXZij+3MW/Bmbaja2ZgHnb2qpjM7rbU5wZuZFZQTvJlZQTnBm5kVlBO8mVlBOcGbmRWUE7yZWUHVNA5e0gZgM7Ad2BYRM4YyKDMzq99gTnTqiIgnhiwSMzNrKHfRmJkVlGo5G03SA8BTQABfj4hvVCgzD5gHMGnSpOkrV65scKiDt2XLFsaPH9/sMFqK26Qvt0lfp69+lmUn7tnsMFpOq3xWOjo61tXSVV5rF83MiHhE0iuAmyTdGxG3lBbISf8bADNmzIhZs2YNNuaG6+7uphXiaCVuk77cJhWsvt5tUsFI+6zUlOAj4pH893FJ1wBHAbcM/Coza4bDzruRTc9vrbueKQuur7uOibuP4c5z3l53PbZrqiZ4SXsCu0XE5nz/7cCiIY/MzHbJpue3suH8k+uqo1Fbqo34kbBdV8sW/CTgmjy96GjgOxGxekijMjOzulVN8BFxP3DYMMRiZmYN5GGSZmYF5QRvZlZQTvDWtlasWMG0adOYPXs206ZNY8WKFc0OyayhfE1Wa0srVqxg4cKFLF26lO3btzNq1Cjmzp0LwJw5c5ocnVljeAve2tLixYtZunQpHR0djB49mo6ODpYuXcrixYubHZpZwzjBW1vq6elh5syZOz03c+ZMenp6mhSRWeO5i8ba0tSpU1mzZg0dHR07nluzZg1Tp05tYlSNMWHqAg5dvqD+ipY3IhaA+k66sl3nBG9taeHChcydO3dHH3xXVxdz584tRBfN5p7zfSarAU7w1qbmzJnDsmXLmD17NhGBJN72trf5AKsVivvgrS11dnZy880384UvfIEbbriBL3zhC9x88810dnY2OzSzhnGCt7Z08cUXs2TJEs4880zGjRvHmWeeyZIlS7j44oubHZpZw7iLxtrSCy+8wPz583d6bv78+Zx11llNiqixGtL3vbox0wVb8zjBW1saO3Ys8+bNY/369fT09DB16lQOP/xwxo4d2+zQ6lbvAVZIPxCNqMeay1001paOO+44Lr/8co499liuvfZajj32WC6//HKOO+64Zodm1jDegre29Mgjj3DqqafyrW99i4suuoixY8dy6qmnct999zU7NLOGcYK3ttTT08Mdd9zBmDFjdoz53rp1K+PGjWt2aGYN4y4aa0u9Z7KWKsqZrGa9nOCtLfWeydrV1cW2bdt2nMm6cOHCZodm1jDuorG21HvGamdn545RNIsXL/aZrFYoTvDWtubMmcOcOXMaNu+KWatxgrfCk9SQeiKiIfWYDRf3wVvhRcSAt4POXlW1TNGSu6QBbw8uOaVqmUb9cNrQcYI3a0PVfsy6urra7keviJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoJzgzcwKquYEL2mUpDskrRrKgMzMrDEGswV/BtAzVIGYmVlj1ZTgJR0InAx8c2jDMTOzRql1C/7LwKeA3w9hLGZm1kBVr8kq6RTg8YhYJ2nWAOXmAfMAJk2aRHd3d6Ni3GVbtmxpiThaidukMrfJzvw5qWyktYuqXXZL0ueADwHbgHHAS4GrI+KD/b1mxowZsXbt2kbGuUu6u7uZNWtWs8NoKW6TvqYsuJ4N55/c7DBaij8nlbVKu0haFxEzqpWr2kUTEZ+OiAMjYgrwp8DNAyV3MzNrDVW7aMxa2WHn3cim57fWXc+UBdfXXcfE3cdw5zlvr7ses0YZVIKPiG6ge0giMdsFm57fWnf3SqN2uxvxI2HWSD6T1cysoJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoHyik41oE6Yu4NDlC+qvaHkjYoE06apZa3CCLxhJDamn2hxFrWJzz/k+0cmsH+6iKZiIGPB20NmrqpYZKcndzAbmBG9mVlBO8GZmBeUEb2ZWUE7wZmYF5VE0NuI1ZPTK6sbMB2/WSpzgbURrxKX2fMk+Kyp30ZiZFZQTvJlZQTnBm5kVlBO8mVlBOcGbmRVUIRP8ihUrmDZtGrNnz2batGmsWLGi2SGZmQ27wg2TXLFiBQsXLmTp0qVs376dUaNGMXfuXADmzJnT5OjMzIZP4bbgFy9ezNKlS+no6GD06NF0dHSwdOlSFi9e3OzQzMyGVeESfE9PDzNnztzpuZkzZ9LT09OkiMzMmqNwCX7q1KmsWbNmp+fWrFnD1KlTmxSRmVlzFC7BL1y4kLlz59LV1cW2bdvo6upi7ty5LFy4sNmhmZkNq8IdZO09kNrZ2UlPTw9Tp05l8eLFPsBqZm2ncAkeUpKfM2dOw661aWY2EhWui8bMzBIneDOzgnKCNzMrKCd4M7OCKuRB1qI67Lwb2fT81rrracQl7ibuPoY7z3l73fUMB0nVyyypXk9ENCAas+HjBD+CbHp+a92XlmvUyKKGXAd1mFRLzB5tZUXlLhozs4JygjczK6iqCV7SOEm3S7pT0t2SzhuOwMzMrD619MG/ABwfEVskjQHWSLohIn4yxLGZmVkdqib4SEeotuSHY/LNwwnMzFpcTX3wkkZJWg88DtwUEbcNbVhmZlYvDWZsr6S9gGuAzoj4RdmyecA8gEmTJk1fuXJlI+PcJVu2bGH8+PHNDqNhTl/9LMtO3LOuOhrVJo2IpVUU7XPSCG6TylqlXTo6OtZFxIyqBSNiUDfgM8DfDFRm+vTp0Qq6urqaHUJDHXT2qrrraFSbNCKWVlG0z0kjuE0qa5V2AdZGDfm6ah+8pH2BrRHxtKTdgbcBNZz3Z402YeoCDl2+oP6KljciFoD6Troys6FVyyiaVwLLJY0i9dlfERGrhjYsq2Rzz/k+k9XMalbLKJqfA0cMQyxmZtZAPpPVzKygnODNzArKCd7MrKCc4M3MCsoJ3sysoHzBjxGmIcMTVzfmik5m1tqc4EeQesfAQ/qBaEQ9Ztb63EVjZlZQhUzwnZ2djBs3jo6ODsaNG0dnZ2ezQzIzG3aF66Lp7Ozkwgsv3PH4hRde2PH4ggsuaFZYZmbDrnBb8L3J/JhjjuF73/sexxxzzE7Pm5m1i8IleICjjz6aW2+9lX322Ydbb72Vo48+utkhmZkNu0Im+He84x0DPjYzawcjtg9eUr/LFi1axKJFi2p+TQziqlZmZiPFiN2C7+8KJpMnT65YfvLkyQNdpcrMrHBGbILvz0MPPdQnyU+ePJmHHnqoSRGZmTVH4RI8pCQfERx09ioiwsndzNpSIRO8mZk5wZuZFZYTvJlZQTnBm5kVlBO8mVlBjdgTnayygU4A21FmSfV6fH6A2cjnLfiC6e9krt5bV1dX1TJO7mbF4ARvZlZQTvBmZgXlBG9mVlBO8GZmBeUEb2ZWUE7wZmYF1bLj4A8770Y2Pb+17nqmLLi+rtdP3H0Md57z9rrjMDMbbi2b4Dc9v5UN559cVx3d3d3MmjWrrjrq/YEwM2sWd9GYmRWUE7yZWUE5wZuZFZQTvJlZQTnBm5kVVNUEL2mypC5J90i6W9IZwxGYmZnVp5ZhktuAsyLiZ5ImAOsk3RQR9wxxbGZmVoeqCT4iNgIb8/3NknqAA4AhTfATpi7g0OUL6q9oeb1xANQ3Ht/MrBk0mIs7SJoC3AJMi4hnypbNA+YBTJo0afrKlSvrCuz01c+y7MQ966pjy5YtjB8/vulxtJJGtEnRuE36cptU1irt0tHRsS4iZlQtWMvVffKPwHhgHfDuamWnT58e9Tro7FV119HV1dUScbSSRrRJ0bhN+nKbVNYq7QKsjRrydk1TFUgaA1wFXB4RV+/6787gNGSagNX1z0VjZjYSVU3wSldxXgr0RMSXhj6kpN55aCD9QDSiHjOzkaiWcfBvAT4EHC9pfb6dNMRxmZlZnWoZRbMG0DDEYmZmDeQzWc3MCsoJ3sysoJzgzcwKygnezKygnODNzArKCd7MrKCc4M3MCsoJ3sysoJzgzcwKygnezKygnODNzArKCd7MrKBqmg++FaVZjGsot6R6mRjEVa3MzEaKEbsFX8vVTLq6umq9WpWZWeGM2ARvZmYDc4I3MysoJ3gzs4JygjczKygneDOzgnKCNzMrKCd4M7OCcoI3MysoDcWJPpJ+AzzY8IoHbx/giWYH0WLcJn25Tfpym1TWKu1yUETsW63QkCT4ViFpbUTMaHYcrcRt0pfbpC+3SWUjrV3cRWNmVlBO8GZmBVX0BP+NZgfQgtwmfblN+nKbVDai2qXQffBmZu2s6FvwZmZta8QneEmTJXVJukfS3ZLOqFBGkv5F0i8l/VzSkc2IdbhIGifpdkl35jY5r0KZsZK+m9vkNklThj/S4SdplKQ7JK2qsKzt2kTSBkl3SVovaW2F5W313QGQtJekKyXdK6lH0h+WLR8xbTLiEzywDTgrIg4B3gx8QtIhZWX+CDg43+YBFw1viMPuBeD4iDgMOBw4UdKby8rMBZ6KiNcB/wzUcO2rQjgD6OlnWbu2SUdEHN7P8L92++4AfAVYHRFvAA6j7+dlxLTJiE/wEbExIn6W728m/TMOKCv2LuDSSH4C7CXplcMc6rDJ73NLfjgm38oPtrwLWJ7vXwnMVq3XQRyhJB0InAx8s58ibdcmNWir746kicCxwFKAiPhdRDxdVmzEtMmIT/Cl8i71EcBtZYsOAH5V8vhh+v4IFEruilgPPA7cFBH9tklEbAM2AXsPb5TD7svAp4Df97O8HdskgBslrZM0r8LydvvuvBr4DXBJ7sr7pqQ9y8qMmDYpTIKXNB64CvhkRDzT7HiaLSK2R8ThwIHAUZKmNTumZpJ0CvB4RKxrdiwtZmZEHEnqdviEpGObHVCTjQaOBC6KiCOAZ4EFzQ1p1xUiwUsaQ0rul0fE1RWKPAJMLnl8YH6u8PLuZRdwYtmiHW0iaTQwEXhyeKMbVm8B3ilpA7ASOF7SZWVl2q1NiIhH8t/HgWuAo8qKtNt352Hg4ZI93itJCb/UiGmTEZ/gcx/pUqAnIr7UT7HrgA/no99vBjZFxMZhC3KYSdpX0l75/u7A24B7y4pdB3wk3z8NuDkKfFJERHw6Ig6MiCnAn5Le7wfLirVVm0jaU9KE3vvA24FflBVrq+9ORDwG/ErS6/NTs4F7yoqNmDYZ3ewAGuAtwIeAu3KfM8DfAq8CiIh/Bf4NOAn4JfAc8NEmxDmcXgkslzSK9CN+RUSskrQIWBsR15F+FL8t6ZfAb0lJr+20eZtMAq7Jx5FHA9+JiNWS5kPbfncAOoHLJb0EuB/46EhtE5/JamZWUCO+i8bMzCpzgjczKygneDOzgnKCNzMrKCd4M7OCcoK3ppB0uqT9h3mdUyT9It+fIelfhnHdsyrNYFlW5p2SKp41KWlLpefNBlKEcfA2Mp1OOqnm0WasPCLWAn2mx22mPBb/umbHYcXhLXhrGEkfzPPQr5f09Tzh2ShJyyT9Is87/teSTgNmkE4mWZ/Ptu2vznMlLZf0H5IelPRuSZ/Pda3O01QgabqkH+VJs/5f7+x++fk7Jd0JfKKk3h1b1JKOkvSfeXKpH/eexZj3Mq7O67lP0udraINlkv5V0lpJ/53nwCkv83JJ31eaS/wnkt5Usr4L8/1X55jukvTZktdeKunUkseXS3pXtbisPTnBW0NImgq8D3hLnuRsO/AB0nz0B0TEtIg4FLgkIq4kbT1/IM9D/nyV6l8LHA+8E7gM6Mp1PQ+cnJP8BcBpETEd+BawOL/2EqAzz43fn3uBt+bJpT4D/GPJssPz+zoUeJ+kyRVeX24KaU6Xk4F/lTSubPl5wB0R8SbSWdeXVqjjK6QJrw4FSk+DX0ra++md2vYY4PoaYrI25C4aa5TZwHTgp/l4vMN0AAACI0lEQVTU991JUxX/AHiNpAtIiejGXaj7hojYKukuYBSwOj9/FymZvh6YBtyU1z0K2Jjn49krIm7J5b9NmjWx3ETS1A4Hk6bPHVOy7N8jYhOApHuAg9h5qthKroiI3wP3SbofeEPZ8pnAewAi4mZJe0t6aVmZt/SWyXEvyeV/JOlrkvbNy6/KUxub9eEEb40iYHlEfLrPAukw4B3AfOBPgD8bZN0vAETE7yVtLZkA7Pekz7CAuyOi/NJqe9VY/z+Q9gr+r9I1BbrL151tp7bvTPn8H7s6H0h/r7sU+CBprpyWnQfFms9dNNYo/w6cJukVsKOf+SBJ+wC7RcRVwN/x4tSrm4EJDVr3fwH7Kl87U9IYSW/MUyU/LWlmLveBfl4/kRenez29AfG8V9Jukl4LvCbHV+o/emORNAt4osI1DG7lxcnOyuNeBnwSICLKZzo028EJ3hoiJ5q/I10d6OfATaRZLQ8AuvNMn5cBvVv4y0j90+sl7S5pkaR37uK6f0ea3ndJPpi6ntQ3DWkL96t5/f1dfu/zwOck3UGNe7VKV/qpdA1TgIeA24EbgPkR8b9ly88Fpud2Op8XpygudQbpAhx3UXa1oIj4NenSlJfUEqu1L88madZAkpYBq/KB5KFaxx6k4w9H9h4fMKvEW/BmI4ikE0hb7xc4uVs13oI3Mysob8GbmRWUE7yZWUE5wZuZFZQTvJlZQTnBm5kVlBO8mVlB/X+KIRNPxs+29gAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# PICNIC average ploidy vs estimated copy-neutral \n", + "tmp = df_ploidies.boxplot(column=\"PICNIC avg. pl.\", by = \"est. median. ploidy\" )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert gene-level integer CN into log2R-like format in order to make it compatible with TCGA and CCLE\n", + "\n", + "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n", + "\n", + "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n", + "\n", + "3) Replace estimates below thresholds with zeroes. " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "estimated_CN = est_ploidies[\"median_pl\"].to_dict()\n", + "estimated_CN[1287381]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>1287381</th>\n", + " <th>924100</th>\n", + " <th>910924</th>\n", + " <th>687561</th>\n", + " <th>1287706</th>\n", + " <th>687452</th>\n", + " <th>906798</th>\n", + " <th>906797</th>\n", + " <th>906800</th>\n", + " <th>910922</th>\n", + " <th>...</th>\n", + " <th>909785</th>\n", + " <th>909904</th>\n", + " <th>909905</th>\n", + " <th>687592</th>\n", + " <th>1303911</th>\n", + " <th>946358</th>\n", + " <th>909907</th>\n", + " <th>1298146</th>\n", + " <th>908452</th>\n", + " <th>908450</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>5S_rRNA</th>\n", + " <td>-4.320000</td>\n", + " <td>1.807355</td>\n", + " <td>2.0</td>\n", + " <td>-0.415037</td>\n", + " <td>-4.32</td>\n", + " <td>-4.320000</td>\n", + " <td>-1.00</td>\n", + " <td>2.807355</td>\n", + " <td>2.169925</td>\n", + " <td>-1.00</td>\n", + " <td>...</td>\n", + " <td>-4.32</td>\n", + " <td>2.222392</td>\n", + " <td>-4.32</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.32</td>\n", + " <td>1.807355</td>\n", + " <td>-4.320000</td>\n", + " <td>1.736966</td>\n", + " <td>2.169925</td>\n", + " <td>-1.584963</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5_8S_rRNA</th>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.00</td>\n", + " <td>-0.584963</td>\n", + " <td>0.00</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.00</td>\n", + " <td>...</td>\n", + " <td>0.00</td>\n", + " <td>0.415037</td>\n", + " <td>0.00</td>\n", + " <td>-0.584963</td>\n", + " <td>-4.32</td>\n", + " <td>0.000000</td>\n", + " <td>-0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.584963</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7SK</th>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.32</td>\n", + " <td>-0.584963</td>\n", + " <td>-4.32</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>-4.32</td>\n", + " <td>...</td>\n", + " <td>-4.32</td>\n", + " <td>0.000000</td>\n", + " <td>-4.32</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.32</td>\n", + " <td>-0.415037</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.320000</td>\n", + " <td>0.000000</td>\n", + " <td>-4.320000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 996 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 1287381 924100 910924 687561 1287706 687452 906798 \\\n", + "5S_rRNA -4.320000 1.807355 2.0 -0.415037 -4.32 -4.320000 -1.00 \n", + "5_8S_rRNA -0.584963 0.000000 0.0 -0.415037 0.00 -0.584963 0.00 \n", + "7SK 1.000000 0.000000 0.0 -4.320000 -4.32 -0.584963 -4.32 \n", + "\n", + " 906797 906800 910922 ... 909785 909904 909905 \\\n", + "5S_rRNA 2.807355 2.169925 -1.00 ... -4.32 2.222392 -4.32 \n", + "5_8S_rRNA 0.000000 0.000000 0.00 ... 0.00 0.415037 0.00 \n", + "7SK 0.584963 0.000000 -4.32 ... -4.32 0.000000 -4.32 \n", + "\n", + " 687592 1303911 946358 909907 1298146 908452 908450 \n", + "5S_rRNA -4.320000 -4.32 1.807355 -4.320000 1.736966 2.169925 -1.584963 \n", + "5_8S_rRNA -0.584963 -4.32 0.000000 -0.415037 0.000000 0.000000 -0.584963 \n", + "7SK -4.320000 -4.32 -0.415037 -4.320000 -4.320000 0.000000 -4.320000 \n", + "\n", + "[3 rows x 996 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdsc = gdsc.apply(lambda x : CN2log2R(x,estimated_CN[x.name] ))\n", + "# drop genes without any determined value\n", + "gdsc = gdsc.dropna(axis=0,how=\"all\")\n", + "# fill with zeroes the remaining ones\n", + "gdsc.fillna(0,inplace=True)\n", + "gdsc.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "gdsc = gdsc.applymap(lambda x : clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ok: no empty rows detected\n", + "Ok: no duplicated pairs detected\n", + "Ok: All Symbol rows are not empty.\n", + "Ok: All Symbol are mapped to GeneID\n", + "16 Symbol mapped to multiple GeneID\n", + "Ok: All GeneID are unique\n", + "59266 Symbol can be mapped directly to GeneID\n" + ] + } + ], + "source": [ + "NCBI = pd.read_csv(root_dir+\"Homo_sapiens.gene_info\",sep = \"\\t\")\n", + "NCBI = NCBI[[\"#tax_id\",\"GeneID\",\"Symbol\",\"Synonyms\",\"type_of_gene\"]]\n", + "NCBI = NCBI.loc[NCBI[\"#tax_id\"] == 9606]\n", + "NCBI = NCBI.loc[NCBI[\"type_of_gene\"] != \"unknown\"]\n", + "ncbi_symbols = parse_mapping_table(NCBI, \"Symbol\",\"GeneID\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ok: no empty rows detected\n", + "Ok: no duplicated pairs detected\n", + "Ok: All Synonyms rows are not empty.\n", + "Ok: All Synonyms are mapped to GeneID\n", + "3145 Synonyms mapped to multiple GeneID\n", + "49179 different Synonyms mapped to the same GeneID\n", + "10839 Synonyms can be mapped directly to GeneID\n" + ] + } + ], + "source": [ + "ncbi_synonyms = expand(NCBI[[\"Synonyms\",\"GeneID\"]],column=\"Synonyms\",sep=\"|\") \n", + "ncbi_synonyms = parse_mapping_table(ncbi_synonyms, \"Synonyms\",\"GeneID\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mapped: 24545 \n", + "\tdirectly via main_mapper 22363 \n", + "\tvia alternative mapper 766 \n", + "\tvia one of multiple synonyms in alternative mapper 1416 \n", + "\tLOC 0 \n", + "Unmapped: 21587 \n", + "\trecognized symbols without Entrez ID 0 \n", + "\tmultiple query_ids map to the same target_id 0 \n", + "\tquery_ids map to multiple target_ids in the main mapper 0 \n", + "\tquery_ids map to multiple target_ids in the alternative mapper 76 \n", + "\tLOC not found in Entrez 0 \n", + "\tNot found at all: 21511\n", + "Warning: query IDs mapping to duplicated target IDs in mapping table: 156\n", + "Warning: query IDs not mapped to any target IDs excluded: 21587\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/olya/miniconda2/lib/python2.7/site-packages/pandas/core/frame.py:3781: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " return super(DataFrame, self).rename(**kwargs)\n", + "IDs mapped to multiple target IDs are kept:\n", + " [143872, 286464, 140290, 414212, 414213, 51463, 642826, 84631, 574445, 399761, 100132115, 647060, 284565, 6551, 161176, 341019, 4253, 9502, 442416, 51236, 643749, 54438, 728113, 100302179, 414761, 29099, 729438, 256815, 10160, 645425, 653234, 644019, 26165, 3255, 644509, 2749, 653505, 653067, 643479, 100462820, 100418977, 26824, 79817, 6218, 728695, 100034743, 221262, 647507, 677844, 728917, 26583, 100289124, 84316, 200030, 768096, 642658, 23523, 401508, 23334, 119016, 106478953, 84458, 1517, 246126, 26095, 100033392, 92017, 374, 26871, 100132948, 125050, 387707, 653308, 79741, 728798]\n", + "mapper.py:204: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " df.sort_index(inplace=True)\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>1287381</th>\n", + " <th>924100</th>\n", + " <th>910924</th>\n", + " <th>687561</th>\n", + " <th>1287706</th>\n", + " <th>687452</th>\n", + " <th>906798</th>\n", + " <th>906797</th>\n", + " <th>906800</th>\n", + " <th>910922</th>\n", + " <th>...</th>\n", + " <th>909785</th>\n", + " <th>909904</th>\n", + " <th>909905</th>\n", + " <th>687592</th>\n", + " <th>1303911</th>\n", + " <th>946358</th>\n", + " <th>909907</th>\n", + " <th>1298146</th>\n", + " <th>908452</th>\n", + " <th>908450</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.415037</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.736966</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.736966</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>-0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>-0.415037</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 996 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 1287381 924100 910924 687561 1287706 687452 906798 \\\n", + "1 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 \n", + "2 0.000000 0.584963 0.0 -0.415037 0.321928 0.000000 0.584963 \n", + "9 -0.584963 0.584963 0.0 -0.415037 -0.415037 -0.584963 0.000000 \n", + "\n", + " 906797 906800 910922 ... 909785 909904 909905 \\\n", + "1 0.584963 0.0 0.0 ... 0.0 -0.584963 0.584963 \n", + "2 0.584963 0.0 0.0 ... 0.0 -0.584963 0.000000 \n", + "9 0.000000 0.0 0.0 ... 0.0 -0.584963 0.000000 \n", + "\n", + " 687592 1303911 946358 909907 1298146 908452 908450 \n", + "1 0.415037 0.000000 0.000000 0.0 0.000000 0.584963 0.415037 \n", + "2 0.000000 0.736966 0.321928 0.0 -0.584963 0.000000 0.736966 \n", + "9 0.415037 0.000000 0.000000 -1.0 -0.584963 0.000000 0.000000 \n", + "\n", + "[3 rows x 996 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdsc,query2target,not_mapped = apply_mappers(gdsc, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n", + "gdsc.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "75 duplicated IDs in 156 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 25\n", + "Merged 131 duplicated rows into 63 rows\n" + ] + } + ], + "source": [ + "gdsc = handle_dups(gdsc,corr_thr = 0.75)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>683665</th>\n", + " <th>683667</th>\n", + " <th>684052</th>\n", + " <th>684055</th>\n", + " <th>684057</th>\n", + " <th>684059</th>\n", + " <th>684062</th>\n", + " <th>684072</th>\n", + " <th>684681</th>\n", + " <th>687448</th>\n", + " <th>...</th>\n", + " <th>1659818</th>\n", + " <th>1659819</th>\n", + " <th>1659823</th>\n", + " <th>1659928</th>\n", + " <th>1659929</th>\n", + " <th>1660034</th>\n", + " <th>1660035</th>\n", + " <th>1660036</th>\n", + " <th>1674021</th>\n", + " <th>1789883</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.415037</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.321928</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>-0.415037</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.584963</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-1.0</td>\n", + " <td>-1.584963</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.584963</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-1.0</td>\n", + " <td>-1.584963</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>0.0</td>\n", + " <td>-1.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>-1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>0.415037</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 996 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 683665 683667 684052 684055 684057 684059 684062 \\\n", + "gene_id \n", + "1 0.0 0.000000 0.0 0.000000 -0.415037 0.0 -0.415037 \n", + "2 0.0 0.000000 0.0 0.584963 0.000000 0.0 0.000000 \n", + "9 0.0 0.321928 0.0 0.584963 0.584963 0.0 0.321928 \n", + "10 0.0 0.321928 0.0 0.584963 0.584963 0.0 0.321928 \n", + "12 0.0 -1.000000 0.0 0.000000 -1.000000 0.0 -0.415037 \n", + "\n", + " 684072 684681 687448 ... 1659818 1659819 1659823 \\\n", + "gene_id ... \n", + "1 0.000000 0.415037 0.0 ... 0.0 0.0 0.000000 \n", + "2 0.584963 0.000000 0.0 ... 0.0 0.0 0.000000 \n", + "9 0.000000 -0.584963 0.0 ... 0.0 0.0 -0.415037 \n", + "10 0.000000 -0.584963 0.0 ... 0.0 0.0 -0.415037 \n", + "12 0.000000 0.000000 0.0 ... -1.0 0.0 0.321928 \n", + "\n", + " 1659928 1659929 1660034 1660035 1660036 1674021 1789883 \n", + "gene_id \n", + "1 0.0 -0.415037 0.0 -0.584963 0.000000 0.000000 0.321928 \n", + "2 0.0 0.000000 0.0 0.000000 0.000000 0.584963 -0.415037 \n", + "9 0.0 0.000000 -1.0 -1.584963 0.000000 -1.000000 -1.000000 \n", + "10 0.0 0.000000 -1.0 -1.584963 0.000000 -1.000000 -1.000000 \n", + "12 0.0 -0.415037 0.0 0.415037 0.584963 0.000000 0.000000 \n", + "\n", + "[5 rows x 996 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdsc.index.name = \"gene_id\"\n", + "gdsc = gdsc.T.sort_index().T\n", + "gdsc.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "gdsc.to_csv(preprocessed_dir+\"/\"+\"GDSC\"+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PDX \n", + "\n", + "For PDX dataset only gene-level estimated copy-number (non-integer) reported. \n", + "From ploidy distributions, calculated as average over all genes we concluded that CN estimates were called under assumption that copy-neutral state of each xenograft corresponds CN = 2.\n", + "\n", + "\n", + "For gene ID conversion we used the same approach as for RNA-seq." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(23852, 375)\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Sample</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>A1BG</th>\n", + " <td>2.58</td>\n", + " <td>1.60</td>\n", + " <td>2.17</td>\n", + " <td>2.08</td>\n", + " <td>2.00</td>\n", + " <td>3.94</td>\n", + " <td>2.04</td>\n", + " <td>11.39</td>\n", + " <td>2.17</td>\n", + " <td>2.01</td>\n", + " <td>...</td>\n", + " <td>2.08</td>\n", + " <td>2.10</td>\n", + " <td>2.14</td>\n", + " <td>2.95</td>\n", + " <td>2.06</td>\n", + " <td>2.07</td>\n", + " <td>1.99</td>\n", + " <td>2.07</td>\n", + " <td>1.43</td>\n", + " <td>2.03</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1BG-AS1</th>\n", + " <td>2.58</td>\n", + " <td>1.60</td>\n", + " <td>2.17</td>\n", + " <td>2.08</td>\n", + " <td>2.00</td>\n", + " <td>3.94</td>\n", + " <td>2.04</td>\n", + " <td>11.39</td>\n", + " <td>2.17</td>\n", + " <td>2.01</td>\n", + " <td>...</td>\n", + " <td>2.08</td>\n", + " <td>2.10</td>\n", + " <td>2.14</td>\n", + " <td>2.95</td>\n", + " <td>2.06</td>\n", + " <td>2.07</td>\n", + " <td>1.99</td>\n", + " <td>2.07</td>\n", + " <td>1.43</td>\n", + " <td>2.03</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1CF</th>\n", + " <td>2.87</td>\n", + " <td>2.97</td>\n", + " <td>2.01</td>\n", + " <td>2.06</td>\n", + " <td>2.10</td>\n", + " <td>1.58</td>\n", + " <td>2.01</td>\n", + " <td>1.64</td>\n", + " <td>1.89</td>\n", + " <td>1.99</td>\n", + " <td>...</td>\n", + " <td>2.04</td>\n", + " <td>0.97</td>\n", + " <td>1.58</td>\n", + " <td>2.08</td>\n", + " <td>1.95</td>\n", + " <td>1.92</td>\n", + " <td>1.54</td>\n", + " <td>1.28</td>\n", + " <td>1.33</td>\n", + " <td>2.10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A2LD1</th>\n", + " <td>5.74</td>\n", + " <td>1.64</td>\n", + " <td>2.06</td>\n", + " <td>2.01</td>\n", + " <td>2.07</td>\n", + " <td>1.74</td>\n", + " <td>2.06</td>\n", + " <td>1.59</td>\n", + " <td>1.40</td>\n", + " <td>2.53</td>\n", + " <td>...</td>\n", + " <td>2.03</td>\n", + " <td>2.07</td>\n", + " <td>2.25</td>\n", + " <td>2.00</td>\n", + " <td>1.01</td>\n", + " <td>2.00</td>\n", + " <td>1.08</td>\n", + " <td>1.85</td>\n", + " <td>1.93</td>\n", + " <td>1.45</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>4 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 X-1167 X-1169 \\\n", + "Sample \n", + "A1BG 2.58 1.60 2.17 2.08 2.00 3.94 2.04 11.39 \n", + "A1BG-AS1 2.58 1.60 2.17 2.08 2.00 3.94 2.04 11.39 \n", + "A1CF 2.87 2.97 2.01 2.06 2.10 1.58 2.01 1.64 \n", + "A2LD1 5.74 1.64 2.06 2.01 2.07 1.74 2.06 1.59 \n", + "\n", + " X-1172 X-1173 ... X-5694 X-5696 X-5713 X-5717 X-5727 \\\n", + "Sample ... \n", + "A1BG 2.17 2.01 ... 2.08 2.10 2.14 2.95 2.06 \n", + "A1BG-AS1 2.17 2.01 ... 2.08 2.10 2.14 2.95 2.06 \n", + "A1CF 1.89 1.99 ... 2.04 0.97 1.58 2.08 1.95 \n", + "A2LD1 1.40 2.53 ... 2.03 2.07 2.25 2.00 1.01 \n", + "\n", + " X-5739 X-5808 X-5959 X-5975 X-6047 \n", + "Sample \n", + "A1BG 2.07 1.99 2.07 1.43 2.03 \n", + "A1BG-AS1 2.07 1.99 2.07 1.43 2.03 \n", + "A1CF 1.92 1.54 1.28 1.33 2.10 \n", + "A2LD1 2.00 1.08 1.85 1.93 1.45 \n", + "\n", + "[4 rows x 375 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PDX_xls = \"/home/olya/SFU/Hossein/PDX/nm.3954-S2.xlsx\"\n", + "pdx = pd.read_excel(PDX_xls,\"copy number\")\n", + "pdx.set_index(\"Sample\",drop=True,inplace=True)\n", + "focal = pdx.T[\"FocalCNScore\"]\n", + "pdx.drop([\"ArmLevelCNScore\",\"FocalCNScore\"],inplace = True)\n", + "print(pdx.shape)\n", + "pdx.head(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Strings containing duplicated gene IDs: 544\n", + "268 duplicated IDs in 544 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 134\n", + "Merged 410 duplicated rows into 205 rows\n" + ] + } + ], + "source": [ + "pdx.index.name = \"gene_id\"\n", + "ids = pdx.index\n", + "ids = list(set(ids[ids.duplicated()]))\n", + "print(\"Strings containing duplicated gene IDs:\",pdx.loc[ids,:].shape[0])\n", + "pdx = handle_dups(pdx,corr_thr = 0.75)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5,0,'CN Averaged over all')" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGyJJREFUeJzt3XuYHVWd7vHvCwRFAwaGNicCMXhFFAnQIgoqIKjgwwFGUaMgcJyJjkfUOaMDelRgvEyYR1ARj+cEQcELl1FuCioIAl6RBCMhIIoQFIgkCMhFAQPv+aNWk01Pd3Z10rV3uuv9PM9+eu+qVVW/vZ6kfnutWrVKtomIiPZar98BREREfyURRES0XBJBRETLJRFERLRcEkFERMslEUREtFwSQTRC0uWS/qFGuZmSHpC0/ijrj5H0tTU4/mr3G//VmtZ1THxJBLHGJC2V9Ndywr1T0lckTR3LPmz/3vZU24+OZ2xrs19Ju0t6rHyv+yXdKOnwsm6WJJd1Q9/7O5L27th+aqmbt3Us21jS7yW9cXy+YcT4SSKItbWf7anAjsAg8JE+xzNe7ijfaxPgSOBkSdt2rJ9W1m8PXAKcK+kwANsPAO8EPitpoJT/D2CB7W/26gtE1JVEEOPC9u3Ad4EXDV8naT1JH5F0q6Tlkk6X9LSybugX9gbl89aSrii/xC8BNu/Yz4WSjhi272slHTjCMYfv93JJH5f0k7LviyVtPny7Eb6XbZ8H3ANsO8L6P9r+HHAMcJyk9cry7wMXAidK2h14E/Du0Y4j6TBJN5fYbhlqTUh6tqTLJP1J0l2Svi5pWsd2SyV9sNTDg5JOkTRd0nfLvn4gadNhdTJX0h2Slkn6wGpi2kXSTyXdK+lX5XusNt6YmJIIYlxI2grYF/jlCKsPK689gGcBU4GTRtnVN4CFVAng48ChHetOAw7uOOb2wBZUJ9w63gocDjwd2BAY9STYcYz1SqKZBixeTdFzyn6f37Hsn4HdgW8CH7D9x1GO8VTgRGAf2xsDLwcWDa0G/h14BvACYCuqpNPpDcDewPOA/agS8oeBAar/4+8dVn4P4LnAa4AjJe01QkxD9foJYDOquvqWpIEu8cYElEQQa+s8SfcCPwauAD41Qpm3ASfYvrl0m3wIeMvQr/UhkmYCLwE+avth21cC3+4ocgHwPEnPLZ8PAc6y/UjNWL9s+ze2/wqcDcxeTdlnlO91F3A0cIjtG1dT/o7yd7OhBbbvAZYAT6FKFKvzGPAiSRvZXmZ7SdnHTbYvKfWxAjgBeNWwbT9v+87SKvsRcJXtX9p+CDgX2GFY+WNtP2h7MfBlYM4I8RwMXGT7ItuP2b4EWECV7EeNNyamJIJYWwfYnmb7mbbfXU6ywz0DuLXj863ABsD0EcrdY/vBYWUBKCe2s4CDSxfMHOCrY4i18xf5X6haJqO5o3yvzWzPtn1ml31vUf7ePbRA0sHALOAHwHGjbVi+75uBdwHLShfYNmUf0yWdKel2SfcBX6Oju6y4s+P9X0f4PPx7/qHj/a1U9T7cM4GDSrfQvSUp7gbMWF28MTElEUQv3EF1YhkyE1jJE09YAMuATUvXQ2fZTqdRtTBeDfzF9s/GOdY1dSCwHLgRQNLTgc8A/0h14fhNkl4x2sa2v297b2AG8Gvg5LLqU4CB7WxvQvVLXWsZ61Yd72eyqjXT6Q/AV0syHHo91fa8LvHGBJREEL1wBvDP5ULwVKqT21m2V3YWsn0rVffDsZI2lLQbVZ93Z5mfUXVLHM/YWgONKL/Y30PVffQh24+VVScB59n+oe1lwL9SjTx60ij72L8kwIeBB6i+I8DG5fOfS7/9B8ch7I9KeoqkF1JdMzlrhDJfA/aT9FpJ60t6sqphtVt2iTcmoCSC6IVTqU7aVwK3AA8BR4xS9q3AS6m6WI4GTh+hzOnAdlQnq365V9KDVBeQ9wUOsn0qgKQDqLpRHj9p2/4S1S/vj42wr/WA/1XW3011DeCfyrpjqYbm/pnq4m23aw11XAHcBFwKfNr2xcML2P4DsD/VRecVVC2ED5ZYVxdvTEDKg2liopH0dmCu7d36HctEImkWVSKeMrw1Fu2WFkFMKJKeQjUef36/Y4mYLJIIYsKQ9Fqqboo7qe43iIhxkK6hiIiWS4sgIqLlNuhepP8233xzz5o1q99hRERMKAsXLrzL9kC3chMiEcyaNYsFCxb0O4yIiAlF0q3dS6VrKCKi9ZIIIiJaLokgIqLlkggiIlouiSAiouWSCCIiWq6xRFCmrf1FedbpEknHluVfKc84XVReq3tKVERENKzJ+wgeBva0/YCkKcCPJX23rPug7W82eOyIiKipsUTgahKjB8rHKeWViY0iItYxjd5ZLGl9YCHwHOALtq+S9E/AJyV9jOrBGEfZfniEbecCcwFmzhz+tMKY6GYddWGtckvnvb7hSCKi0YvFth+1PRvYEthZ0ouADwHbAC8BNgOOHGXb+bYHbQ8ODHSdKiMiItZQT0YN2b4X+CHwOtvLXHkY+DKwcy9iiIiIkTU5amhA0rTyfiNgb+DXkmaUZQIOAK5rKoaIiOiuyWsEM4DTynWC9YCzbX9H0mWSBgABi4B3NRhDRER00eSooWuBHUZYvmdTx4yIiLHLncURES2XRBAR0XJJBBERLZdEEBHRckkEEREtl0QQEdFySQQRES2XRBAR0XJJBBERLZdEEBHRckkEEREtl0QQEdFySQQRES2XRBAR0XJJBBERLZdEEBHRckkEEREt1+SjKqNlZh11Yb9DiIg1kBZBRETLJRFERLRcY4lA0pMl/ULSryQtkXRsWb61pKsk3STpLEkbNhVDRER012SL4GFgT9vbA7OB10naBTgO+Izt5wD3AO9oMIaIiOiisUTgygPl45TyMrAn8M2y/DTggKZiiIiI7hodNSRpfWAh8BzgC8DvgHttryxFbgO2GGXbucBcgJkzZzYZZqzD6o5EWjrv9Q1HEjF5NXqx2PajtmcDWwI7A9uMYdv5tgdtDw4MDDQWY0RE2/Vk1JDte4EfAi8DpkkaaolsCdzeixgiImJkTY4aGpA0rbzfCNgbuIEqIbyxFDsUOL+pGCIiorsmrxHMAE4r1wnWA862/R1J1wNnSvoE8EvglAZjiIiILhpLBLavBXYYYfnNVNcLIiJiHZC5hqKriTCHUEYXRay5TDEREdFySQQRES2XRBAR0XJJBBERLZdEEBHRchk1NAlNhFE+EbHuSIsgIqLlkggiIlouiSAiouWSCCIiWi6JICKi5ZIIIiJaLokgIqLlkggiIlouiSAiouWSCCIiWi6JICKi5TLXULRKnmQW8V+lRRAR0XJJBBERLddYIpC0laQfSrpe0hJJ7yvLj5F0u6RF5bVvUzFERER3TV4jWAn8i+1rJG0MLJR0SVn3GdufbvDYERFRU2OJwPYyYFl5f7+kG4AtmjpeRESsmZ5cI5A0C9gBuKoseo+kayWdKmnTUbaZK2mBpAUrVqzoRZgREa3UeCKQNBX4FvB+2/cBXwSeDcymajEcP9J2tufbHrQ9ODAw0HSYERGt1WgikDSFKgl83fY5ALbvtP2o7ceAk4Gdm4whIiJWr8lRQwJOAW6wfULH8hkdxQ4ErmsqhoiI6K7JUUO7AocAiyUtKss+DMyRNBswsBR4Z4MxREREF02OGvoxoBFWXdTUMSMiYuxyZ3FERMslEUREtFwSQUREyyURRES0XBJBRETLJRFERLRcEkFERMslEUREtFwSQUREy3VNBJKeLelJ5f3ukt4raVrzoUVERC/UaRF8C3hU0nOA+cBWwDcajSoiInqmTiJ4zPZKqplCP2/7g8CMLttERMQEUWfSub9JmgMcCuxXlk1pLqSI/pt11IW1yi2d9/qGI4loXp0WweHAy4BP2r5F0tbAV5sNKyIieqVri8D29ZKOBGaWz7cAxzUdWERE9EadUUP7AYuA75XPsyVd0HRgERHRG3W6ho6heq7wvQC2FwHPajCmiIjooTqJ4G+2/zxs2WNNBBMREb1XZ9TQEklvBdaX9FzgvcBPmw0rIiJ6pU6L4AjghcDDwBnAfcD7mwwqIiJ6p86oob8A/7u8apO0FXA6MB0wMN/25yRtBpwFzAKWAm+yfc/Ywo6IiPEyaiKQ9G2qE/iIbP/3LvteCfyL7WskbQwslHQJcBhwqe15ko4CjgKOHHPkERExLlbXIvj02uzY9jJgWXl/v6QbgC2A/YHdS7HTgMtJIoiI6JtRE4HtK4beS9oQ2IaqhXCj7UfGchBJs4AdgKuA6SVJAPyRqutopG3mAnMBZs6cOZbDRUTEGNS5oez1wO+AE4GTgJsk7VP3AJKmUs1g+n7b93Wus21G6X6yPd/2oO3BgYGBuoeLiIgxqjN89HhgD9s3QfV8AuBC4LvdNpQ0hSoJfN32OWXxnZJm2F4maQawfM1Cj4iI8VBn+Oj9Q0mguBm4v9tGkgScAtxg+4SOVRdQzWRK+Xt+zVgjIqIBdVoECyRdBJxN1Y1zEHC1pL8H6PilP9yuwCHAYkmLyrIPA/OAsyW9A7gVeNNaxB8REWupTiJ4MnAn8KryeQWwEdWzCQyMmAhs/xjQKPt89djCjIiIptS5oezwXgQSERH90TURlAfRHEF1J/Dj5WvcUBYRERNAna6h86gu+n6bzDoaETHp1EkED9k+sfFIIiKiL+okgs9JOhq4mGoGUgBsX9NYVBER0TN1EsF2VMNA92RV15DL54iImODqJIKDgGeNdX6hiIiYGOrcWXwdMK3pQCIioj/qtAimAb+WdDVPvEaQ4aMREZNAnURwdONRRERE39S5s/iKbmUiImLiqvM8gl0kXS3pAUmPSHpU0n3dtouIiImhzsXik4A5wG+pJpv7B+ALTQYVERG9UycRUJ5HsL7tR21/GXhds2FFRESv1LlY/JfyzOJFkv6D6oH0tRJIRESs++qc0A8p5d4DPAhsBbyhyaAiIqJ36owaurW8fUjSicBWwx5dGRERE1idUUOXS9pE0mbANcDJkk7otl1EREwMdbqGnmb7PuDvgdNtvxTYq9mwIiKiV+okgg0kzaB6yPx3Go4nIiJ6rE4i+Dfg+8BNtq+W9CyqewoiImIS6JoIbP+n7Rfbfnf5fLPtrqOGJJ0qabmk6zqWHSPpdkmLymvftQs/IiLWVpP3A3yFkW88+4zt2eV1UYPHj4iIGhpLBLavBO5uav8RETE++nGH8HskXVu6jjYdrZCkuZIWSFqwYsWKXsYXEdEqXW8ok/QkqjuJZ3WWt/1va3C8LwIfp3rm8ceB44H/MVJB2/OB+QCDg4Neg2NFREQNdeYaOh/4M7CQjieUrQnbdw69l3QyGY4aEdF3dRLBlrbHZbZRSTNsLysfD6R6HnJERPRRnUTwU0nb2V48lh1LOgPYHdhc0m1Uj7zcXdJsqq6hpcA7xxZuRESMtzqJYDfgMEm3UHUNCbDtF69uI9tzRlh8ythDjIiIJtVJBPs0HkVERPTNWKahjoiISShPGouIaLkkgoiIlksiiIhouSSCiIiWSyKIiGi5JIKIiJZLIoiIaLkkgoiIlksiiIhouSSCiIiWqzPXUKyhWUddWKvc0nmvbziSiIjRpUUQEdFySQQRES2XRBAR0XJJBBERLZdEEBHRchk1FLEW+jkyLKPSYrykRRAR0XKNJQJJp0paLum6jmWbSbpE0m/L302bOn5ERNTTZIvgK8Drhi07CrjU9nOBS8vniIjoo8YSge0rgbuHLd4fOK28Pw04oKnjR0REPb2+RjDd9rLy/o/A9B4fPyIihunbqCHbluTR1kuaC8wFmDlzZs/i6qbuSI1+7zMioq5etwjulDQDoPxdPlpB2/NtD9oeHBgY6FmAERFt0+tEcAFwaHl/KHB+j48fERHDNDl89AzgZ8DzJd0m6R3APGBvSb8F9iqfIyKijxq7RmB7ziirXt3UMSMiYuxyZ3FERMtlrqGIHhjLyLDMDRS9lhZBRETLJRFERLRcEkFERMslEUREtFwSQUREyyURRES0XBJBRETLJRFERLRcEkFERMslEUREtFymmIhYx4z3g4rq7q/u1Bbjvb/ov7QIIiJaLokgIqLlkggiIlouiSAiouWSCCIiWi6JICKi5ZIIIiJaLokgIqLl+nJDmaSlwP3Ao8BK24P9iCMiIvp7Z/Eetu/q4/EjIoJ0DUVEtF6/EoGBiyUtlDR3pAKS5kpaIGnBihUrehxeRER79CsR7GZ7R2Af4H9KeuXwArbn2x60PTgwMND7CCMiWqIvicD27eXvcuBcYOd+xBEREX1IBJKeKmnjoffAa4Dreh1HRERU+jFqaDpwrqSh43/D9vf6EEdERNCHRGD7ZmD7Xh83IiJGlieURUQjxvKktTzNrL9yH0FERMslEUREtFwSQUREyyURRES0XBJBRETLZdRQMZYRDhGT0WT6P1D3u2S0UiUtgoiIlksiiIhouSSCiIiWSyKIiGi5JIKIiJaT7X7H0NXg4KAXLFiwRttOppEQEdEfE3V0kaSFtge7lUuLICKi5ZIIIiJaLokgIqLlkggiIlouiSAiouUy11BExDhpYpRiL0YspUUQEdFySQQRES3Xl0Qg6XWSbpR0k6Sj+hFDRERUep4IJK0PfAHYB9gWmCNp217HERERlX60CHYGbrJ9s+1HgDOB/fsQR0RE0J9RQ1sAf+j4fBvw0uGFJM0F5paPD0i6sQexrY3Ngbv6HcQ6IPWwSupilQldFzpu3HY15npYy2M/s06hdXb4qO35wPx+x1GXpAV1Jnea7FIPq6QuVkldVNbVeuhH19DtwFYdn7csyyIiog/6kQiuBp4raWtJGwJvAS7oQxwREUEfuoZsr5T0HuD7wPrAqbaX9DqOBkyYbqyGpR5WSV2skrqorJP1MCEeTBMREc3JncURES2XRBAR0XJJBGMg6VRJyyVdN8r6p0n6tqRfSVoi6fBex9gLkraS9ENJ15fv+b4RykjSiWUakWsl7diPWJtWsy7eVupgsaSfStq+H7E2qU49dJR9iaSVkt7Yyxh7pW5dSNpd0qJS5opex/kEtvOq+QJeCewIXDfK+g8Dx5X3A8DdwIb9jruBepgB7Fjebwz8Bth2WJl9ge8CAnYBrup33H2si5cDm5b3+0zGuqhTD2Xd+sBlwEXAG/sddx//TUwDrgdmls9P72fMaRGMge0rqU7uoxYBNpYkYGopu7IXsfWS7WW2rynv7wduoLpjvNP+wOmu/ByYJmlGj0NtXJ26sP1T2/eUjz+nundmUqn5bwLgCOBbwPIehtdTNevircA5tn9fyvW1PpIIxtdJwAuAO4DFwPtsP9bfkJolaRawA3DVsFUjTSUy0olh0lhNXXR6B1VLadIarR4kbQEcCHyx91H1x2r+TTwP2FTS5ZIWSnp7r2PrtM5OMTFBvRZYBOwJPBu4RNKPbN/X37CaIWkq1a+790/W71hXnbqQtAdVItitl7H1Upd6+CxwpO3Hqkbz5NalLjYAdgJeDWwE/EzSz23/psdhPh5MjJ/DgXmuOv1uknQLsA3wi/6GNf4kTaH6R/512+eMUKQ1U4nUqAskvRj4ErCP7T/1Mr5eqVEPg8CZJQlsDuwraaXt83oYZk/UqIvbgD/ZfhB4UNKVwPZU1xN6Ll1D4+v3VBkeSdOB5wM39zWiBpRrIKcAN9g+YZRiFwBvL6OHdgH+bHtZz4LskTp1IWkmcA5wSL9+8TWtTj3Y3tr2LNuzgG8C756kSaDO/4/zgd0kbSDpKVQzMN/QqxiHS4tgDCSdAewObC7pNuBoYAqA7f8LfBz4iqTFVKNljrQ9YafeXY1dgUOAxZIWlWUfBmbC43VxEdXIoZuAv1C1liajOnXxMeDvgP9Tfg2v9Do4A+VaqlMPbdG1LmzfIOl7wLXAY8CXbI84LL0XMsVERETLpWsoIqLlkggiIlouiSAiouWSCCIiWi6JICKi5ZIIoqck/TdJZ0r6Xbm1/iJJz5M0S5IlHdFR9iRJh61mX4skndmTwNeSpMMkndTvOIaT9ED5O2u0WXVj8ksiiJ4pN9qcC1xu+9m2dwI+BEwvRZYD7yvPsu62rxdQzWT5CklPHaf4JsV9NZPle0TvJBFEL+0B/K3z5iLbv7L9o/JxBXApcGiNfc0BvgpcTDXTKZK2kfT4dB7lV+7i8n4nSVeUVsj3h2ZCLZN+fVbSAqoktJ+kqyT9UtIPyh3iSBqQdEmZO/5Lkm6VtHlZd7CkX5QWyv+TtH5Zfrik35SYdh3pS0jaTNJ5qp5X8HNJL5a0nqSlkqZ1lPutpOkljm9Jurq8di3rj5H0VUk/KfXSeYypki6VdI2qZyLsX6N+o0WSCKKXXgQs7FLmOOADQyfT1XgzcCZwBlVSwPavgQ0lbd1R5qwy78vnqea/3wk4Ffhkx742tD1o+3jgx8Autnco+//XUuZo4DLbL6SaHmEmPN4yeTOwq+3ZwKPA20qiOZYqAewGbDvK9zgW+KXtF1PdfXp6mbH2fKqZOpH0UuBW23cCnwM+Y/slwBuo5i8asi2wl+05w47xEHCg7R2pkvHxasOsb1FbmpCxTrF9s6SrqOZrH5GkQeAu27+XdDtwqqTNbN8NnE11Yp5X/r6Zas6nF1HNBgtVl1LnvEdndbzfkip5zAA2BG4py3ejnJhtf0/S0PMFXk01i+TVZd8bUXVxvZSqC2xFifksqqmHh9uN6oSO7csk/Z2kTUpMHwO+DLylI8a9gG07zuObqJrlEuAC238dqcqAT0l6JdV0BltQdcf9cYSy0UJJBNFLS4A6jyf8FNWv7tEe3zcH2EbS0vJ5E6qT6clUJ8z/lHQOYNu/lbQdsMT2y0bZ34Md7z8PnGD7Akm7A8d0iVXAabY/9ISF0gFdtuvmZ8BzJA0ABwCfKMvXo2qxPDTsePDE79HpbVRPzNvJ9t9KvT15LeOLSSRdQ9FLlwFPkjR3aEHpE39FZ6HSxXM9sN/wHUhaD3gTsF3HTJb7s6p76HdU3TMfZdWv6BuBAUkvK/uYIumFo8T4NFZNl915reIn5bhIeg2waVl+KfBGSU8v6zaT9EyqB5G8qvzCnwIcNMrxfkR1oqYknrts31emMj8XOIFqFsuhqasvpnrK11B9zB5lv8O/0/KSBPYAnlljm2iRJILomXJyOxDYqwwfXQL8OyN3UXySkR/p+Argdtt3dCy7kqq7ZOhRmGcBB1N1E2H7EaqWyHGSfkX18KCXjxLmMVQtioVA58yxxwKvKUMsDyox32/7euAjwMWSrgUuAWaUKbePofpl/xNGn2L4GGCnsu08nph8hr5HZ9fVe4HBcnH5euBdo+y309fLNouBtwO/rrFNtEhmH42oQdKTgEdtrywtiy+Wi8MRE16uEUTUMxM4u3RNPQL8Y5/jiRg3aRFERLRcrhFERLRcEkFERMslEUREtFwSQUREyyURRES03P8H9AxF0s+Zly8AAAAASUVORK5CYII=\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "average_ploidies = pdx.apply(np.mean)\n", + "p = plt.hist(average_ploidies,bins=30)\n", + "plt.title(\"Ploidy in PDX samples\")\n", + "plt.ylabel(\"n samples\")\n", + "plt.xlabel(\"CN Averaged over all\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>A1BG</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.117695</td>\n", + " <td>0.056584</td>\n", + " <td>0.000000</td>\n", + " <td>0.978196</td>\n", + " <td>0.028569</td>\n", + " <td>2.509696</td>\n", + " <td>0.117695</td>\n", + " <td>0.007196</td>\n", + " <td>...</td>\n", + " <td>0.056584</td>\n", + " <td>0.070389</td>\n", + " <td>0.097611</td>\n", + " <td>0.560715</td>\n", + " <td>0.042644</td>\n", + " <td>0.049631</td>\n", + " <td>-0.007232</td>\n", + " <td>0.049631</td>\n", + " <td>-0.483985</td>\n", + " <td>0.021480</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1BG-AS1</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.117695</td>\n", + " <td>0.056584</td>\n", + " <td>0.000000</td>\n", + " <td>0.978196</td>\n", + " <td>0.028569</td>\n", + " <td>2.509696</td>\n", + " <td>0.117695</td>\n", + " <td>0.007196</td>\n", + " <td>...</td>\n", + " <td>0.056584</td>\n", + " <td>0.070389</td>\n", + " <td>0.097611</td>\n", + " <td>0.560715</td>\n", + " <td>0.042644</td>\n", + " <td>0.049631</td>\n", + " <td>-0.007232</td>\n", + " <td>0.049631</td>\n", + " <td>-0.483985</td>\n", + " <td>0.021480</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1CF</th>\n", + " <td>0.521051</td>\n", + " <td>0.570463</td>\n", + " <td>0.007196</td>\n", + " <td>0.042644</td>\n", + " <td>0.070389</td>\n", + " <td>-0.340075</td>\n", + " <td>0.007196</td>\n", + " <td>-0.286304</td>\n", + " <td>-0.081614</td>\n", + " <td>-0.007232</td>\n", + " <td>...</td>\n", + " <td>0.028569</td>\n", + " <td>-1.043943</td>\n", + " <td>-0.340075</td>\n", + " <td>0.056584</td>\n", + " <td>-0.036526</td>\n", + " <td>-0.058894</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.643856</td>\n", + " <td>-0.588574</td>\n", + " <td>0.070389</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 \\\n", + "gene_id \n", + "A1BG 0.367371 -0.321928 0.117695 0.056584 0.000000 0.978196 \n", + "A1BG-AS1 0.367371 -0.321928 0.117695 0.056584 0.000000 0.978196 \n", + "A1CF 0.521051 0.570463 0.007196 0.042644 0.070389 -0.340075 \n", + "\n", + " X-1167 X-1169 X-1172 X-1173 ... X-5694 \\\n", + "gene_id ... \n", + "A1BG 0.028569 2.509696 0.117695 0.007196 ... 0.056584 \n", + "A1BG-AS1 0.028569 2.509696 0.117695 0.007196 ... 0.056584 \n", + "A1CF 0.007196 -0.286304 -0.081614 -0.007232 ... 0.028569 \n", + "\n", + " X-5696 X-5713 X-5717 X-5727 X-5739 X-5808 \\\n", + "gene_id \n", + "A1BG 0.070389 0.097611 0.560715 0.042644 0.049631 -0.007232 \n", + "A1BG-AS1 0.070389 0.097611 0.560715 0.042644 0.049631 -0.007232 \n", + "A1CF -1.043943 -0.340075 0.056584 -0.036526 -0.058894 -0.377070 \n", + "\n", + " X-5959 X-5975 X-6047 \n", + "gene_id \n", + "A1BG 0.049631 -0.483985 0.021480 \n", + "A1BG-AS1 0.049631 -0.483985 0.021480 \n", + "A1CF -0.643856 -0.588574 0.070389 \n", + "\n", + "[3 rows x 375 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdx = pdx.applymap(lambda x: np.log2(x/2))\n", + "pdx.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>A1BG</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.978196</td>\n", + " <td>0.0</td>\n", + " <td>2.509696</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.00000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1BG-AS1</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.978196</td>\n", + " <td>0.0</td>\n", + " <td>2.509696</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.00000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1CF</th>\n", + " <td>0.521051</td>\n", + " <td>0.570463</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.340075</td>\n", + " <td>0.0</td>\n", + " <td>-0.286304</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-1.043943</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.37707</td>\n", + " <td>-0.643856</td>\n", + " <td>-0.588574</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 X-1167 \\\n", + "gene_id \n", + "A1BG 0.367371 -0.321928 0.0 0.0 0.0 0.978196 0.0 \n", + "A1BG-AS1 0.367371 -0.321928 0.0 0.0 0.0 0.978196 0.0 \n", + "A1CF 0.521051 0.570463 0.0 0.0 0.0 -0.340075 0.0 \n", + "\n", + " X-1169 X-1172 X-1173 ... X-5694 X-5696 X-5713 \\\n", + "gene_id ... \n", + "A1BG 2.509696 0.0 0.0 ... 0.0 0.000000 0.000000 \n", + "A1BG-AS1 2.509696 0.0 0.0 ... 0.0 0.000000 0.000000 \n", + "A1CF -0.286304 0.0 0.0 ... 0.0 -1.043943 -0.340075 \n", + "\n", + " X-5717 X-5727 X-5739 X-5808 X-5959 X-5975 X-6047 \n", + "gene_id \n", + "A1BG 0.560715 0.0 0.0 0.00000 0.000000 -0.483985 0.0 \n", + "A1BG-AS1 0.560715 0.0 0.0 0.00000 0.000000 -0.483985 0.0 \n", + "A1CF 0.000000 0.0 0.0 -0.37707 -0.643856 -0.588574 0.0 \n", + "\n", + "[3 rows x 375 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdx = pdx.applymap(lambda x : clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))\n", + "pdx.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mapped: 23313 \n", + "\tdirectly via main_mapper 21188 \n", + "\tvia alternative mapper 466 \n", + "\tvia one of multiple synonyms in alternative mapper 926 \n", + "\tLOC 733 \n", + "Unmapped: 200 \n", + "\trecognized symbols without Entrez ID 0 \n", + "\tmultiple query_ids map to the same target_id 0 \n", + "\tquery_ids map to multiple target_ids in the main mapper 0 \n", + "\tquery_ids map to multiple target_ids in the alternative mapper 52 \n", + "\tLOC not found in Entrez 29 \n", + "\tNot found at all: 119\n", + "Warning: query IDs mapping to duplicated target IDs in mapping table: 77\n", + "Warning: query IDs not mapped to any target IDs excluded: 200\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "IDs mapped to multiple target IDs are kept:\n", + " [143872, 286464, 51463, 642826, 653067, 399761, 647060, 284565, 84631, 161176, 341019, 83869, 9502, 83871, 728113, 729438, 4253, 645425, 26165, 6218, 728695, 100132948, 100134869, 84316, 200030, 642658, 100302179, 401508, 119016, 84458, 574445, 26095, 84968, 80759, 3192, 387707, 79741]\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.978196</td>\n", + " <td>0.0</td>\n", + " <td>2.509696</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.560715</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.500802</td>\n", + " <td>0.0</td>\n", + " <td>0.700440</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.327687</td>\n", + " <td>-0.494109</td>\n", + " <td>-0.535332</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.500802</td>\n", + " <td>0.0</td>\n", + " <td>0.700440</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.327687</td>\n", + " <td>-0.494109</td>\n", + " <td>-0.535332</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 X-1167 \\\n", + "gene_id \n", + "1 0.367371 -0.321928 0.0 0.000000 0.0 0.978196 0.0 \n", + "2 0.761285 0.000000 0.0 0.500802 0.0 0.700440 0.0 \n", + "3 0.761285 0.000000 0.0 0.500802 0.0 0.700440 0.0 \n", + "\n", + " X-1169 X-1172 X-1173 ... X-5694 X-5696 X-5713 X-5717 \\\n", + "gene_id ... \n", + "1 2.509696 0.000000 0.0 ... 0.0 0.0 0.0 0.560715 \n", + "2 0.000000 0.201634 0.0 ... 0.0 0.0 0.0 0.739848 \n", + "3 0.000000 0.201634 0.0 ... 0.0 0.0 0.0 0.739848 \n", + "\n", + " X-5727 X-5739 X-5808 X-5959 X-5975 X-6047 \n", + "gene_id \n", + "1 0.0 0.000000 0.000000 0.000000 -0.483985 0.0 \n", + "2 0.0 0.739848 0.327687 -0.494109 -0.535332 0.0 \n", + "3 0.0 0.739848 0.327687 -0.494109 -0.535332 0.0 \n", + "\n", + "[3 rows x 375 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdx,query2target,not_mapped = apply_mappers(pdx, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n", + "pdx.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>143872</th>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.350497</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.514573</td>\n", + " </tr>\n", + " <tr>\n", + " <th>143872</th>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.350497</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.514573</td>\n", + " </tr>\n", + " <tr>\n", + " <th>286464</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>-1.321928</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>286464</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>-1.321928</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>286464</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>-1.321928</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>51463</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>51463</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.438293</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642826</th>\n", + " <td>0.608809</td>\n", + " <td>0.859970</td>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.871844</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-1.494109</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642826</th>\n", + " <td>0.608809</td>\n", + " <td>0.859970</td>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.871844</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-1.494109</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>399761</th>\n", + " <td>0.531069</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.367371</td>\n", + " <td>-1.535332</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>399761</th>\n", + " <td>0.531069</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.367371</td>\n", + " <td>-1.535332</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>647060</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>0.000000</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.749038</td>\n", + " <td>-0.405451</td>\n", + " <td>0.000000</td>\n", + " <td>-0.395929</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>647060</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>0.000000</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.749038</td>\n", + " <td>-0.405451</td>\n", + " <td>0.000000</td>\n", + " <td>-0.395929</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>284565</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.411426</td>\n", + " <td>0.448901</td>\n", + " <td>...</td>\n", + " <td>0.469886</td>\n", + " <td>-0.504305</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>284565</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.310340</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.411426</td>\n", + " <td>0.448901</td>\n", + " <td>...</td>\n", + " <td>0.469886</td>\n", + " <td>-0.524915</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84631</th>\n", + " <td>-0.823677</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.500802</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84631</th>\n", + " <td>-0.823677</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.500802</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>161176</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.358454</td>\n", + " <td>0.250962</td>\n", + " <td>0.339137</td>\n", + " <td>...</td>\n", + " <td>-0.875672</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.411426</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>161176</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.358454</td>\n", + " <td>0.250962</td>\n", + " <td>0.339137</td>\n", + " <td>...</td>\n", + " <td>-0.875672</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.411426</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>341019</th>\n", + " <td>0.959770</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>-1.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.444184</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>341019</th>\n", + " <td>0.959770</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>-1.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.444184</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>83869</th>\n", + " <td>-3.000000</td>\n", + " <td>-3.184425</td>\n", + " <td>-3.321928</td>\n", + " <td>-3.000000</td>\n", + " <td>-2.599462</td>\n", + " <td>-3.556393</td>\n", + " <td>-2.785875</td>\n", + " <td>-0.875672</td>\n", + " <td>-2.736966</td>\n", + " <td>-2.514573</td>\n", + " <td>...</td>\n", + " <td>-2.556393</td>\n", + " <td>-3.943416</td>\n", + " <td>-2.785875</td>\n", + " <td>-2.152003</td>\n", + " <td>-3.556393</td>\n", + " <td>-3.251539</td>\n", + " <td>-2.736966</td>\n", + " <td>-3.473931</td>\n", + " <td>-3.120294</td>\n", + " <td>-3.643856</td>\n", + " </tr>\n", + " <tr>\n", + " <th>83869</th>\n", + " <td>-3.000000</td>\n", + " <td>-3.184425</td>\n", + " <td>-3.321928</td>\n", + " <td>-3.000000</td>\n", + " <td>-2.599462</td>\n", + " <td>-3.556393</td>\n", + " <td>-2.785875</td>\n", + " <td>-0.875672</td>\n", + " <td>-2.736966</td>\n", + " <td>-2.514573</td>\n", + " <td>...</td>\n", + " <td>-2.556393</td>\n", + " <td>-3.943416</td>\n", + " <td>-2.785875</td>\n", + " <td>-2.152003</td>\n", + " <td>-3.556393</td>\n", + " <td>-3.251539</td>\n", + " <td>-2.736966</td>\n", + " <td>-3.473931</td>\n", + " <td>-3.120294</td>\n", + " <td>-3.643856</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9502</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9502</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>83871</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.500802</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.327687</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>-0.358454</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>-0.535332</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100134869</th>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.389567</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.790772</td>\n", + " <td>0.000000</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100134869</th>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.389567</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.790772</td>\n", + " <td>0.000000</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84316</th>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.358454</td>\n", + " <td>-0.340075</td>\n", + " <td>-0.971431</td>\n", + " <td>-0.545824</td>\n", + " <td>0.310340</td>\n", + " <td>0.438293</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84316</th>\n", + " <td>-0.810966</td>\n", + " <td>-0.689660</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.731183</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>1.060047</td>\n", + " <td>0.490570</td>\n", + " <td>0.000000</td>\n", + " <td>-0.632629</td>\n", + " <td>-0.655172</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>200030</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.438293</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>200030</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.438293</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642658</th>\n", + " <td>1.049631</td>\n", + " <td>1.358959</td>\n", + " <td>0.000000</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.321928</td>\n", + " <td>0.700440</td>\n", + " <td>0.298658</td>\n", + " <td>...</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.769772</td>\n", + " <td>0.232661</td>\n", + " <td>1.121015</td>\n", + " <td>0.831877</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>1.121015</td>\n", + " <td>0.459432</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642658</th>\n", + " <td>1.049631</td>\n", + " <td>1.358959</td>\n", + " <td>0.000000</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.321928</td>\n", + " <td>0.700440</td>\n", + " <td>0.298658</td>\n", + " <td>...</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.769772</td>\n", + " <td>0.232661</td>\n", + " <td>1.121015</td>\n", + " <td>0.831877</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>1.121015</td>\n", + " <td>0.459432</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100302179</th>\n", + " <td>0.778209</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100302179</th>\n", + " <td>0.778209</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>401508</th>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>-0.666576</td>\n", + " <td>1.629939</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-2.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>401508</th>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>-0.666576</td>\n", + " <td>1.629939</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-2.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119016</th>\n", + " <td>0.378512</td>\n", + " <td>0.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.043943</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.643856</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119016</th>\n", + " <td>0.448901</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.242977</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84458</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84458</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>574445</th>\n", + " <td>0.490570</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>0.238787</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>-0.823677</td>\n", + " <td>-0.957356</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>574445</th>\n", + " <td>0.490570</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>0.238787</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>-0.823677</td>\n", + " <td>-0.957356</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26095</th>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.269033</td>\n", + " <td>-1.043943</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26095</th>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.269033</td>\n", + " <td>-1.043943</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84968</th>\n", + " <td>-0.736966</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84968</th>\n", + " <td>-0.736966</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>80759</th>\n", + " <td>-0.556393</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.567041</td>\n", + " <td>-0.556393</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.621488</td>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>80759</th>\n", + " <td>-0.556393</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.567041</td>\n", + " <td>-0.556393</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.621488</td>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3192</th>\n", + " <td>1.350497</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.220330</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>-0.268817</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.480265</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>1.021480</td>\n", + " <td>0.599318</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3192</th>\n", + " <td>1.350497</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.220330</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>-0.268817</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.480265</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>1.021480</td>\n", + " <td>0.599318</td>\n", + " </tr>\n", + " <tr>\n", + " <th>387707</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>387707</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>79741</th>\n", + " <td>0.000000</td>\n", + " <td>0.589763</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.242977</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>-0.798366</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.358454</td>\n", + " <td>0.000000</td>\n", + " <td>0.700440</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>-0.588574</td>\n", + " <td>0.459432</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>79741</th>\n", + " <td>0.000000</td>\n", + " <td>0.589763</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.242977</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.220330</td>\n", + " <td>...</td>\n", + " <td>-0.798366</td>\n", + " <td>-0.985645</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.700440</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>-0.588574</td>\n", + " <td>0.459432</td>\n", + " <td>0.269033</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>77 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 \\\n", + "gene_id \n", + "143872 0.000000 0.560715 0.000000 0.000000 0.000000 -0.330973 \n", + "143872 0.000000 0.560715 0.000000 0.000000 0.000000 -0.330973 \n", + "286464 0.000000 0.378512 0.000000 0.550901 0.000000 -0.524915 \n", + "286464 0.000000 0.378512 0.000000 0.550901 0.000000 -0.524915 \n", + "286464 0.000000 0.378512 0.000000 0.550901 0.000000 -0.524915 \n", + "51463 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "51463 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "642826 0.608809 0.859970 0.531069 0.000000 0.000000 0.871844 \n", + "642826 0.608809 0.859970 0.531069 0.000000 0.000000 0.871844 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "399761 0.531069 0.718088 0.000000 0.000000 0.000000 -0.251539 \n", + "399761 0.531069 0.718088 0.000000 0.000000 0.000000 -0.251539 \n", + "647060 0.000000 0.000000 0.000000 0.000000 0.000000 -0.312939 \n", + "647060 0.000000 0.000000 0.000000 0.000000 0.000000 -0.312939 \n", + "284565 1.238787 1.090853 0.000000 0.000000 0.000000 1.570463 \n", + "284565 1.238787 1.090853 0.000000 0.000000 0.000000 1.310340 \n", + "84631 -0.823677 -0.524915 0.000000 0.632268 0.000000 -1.074001 \n", + "84631 -0.823677 -0.524915 0.000000 0.632268 0.000000 -1.074001 \n", + "161176 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "161176 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "341019 0.959770 0.000000 0.000000 0.000000 0.000000 -0.304006 \n", + "341019 0.959770 0.000000 0.000000 0.000000 0.000000 -0.304006 \n", + "83869 -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393 \n", + "83869 -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393 \n", + "9502 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "9502 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "83871 0.000000 0.000000 0.000000 0.500802 0.000000 -0.304006 \n", + "... ... ... ... ... ... ... \n", + "100134869 0.000000 0.207893 0.000000 0.000000 0.000000 0.000000 \n", + "100134869 0.000000 0.207893 0.000000 0.000000 0.000000 0.000000 \n", + "84316 -0.251539 0.000000 0.000000 0.000000 0.000000 -0.340075 \n", + "84316 -0.810966 -0.689660 0.000000 0.000000 0.000000 0.718088 \n", + "200030 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "200030 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "642658 1.049631 1.358959 0.000000 0.599318 0.000000 0.000000 \n", + "642658 1.049631 1.358959 0.000000 0.599318 0.000000 0.000000 \n", + "100302179 0.778209 0.000000 0.000000 0.000000 0.000000 -0.340075 \n", + "100302179 0.778209 0.000000 0.000000 0.000000 0.000000 -0.340075 \n", + "401508 0.232661 0.000000 0.000000 0.000000 0.250962 0.000000 \n", + "401508 0.232661 0.000000 0.000000 0.000000 0.250962 0.000000 \n", + "119016 0.378512 0.570463 0.000000 0.000000 0.000000 -0.251539 \n", + "119016 0.448901 -0.286304 0.000000 0.000000 0.000000 -0.242977 \n", + "84458 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "84458 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "574445 0.490570 0.000000 -0.286304 0.000000 0.000000 -1.074001 \n", + "574445 0.490570 0.000000 -0.286304 0.000000 0.000000 -1.074001 \n", + "26095 0.531069 0.000000 0.000000 0.000000 0.000000 -0.251539 \n", + "26095 0.531069 0.000000 0.000000 0.000000 0.000000 -0.251539 \n", + "84968 -0.736966 -0.321928 0.000000 0.570463 0.000000 0.000000 \n", + "84968 -0.736966 -0.321928 0.000000 0.570463 0.000000 0.000000 \n", + "80759 -0.556393 0.599318 0.000000 0.000000 0.000000 0.250962 \n", + "80759 -0.556393 0.599318 0.000000 0.000000 0.000000 0.250962 \n", + "3192 1.350497 0.632268 0.000000 0.000000 0.000000 0.220330 \n", + "3192 1.350497 0.632268 0.000000 0.000000 0.000000 0.220330 \n", + "387707 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "387707 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "79741 0.000000 0.589763 0.000000 0.000000 0.000000 -0.242977 \n", + "79741 0.000000 0.589763 0.000000 0.000000 0.000000 -0.242977 \n", + "\n", + " X-1167 X-1169 X-1172 X-1173 ... X-5694 \\\n", + "gene_id ... \n", + "143872 0.000000 -0.367732 0.350497 0.000000 ... 0.000000 \n", + "143872 0.000000 -0.367732 0.350497 0.000000 ... 0.000000 \n", + "286464 0.000000 -0.902389 -1.321928 0.367371 ... 0.000000 \n", + "286464 0.000000 -0.902389 -1.321928 0.367371 ... 0.000000 \n", + "286464 0.000000 -0.902389 -1.321928 0.367371 ... 0.000000 \n", + "51463 0.000000 0.448901 0.000000 0.000000 ... 0.469886 \n", + "51463 0.000000 0.448901 0.000000 0.000000 ... 0.438293 \n", + "642826 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "642826 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "399761 0.000000 -0.286304 0.000000 0.790772 ... 0.367371 \n", + "399761 0.000000 -0.286304 0.000000 0.790772 ... 0.367371 \n", + "647060 0.000000 -0.321928 0.000000 0.618239 ... 0.000000 \n", + "647060 0.000000 -0.321928 0.000000 0.618239 ... 0.000000 \n", + "284565 0.000000 0.448901 0.411426 0.448901 ... 0.469886 \n", + "284565 0.000000 0.448901 0.411426 0.448901 ... 0.469886 \n", + "84631 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "84631 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "161176 0.000000 -0.358454 0.250962 0.339137 ... -0.875672 \n", + "161176 0.000000 -0.358454 0.250962 0.339137 ... -0.875672 \n", + "341019 0.000000 -0.349235 0.000000 0.000000 ... 0.000000 \n", + "341019 0.000000 -0.349235 0.000000 0.000000 ... 0.000000 \n", + "83869 -2.785875 -0.875672 -2.736966 -2.514573 ... -2.556393 \n", + "83869 -2.785875 -0.875672 -2.736966 -2.514573 ... -2.556393 \n", + "9502 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "9502 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "83871 0.000000 -0.349235 0.327687 0.000000 ... 0.000000 \n", + "... ... ... ... ... ... ... \n", + "100134869 0.000000 0.000000 0.232661 0.000000 ... 0.000000 \n", + "100134869 0.000000 0.000000 0.232661 0.000000 ... 0.000000 \n", + "84316 0.000000 -0.349235 -0.588574 0.000000 ... 0.000000 \n", + "84316 0.000000 0.000000 0.731183 0.367371 ... 0.000000 \n", + "200030 0.000000 0.448901 0.000000 0.000000 ... 0.438293 \n", + "200030 0.000000 0.448901 0.000000 0.000000 ... 0.438293 \n", + "642658 0.000000 0.321928 0.700440 0.298658 ... 0.632268 \n", + "642658 0.000000 0.321928 0.700440 0.298658 ... 0.632268 \n", + "100302179 0.000000 0.000000 0.000000 0.000000 ... 0.000000 \n", + "100302179 0.000000 0.000000 0.000000 0.000000 ... 0.000000 \n", + "401508 0.207893 0.000000 -0.666576 1.629939 ... 0.000000 \n", + "401508 0.207893 0.000000 -0.666576 1.629939 ... 0.000000 \n", + "119016 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "119016 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "84458 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "84458 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "574445 0.000000 0.238787 0.000000 0.000000 ... -0.823677 \n", + "574445 0.000000 0.238787 0.000000 0.000000 ... -0.823677 \n", + "26095 0.000000 -0.286304 0.000000 0.790772 ... 0.269033 \n", + "26095 0.000000 -0.286304 0.000000 0.790772 ... 0.269033 \n", + "84968 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "84968 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "80759 0.000000 -0.386468 -0.567041 -0.556393 ... 0.000000 \n", + "80759 0.000000 -0.386468 -0.567041 -0.556393 ... 0.000000 \n", + "3192 0.000000 0.448901 -0.268817 0.000000 ... 0.480265 \n", + "3192 0.000000 0.448901 -0.268817 0.000000 ... 0.480265 \n", + "387707 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "387707 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "79741 0.000000 0.207893 0.000000 0.000000 ... -0.798366 \n", + "79741 0.000000 0.207893 0.000000 0.220330 ... -0.798366 \n", + "\n", + " X-5696 X-5713 X-5717 X-5727 X-5739 X-5808 \\\n", + "gene_id \n", + "143872 -0.349235 0.000000 -0.321928 -1.014500 -0.588574 0.000000 \n", + "143872 -0.349235 0.000000 -0.321928 -1.014500 -0.588574 0.000000 \n", + "286464 -1.014500 0.718088 0.000000 -1.014500 0.000000 0.000000 \n", + "286464 -1.014500 0.718088 0.000000 -1.014500 0.000000 0.000000 \n", + "286464 -1.014500 0.718088 0.000000 -1.014500 0.000000 0.000000 \n", + "51463 0.000000 0.618239 0.201634 0.000000 -0.260152 -0.349235 \n", + "51463 0.000000 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "642826 -1.014500 -0.473931 0.000000 0.618239 0.000000 -0.386468 \n", + "642826 -1.014500 -0.473931 0.000000 0.618239 0.000000 -0.386468 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "399761 -1.535332 0.000000 0.000000 0.000000 0.000000 -0.599462 \n", + "399761 -1.535332 0.000000 0.000000 0.000000 0.000000 -0.599462 \n", + "647060 -0.621488 0.000000 -0.377070 -0.749038 -0.405451 0.000000 \n", + "647060 -0.621488 0.000000 -0.377070 -0.749038 -0.405451 0.000000 \n", + "284565 -0.504305 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "284565 -0.524915 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "84631 -1.014500 0.500802 0.000000 0.000000 0.000000 0.000000 \n", + "84631 -1.014500 0.500802 0.000000 0.000000 0.000000 0.000000 \n", + "161176 0.000000 -0.483985 0.000000 0.000000 -0.610433 0.000000 \n", + "161176 0.000000 -0.483985 0.000000 0.000000 -0.610433 0.000000 \n", + "341019 -0.666576 0.000000 -0.304006 -1.000000 -0.610433 0.000000 \n", + "341019 -0.312939 0.000000 -0.304006 -1.000000 -0.610433 0.000000 \n", + "83869 -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966 \n", + "83869 -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966 \n", + "9502 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "9502 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "83871 0.000000 -0.463947 -0.358454 0.000000 0.000000 0.000000 \n", + "... ... ... ... ... ... ... \n", + "100134869 0.389567 0.000000 0.000000 -0.463947 0.000000 -0.367732 \n", + "100134869 0.389567 0.000000 0.000000 -0.463947 0.000000 -0.367732 \n", + "84316 -0.985645 -0.358454 -0.340075 -0.971431 -0.545824 0.310340 \n", + "84316 0.000000 0.000000 -0.434403 1.060047 0.490570 0.000000 \n", + "200030 0.000000 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "200030 0.000000 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "642658 0.000000 0.769772 0.232661 1.121015 0.831877 -0.367732 \n", + "642658 0.000000 0.769772 0.232661 1.121015 0.831877 -0.367732 \n", + "100302179 0.000000 0.201634 -0.251539 0.000000 -0.463947 0.000000 \n", + "100302179 0.000000 0.201634 -0.251539 0.000000 -0.463947 0.000000 \n", + "401508 -2.321928 0.000000 0.000000 -0.260152 0.000000 -0.545824 \n", + "401508 -2.321928 0.000000 0.000000 -0.260152 0.000000 -0.545824 \n", + "119016 -1.043943 -0.524915 0.000000 0.000000 0.000000 -0.377070 \n", + "119016 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.367732 \n", + "84458 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "84458 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "574445 -0.957356 -0.321928 0.000000 -0.985645 -0.454032 0.000000 \n", + "574445 -0.957356 -0.321928 0.000000 -0.985645 -0.454032 0.000000 \n", + "26095 -1.043943 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "26095 -1.043943 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "84968 -1.000000 0.250962 0.000000 0.000000 0.000000 0.000000 \n", + "84968 -1.000000 0.250962 0.000000 0.000000 0.000000 0.000000 \n", + "80759 0.000000 -0.621488 -0.454032 0.000000 0.000000 -0.386468 \n", + "80759 0.000000 -0.621488 -0.454032 0.000000 0.000000 -0.386468 \n", + "3192 0.000000 0.207893 0.000000 0.000000 -0.330973 0.250962 \n", + "3192 0.000000 0.207893 0.000000 0.000000 -0.330973 0.250962 \n", + "387707 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "387707 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "79741 -0.985645 -0.358454 0.000000 0.700440 0.000000 -0.367732 \n", + "79741 -0.985645 0.000000 0.000000 0.700440 0.000000 -0.367732 \n", + "\n", + " X-5959 X-5975 X-6047 \n", + "gene_id \n", + "143872 0.000000 0.000000 -0.514573 \n", + "143872 0.000000 0.000000 -0.514573 \n", + "286464 0.000000 -1.494109 -1.494109 \n", + "286464 0.000000 -1.494109 -1.494109 \n", + "286464 0.000000 -1.494109 -1.494109 \n", + "51463 0.000000 0.469886 0.000000 \n", + "51463 0.000000 0.469886 0.000000 \n", + "642826 -1.494109 -0.312939 0.000000 \n", + "642826 -1.494109 -0.312939 0.000000 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "399761 -0.588574 0.000000 0.000000 \n", + "399761 -0.588574 0.000000 0.000000 \n", + "647060 -0.395929 0.000000 0.000000 \n", + "647060 -0.395929 0.000000 0.000000 \n", + "284565 0.000000 0.469886 0.000000 \n", + "284565 0.000000 0.469886 0.000000 \n", + "84631 -0.545824 -0.545824 -0.535332 \n", + "84631 -0.545824 -0.545824 -0.535332 \n", + "161176 0.411426 0.000000 0.000000 \n", + "161176 0.411426 0.000000 0.000000 \n", + "341019 0.000000 -0.444184 0.000000 \n", + "341019 0.000000 -0.444184 0.000000 \n", + "83869 -3.473931 -3.120294 -3.643856 \n", + "83869 -3.473931 -3.120294 -3.643856 \n", + "9502 -0.577767 -1.494109 -1.494109 \n", + "9502 -0.577767 -1.494109 -1.494109 \n", + "83871 0.448901 -0.535332 0.000000 \n", + "... ... ... ... \n", + "100134869 0.790772 0.000000 -0.535332 \n", + "100134869 0.790772 0.000000 -0.535332 \n", + "84316 0.438293 -0.473931 0.000000 \n", + "84316 -0.632629 -0.655172 0.000000 \n", + "200030 0.000000 0.469886 0.000000 \n", + "200030 0.000000 0.469886 0.000000 \n", + "642658 0.000000 1.121015 0.459432 \n", + "642658 0.000000 1.121015 0.459432 \n", + "100302179 0.000000 0.000000 0.000000 \n", + "100302179 0.000000 0.000000 0.000000 \n", + "401508 -0.330973 0.000000 0.000000 \n", + "401508 -0.330973 0.000000 0.000000 \n", + "119016 -0.643856 0.000000 0.000000 \n", + "119016 -0.577767 0.000000 0.000000 \n", + "84458 -0.610433 -0.577767 0.000000 \n", + "84458 -0.610433 -0.577767 0.000000 \n", + "574445 -0.599462 -0.577767 0.000000 \n", + "574445 -0.599462 -0.577767 0.000000 \n", + "26095 -0.588574 0.000000 0.000000 \n", + "26095 -0.588574 0.000000 0.000000 \n", + "84968 -0.545824 -0.545824 -0.535332 \n", + "84968 -0.545824 -0.545824 -0.535332 \n", + "80759 -0.621488 0.761285 0.000000 \n", + "80759 -0.621488 0.761285 0.000000 \n", + "3192 0.000000 1.021480 0.599318 \n", + "3192 0.000000 1.021480 0.599318 \n", + "387707 -0.610433 -0.577767 0.000000 \n", + "387707 -0.610433 -0.577767 0.000000 \n", + "79741 -0.588574 0.459432 0.000000 \n", + "79741 -0.588574 0.459432 0.269033 \n", + "\n", + "[77 rows x 375 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dups = list(set(pdx[pdx.index.duplicated(keep=False)].index.values))\n", + "pdx.loc[dups,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "37 duplicated IDs in 77 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 4\n", + "Merged 73 duplicated rows into 35 rows\n" + ] + } + ], + "source": [ + "# most of these dupliates correspond to genes merged in the current assembly, e.g. gene - gene-AS\n", + "pdx = handle_dups(pdx,corr_thr = 0.75)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "pdx = pdx.T.sort_index().T\n", + "pdx.to_csv(preprocessed_dir+\"/\"+\"PDX\"+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluation of the results\n", + "1). How many common genes between four datasets?\n", + "\n", + "2). Do CNA profiles of the same cell line from GDSC and CCLE correlate?\n", + "\n", + "3). Do CNA profiles of the same cancer type from TCGA and PDX look similar?\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "ename": "IOError", + "evalue": "File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-39-1f476096b0ec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m### 1). How many common genes between four datasets?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# we take BRCA from TCGA because\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtcga\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"BRCA\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;31m#print(tcga.head(3))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mgdsc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"GDSC\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 676\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 677\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 678\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 679\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 680\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 442\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 785\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 786\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 787\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 788\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 789\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1012\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1013\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1014\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1015\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1016\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1706\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'usecols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1707\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1708\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1709\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1710\u001b[0m \u001b[0mpassed_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnames\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mIOError\u001b[0m: File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist" + ] + } + ], + "source": [ + "### 1). How many common genes between four datasets?\n", + "# we take BRCA from TCGA because \n", + "tcga = pd.read_csv(preprocessed_dir+\"BRCA\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n", + "#print(tcga.head(3))\n", + "gdsc = pd.read_csv(preprocessed_dir+\"GDSC\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n", + "#print(tcga.head(3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### distribution of logR values in GDSC and CCLE \n", + "cn_values_gdsc = []\n", + "for row in df.iterrows():\n", + " cn_values_gdsc += list(row[1].values)\n", + "cn_values_ccle = []\n", + "for row in cna_table.iterrows():\n", + " cn_values_ccle+= list(row[1].values)\n", + "\n", + "cn_values_gdsc = sorted (cn_values_gdsc)\n", + "cn_values_ccle = sorted (cn_values_ccle)\n", + "plt.figure(figsize=(20,5))\n", + "plt.subplot(121)\n", + "tmp = plt.hist(cn_values_gdsc,bins=100,density = True,range=(-5,4))\n", + "plt.title(\"GDSC\")\n", + "plt.subplot(122)\n", + "tmp = plt.hist(cn_values_ccle,bins=100,density = True, range=(-5,4))\n", + "plt.title(\"CCLE\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}