--- a +++ b/preprocessing_scr/CNA.ipynb @@ -0,0 +1,7240 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "import pandas as pd\n", + "import os,sys\n", + "import pybedtools as pbt\n", + "from StringIO import StringIO\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import time\n", + "from mapper import expand, parse_mapping_table, apply_mappers\n", + "%matplotlib inline\n", + "\n", + "\n", + "chr_dict = dict(zip(range(1,22),map(str,range(1,22))))\n", + "chr_dict.update({22: 'X', 23: \"Y\"})\n", + "\n", + "root_dir = \"/home/olya/SFU/Hossein/v2/\"\n", + "gene_coords_file = root_dir + \"ref_GRCh37.p5_top_level.gff3.bed\" # must contain chromosome, start, end and Entrez Gene ID for hg19" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TCGA \n", + "\n", + "Assume that segmentation files from GDAC : http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/*/*snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt are dowmnoaded\n", + "\n", + "1) Filtering segments:\n", + " - segments containing less than 5 probes removed\n", + " - keep only segments with segment mean below -0.23 or above 0.2. This means that one copy gains and losses are detectable when their CCF (canncer cell fraction) is 0.3 or higher. \n", + " \n", + "TODO: remove segements overlapping with germline CNA forund in normals (add this as the first step)\n", + "2). For each samples aggregte to gene-level:\n", + " - rename chromosomes 22 and 23 to X and Y\n", + " - overpal segemntation file with Entrez gene coordinates for hg19\n", + " - if a gene overlaps by multiple segments, keep the one with most extreme values" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "num_marker_thr = 5\n", + "# to detect 1 copy gains or losses presenting at CCF >= 0.3\n", + "pos_seg_mean_thr = 0.20\n", + "neg_seg_mean_thr = -0.23 \n", + "\n", + "preprocessed_dir = root_dir+\"preprocessed/CNA/\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "### functions for TCGA and CCLE #################################\n", + "def filter_lowconf_segments(df,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr ):\n", + " # filter low-confidence segments with too few probes\n", + " df = df[df[\"Num_Probes\"] >= num_marker_thr ]\n", + " # filter low-confidence segments with Segment_Mean too close to zero:\n", + " df = df[ (df[\"Segment_Mean\"] >= pos_seg_mean_thr) | (df[\"Segment_Mean\"] <= neg_seg_mean_thr)]\n", + " return df\n", + "\n", + "def sample_type(barcode):\n", + " if barcode[13:16] in [\"10A\",\"10B\",\"11A\",\"11B\",\"10C\",\"11C\"]:\n", + " return \"Normal\"\n", + " else:\n", + " return \"Tumor\"\n", + "\n", + "def find_matching_normal(tumor_barcode,barcodes_list):\n", + " patient_id = tumor_barcode[:12]\n", + " normal_barcodes = []\n", + " for barcode in barcodes_list:\n", + " if barcode.startswith(patient_id) and sample_type(barcode) == \"Normal\":\n", + " normal_barcodes.append(barcode)\n", + " return normal_barcodes\n", + "\n", + "def cnv2bed(seg):\n", + " #cnv_bed = seg[[\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\"]]\n", + " #cnv_bed.columns = [\"chrom\",\"start\",\"stop\",\"Segment_Mean\"]\n", + " cnv_bed = seg.rename({\"Chromosome\":\"chrom\",\"Start\":\"start\",\n", + " \"End\":\"stop\"},axis=\"columns\")\n", + " cnv_bed = cnv_bed.loc[:,[\"chrom\",\"start\",\"stop\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]]\n", + " return pbt.BedTool.from_dataframe(cnv_bed)\n", + "\n", + "def bed2cnv(cnv_bed):\n", + " cnv_bed = str(cnv_bed)\n", + " if len(cnv_bed) > 0:\n", + " seg = pd.read_csv(StringIO(cnv_bed),sep = \"\\t\",header=None)\n", + " seg.columns = [\"Chromosome\",\"Start\",\"End\",\"Segment_Mean\",\"Sample\",\"Num_Probes\"]\n", + " seg = seg.loc[:,[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n", + " else:\n", + " seg = pd.DataFrame(columns=[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"])\n", + " return seg\n", + "def remove_ovelapping_segments(tumor, normal,sample_name):\n", + " tumor_bed = cnv2bed(tumor)\n", + " normal_bed = cnv2bed(normal)\n", + " tumor_wo_germline = tumor_bed.subtract(normal_bed,r=True,f=0.8,A=True)\n", + " tumor_wo_germline = bed2cnv(tumor_wo_germline)\n", + " n_segs_removed = tumor.shape[0] - tumor_wo_germline.shape[0]\n", + " if n_segs_removed*1.0/tumor.shape[0] > 0.5 and n_segs_removed>5 :\n", + " print(n_segs_removed,\"of\",tumor.shape[0],\"segments removed in\",sample_name,\"due to overlap with normal\",file = sys.stderr)\n", + " return tumor_wo_germline\n", + "\n", + "def cnv2genelevel(cnv_bed,gene_intervals_bed,sample_name,verbose = True,sorted_index = \"\"):\n", + " \n", + " # intersect \n", + " cnv2gene = str(gene_intervals_bed.intersect(cnv_bed,wb = True,wa=True))\n", + " if len(cnv2gene)==0: # if no intersection, return all zeroes\n", + " print(sample_name,\"has no genes with altered CN\",file = sys.stderr)\n", + " return pd.DataFrame(columns=[sample])\n", + " cnv2gene = pd.read_csv(StringIO(cnv2gene),sep = \"\\t\",header=None)\n", + " cnv2gene = cnv2gene[[3,7]].copy()\n", + " cnv2gene.columns = [\"gene\",\"Segment_Mean\"] \n", + " \n", + " # find genes overlapping with more than one segment:\n", + " # take the most exterme segement_mean value\n", + " \n", + " dups = cnv2gene.loc[cnv2gene.duplicated(subset=[\"gene\"],keep=False),]\n", + " if dups.shape[0] > 0:\n", + " cnv2gene = cnv2gene.drop_duplicates(subset=[\"gene\"],keep=False)\n", + " dups[\"abs_seg_mean\"] = abs(dups[\"Segment_Mean\"])\n", + " if verbose:\n", + " print(sample_name,\"contain \",len(set(dups[\"gene\"].values)),\"genes overalpped with more than one segment\",file=sys.stderr)\n", + " #print(dups.head(10),file=sys.stderr)\n", + " dups = dups.groupby(['gene'], group_keys=False).apply(lambda row: row.loc[row['abs_seg_mean'].idxmax()])\n", + " cnv2gene = pd.concat([cnv2gene,dups],sort=False)\n", + "\n", + " cnv2gene = cnv2gene[[\"gene\",\"Segment_Mean\"]]\n", + " cnv2gene.set_index(\"gene\",inplace=True,drop=True)\n", + " cnv2gene.rename(int,axis=0,inplace=True)\n", + " # add copy-neutral genes with 0s\n", + " \n", + " cnv2gene = cnv2gene.loc[sorted_index,:]\n", + " cnv2gene.columns = [sample_name]\n", + " return cnv2gene\n", + "\n", + "\n", + "### functions for GDSC and PDX #################################\n", + "\n", + "def CN2log2R(col, median_ploidy=2 ):\n", + " # this is fr GDSC only\n", + " lRs = []\n", + " genes = col.index.values\n", + " for code in col.values:\n", + " if not code == \"-1,-1,-,-\":\n", + " [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n", + " if int(max_cn) == 0:\n", + " lRs.append(-4.32) # CN=0 with 95% purity\n", + " else:\n", + " max_lR = np.log2(float(max_cn)/median_ploidy)\n", + " if not disruption == \"D\":\n", + " lRs.append(max_lR)\n", + " else:\n", + " if int(min_cn) == 0:\n", + " min_lR = -4.32\n", + " else:\n", + " min_lR = np.log2(float(min_cn)/median_ploidy)\n", + " if abs(min_lR) > abs(max_lR):\n", + " lRs.append(min_lR)\n", + " else:\n", + " lRs.append(max_lR)\n", + " \n", + " else:\n", + " lRs.append(np.NaN)\n", + " return pd.Series(dict(zip(genes, lRs)))\n", + "\n", + "def define_avg_ploidy(col):\n", + " n,pl = 0,0\n", + " CN_non_disrupted = []\n", + " for code in col.values:\n", + " if not code == \"-1,-1,-,-\":\n", + " [max_cn,min_cn,zygosity,disruption] = code.split(\",\")\n", + " n+=1\n", + " cn = (int(max_cn)+int(min_cn))*0.5\n", + " pl += cn\n", + " if not disruption == \"D\":\n", + " CN_non_disrupted.append((cn))\n", + " return pd.Series({\"avg_pl\":pl/n , \"median_pl\":np.median(CN_non_disrupted)})\n", + "\n", + "def clean_logR(logR_value, pos_seg_mean_thr, neg_seg_mean_thr):\n", + " if logR_value >= pos_seg_mean_thr:\n", + " return logR_value \n", + " elif logR_value <= neg_seg_mean_thr:\n", + " return logR_value \n", + " else:\n", + " return 0\n", + " \n", + "def handle_dups(df,corr_thr = 0.75):\n", + " '''Detect dupliated row IDs. Merge 2 or more rows with the same ID, \n", + " if averaged correlation in all pairvise comparision is >= corr_thhr;\\n\n", + " otherwise drop all duplicates. Keeps abs. max value (negative preferred).'''\n", + " dups = df.index\n", + " dups = list(set(dups[dups.duplicated()]))\n", + " if len(dups)==0:\n", + " print(\"No duplicated row IDs. Do nothing.\")\n", + " return df\n", + " print(len(dups), \"duplicated IDs in\",df.loc[dups,:].shape[0],\"rows found.\")\n", + " dups_merge = [] # if corr > corr_thr\n", + " dups_remove = [] # corr < \n", + " for dup in dups:\n", + " r = df.loc[dup,:].T.corr()\n", + " n_dups = df.loc[dup,:].shape[0]\n", + " r_avg = []\n", + " for i in range(0,n_dups):\n", + " for j in range(i+1,n_dups):\n", + " r_avg.append(r.iloc[i,j])\n", + " if np.average(r_avg) < corr_thr :\n", + " #print(dup,r_avg, n_dups)\n", + " dups_remove.append(dup)\n", + " else:\n", + " dups_merge.append(dup)\n", + " \n", + " # remove not similar duplicates\n", + " df_size = df.shape[0]\n", + " df = df.loc[~df.index.isin(dups_remove),:]\n", + " print(\"duplicate rows removed due to low correlation of duplicated profiles\",df_size -df.shape[0] )\n", + " df_size = df.shape[0]\n", + " \n", + " # merge simialr duplicates\n", + " d1 = df.loc[~df.index.isin(dups_merge),:]\n", + " d2 = df.loc[dups_merge,:]\n", + " d2 = d2.groupby(d2.index).agg(lambda x: -max(-x.max(),-x.min(),key= abs))\n", + " df = pd.concat([d1,d2])\n", + " df.sort_index(inplace=True)\n", + " print(\"Merged \",df_size-df.shape[0]+len(dups_merge),\"duplicated rows into\",len(dups_merge),\"rows\")\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### next few tabs demonstrate necessity of removing low-confidence and germline segments: \n", + "\n", + "(e.g. fragment 11:126596926-127130276 presents in both tumor and normal\n", + "therefore, it is germline; see chr11:126596926-12713027 in UCSC browser - it covers part of KIRELL3)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in tumor 204 segemtns in normal 121\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>57803</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>456120</td>\n", + " <td>8896255</td>\n", + " <td>4489.0</td>\n", + " <td>-0.0113</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57804</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>8899400</td>\n", + " <td>8899668</td>\n", + " <td>3.0</td>\n", + " <td>-1.3344</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57805</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>8900394</td>\n", + " <td>126596817</td>\n", + " <td>67487.0</td>\n", + " <td>0.0010</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57806</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>126596926</td>\n", + " <td>127130276</td>\n", + " <td>453.0</td>\n", + " <td>-1.0306</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57807</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>127132920</td>\n", + " <td>128342803</td>\n", + " <td>864.0</td>\n", + " <td>-0.0031</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57808</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>128342819</td>\n", + " <td>128350888</td>\n", + " <td>44.0</td>\n", + " <td>0.2824</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57809</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>128353007</td>\n", + " <td>134142530</td>\n", + " <td>3708.0</td>\n", + " <td>0.0082</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "57803 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 456120 8896255 \n", + "57804 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 8899400 8899668 \n", + "57805 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 8900394 126596817 \n", + "57806 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 126596926 127130276 \n", + "57807 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 127132920 128342803 \n", + "57808 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 128342819 128350888 \n", + "57809 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 128353007 134142530 \n", + "\n", + " Num_Probes Segment_Mean \n", + "57803 4489.0 -0.0113 \n", + "57804 3.0 -1.3344 \n", + "57805 67487.0 0.0010 \n", + "57806 453.0 -1.0306 \n", + "57807 864.0 -0.0031 \n", + "57808 44.0 0.2824 \n", + "57809 3708.0 0.0082 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#file_path = \"../../TCGA/CNA/data/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015082100.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n", + "file_path = \"../../TCGA/CNA/data__2016_01_28/gdac.broadinstitute.org_CESC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2016012800.0.0/CESC.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt\"\n", + "df = pd.read_csv(file_path, sep = \"\\t\")\n", + "tumor_barcode = \"TCGA-ZJ-AAXJ-01A-11D-A42N-01\"\n", + "t = df.loc[df[\"Sample\"]==tumor_barcode,:]\n", + "t_shape = t.shape[0]\n", + "n = find_matching_normal(tumor_barcode,list(set(df[\"Sample\"].values)))\n", + "n = df.loc[df[\"Sample\"]==n[0],:]\n", + "print(\"segemtns in tumor\",t.shape[0],\"segemtns in normal\",n.shape[0])\n", + "\n", + "n.loc[n['Chromosome']==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>57960</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>456120</td>\n", + " <td>64200041</td>\n", + " <td>34710.0</td>\n", + " <td>0.0054</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57961</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64208988</td>\n", + " <td>64319750</td>\n", + " <td>61.0</td>\n", + " <td>-0.6748</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57962</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64325209</td>\n", + " <td>126596817</td>\n", + " <td>37207.0</td>\n", + " <td>0.0571</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57963</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>126596926</td>\n", + " <td>127130276</td>\n", + " <td>454.0</td>\n", + " <td>-1.0760</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57964</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>127132920</td>\n", + " <td>132080656</td>\n", + " <td>3591.0</td>\n", + " <td>0.0449</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57965</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132080885</td>\n", + " <td>132099465</td>\n", + " <td>15.0</td>\n", + " <td>-0.6123</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57966</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132099856</td>\n", + " <td>134142530</td>\n", + " <td>1010.0</td>\n", + " <td>0.0483</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "57960 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 456120 64200041 \n", + "57961 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64208988 64319750 \n", + "57962 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64325209 126596817 \n", + "57963 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 126596926 127130276 \n", + "57964 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 127132920 132080656 \n", + "57965 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132080885 132099465 \n", + "57966 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132099856 134142530 \n", + "\n", + " Num_Probes Segment_Mean \n", + "57960 34710.0 0.0054 \n", + "57961 61.0 -0.6748 \n", + "57962 37207.0 0.0571 \n", + "57963 454.0 -1.0760 \n", + "57964 3591.0 0.0449 \n", + "57965 15.0 -0.6123 \n", + "57966 1010.0 0.0483 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.loc[t[\"Chromosome\"] ==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in normal after dropping low.conf.: 38\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>57804</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>8899400</td>\n", + " <td>8899668</td>\n", + " <td>3.0</td>\n", + " <td>-1.3344</td>\n", + " </tr>\n", + " <tr>\n", + " <th>57806</th>\n", + " <td>TCGA-ZJ-AAXJ-10A-01D-A42Q-01</td>\n", + " <td>11</td>\n", + " <td>126596926</td>\n", + " <td>127130276</td>\n", + " <td>453.0</td>\n", + " <td>-1.0306</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "57804 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 8899400 8899668 \n", + "57806 TCGA-ZJ-AAXJ-10A-01D-A42Q-01 11 126596926 127130276 \n", + "\n", + " Num_Probes Segment_Mean \n", + "57804 3.0 -1.3344 \n", + "57806 453.0 -1.0306 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n = filter_lowconf_segments(n,0,0.46, -0.68 )\n", + "print(\"segemtns in normal after dropping low.conf.:\",n.shape[0])\n", + "n.loc[n['Chromosome']==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in tumor after removing germlines: 194\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>96</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>456120</td>\n", + " <td>64200041</td>\n", + " <td>34710.0</td>\n", + " <td>0.0054</td>\n", + " </tr>\n", + " <tr>\n", + " <th>97</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64208988</td>\n", + " <td>64319750</td>\n", + " <td>61.0</td>\n", + " <td>-0.6748</td>\n", + " </tr>\n", + " <tr>\n", + " <th>98</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64325209</td>\n", + " <td>126596817</td>\n", + " <td>37207.0</td>\n", + " <td>0.0571</td>\n", + " </tr>\n", + " <tr>\n", + " <th>99</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>127132920</td>\n", + " <td>132080656</td>\n", + " <td>3591.0</td>\n", + " <td>0.0449</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132080885</td>\n", + " <td>132099465</td>\n", + " <td>15.0</td>\n", + " <td>-0.6123</td>\n", + " </tr>\n", + " <tr>\n", + " <th>101</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132099856</td>\n", + " <td>134142530</td>\n", + " <td>1010.0</td>\n", + " <td>0.0483</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "96 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 456120 64200041 \n", + "97 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64208988 64319750 \n", + "98 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64325209 126596817 \n", + "99 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 127132920 132080656 \n", + "100 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132080885 132099465 \n", + "101 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132099856 134142530 \n", + "\n", + " Num_Probes Segment_Mean \n", + "96 34710.0 0.0054 \n", + "97 61.0 -0.6748 \n", + "98 37207.0 0.0571 \n", + "99 3591.0 0.0449 \n", + "100 15.0 -0.6123 \n", + "101 1010.0 0.0483 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "t = remove_ovelapping_segments(t, n,tumor_barcode)\n", + "print(\"segemtns in tumor after removing germlines:\",t.shape[0])\n", + "t.loc[t[\"Chromosome\"] ==11,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "segemtns in tumor after dropping low.conf.: 101\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Sample</th>\n", + " <th>Chromosome</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Num_Probes</th>\n", + " <th>Segment_Mean</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>97</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>64208988</td>\n", + " <td>64319750</td>\n", + " <td>61.0</td>\n", + " <td>-0.6748</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100</th>\n", + " <td>TCGA-ZJ-AAXJ-01A-11D-A42N-01</td>\n", + " <td>11</td>\n", + " <td>132080885</td>\n", + " <td>132099465</td>\n", + " <td>15.0</td>\n", + " <td>-0.6123</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Sample Chromosome Start End \\\n", + "97 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 64208988 64319750 \n", + "100 TCGA-ZJ-AAXJ-01A-11D-A42N-01 11 132080885 132099465 \n", + "\n", + " Num_Probes Segment_Mean \n", + "97 61.0 -0.6748 \n", + "100 15.0 -0.6123 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t = filter_lowconf_segments(t,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + "print(\"segemtns in tumor after dropping low.conf.:\",t.shape[0])\n", + "t.loc[t[\"Chromosome\"] ==11,:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TCGA " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HNSC samples: 1089 CNA events per sample on avg.: 101.275482094\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1089 tumors: 530 normals: 559\n", + "\ttumors without matched normal 28\n", + "\ttumors with at least one sCNA 497\n", + "\ttumors without any somatic CNA 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "HNSC samples: 525 Segments per sample on avg.: 60.6876190476\n", + "ESCA samples: 373 CNA events per sample on avg.: 163.010723861\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 373 tumors: 185 normals: 188\n", + "\ttumors without matched normal 3\n", + "\ttumors with at least one sCNA 181\n", + "\ttumors without any somatic CNA 1\n", + "total samples: 248 tumors: 125 normals: 123\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "ESCA samples: 184 Segments per sample on avg.: 141.836956522\n", + "THYM samples: 248 CNA events per sample on avg.: 62.7862903226\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 5\n", + "\ttumors with at least one sCNA 95\n", + "\ttumors without any somatic CNA 25\n", + "total samples: 132 tumors: 66 normals: 66\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "THYM samples: 100 Segments per sample on avg.: 9.41\n", + "KICH samples: 132 CNA events per sample on avg.: 77.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 0\n", + "\ttumors with at least one sCNA 65\n", + "\ttumors without any somatic CNA 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "KICH samples: 65 Segments per sample on avg.: 51.4923076923\n", + "LUSC samples: 1032 CNA events per sample on avg.: 130.682170543\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1032 tumors: 501 normals: 531\n", + "\ttumors without matched normal 23\n", + "\ttumors with at least one sCNA 476\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LUSC samples: 499 Segments per sample on avg.: 94.6533066132\n", + "BLCA samples: 797 CNA events per sample on avg.: 130.927227102\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 797 tumors: 414 normals: 383\n", + "\ttumors without matched normal 46\n", + "\ttumors with at least one sCNA 366\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "BLCA samples: 412 Segments per sample on avg.: 94.8859223301\n", + "GBM samples: 1104 CNA events per sample on avg.: 133.018115942\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1104 tumors: 590 normals: 514\n", + "\ttumors without matched normal 78\n", + "\ttumors with at least one sCNA 511\n", + "\ttumors without any somatic CNA 1\n", + "total samples: 85 tumors: 36 normals: 49\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "GBM samples: 589 Segments per sample on avg.: 70.2139219015\n", + "CHOL samples: 85 CNA events per sample on avg.: 89.0588235294\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 0\n", + "\ttumors with at least one sCNA 36\n", + "\ttumors without any somatic CNA 0\n", + "total samples: 111 tumors: 56 normals: 55\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "CHOL samples: 36 Segments per sample on avg.: 56.6944444444\n", + "UCS samples: 111 CNA events per sample on avg.: 173.855855856\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 54\n", + "\ttumors without any somatic CNA 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "UCS samples: 56 Segments per sample on avg.: 179.125\n", + "LGG samples: 1015 CNA events per sample on avg.: 78.6118226601\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1015 tumors: 530 normals: 485\n", + "\ttumors without matched normal 33\n", + "\ttumors with at least one sCNA 494\n", + "\ttumors without any somatic CNA 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LGG samples: 527 Segments per sample on avg.: 29.1157495256\n", + "THCA samples: 1013 CNA events per sample on avg.: 54.4096742349\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1013 tumors: 506 normals: 507\n", + "\ttumors without matched normal 15\n", + "\ttumors with at least one sCNA 367\n", + "\ttumors without any somatic CNA 124\n", + "total samples: 365 tumors: 185 normals: 180\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "THCA samples: 382 Segments per sample on avg.: 3.8219895288\n", + "PAAD samples: 365 CNA events per sample on avg.: 95.3643835616\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 10\n", + "\ttumors with at least one sCNA 161\n", + "\ttumors without any somatic CNA 14\n", + "total samples: 1059 tumors: 529 normals: 530\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "PAAD samples: 171 Segments per sample on avg.: 32.4093567251\n", + "KIRC samples: 1059 CNA events per sample on avg.: 80.298394712\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 22\n", + "\ttumors with at least one sCNA 501\n", + "\ttumors without any somatic CNA 6\n", + "total samples: 160 tumors: 80 normals: 80\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "KIRC samples: 523 Segments per sample on avg.: 20.5009560229\n", + "UVM samples: 160 CNA events per sample on avg.: 81.08125\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 0\n", + "\ttumors with at least one sCNA 80\n", + "\ttumors without any somatic CNA 0\n", + "total samples: 586 tumors: 297 normals: 289\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "UVM samples: 80 Segments per sample on avg.: 38.425\n", + "CESC samples: 586 CNA events per sample on avg.: 101.450511945\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 16\n", + "\ttumors with at least one sCNA 280\n", + "\ttumors without any somatic CNA 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "CESC samples: 296 Segments per sample on avg.: 58.1351351351\n", + "LUAD samples: 1095 CNA events per sample on avg.: 105.78630137\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1095 tumors: 518 normals: 577\n", + "\ttumors without matched normal 19\n", + "\ttumors with at least one sCNA 494\n", + "\ttumors without any somatic CNA 5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LUAD samples: 513 Segments per sample on avg.: 70.469785575\n", + "STAD samples: 904 CNA events per sample on avg.: 130.961283186\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 904 tumors: 442 normals: 462\n", + "\ttumors without matched normal 26\n", + "\ttumors with at least one sCNA 410\n", + "\ttumors without any somatic CNA 6\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "STAD samples: 436 Segments per sample on avg.: 96.4220183486\n", + "UCEC samples: 1089 CNA events per sample on avg.: 116.707070707\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1089 tumors: 540 normals: 549\n", + "\ttumors without matched normal 23\n", + "\ttumors with at least one sCNA 504\n", + "\ttumors without any somatic CNA 13\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "UCEC samples: 527 Segments per sample on avg.: 78.89943074\n", + "SKCM samples: 937 CNA events per sample on avg.: 115.351120598\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 937 tumors: 472 normals: 465\n", + "\ttumors without matched normal 7\n", + "\ttumors with at least one sCNA 463\n", + "\ttumors without any somatic CNA 2\n", + "total samples: 172 tumors: 87 normals: 85\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "SKCM samples: 470 Segments per sample on avg.: 82.9957446809\n", + "MESO samples: 172 CNA events per sample on avg.: 106.598837209\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 82\n", + "\ttumors without any somatic CNA 3\n", + "total samples: 346 tumors: 168 normals: 178\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "MESO samples: 84 Segments per sample on avg.: 60.8333333333\n", + "PCPG samples: 346 CNA events per sample on avg.: 90.3352601156\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 6\n", + "\ttumors with at least one sCNA 159\n", + "\ttumors without any somatic CNA 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "PCPG samples: 165 Segments per sample on avg.: 43.5878787879\n", + "STES samples: 1277 CNA events per sample on avg.: 140.322631167\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1277 tumors: 627 normals: 650\n", + "\ttumors without matched normal 29\n", + "\ttumors with at least one sCNA 591\n", + "\ttumors without any somatic CNA 7\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "STES samples: 620 Segments per sample on avg.: 109.9\n", + "SARC samples: 513 CNA events per sample on avg.: 208.068226121\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 513 tumors: 263 normals: 250\n", + "\ttumors without matched normal 17\n", + "\ttumors with at least one sCNA 245\n", + "\ttumors without any somatic CNA 1\n", + "total samples: 380 tumors: 191 normals: 189\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "SARC samples: 262 Segments per sample on avg.: 187.057251908\n", + "LAML samples: 380 CNA events per sample on avg.: 74.5368421053\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 3\n", + "\ttumors with at least one sCNA 167\n", + "\ttumors without any somatic CNA 21\n", + "total samples: 590 tumors: 288 normals: 302\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LAML samples: 170 Segments per sample on avg.: 7.18823529412\n", + "KIRP samples: 590 CNA events per sample on avg.: 79.5152542373\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 15\n", + "\ttumors with at least one sCNA 271\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "KIRP samples: 286 Segments per sample on avg.: 21.8846153846\n", + "LIHC samples: 760 CNA events per sample on avg.: 122.8\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 760 tumors: 373 normals: 387\n", + "\ttumors without matched normal 21\n", + "\ttumors with at least one sCNA 348\n", + "\ttumors without any somatic CNA 4\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "LIHC samples: 369 Segments per sample on avg.: 81.1327913279\n", + "OV samples: 1168 CNA events per sample on avg.: 224.04109589\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1168 tumors: 597 normals: 571\n", + "\ttumors without matched normal 26\n", + "\ttumors with at least one sCNA 571\n", + "\ttumors without any somatic CNA 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "OV samples: 597 Segments per sample on avg.: 207.924623116\n", + "TGCT samples: 304 CNA events per sample on avg.: 83.8125\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 304 tumors: 156 normals: 148\n", + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 154\n", + "\ttumors without any somatic CNA 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "TGCT samples: 156 Segments per sample on avg.: 37.7820512821\n", + "COAD samples: 918 CNA events per sample on avg.: 98.6209150327\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 918 tumors: 453 normals: 465\n", + "\ttumors without matched normal 44\n", + "\ttumors with at least one sCNA 406\n", + "\ttumors without any somatic CNA 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "COAD samples: 450 Segments per sample on avg.: 48.4755555556\n", + "BRCA samples: 2199 CNA events per sample on avg.: 129.35788995\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 2199 tumors: 1088 normals: 1111\n", + "\ttumors without matched normal 35\n", + "\ttumors with at least one sCNA 1046\n", + "\ttumors without any somatic CNA 7\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "BRCA samples: 1081 Segments per sample on avg.: 102.808510638\n", + "PRAD samples: 1023 CNA events per sample on avg.: 114.706744868\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 1023 tumors: 493 normals: 530\n", + "\ttumors without matched normal 17\n", + "\ttumors with at least one sCNA 458\n", + "\ttumors without any somatic CNA 18\n", + "total samples: 96 tumors: 52 normals: 44\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "PRAD samples: 475 Segments per sample on avg.: 60.3831578947\n", + "DLBC samples: 96 CNA events per sample on avg.: 97.3229166667\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 10\n", + "\ttumors with at least one sCNA 40\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "DLBC samples: 50 Segments per sample on avg.: 44.44\n", + "READ samples: 316 CNA events per sample on avg.: 113.180379747\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 316 tumors: 166 normals: 150\n", + "\ttumors without matched normal 23\n", + "\ttumors with at least one sCNA 141\n", + "\ttumors without any somatic CNA 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "READ samples: 164 Segments per sample on avg.: 70.012195122\n", + "ACC samples: 180 CNA events per sample on avg.: 116.955555556\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "total samples: 180 tumors: 90 normals: 90\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "after filtering\n", + "ACC samples: 89 Segments per sample on avg.: 107.449438202\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\ttumors without matched normal 2\n", + "\ttumors with at least one sCNA 87\n", + "\ttumors without any somatic CNA 1\n" + ] + } + ], + "source": [ + "\n", + "data_dir = \"../../TCGA/CNA/data__2016_01_28//\"\n", + "\n", + "dfs = {}\n", + "dfs_normals = {}\n", + "tumors_without_CNA = {}\n", + "for f in os.listdir(data_dir):\n", + " if f.endswith(\"tar.gz\"):\n", + " fp = f.replace(\".tar.gz\",\"\")\n", + " cohort = fp.split(\".\")[2].replace(\"org_\",\"\")\n", + " file_path = fp+\"/\"+cohort+\".\"+fp.split(\".\")[3].replace(\"Merge_\",\"\")+\".seg.txt\"\n", + " df = pd.read_csv(data_dir+file_path, sep = \"\\t\")\n", + " \n", + " df[\"Chromosome\"] = df[\"Chromosome\"].map(chr_dict)\n", + " print(cohort,\"samples:\",len(set(df[\"Sample\"].values)),\n", + " \"CNA events per sample on avg.:\",float(df.shape[0])/len(set(df[\"Sample\"].values)))\n", + " \n", + " #### remove segments overlapping with segemnts in normals by 80% or more reciprocally ####\n", + " df[\"type\"] = df[\"Sample\"].apply(sample_type)\n", + " df_normals = df.loc[df[\"type\"]== \"Normal\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n", + " df_tumors = df.loc[df[\"type\"]== \"Tumor\",[\"Sample\",\"Chromosome\",\"Start\",\"End\",\"Num_Probes\",\"Segment_Mean\"]]\n", + " normal_samples = list(set(df_normals[\"Sample\"].values))\n", + " tumor_samples = list(set(df_tumors[\"Sample\"].values))\n", + " print(\"total samples:\", len(set(df[\"Sample\"].values)),\n", + " \"tumors:\",len(tumor_samples),\"normals:\",len(normal_samples),file= sys.stderr)\n", + " \n", + " tumors_without_somatic_CNA = []\n", + " tumors_germline_removed = []\n", + " tumors_without_matching_normal = []\n", + " filtered_normals = []\n", + " for tumor_sample in tumor_samples:\n", + " #print(sample, find_matching_normal(sample,list(set(d[\"Sample\"]))))\n", + " tumor = df_tumors.loc[df_tumors [\"Sample\"]== tumor_sample,:]\n", + " matching_normals = find_matching_normal(tumor_sample,normal_samples)\n", + " if len(matching_normals) >0:\n", + " n_segs = tumor.shape[0]\n", + " for normal_sample in matching_normals:\n", + " normal = df_normals.loc[df_normals[\"Sample\"]== normal_sample,:]\n", + " # thresholds for +1 and -1 copy in 75% of normal cell;\n", + " # this is to retain segments appeared due to slight tumor contamination\n", + " normal = filter_lowconf_segments(normal,0,0.46, -0.68 )\n", + " filtered_normals.append(normal)\n", + " tumor = remove_ovelapping_segments(tumor, normal,tumor_sample)\n", + " #if n_segs > tumor.shape[0]:\n", + " # print(n_segs - tumor.shape[0],\"segments removed in sample\",tumor_sample,\n", + " # tumor.shape[0],\"remained\",file= sys.stderr)\n", + " tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + " if tumor.shape[0] == 0:\n", + " tumors_without_somatic_CNA.append(tumor_sample)\n", + " else:\n", + " tumors_germline_removed.append(tumor)\n", + " else:\n", + " tumor = filter_lowconf_segments(tumor,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + " if tumor.shape[0] == 0:\n", + " tumors_without_somatic_CNA.append(tumor_sample)\n", + " else:\n", + " tumors_without_matching_normal.append(tumor)\n", + "\n", + " print(\"\\ttumors without matched normal\",len(tumors_without_matching_normal),file= sys.stderr)\n", + " print(\"\\ttumors with at least one sCNA\",len(tumors_germline_removed),file= sys.stderr)\n", + " print(\"\\ttumors without any somatic CNA\",len(tumors_without_somatic_CNA),file= sys.stderr)\n", + " #dfs[cohort] = df\n", + " filtered_tumors = pd.concat(tumors_germline_removed+tumors_without_matching_normal)\n", + " dfs[cohort] = filtered_tumors\n", + " filtered_normals = pd.concat(filtered_normals)\n", + " dfs_normals[cohort] = filtered_normals\n", + " tumors_without_CNA[cohort] = tumors_without_somatic_CNA\n", + " print(\"after filtering\")\n", + " print(cohort,\"samples:\",len(set(filtered_tumors[\"Sample\"].values)),\n", + " \"Segments per sample on avg.:\",float(filtered_tumors.shape[0])/len(set(filtered_tumors[\"Sample\"].values)))\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregating to gene-level\n", + "\n", + "Gene annotation must be:\n", + " - with Entrez gene IDs \n", + " - in hg19 coordinates\n", + " - with columns \"chrom\",\"start\",\"stop\",\"gene\" (this is foru-column bed format)\n", + " \n", + "wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz\n", + "\n", + "echo -e \"chrom\\tstart\\tstop\\tgene\\tname\" > ef_GRCh37.p5_top_level.gff3.bed;\n", + "zcat ref_GRCh37.p5_top_level.gff3.gz | awk '$3==\"gene\"' | cut -f 1,4,5,9| sed -e 's/;/\\t/g'| cut -f 1-3,5,6 | grep GeneID | sed -re 's/(Dbxref=GeneID:[0-9]*),.*/\\1/' | sed -e 's/Name=//' -e 's/Dbxref=GeneID://' | awk '{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$5\"\\t\"$4}' >> \n", + "ref_GRCh37.p5_top_level.gff3.bed\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "rename_chroms = {\"NC_000001.10\":1,\"NC_000002.11\":2,\"NC_000003.11\":3,\"NC_000004.11\":4,\n", + " \"NC_000005.9\":5,\"NC_000006.11\":6,\"NC_000007.13\":7,\"NC_000008.10\":8,\n", + " \"NC_000009.11\":9,\"NC_000010.10\":10,\"NC_000011.9\":11,\"NC_000012.11\":12,\"NC_000013.10\":13,\n", + " \"NC_000014.8\":14,\"NC_000015.9\":15,\"NC_000016.9\":16,\"NC_000017.10\":17,\n", + " \"NC_000018.9\":18,\"NC_000019.9\":19,\"NC_000020.10\":20,\"NC_000021.8\":21,\n", + " \"NC_000022.10\":22,\"NC_000023.10\":23,\"NC_000024.9\":24}" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(36019, 5)\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>chrom</th>\n", + " <th>start</th>\n", + " <th>stop</th>\n", + " <th>gene</th>\n", + " <th>name</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>10954</td>\n", + " <td>11507</td>\n", + " <td>100506145</td>\n", + " <td>LOC100506145</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>12190</td>\n", + " <td>13639</td>\n", + " <td>100652771</td>\n", + " <td>LOC100652771</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>14362</td>\n", + " <td>29370</td>\n", + " <td>653635</td>\n", + " <td>WASH7P</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>30366</td>\n", + " <td>30503</td>\n", + " <td>100302278</td>\n", + " <td>MIR1302-2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " chrom start stop gene name\n", + "0 1 10954 11507 100506145 LOC100506145\n", + "1 1 12190 13639 100652771 LOC100652771\n", + "2 1 14362 29370 653635 WASH7P\n", + "3 1 30366 30503 100302278 MIR1302-2" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_intervals = pd.read_csv(gene_coords_file, sep = \"\\t\")\n", + "gene_intervals = gene_intervals.loc[gene_intervals[\"chrom\"].isin(rename_chroms.keys()),:]\n", + "gene_intervals[\"chrom\"] = gene_intervals[\"chrom\"].apply(lambda x : rename_chroms[x])\n", + "#print(\"chromosomes:\",list(set(gene_intervals[\"chrom\"].values)))\n", + "gene_intervals = gene_intervals.sort_values(by=[\"chrom\",\"start\",\"stop\"],ascending=True)\n", + "gene_intervals.to_csv(\"/home/olya/SFU/Hossein/v1/ref_GRCh37.p5_top_level.gff3.chroms_renamed.bed\",sep = \"\\t\",index=False)\n", + "print(gene_intervals.shape)\n", + "gene_intervals.head(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "gene_intervals_bed = pbt.BedTool.from_dataframe(gene_intervals[[\"chrom\",\"start\",\"stop\",\"gene\"]])\n", + "# prepare copy-neutral table\n", + "cnv_baseline = gene_intervals.copy()\n", + "cnv_baseline[\"Segment_Mean\"] = [0]*cnv_baseline.shape[0]\n", + "cnv_baseline = cnv_baseline[[\"gene\",\"Segment_Mean\"]]\n", + "cnv_baseline.set_index(\"gene\",inplace=True,drop=True)\n", + "cnv_baseline.sort_index(inplace=True)\n", + "sorted_index = list(cnv_baseline.index.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ESCA\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "ESCA (36019, 185)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DLBC\n", + "TCGA-G8-6914-14A-01D-2209-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLBC (36019, 52)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "READ\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "READ (36019, 166)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GBM\n", + "TCGA-06-0165-01A-01D-0236-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-06-0119-01A-08D-0214-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n", + "... 400 processed.\n", + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-06-5410-01A-01D-1694-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GBM (36019, 590)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "STES\n", + "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n", + "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n", + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n", + "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n", + "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 600 processed.\n", + "STES (36019, 627)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "BLCA\n", + "TCGA-YC-A8S6-01A-31D-A38F-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DK-A3WY-01A-11D-A22Y-01 has no genes with altered CN\n", + "TCGA-XF-A9SL-01A-11D-A390-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-E7-A7XN-01A-11D-A34T-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n", + "... 400 processed.\n", + "BLCA (36019, 414)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UCEC\n", + "TCGA-D1-A16Y-01A-31D-A12G-01 has no genes with altered CN\n", + "TCGA-BK-A6W4-01A-12D-A34P-01 has no genes with altered CN\n", + "TCGA-BS-A0V7-01A-21D-A120-01 has no genes with altered CN\n", + "TCGA-B5-A11Y-01A-21D-A10L-01 has no genes with altered CN\n", + "TCGA-D1-A17F-01A-11D-A12G-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AX-A062-01A-11D-A00X-01 has no genes with altered CN\n", + "TCGA-D1-A16D-01A-11D-A12G-01 has no genes with altered CN\n", + "TCGA-BG-A0VZ-01A-11D-A107-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AJ-A2QL-01A-11D-A18N-01 has no genes with altered CN\n", + "TCGA-BS-A0UA-01A-11D-A120-01 has no genes with altered CN\n", + "TCGA-B5-A11U-01A-11D-A120-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EO-A3AU-01A-21D-A19X-01 has no genes with altered CN\n", + "TCGA-QF-A5YS-01A-11D-A31T-01 has no genes with altered CN\n", + "TCGA-D1-A0ZV-01A-11D-A10L-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-QS-A5YR-01A-31D-A31T-01 has no genes with altered CN\n", + "TCGA-DI-A1BU-01A-11D-A134-01 has no genes with altered CN\n", + "TCGA-AP-A0LG-01A-11D-A042-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-D1-A0ZS-01A-11D-A120-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UCEC (36019, 540)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PCPG\n", + "TCGA-RW-A7CZ-01A-11D-A35C-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-WB-A817-01A-11D-A35H-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PCPG (36019, 168)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "STAD\n", + "TCGA-MX-A5UG-01A-21D-A31K-01 has no genes with altered CN\n", + "TCGA-RD-A8NB-01A-12D-A396-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-7957-01A-11D-2200-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BR-6563-01A-13D-2052-01 has no genes with altered CN\n", + "TCGA-D7-6522-01A-11D-1799-01 has no genes with altered CN\n", + "TCGA-BR-7196-01A-11D-2052-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-D7-A6ET-01A-32D-A32M-01 has no genes with altered CN\n", + "TCGA-HU-A4GJ-01A-11D-A253-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n", + "STAD (36019, 442)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CESC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "CESC (36019, 297)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UCS\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UCS (36019, 56)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TGCT\n", + "TCGA-YU-A90S-01A-11D-A434-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "TGCT (36019, 156)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "THCA\n", + "TCGA-EL-A4JZ-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A13X-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-EL-A3ZT-01A-12D-A23L-01 has no genes with altered CN\n", + "TCGA-DE-A0XZ-01A-11D-A17S-01 has no genes with altered CN\n", + "TCGA-DJ-A2PP-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-KS-A4I5-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A2PS-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-EL-A3GW-01A-11D-A201-01 has no genes with altered CN\n", + "TCGA-BJ-A0ZG-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-J8-A3O2-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-FY-A3RA-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-CE-A483-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-EM-A1CW-01A-21D-A13V-01 has no genes with altered CN\n", + "TCGA-DJ-A4V4-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-E3-A3E1-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-ET-A2MZ-01A-12D-A19I-01 has no genes with altered CN\n", + "TCGA-E8-A414-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-EL-A3T6-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-DJ-A4V5-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A3UY-01A-21D-A22C-01 has no genes with altered CN\n", + "TCGA-EL-A3D4-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-FY-A76V-01A-11D-A396-01 has no genes with altered CN\n", + "TCGA-FY-A4B3-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-DJ-A3UO-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-EL-A4K7-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A1QI-01A-11D-A14V-01 has no genes with altered CN\n", + "TCGA-EL-A3N2-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-E3-A3E5-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-EM-A1YD-01A-11D-A14V-01 has no genes with altered CN\n", + "TCGA-GE-A2C6-01A-11D-A16M-01 has no genes with altered CN\n", + "TCGA-DJ-A2Q5-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-ET-A3DP-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-DJ-A4UT-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-DJ-A2PT-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-DJ-A4V2-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-L6-A4ET-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-BJ-A0ZJ-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-DE-A4M9-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A4KD-01A-11D-A256-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-QD-A8IV-01A-11D-A396-01 has no genes with altered CN\n", + "TCGA-ET-A3DV-01A-12D-A201-01 has no genes with altered CN\n", + "TCGA-EM-A22K-01A-11D-A17S-01 has no genes with altered CN\n", + "TCGA-DJ-A3VE-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-EL-A3D1-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-BJ-A2P4-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-CE-A3ME-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-E8-A417-01A-21D-A23L-01 has no genes with altered CN\n", + "TCGA-KS-A41I-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-FK-A3SB-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-BJ-A28S-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-MK-A4N9-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-E8-A437-01A-12D-A23T-01 has no genes with altered CN\n", + "TCGA-EM-A3AP-01A-12D-A20A-01 has no genes with altered CN\n", + "TCGA-EL-A3TA-01A-12D-A22C-01 has no genes with altered CN\n", + "TCGA-IM-A41Z-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-EM-A2CQ-01A-11D-A17S-01 has no genes with altered CN\n", + "TCGA-EM-A3O7-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-FE-A3PC-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-DJ-A2PY-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-EM-A4FQ-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EM-A3FO-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-BJ-A0Z9-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-EM-A3FK-01A-11D-A219-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-ET-A3BU-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-BJ-A0Z5-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-EL-A3MY-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-ET-A39L-01A-12D-A19I-01 has no genes with altered CN\n", + "TCGA-E8-A415-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-ET-A40Q-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-KS-A4I7-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-MK-A4N7-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-L6-A4EQ-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-FY-A3TY-01A-11D-A22Y-01 has no genes with altered CN\n", + "TCGA-ET-A2N1-01A-11D-A18E-01 has no genes with altered CN\n", + "TCGA-DJ-A2PO-01A-21D-A19I-01 has no genes with altered CN\n", + "TCGA-J8-A3O2-06A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-CE-A485-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-ET-A3BX-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DJ-A3VK-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-DE-A4M8-01A-21D-A256-01 has no genes with altered CN\n", + "TCGA-ET-A40T-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-BJ-A18Z-01A-21D-A13V-01 has no genes with altered CN\n", + "TCGA-DJ-A3UT-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-DJ-A2Q2-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-BJ-A18Y-01A-11D-A13V-01 has no genes with altered CN\n", + "TCGA-ET-A39T-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-EL-A3CL-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DJ-A4V0-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A3H8-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-ET-A39J-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-FY-A3I4-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-EM-A2CU-01A-12D-A17S-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EM-A3FM-01A-11D-A219-01 has no genes with altered CN\n", + "TCGA-EM-A4FF-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A3GX-01A-11D-A201-01 has no genes with altered CN\n", + "TCGA-DJ-A3UN-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-EM-A4FO-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EL-A3TB-01A-11D-A22C-01 has no genes with altered CN\n", + "TCGA-ET-A25N-01A-11D-A16M-01 has no genes with altered CN\n", + "TCGA-ET-A39M-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DE-A4MA-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-ET-A39O-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-DE-A0Y2-01A-11D-A10T-01 has no genes with altered CN\n", + "TCGA-FY-A3R8-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-EM-A3AL-01A-11D-A201-01 has no genes with altered CN\n", + "TCGA-EM-A2CN-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-FY-A3BL-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-EM-A1CS-01A-11D-A13V-01 has no genes with altered CN\n", + "TCGA-EL-A3D0-01A-12D-A201-01 has no genes with altered CN\n", + "TCGA-E3-A3DZ-01A-11D-A20A-01 has no genes with altered CN\n", + "TCGA-DJ-A1QF-01A-12D-A14V-01 has no genes with altered CN\n", + "TCGA-J8-A3YH-01A-11D-A22Y-01 has no genes with altered CN\n", + "TCGA-EL-A4K1-01A-11D-A256-01 has no genes with altered CN\n", + "TCGA-EM-A3O8-01A-11D-A21Y-01 has no genes with altered CN\n", + "TCGA-DJ-A3VJ-01A-11D-A23L-01 has no genes with altered CN\n", + "TCGA-BJ-A45D-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-FY-A4B4-01A-11D-A23T-01 has no genes with altered CN\n", + "TCGA-EM-A1CU-01A-11D-A13V-01 has no genes with altered CN\n", + "TCGA-EL-A3CX-01A-11D-A19I-01 has no genes with altered CN\n", + "TCGA-ET-A25O-01A-11D-A16M-01 has no genes with altered CN\n", + "TCGA-E8-A433-01A-11D-A23L-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "THCA (36019, 506)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CHOL\n", + "TCGA-W5-AA2H-01A-31D-A416-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CHOL (36019, 36)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "HNSC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n", + "... 500 processed.\n", + "HNSC (36019, 530)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UVM\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UVM (36019, 80)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "SKCM\n", + "TCGA-ER-A19A-06A-21D-A191-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EB-A4OZ-01A-12D-A25P-01 has no genes with altered CN\n", + "TCGA-EE-A2GK-06A-11D-A194-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SKCM (36019, 472)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "COAD\n", + "TCGA-G4-6302-01A-11D-1717-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AA-A03F-01A-11D-A080-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n", + "COAD (36019, 453)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ACC\n", + "TCGA-OR-A5KQ-01A-11D-A309-01 has no genes with altered CN\n", + "TCGA-OR-A5KV-01A-11D-A29H-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ACC (36019, 90)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PAAD\n", + "TCGA-IB-AAUR-01A-21D-A38F-01 has no genes with altered CN\n", + "TCGA-HZ-8002-01A-11D-2200-01 has no genes with altered CN\n", + "TCGA-XD-AAUG-01A-61D-A40V-01 has no genes with altered CN\n", + "TCGA-Z5-AAPL-01A-12D-A40V-01 has no genes with altered CN\n", + "TCGA-IB-A5SQ-01A-11D-A32M-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-IB-AAUS-01A-12D-A38F-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PAAD (36019, 185)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "THYM\n", + "TCGA-4V-A9QW-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZB-A96B-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-X7-A8DB-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-X7-A8M4-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-X7-A8D8-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-3S-AAYX-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-YT-A95E-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-X7-A8M8-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZT-A8OM-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZB-A96E-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-3Q-A9WF-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-X7-A8M1-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZB-A96A-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZB-A96R-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZB-A963-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-ZC-AAAA-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-XM-A8RB-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-ZB-A96G-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-X7-A8M7-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-XU-AAXZ-01A-11D-A427-01 has no genes with altered CN\n", + "TCGA-XH-A853-01A-11D-A422-01 has no genes with altered CN\n", + "TCGA-XM-AAZ3-01A-11D-A422-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "THYM (36019, 125)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LUSC\n", + "TCGA-56-8623-01A-11D-2391-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-98-A53H-01A-12D-A25M-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LUSC (36019, 501)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "MESO\n", + "TCGA-TS-A8AS-01A-21D-A39Q-01 has no genes with altered CN\n", + "TCGA-TS-A7P8-01A-11D-A34B-01 has no genes with altered CN\n", + "TCGA-TS-A8AV-01A-12D-A39Q-01 has no genes with altered CN\n", + "TCGA-3H-AB3O-01A-11D-A39Q-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MESO (36019, 87)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OV\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n", + "... 300 processed.\n", + "... 400 processed.\n", + "... 500 processed.\n", + "OV (36019, 597)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "SARC\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-WK-A8Y0-10D-01D-A419-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-WK-A8XS-10E-01D-A37E-01 has no genes with altered CN\n", + "TCGA-QQ-A5V2-01A-11D-A32H-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SARC (36019, 263)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "KIRP\n", + "TCGA-Y8-A8S1-01A-11D-A36W-01 has no genes with altered CN\n", + "TCGA-GL-A4EM-01A-11D-A253-01 has no genes with altered CN\n", + "TCGA-4A-A93Y-01A-11D-A36W-01 has no genes with altered CN\n", + "TCGA-AL-3467-01A-02D-1348-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-A4-7828-01A-11D-2135-01 has no genes with altered CN\n", + "TCGA-DW-7838-01A-11D-2135-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n", + "KIRP (36019, 288)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LGG\n", + "TCGA-HT-8106-01A-11D-2391-01 has no genes with altered CN\n", + "TCGA-S9-A6WI-01A-21D-A33S-01 has no genes with altered CN\n", + "TCGA-HT-7602-01A-21D-2085-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DU-7011-01A-11D-2023-01 has no genes with altered CN\n", + "TCGA-TM-A84B-12A-01D-A366-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-FG-8181-01A-11D-2252-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-FG-8189-01B-11D-A288-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DU-5872-02A-21D-A36N-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-HT-7680-01A-11D-2252-01 has no genes with altered CN\n", + "TCGA-P5-A5EY-01A-11D-A27J-01 has no genes with altered CN\n", + "TCGA-CS-6669-01A-11D-1892-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LGG (36019, 530)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LAML\n", + "TCGA-AB-2884-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2932-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2842-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2969-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2826-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2836-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2871-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2845-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2840-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2837-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2844-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2854-03A-01D-0756-21 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AB-3006-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2931-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2851-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2978-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2880-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2922-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2947-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2998-03A-01D-0756-21 has no genes with altered CN\n", + "TCGA-AB-2824-03A-01D-0756-21 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LAML (36019, 191)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LIHC\n", + "TCGA-2V-A95S-10D-01D-A36Z-01 has no genes with altered CN\n", + "TCGA-UB-AA0V-01A-11D-A381-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-G3-A25V-01A-11D-A16U-01 has no genes with altered CN\n", + "TCGA-DD-A3A6-01A-11D-A22E-01 has no genes with altered CN\n", + "TCGA-DD-A4NL-01A-11D-A28W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-ED-A5KG-01A-11D-A27H-01 has no genes with altered CN\n", + "TCGA-CC-A9FV-01A-11D-A36W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-MR-A520-01A-11D-A25U-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LIHC (36019, 373)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PRAD\n", + "TCGA-J9-A52C-01A-11D-A26L-01 has no genes with altered CN\n", + "TCGA-V1-A8MJ-01A-11D-A363-01 has no genes with altered CN\n", + "TCGA-XJ-A9DQ-01A-11D-A376-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-J4-A6G1-01A-11D-A30W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-J4-A67R-01A-21D-A30D-01 has no genes with altered CN\n", + "TCGA-EJ-A7NJ-01A-22D-A34T-01 has no genes with altered CN\n", + "TCGA-EJ-7791-01A-11D-2112-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-EJ-A8FU-01A-11D-A363-01 has no genes with altered CN\n", + "TCGA-EJ-A6RC-01A-11D-A32A-01 has no genes with altered CN\n", + "TCGA-HC-7740-01A-11D-2112-01 has no genes with altered CN\n", + "TCGA-EJ-A65B-01A-12D-A30D-01 has no genes with altered CN\n", + "TCGA-HC-8260-01A-11D-2259-01 has no genes with altered CN\n", + "TCGA-FC-A8O0-01A-41D-A376-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-VN-A88I-01A-11D-A34T-01 has no genes with altered CN\n", + "TCGA-EJ-A7NK-01A-12D-A34T-01 has no genes with altered CN\n", + "TCGA-CH-5743-01A-21D-1574-01 has no genes with altered CN\n", + "TCGA-G9-6367-01A-11D-1785-01 has no genes with altered CN\n", + "TCGA-KC-A4BO-01A-61D-A256-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PRAD (36019, 493)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "LUAD\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n", + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-L4-A4E6-01A-11D-A24C-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-44-3398-01A-01D-1877-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-55-8619-01A-11D-2389-01 has no genes with altered CN\n", + "TCGA-86-A4P8-01A-11D-A24O-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n", + "LUAD (36019, 518)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "BRCA\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AO-A0JC-01A-11D-A059-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BH-A0H5-01A-21D-A111-01 has no genes with altered CN\n", + "TCGA-A2-A0CR-01A-11D-A227-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BH-A1FE-06A-11D-A20R-01 has no genes with altered CN\n", + "TCGA-AN-A0FN-01A-11D-A036-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n", + "... 500 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-PL-A8LY-01A-11D-A41E-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 600 processed.\n", + "... 700 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-GM-A3XG-01A-31D-A242-01 has no genes with altered CN\n", + "TCGA-LD-A74U-01A-13D-A33D-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 800 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-GM-A2DO-10D-01D-A18N-01 has no genes with altered CN\n", + "TCGA-A2-A0EP-01A-52D-A22W-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 900 processed.\n", + "... 1000 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-AO-A1KO-01A-31D-A13J-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BRCA (36019, 1088)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "KIRC\n", + "TCGA-B4-5378-01A-01D-1499-01 has no genes with altered CN\n", + "TCGA-B0-5400-01A-01D-1499-01 has no genes with altered CN\n", + "TCGA-CJ-4890-01A-01D-1302-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 100 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-A3-A8OX-01A-11D-A36W-01 has no genes with altered CN\n", + "TCGA-B0-4817-01A-01D-1274-01 has no genes with altered CN\n", + "TCGA-B0-5080-01A-01D-1499-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 200 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-DV-A4VZ-01A-11D-A25U-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 300 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-CJ-4891-01A-01D-1302-01 has no genes with altered CN\n", + "TCGA-CJ-4889-01A-01D-1302-01 has no genes with altered CN\n", + "TCGA-BP-4769-01A-01D-1283-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 400 processed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TCGA-BP-4760-01A-02D-1417-01 has no genes with altered CN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "... 500 processed.\n", + "KIRC (36019, 529)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "KICH\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KICH (36019, 66)\n" + ] + } + ], + "source": [ + "for cohort in dfs.keys():\n", + " print(cohort, file=sys.stderr)\n", + " df = dfs[cohort]\n", + " cna_table = []\n", + " n_samples = 0\n", + " for sample in list(set(df.Sample.values)):\n", + " n_samples +=1\n", + " cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n", + " verbose = False,sorted_index = sorted_index)\n", + " cna_table.append(cnv2gene)\n", + " if n_samples % 100 == 0:\n", + " print(\"...\",n_samples, \"processed.\")\n", + " cna_table = pd.concat(cna_table,axis =1)\n", + " \n", + "\n", + " for sample in tumors_without_CNA[cohort]:\n", + " cna_table[sample] = 0\n", + " \n", + " cna_table.fillna(0, inplace = True)\n", + " cna_table.to_csv(preprocessed_dir+\"/TCGA-\"+cohort+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)\n", + " print(cohort,cna_table.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'t = time.time()\\ncnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\\n verbose = False,sorted_index = sorted_index)\\nprint( time.time() - t)\\ncnv2gene'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"t = time.time()\n", + "cnv2gene = cnv2genelevel(cnv2bed(df[df.Sample == sample]),gene_intervals_bed,sample,\n", + " verbose = False,sorted_index = sorted_index)\n", + "print( time.time() - t)\n", + "cnv2gene\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CCLE \n", + "\n", + "the same pipeline as for TCGA except filtering out germline CNA (because no )\n", + "\n", + "wget https://data.broadinstitute.org/ccle_legacy_data/dna_copy_number/CCLE_copynumber_2013-12-03.seg.txt\n", + "\n", + "? should we use a stronger segment_mean threshold because this data are for cell lines and purity must be 100%" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "47 duplicated IDs in 94 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 0\n", + "Merged 94 duplicated rows into 47 rows\n", + "CCLE: genes: 35972 samples 1043\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"../../CCLE/CCLE_copynumber_2013-12-03.seg.txt\",sep = \"\\t\")\n", + "df.rename({\"CCLE_name\":\"Sample\"},inplace=True, axis=\"columns\")\n", + "df[\"End\"] = df[\"End\"].apply(int)\n", + "ccle = []\n", + "for sample_name in list(set(df[\"Sample\"].values)):\n", + " cl = df.loc[df[\"Sample\"]==sample_name, :]\n", + " # keep high-conf segments \n", + " cl_filtered = filter_lowconf_segments(cl,num_marker_thr,pos_seg_mean_thr, neg_seg_mean_thr )\n", + " #print(sample_name, cl.shape[0], \"after filtration\",cl_filtered.shape[0])\n", + " # map to genes \n", + " cnv2gene = cnv2genelevel(cnv2bed(cl_filtered),gene_intervals_bed,sample_name,\n", + " verbose = False,sorted_index = sorted_index)\n", + " ccle.append(cnv2gene)\n", + " \n", + "ccle = pd.concat(ccle,axis =1)\n", + "ccle.fillna(0, inplace = True)\n", + "ccle = handle_dups(ccle)\n", + "ccle.to_csv(preprocessed_dir+\"/\"+\"CCLE\"+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)\n", + "print(\"CCLE:\",\"genes:\",ccle.shape[0],\"samples\",ccle.shape[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GDSC\n", + "Assume that supplementary file with gene-level CN is downloaded :\n", + "\n", + "wget \n", + "\n", + "GDSC provides gene-level integer estimated CN, max. and min. CN over all segments covering a gene. In order to make it comparable with TCGA and CCLE, we divide estimated CN by CN of copy-neutral state and log2-transform it. \n", + "\n", + "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n", + "\n", + "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n", + "\n", + "3) Replace estimates below thresholds with zeroes. \n", + "\n", + "\n", + "DGSC uses 4 comma-separated values for gene-level CN (max_cn,min_cn,zygosity,disruption): e.g. (from \"legend\" tab)\n", + "\n", + "2,2,H,-\tGene resides on a single genomic segment in a diploid region of the genome.\n", + "2,0,L,D\tGene spans multiple segments, higest copy number is 2 but part of the coding sequence is homozygously deleted, the gene is disrupted.\n", + "13,13,H,-\tGene resides on a single genomic segment of copy number 13 in a heterozygous part of the genome (amplification).\n", + "14,12,L,D\tGene spans multiple genomic segments all of which are amplified to 12 or more copies, some or all segments have LOH, the gene is disrupted.\n", + "0,0,0,-\tComplete gene sequence falls within a homozygous deletion.\n", + "-1,-1,-,- gene level CN not assigned\n", + "\n", + "* min and max CN are integers \n", + "* zygosity - can be L (LOH in any overlapping segment) or H (heterozygous) or 0 (homozygous deleteion of the whole gene) or - (undefined)\n", + "* disruption - D (if disrupted) or \"-\" (not disrupted) \n", + "\n", + "Average ploidies of cell lines were downloaded from COSMIC:\n", + "\n", + "wget https://cog.sanger.ac.uk/cosmic/GRCh37/cell_lines/v86/PICNIC_average_ploidies.tsv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1540792525&Signature=mcSB6oFv%2BXCF4%2Fezm4a3Ds1JXo4%3D\n", + "\n", + "wget ftp:// ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/release-7.0/Gene_level_CN.xlsx\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>gene</th>\n", + " <th>chr</th>\n", + " <th>start</th>\n", + " <th>stop</th>\n", + " <th>201T</th>\n", + " <th>22RV1</th>\n", + " <th>23132-87</th>\n", + " <th>42-MG-BA</th>\n", + " <th>451Lu</th>\n", + " <th>5637</th>\n", + " <th>...</th>\n", + " <th>WSU-NHL</th>\n", + " <th>YAPC</th>\n", + " <th>YH-13</th>\n", + " <th>YKG-1</th>\n", + " <th>YMB-1-E</th>\n", + " <th>YT</th>\n", + " <th>ZR-75-30</th>\n", + " <th>huH-1</th>\n", + " <th>no-10</th>\n", + " <th>no-11</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1287381</td>\n", + " <td>924100</td>\n", + " <td>910924</td>\n", + " <td>687561</td>\n", + " <td>1287706</td>\n", + " <td>687452</td>\n", + " <td>...</td>\n", + " <td>909785</td>\n", + " <td>909904</td>\n", + " <td>909905</td>\n", + " <td>687592</td>\n", + " <td>1303911</td>\n", + " <td>946358</td>\n", + " <td>909907</td>\n", + " <td>1298146</td>\n", + " <td>908452</td>\n", + " <td>908450</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>DDX11L1</td>\n", + " <td>1</td>\n", + " <td>11869.0</td>\n", + " <td>14412.0</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>...</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>WASH7P</td>\n", + " <td>1</td>\n", + " <td>14363.0</td>\n", + " <td>29806.0</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>...</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " <td>-1,-1,-,-</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 1000 columns</p>\n", + "</div>" + ], + "text/plain": [ + " gene chr start stop 201T 22RV1 23132-87 42-MG-BA \\\n", + "0 NaN NaN NaN NaN 1287381 924100 910924 687561 \n", + "1 DDX11L1 1 11869.0 14412.0 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "2 WASH7P 1 14363.0 29806.0 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "\n", + " 451Lu 5637 ... WSU-NHL YAPC YH-13 \\\n", + "0 1287706 687452 ... 909785 909904 909905 \n", + "1 -1,-1,-,- -1,-1,-,- ... -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "2 -1,-1,-,- -1,-1,-,- ... -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "\n", + " YKG-1 YMB-1-E YT ZR-75-30 huH-1 no-10 no-11 \n", + "0 687592 1303911 946358 909907 1298146 908452 908450 \n", + "1 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "2 -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- -1,-1,-,- \n", + "\n", + "[3 rows x 1000 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GDSC_CNA = \"/home/olya/SFU/Hossein/GDSC/Gene_level_CN.xlsx\"\n", + "\n", + "gdsc = pd.read_excel(GDSC_CNA,\"Gene_level_CN\")\n", + "gdsc.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "25 gene IDs excluded due to string to datetime conversion in Excel.\n", + "Strings containing duplicated gene IDs: 0\n" + ] + } + ], + "source": [ + "gdsc.set_index(\"gene\",inplace = True)\n", + "gdsc.drop([\"chr\",\"start\",\"stop\"],inplace=True,axis=1)\n", + "gdsc.columns = gdsc.iloc[0,:]\n", + "gdsc = gdsc.iloc[1:,:]\n", + "gdsc.columns.name = None\n", + "# replace 2001-12-01 with DEC1 and get remove gene names converted to datetimes\n", + "gdsc.index.values[37778] = \"DEC1\"\n", + "df_size = gdsc.shape[0]\n", + "ndxs=pd.Series(gdsc.index).apply(lambda x : type(x) == unicode or type(x) == str)\n", + "gdsc = gdsc.loc[gdsc.index.values[ndxs[ndxs].index],:]\n", + "print(df_size - gdsc.shape[0],\"gene IDs excluded due to string to datetime conversion in Excel.\")\n", + "\n", + "gdsc.index.name = \"gene_id\"\n", + "ids = gdsc.index\n", + "ids = list(set(ids[ids.duplicated()]))\n", + "print(\"Strings containing duplicated gene IDs:\",gdsc.loc[ids,:].shape[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### distribution of averaged ploidies in GDSC\n", + "\n", + "we compared average ploidies reported in PICNIC_average_ploidies.tsv provided by COSMIC with " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1020\n", + "1016\n" + ] + } + ], + "source": [ + "GDSC_Ploidies = \"/home/olya/SFU/Hossein/GDSC/PICNIC_average_ploidies.tsv\"\n", + "GDSC_Ploidies = pd.read_csv(GDSC_Ploidies,sep = \"\\t\")\n", + "GDSC_Ploidies.drop(\"#sample_name\",axis = 1, inplace= True)\n", + "GDSC_Ploidies.set_index(\"sample_id\",inplace=True)\n", + "print(GDSC_Ploidies.shape[0])\n", + "GDSC_Ploidies.dropna(inplace=True)\n", + "print(GDSC_Ploidies.shape[0])\n", + "\n", + "est_ploidies = gdsc.apply(define_avg_ploidy).T\n", + "df_ploidies = pd.DataFrame.from_dict({\"est. avg. ploidy from CN profile\":est_ploidies[\"avg_pl\"],\"PICNIC avg. pl.\":GDSC_Ploidies[\"average_ploidy\"],\n", + " \"est. median. ploidy\":est_ploidies[\"median_pl\"]})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 1440x360 with 3 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(20,5))\n", + "plt.subplot(131)\n", + "tmp = plt.hist(est_ploidies[\"avg_pl\"],bins=30)\n", + "plt.title(\"est. avg. ploidy from CN profile\")\n", + "plt.subplot(132)\n", + "tmp = plt.hist(sorted(list(GDSC_Ploidies[\"average_ploidy\"].values)),bins=30)\n", + "plt.title(\"PICNIC avg. pl.\")\n", + "plt.subplot(133)\n", + "tmp = plt.hist(est_ploidies[\"median_pl\"],bins=30)\n", + "plt.title(\"est. median ploidy\")\n", + "\n", + "tmp = df_ploidies.plot.scatter(x = \"est. avg. ploidy from CN profile\",y=\"PICNIC avg. pl.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# PICNIC average ploidy vs estimated copy-neutral \n", + "tmp = df_ploidies.boxplot(column=\"PICNIC avg. pl.\", by = \"est. median. ploidy\" )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert gene-level integer CN into log2R-like format in order to make it compatible with TCGA and CCLE\n", + "\n", + "1) Copy-neutral state was defined from average ploiy, as median of integer CN values in non-disrupted genes.\n", + "\n", + "2) Compute log2(CN/neutral-CN) for min and max CN; keep the value with most extreme estimate\n", + "\n", + "3) Replace estimates below thresholds with zeroes. " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "estimated_CN = est_ploidies[\"median_pl\"].to_dict()\n", + "estimated_CN[1287381]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>1287381</th>\n", + " <th>924100</th>\n", + " <th>910924</th>\n", + " <th>687561</th>\n", + " <th>1287706</th>\n", + " <th>687452</th>\n", + " <th>906798</th>\n", + " <th>906797</th>\n", + " <th>906800</th>\n", + " <th>910922</th>\n", + " <th>...</th>\n", + " <th>909785</th>\n", + " <th>909904</th>\n", + " <th>909905</th>\n", + " <th>687592</th>\n", + " <th>1303911</th>\n", + " <th>946358</th>\n", + " <th>909907</th>\n", + " <th>1298146</th>\n", + " <th>908452</th>\n", + " <th>908450</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>5S_rRNA</th>\n", + " <td>-4.320000</td>\n", + " <td>1.807355</td>\n", + " <td>2.0</td>\n", + " <td>-0.415037</td>\n", + " <td>-4.32</td>\n", + " <td>-4.320000</td>\n", + " <td>-1.00</td>\n", + " <td>2.807355</td>\n", + " <td>2.169925</td>\n", + " <td>-1.00</td>\n", + " <td>...</td>\n", + " <td>-4.32</td>\n", + " <td>2.222392</td>\n", + " <td>-4.32</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.32</td>\n", + " <td>1.807355</td>\n", + " <td>-4.320000</td>\n", + " <td>1.736966</td>\n", + " <td>2.169925</td>\n", + " <td>-1.584963</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5_8S_rRNA</th>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.00</td>\n", + " <td>-0.584963</td>\n", + " <td>0.00</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.00</td>\n", + " <td>...</td>\n", + " <td>0.00</td>\n", + " <td>0.415037</td>\n", + " <td>0.00</td>\n", + " <td>-0.584963</td>\n", + " <td>-4.32</td>\n", + " <td>0.000000</td>\n", + " <td>-0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.584963</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7SK</th>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.32</td>\n", + " <td>-0.584963</td>\n", + " <td>-4.32</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>-4.32</td>\n", + " <td>...</td>\n", + " <td>-4.32</td>\n", + " <td>0.000000</td>\n", + " <td>-4.32</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.32</td>\n", + " <td>-0.415037</td>\n", + " <td>-4.320000</td>\n", + " <td>-4.320000</td>\n", + " <td>0.000000</td>\n", + " <td>-4.320000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 996 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 1287381 924100 910924 687561 1287706 687452 906798 \\\n", + "5S_rRNA -4.320000 1.807355 2.0 -0.415037 -4.32 -4.320000 -1.00 \n", + "5_8S_rRNA -0.584963 0.000000 0.0 -0.415037 0.00 -0.584963 0.00 \n", + "7SK 1.000000 0.000000 0.0 -4.320000 -4.32 -0.584963 -4.32 \n", + "\n", + " 906797 906800 910922 ... 909785 909904 909905 \\\n", + "5S_rRNA 2.807355 2.169925 -1.00 ... -4.32 2.222392 -4.32 \n", + "5_8S_rRNA 0.000000 0.000000 0.00 ... 0.00 0.415037 0.00 \n", + "7SK 0.584963 0.000000 -4.32 ... -4.32 0.000000 -4.32 \n", + "\n", + " 687592 1303911 946358 909907 1298146 908452 908450 \n", + "5S_rRNA -4.320000 -4.32 1.807355 -4.320000 1.736966 2.169925 -1.584963 \n", + "5_8S_rRNA -0.584963 -4.32 0.000000 -0.415037 0.000000 0.000000 -0.584963 \n", + "7SK -4.320000 -4.32 -0.415037 -4.320000 -4.320000 0.000000 -4.320000 \n", + "\n", + "[3 rows x 996 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdsc = gdsc.apply(lambda x : CN2log2R(x,estimated_CN[x.name] ))\n", + "# drop genes without any determined value\n", + "gdsc = gdsc.dropna(axis=0,how=\"all\")\n", + "# fill with zeroes the remaining ones\n", + "gdsc.fillna(0,inplace=True)\n", + "gdsc.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "gdsc = gdsc.applymap(lambda x : clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ok: no empty rows detected\n", + "Ok: no duplicated pairs detected\n", + "Ok: All Symbol rows are not empty.\n", + "Ok: All Symbol are mapped to GeneID\n", + "16 Symbol mapped to multiple GeneID\n", + "Ok: All GeneID are unique\n", + "59266 Symbol can be mapped directly to GeneID\n" + ] + } + ], + "source": [ + "NCBI = pd.read_csv(root_dir+\"Homo_sapiens.gene_info\",sep = \"\\t\")\n", + "NCBI = NCBI[[\"#tax_id\",\"GeneID\",\"Symbol\",\"Synonyms\",\"type_of_gene\"]]\n", + "NCBI = NCBI.loc[NCBI[\"#tax_id\"] == 9606]\n", + "NCBI = NCBI.loc[NCBI[\"type_of_gene\"] != \"unknown\"]\n", + "ncbi_symbols = parse_mapping_table(NCBI, \"Symbol\",\"GeneID\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ok: no empty rows detected\n", + "Ok: no duplicated pairs detected\n", + "Ok: All Synonyms rows are not empty.\n", + "Ok: All Synonyms are mapped to GeneID\n", + "3145 Synonyms mapped to multiple GeneID\n", + "49179 different Synonyms mapped to the same GeneID\n", + "10839 Synonyms can be mapped directly to GeneID\n" + ] + } + ], + "source": [ + "ncbi_synonyms = expand(NCBI[[\"Synonyms\",\"GeneID\"]],column=\"Synonyms\",sep=\"|\") \n", + "ncbi_synonyms = parse_mapping_table(ncbi_synonyms, \"Synonyms\",\"GeneID\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mapped: 24545 \n", + "\tdirectly via main_mapper 22363 \n", + "\tvia alternative mapper 766 \n", + "\tvia one of multiple synonyms in alternative mapper 1416 \n", + "\tLOC 0 \n", + "Unmapped: 21587 \n", + "\trecognized symbols without Entrez ID 0 \n", + "\tmultiple query_ids map to the same target_id 0 \n", + "\tquery_ids map to multiple target_ids in the main mapper 0 \n", + "\tquery_ids map to multiple target_ids in the alternative mapper 76 \n", + "\tLOC not found in Entrez 0 \n", + "\tNot found at all: 21511\n", + "Warning: query IDs mapping to duplicated target IDs in mapping table: 156\n", + "Warning: query IDs not mapped to any target IDs excluded: 21587\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/olya/miniconda2/lib/python2.7/site-packages/pandas/core/frame.py:3781: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " return super(DataFrame, self).rename(**kwargs)\n", + "IDs mapped to multiple target IDs are kept:\n", + " [143872, 286464, 140290, 414212, 414213, 51463, 642826, 84631, 574445, 399761, 100132115, 647060, 284565, 6551, 161176, 341019, 4253, 9502, 442416, 51236, 643749, 54438, 728113, 100302179, 414761, 29099, 729438, 256815, 10160, 645425, 653234, 644019, 26165, 3255, 644509, 2749, 653505, 653067, 643479, 100462820, 100418977, 26824, 79817, 6218, 728695, 100034743, 221262, 647507, 677844, 728917, 26583, 100289124, 84316, 200030, 768096, 642658, 23523, 401508, 23334, 119016, 106478953, 84458, 1517, 246126, 26095, 100033392, 92017, 374, 26871, 100132948, 125050, 387707, 653308, 79741, 728798]\n", + "mapper.py:204: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " df.sort_index(inplace=True)\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>1287381</th>\n", + " <th>924100</th>\n", + " <th>910924</th>\n", + " <th>687561</th>\n", + " <th>1287706</th>\n", + " <th>687452</th>\n", + " <th>906798</th>\n", + " <th>906797</th>\n", + " <th>906800</th>\n", + " <th>910922</th>\n", + " <th>...</th>\n", + " <th>909785</th>\n", + " <th>909904</th>\n", + " <th>909905</th>\n", + " <th>687592</th>\n", + " <th>1303911</th>\n", + " <th>946358</th>\n", + " <th>909907</th>\n", + " <th>1298146</th>\n", + " <th>908452</th>\n", + " <th>908450</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.415037</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.736966</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.736966</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>-0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>-0.415037</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 996 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 1287381 924100 910924 687561 1287706 687452 906798 \\\n", + "1 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 \n", + "2 0.000000 0.584963 0.0 -0.415037 0.321928 0.000000 0.584963 \n", + "9 -0.584963 0.584963 0.0 -0.415037 -0.415037 -0.584963 0.000000 \n", + "\n", + " 906797 906800 910922 ... 909785 909904 909905 \\\n", + "1 0.584963 0.0 0.0 ... 0.0 -0.584963 0.584963 \n", + "2 0.584963 0.0 0.0 ... 0.0 -0.584963 0.000000 \n", + "9 0.000000 0.0 0.0 ... 0.0 -0.584963 0.000000 \n", + "\n", + " 687592 1303911 946358 909907 1298146 908452 908450 \n", + "1 0.415037 0.000000 0.000000 0.0 0.000000 0.584963 0.415037 \n", + "2 0.000000 0.736966 0.321928 0.0 -0.584963 0.000000 0.736966 \n", + "9 0.415037 0.000000 0.000000 -1.0 -0.584963 0.000000 0.000000 \n", + "\n", + "[3 rows x 996 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdsc,query2target,not_mapped = apply_mappers(gdsc, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n", + "gdsc.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "75 duplicated IDs in 156 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 25\n", + "Merged 131 duplicated rows into 63 rows\n" + ] + } + ], + "source": [ + "gdsc = handle_dups(gdsc,corr_thr = 0.75)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>683665</th>\n", + " <th>683667</th>\n", + " <th>684052</th>\n", + " <th>684055</th>\n", + " <th>684057</th>\n", + " <th>684059</th>\n", + " <th>684062</th>\n", + " <th>684072</th>\n", + " <th>684681</th>\n", + " <th>687448</th>\n", + " <th>...</th>\n", + " <th>1659818</th>\n", + " <th>1659819</th>\n", + " <th>1659823</th>\n", + " <th>1659928</th>\n", + " <th>1659929</th>\n", + " <th>1660034</th>\n", + " <th>1660035</th>\n", + " <th>1660036</th>\n", + " <th>1674021</th>\n", + " <th>1789883</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.415037</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>-0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.321928</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.584963</td>\n", + " <td>-0.415037</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.584963</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-1.0</td>\n", + " <td>-1.584963</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.584963</td>\n", + " <td>0.584963</td>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.584963</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-1.0</td>\n", + " <td>-1.584963</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>0.0</td>\n", + " <td>-1.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>-1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.321928</td>\n", + " <td>0.0</td>\n", + " <td>-0.415037</td>\n", + " <td>0.0</td>\n", + " <td>0.415037</td>\n", + " <td>0.584963</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 996 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 683665 683667 684052 684055 684057 684059 684062 \\\n", + "gene_id \n", + "1 0.0 0.000000 0.0 0.000000 -0.415037 0.0 -0.415037 \n", + "2 0.0 0.000000 0.0 0.584963 0.000000 0.0 0.000000 \n", + "9 0.0 0.321928 0.0 0.584963 0.584963 0.0 0.321928 \n", + "10 0.0 0.321928 0.0 0.584963 0.584963 0.0 0.321928 \n", + "12 0.0 -1.000000 0.0 0.000000 -1.000000 0.0 -0.415037 \n", + "\n", + " 684072 684681 687448 ... 1659818 1659819 1659823 \\\n", + "gene_id ... \n", + "1 0.000000 0.415037 0.0 ... 0.0 0.0 0.000000 \n", + "2 0.584963 0.000000 0.0 ... 0.0 0.0 0.000000 \n", + "9 0.000000 -0.584963 0.0 ... 0.0 0.0 -0.415037 \n", + "10 0.000000 -0.584963 0.0 ... 0.0 0.0 -0.415037 \n", + "12 0.000000 0.000000 0.0 ... -1.0 0.0 0.321928 \n", + "\n", + " 1659928 1659929 1660034 1660035 1660036 1674021 1789883 \n", + "gene_id \n", + "1 0.0 -0.415037 0.0 -0.584963 0.000000 0.000000 0.321928 \n", + "2 0.0 0.000000 0.0 0.000000 0.000000 0.584963 -0.415037 \n", + "9 0.0 0.000000 -1.0 -1.584963 0.000000 -1.000000 -1.000000 \n", + "10 0.0 0.000000 -1.0 -1.584963 0.000000 -1.000000 -1.000000 \n", + "12 0.0 -0.415037 0.0 0.415037 0.584963 0.000000 0.000000 \n", + "\n", + "[5 rows x 996 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdsc.index.name = \"gene_id\"\n", + "gdsc = gdsc.T.sort_index().T\n", + "gdsc.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "gdsc.to_csv(preprocessed_dir+\"/\"+\"GDSC\"+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PDX \n", + "\n", + "For PDX dataset only gene-level estimated copy-number (non-integer) reported. \n", + "From ploidy distributions, calculated as average over all genes we concluded that CN estimates were called under assumption that copy-neutral state of each xenograft corresponds CN = 2.\n", + "\n", + "\n", + "For gene ID conversion we used the same approach as for RNA-seq." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(23852, 375)\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Sample</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>A1BG</th>\n", + " <td>2.58</td>\n", + " <td>1.60</td>\n", + " <td>2.17</td>\n", + " <td>2.08</td>\n", + " <td>2.00</td>\n", + " <td>3.94</td>\n", + " <td>2.04</td>\n", + " <td>11.39</td>\n", + " <td>2.17</td>\n", + " <td>2.01</td>\n", + " <td>...</td>\n", + " <td>2.08</td>\n", + " <td>2.10</td>\n", + " <td>2.14</td>\n", + " <td>2.95</td>\n", + " <td>2.06</td>\n", + " <td>2.07</td>\n", + " <td>1.99</td>\n", + " <td>2.07</td>\n", + " <td>1.43</td>\n", + " <td>2.03</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1BG-AS1</th>\n", + " <td>2.58</td>\n", + " <td>1.60</td>\n", + " <td>2.17</td>\n", + " <td>2.08</td>\n", + " <td>2.00</td>\n", + " <td>3.94</td>\n", + " <td>2.04</td>\n", + " <td>11.39</td>\n", + " <td>2.17</td>\n", + " <td>2.01</td>\n", + " <td>...</td>\n", + " <td>2.08</td>\n", + " <td>2.10</td>\n", + " <td>2.14</td>\n", + " <td>2.95</td>\n", + " <td>2.06</td>\n", + " <td>2.07</td>\n", + " <td>1.99</td>\n", + " <td>2.07</td>\n", + " <td>1.43</td>\n", + " <td>2.03</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1CF</th>\n", + " <td>2.87</td>\n", + " <td>2.97</td>\n", + " <td>2.01</td>\n", + " <td>2.06</td>\n", + " <td>2.10</td>\n", + " <td>1.58</td>\n", + " <td>2.01</td>\n", + " <td>1.64</td>\n", + " <td>1.89</td>\n", + " <td>1.99</td>\n", + " <td>...</td>\n", + " <td>2.04</td>\n", + " <td>0.97</td>\n", + " <td>1.58</td>\n", + " <td>2.08</td>\n", + " <td>1.95</td>\n", + " <td>1.92</td>\n", + " <td>1.54</td>\n", + " <td>1.28</td>\n", + " <td>1.33</td>\n", + " <td>2.10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A2LD1</th>\n", + " <td>5.74</td>\n", + " <td>1.64</td>\n", + " <td>2.06</td>\n", + " <td>2.01</td>\n", + " <td>2.07</td>\n", + " <td>1.74</td>\n", + " <td>2.06</td>\n", + " <td>1.59</td>\n", + " <td>1.40</td>\n", + " <td>2.53</td>\n", + " <td>...</td>\n", + " <td>2.03</td>\n", + " <td>2.07</td>\n", + " <td>2.25</td>\n", + " <td>2.00</td>\n", + " <td>1.01</td>\n", + " <td>2.00</td>\n", + " <td>1.08</td>\n", + " <td>1.85</td>\n", + " <td>1.93</td>\n", + " <td>1.45</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>4 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 X-1167 X-1169 \\\n", + "Sample \n", + "A1BG 2.58 1.60 2.17 2.08 2.00 3.94 2.04 11.39 \n", + "A1BG-AS1 2.58 1.60 2.17 2.08 2.00 3.94 2.04 11.39 \n", + "A1CF 2.87 2.97 2.01 2.06 2.10 1.58 2.01 1.64 \n", + "A2LD1 5.74 1.64 2.06 2.01 2.07 1.74 2.06 1.59 \n", + "\n", + " X-1172 X-1173 ... X-5694 X-5696 X-5713 X-5717 X-5727 \\\n", + "Sample ... \n", + "A1BG 2.17 2.01 ... 2.08 2.10 2.14 2.95 2.06 \n", + "A1BG-AS1 2.17 2.01 ... 2.08 2.10 2.14 2.95 2.06 \n", + "A1CF 1.89 1.99 ... 2.04 0.97 1.58 2.08 1.95 \n", + "A2LD1 1.40 2.53 ... 2.03 2.07 2.25 2.00 1.01 \n", + "\n", + " X-5739 X-5808 X-5959 X-5975 X-6047 \n", + "Sample \n", + "A1BG 2.07 1.99 2.07 1.43 2.03 \n", + "A1BG-AS1 2.07 1.99 2.07 1.43 2.03 \n", + "A1CF 1.92 1.54 1.28 1.33 2.10 \n", + "A2LD1 2.00 1.08 1.85 1.93 1.45 \n", + "\n", + "[4 rows x 375 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PDX_xls = \"/home/olya/SFU/Hossein/PDX/nm.3954-S2.xlsx\"\n", + "pdx = pd.read_excel(PDX_xls,\"copy number\")\n", + "pdx.set_index(\"Sample\",drop=True,inplace=True)\n", + "focal = pdx.T[\"FocalCNScore\"]\n", + "pdx.drop([\"ArmLevelCNScore\",\"FocalCNScore\"],inplace = True)\n", + "print(pdx.shape)\n", + "pdx.head(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Strings containing duplicated gene IDs: 544\n", + "268 duplicated IDs in 544 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 134\n", + "Merged 410 duplicated rows into 205 rows\n" + ] + } + ], + "source": [ + "pdx.index.name = \"gene_id\"\n", + "ids = pdx.index\n", + "ids = list(set(ids[ids.duplicated()]))\n", + "print(\"Strings containing duplicated gene IDs:\",pdx.loc[ids,:].shape[0])\n", + "pdx = handle_dups(pdx,corr_thr = 0.75)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5,0,'CN Averaged over all')" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "average_ploidies = pdx.apply(np.mean)\n", + "p = plt.hist(average_ploidies,bins=30)\n", + "plt.title(\"Ploidy in PDX samples\")\n", + "plt.ylabel(\"n samples\")\n", + "plt.xlabel(\"CN Averaged over all\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>A1BG</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.117695</td>\n", + " <td>0.056584</td>\n", + " <td>0.000000</td>\n", + " <td>0.978196</td>\n", + " <td>0.028569</td>\n", + " <td>2.509696</td>\n", + " <td>0.117695</td>\n", + " <td>0.007196</td>\n", + " <td>...</td>\n", + " <td>0.056584</td>\n", + " <td>0.070389</td>\n", + " <td>0.097611</td>\n", + " <td>0.560715</td>\n", + " <td>0.042644</td>\n", + " <td>0.049631</td>\n", + " <td>-0.007232</td>\n", + " <td>0.049631</td>\n", + " <td>-0.483985</td>\n", + " <td>0.021480</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1BG-AS1</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.117695</td>\n", + " <td>0.056584</td>\n", + " <td>0.000000</td>\n", + " <td>0.978196</td>\n", + " <td>0.028569</td>\n", + " <td>2.509696</td>\n", + " <td>0.117695</td>\n", + " <td>0.007196</td>\n", + " <td>...</td>\n", + " <td>0.056584</td>\n", + " <td>0.070389</td>\n", + " <td>0.097611</td>\n", + " <td>0.560715</td>\n", + " <td>0.042644</td>\n", + " <td>0.049631</td>\n", + " <td>-0.007232</td>\n", + " <td>0.049631</td>\n", + " <td>-0.483985</td>\n", + " <td>0.021480</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1CF</th>\n", + " <td>0.521051</td>\n", + " <td>0.570463</td>\n", + " <td>0.007196</td>\n", + " <td>0.042644</td>\n", + " <td>0.070389</td>\n", + " <td>-0.340075</td>\n", + " <td>0.007196</td>\n", + " <td>-0.286304</td>\n", + " <td>-0.081614</td>\n", + " <td>-0.007232</td>\n", + " <td>...</td>\n", + " <td>0.028569</td>\n", + " <td>-1.043943</td>\n", + " <td>-0.340075</td>\n", + " <td>0.056584</td>\n", + " <td>-0.036526</td>\n", + " <td>-0.058894</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.643856</td>\n", + " <td>-0.588574</td>\n", + " <td>0.070389</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 \\\n", + "gene_id \n", + "A1BG 0.367371 -0.321928 0.117695 0.056584 0.000000 0.978196 \n", + "A1BG-AS1 0.367371 -0.321928 0.117695 0.056584 0.000000 0.978196 \n", + "A1CF 0.521051 0.570463 0.007196 0.042644 0.070389 -0.340075 \n", + "\n", + " X-1167 X-1169 X-1172 X-1173 ... X-5694 \\\n", + "gene_id ... \n", + "A1BG 0.028569 2.509696 0.117695 0.007196 ... 0.056584 \n", + "A1BG-AS1 0.028569 2.509696 0.117695 0.007196 ... 0.056584 \n", + "A1CF 0.007196 -0.286304 -0.081614 -0.007232 ... 0.028569 \n", + "\n", + " X-5696 X-5713 X-5717 X-5727 X-5739 X-5808 \\\n", + "gene_id \n", + "A1BG 0.070389 0.097611 0.560715 0.042644 0.049631 -0.007232 \n", + "A1BG-AS1 0.070389 0.097611 0.560715 0.042644 0.049631 -0.007232 \n", + "A1CF -1.043943 -0.340075 0.056584 -0.036526 -0.058894 -0.377070 \n", + "\n", + " X-5959 X-5975 X-6047 \n", + "gene_id \n", + "A1BG 0.049631 -0.483985 0.021480 \n", + "A1BG-AS1 0.049631 -0.483985 0.021480 \n", + "A1CF -0.643856 -0.588574 0.070389 \n", + "\n", + "[3 rows x 375 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdx = pdx.applymap(lambda x: np.log2(x/2))\n", + "pdx.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>A1BG</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.978196</td>\n", + " <td>0.0</td>\n", + " <td>2.509696</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.00000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1BG-AS1</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.978196</td>\n", + " <td>0.0</td>\n", + " <td>2.509696</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.00000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>A1CF</th>\n", + " <td>0.521051</td>\n", + " <td>0.570463</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.340075</td>\n", + " <td>0.0</td>\n", + " <td>-0.286304</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>-1.043943</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>-0.37707</td>\n", + " <td>-0.643856</td>\n", + " <td>-0.588574</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 X-1167 \\\n", + "gene_id \n", + "A1BG 0.367371 -0.321928 0.0 0.0 0.0 0.978196 0.0 \n", + "A1BG-AS1 0.367371 -0.321928 0.0 0.0 0.0 0.978196 0.0 \n", + "A1CF 0.521051 0.570463 0.0 0.0 0.0 -0.340075 0.0 \n", + "\n", + " X-1169 X-1172 X-1173 ... X-5694 X-5696 X-5713 \\\n", + "gene_id ... \n", + "A1BG 2.509696 0.0 0.0 ... 0.0 0.000000 0.000000 \n", + "A1BG-AS1 2.509696 0.0 0.0 ... 0.0 0.000000 0.000000 \n", + "A1CF -0.286304 0.0 0.0 ... 0.0 -1.043943 -0.340075 \n", + "\n", + " X-5717 X-5727 X-5739 X-5808 X-5959 X-5975 X-6047 \n", + "gene_id \n", + "A1BG 0.560715 0.0 0.0 0.00000 0.000000 -0.483985 0.0 \n", + "A1BG-AS1 0.560715 0.0 0.0 0.00000 0.000000 -0.483985 0.0 \n", + "A1CF 0.000000 0.0 0.0 -0.37707 -0.643856 -0.588574 0.0 \n", + "\n", + "[3 rows x 375 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdx = pdx.applymap(lambda x : clean_logR(x, pos_seg_mean_thr, neg_seg_mean_thr))\n", + "pdx.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mapped: 23313 \n", + "\tdirectly via main_mapper 21188 \n", + "\tvia alternative mapper 466 \n", + "\tvia one of multiple synonyms in alternative mapper 926 \n", + "\tLOC 733 \n", + "Unmapped: 200 \n", + "\trecognized symbols without Entrez ID 0 \n", + "\tmultiple query_ids map to the same target_id 0 \n", + "\tquery_ids map to multiple target_ids in the main mapper 0 \n", + "\tquery_ids map to multiple target_ids in the alternative mapper 52 \n", + "\tLOC not found in Entrez 29 \n", + "\tNot found at all: 119\n", + "Warning: query IDs mapping to duplicated target IDs in mapping table: 77\n", + "Warning: query IDs not mapped to any target IDs excluded: 200\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "IDs mapped to multiple target IDs are kept:\n", + " [143872, 286464, 51463, 642826, 653067, 399761, 647060, 284565, 84631, 161176, 341019, 83869, 9502, 83871, 728113, 729438, 4253, 645425, 26165, 6218, 728695, 100132948, 100134869, 84316, 200030, 642658, 100302179, 401508, 119016, 84458, 574445, 26095, 84968, 80759, 3192, 387707, 79741]\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.367371</td>\n", + " <td>-0.321928</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.978196</td>\n", + " <td>0.0</td>\n", + " <td>2.509696</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.560715</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.500802</td>\n", + " <td>0.0</td>\n", + " <td>0.700440</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.327687</td>\n", + " <td>-0.494109</td>\n", + " <td>-0.535332</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.500802</td>\n", + " <td>0.0</td>\n", + " <td>0.700440</td>\n", + " <td>0.0</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.0</td>\n", + " <td>0.739848</td>\n", + " <td>0.327687</td>\n", + " <td>-0.494109</td>\n", + " <td>-0.535332</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>3 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 X-1167 \\\n", + "gene_id \n", + "1 0.367371 -0.321928 0.0 0.000000 0.0 0.978196 0.0 \n", + "2 0.761285 0.000000 0.0 0.500802 0.0 0.700440 0.0 \n", + "3 0.761285 0.000000 0.0 0.500802 0.0 0.700440 0.0 \n", + "\n", + " X-1169 X-1172 X-1173 ... X-5694 X-5696 X-5713 X-5717 \\\n", + "gene_id ... \n", + "1 2.509696 0.000000 0.0 ... 0.0 0.0 0.0 0.560715 \n", + "2 0.000000 0.201634 0.0 ... 0.0 0.0 0.0 0.739848 \n", + "3 0.000000 0.201634 0.0 ... 0.0 0.0 0.0 0.739848 \n", + "\n", + " X-5727 X-5739 X-5808 X-5959 X-5975 X-6047 \n", + "gene_id \n", + "1 0.0 0.000000 0.000000 0.000000 -0.483985 0.0 \n", + "2 0.0 0.739848 0.327687 -0.494109 -0.535332 0.0 \n", + "3 0.0 0.739848 0.327687 -0.494109 -0.535332 0.0 \n", + "\n", + "[3 rows x 375 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdx,query2target,not_mapped = apply_mappers(pdx, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = \"keep\")\n", + "pdx.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>X-1004</th>\n", + " <th>X-1008</th>\n", + " <th>X-1027</th>\n", + " <th>X-1095</th>\n", + " <th>X-1119</th>\n", + " <th>X-1156</th>\n", + " <th>X-1167</th>\n", + " <th>X-1169</th>\n", + " <th>X-1172</th>\n", + " <th>X-1173</th>\n", + " <th>...</th>\n", + " <th>X-5694</th>\n", + " <th>X-5696</th>\n", + " <th>X-5713</th>\n", + " <th>X-5717</th>\n", + " <th>X-5727</th>\n", + " <th>X-5739</th>\n", + " <th>X-5808</th>\n", + " <th>X-5959</th>\n", + " <th>X-5975</th>\n", + " <th>X-6047</th>\n", + " </tr>\n", + " <tr>\n", + " <th>gene_id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>143872</th>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.350497</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.514573</td>\n", + " </tr>\n", + " <tr>\n", + " <th>143872</th>\n", + " <td>0.000000</td>\n", + " <td>0.560715</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.350497</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.514573</td>\n", + " </tr>\n", + " <tr>\n", + " <th>286464</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>-1.321928</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>286464</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>-1.321928</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>286464</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>-1.321928</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>51463</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>51463</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.438293</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642826</th>\n", + " <td>0.608809</td>\n", + " <td>0.859970</td>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.871844</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-1.494109</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642826</th>\n", + " <td>0.608809</td>\n", + " <td>0.859970</td>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.871844</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-1.494109</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>653067</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>399761</th>\n", + " <td>0.531069</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.367371</td>\n", + " <td>-1.535332</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>399761</th>\n", + " <td>0.531069</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.367371</td>\n", + " <td>-1.535332</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>647060</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>0.000000</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.749038</td>\n", + " <td>-0.405451</td>\n", + " <td>0.000000</td>\n", + " <td>-0.395929</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>647060</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>0.000000</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.749038</td>\n", + " <td>-0.405451</td>\n", + " <td>0.000000</td>\n", + " <td>-0.395929</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>284565</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.411426</td>\n", + " <td>0.448901</td>\n", + " <td>...</td>\n", + " <td>0.469886</td>\n", + " <td>-0.504305</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>284565</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.310340</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.411426</td>\n", + " <td>0.448901</td>\n", + " <td>...</td>\n", + " <td>0.469886</td>\n", + " <td>-0.524915</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84631</th>\n", + " <td>-0.823677</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.500802</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84631</th>\n", + " <td>-0.823677</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.500802</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>161176</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.358454</td>\n", + " <td>0.250962</td>\n", + " <td>0.339137</td>\n", + " <td>...</td>\n", + " <td>-0.875672</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.411426</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>161176</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.358454</td>\n", + " <td>0.250962</td>\n", + " <td>0.339137</td>\n", + " <td>...</td>\n", + " <td>-0.875672</td>\n", + " <td>0.000000</td>\n", + " <td>-0.483985</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.411426</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>341019</th>\n", + " <td>0.959770</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>-1.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.444184</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>341019</th>\n", + " <td>0.959770</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.312939</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>-1.000000</td>\n", + " <td>-0.610433</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.444184</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>83869</th>\n", + " <td>-3.000000</td>\n", + " <td>-3.184425</td>\n", + " <td>-3.321928</td>\n", + " <td>-3.000000</td>\n", + " <td>-2.599462</td>\n", + " <td>-3.556393</td>\n", + " <td>-2.785875</td>\n", + " <td>-0.875672</td>\n", + " <td>-2.736966</td>\n", + " <td>-2.514573</td>\n", + " <td>...</td>\n", + " <td>-2.556393</td>\n", + " <td>-3.943416</td>\n", + " <td>-2.785875</td>\n", + " <td>-2.152003</td>\n", + " <td>-3.556393</td>\n", + " <td>-3.251539</td>\n", + " <td>-2.736966</td>\n", + " <td>-3.473931</td>\n", + " <td>-3.120294</td>\n", + " <td>-3.643856</td>\n", + " </tr>\n", + " <tr>\n", + " <th>83869</th>\n", + " <td>-3.000000</td>\n", + " <td>-3.184425</td>\n", + " <td>-3.321928</td>\n", + " <td>-3.000000</td>\n", + " <td>-2.599462</td>\n", + " <td>-3.556393</td>\n", + " <td>-2.785875</td>\n", + " <td>-0.875672</td>\n", + " <td>-2.736966</td>\n", + " <td>-2.514573</td>\n", + " <td>...</td>\n", + " <td>-2.556393</td>\n", + " <td>-3.943416</td>\n", + " <td>-2.785875</td>\n", + " <td>-2.152003</td>\n", + " <td>-3.556393</td>\n", + " <td>-3.251539</td>\n", + " <td>-2.736966</td>\n", + " <td>-3.473931</td>\n", + " <td>-3.120294</td>\n", + " <td>-3.643856</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9502</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9502</th>\n", + " <td>0.000000</td>\n", + " <td>0.378512</td>\n", + " <td>0.000000</td>\n", + " <td>0.550901</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>0.000000</td>\n", + " <td>-0.902389</td>\n", + " <td>1.220330</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.580145</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.577767</td>\n", + " <td>-1.494109</td>\n", + " <td>-1.494109</td>\n", + " </tr>\n", + " <tr>\n", + " <th>83871</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.500802</td>\n", + " <td>0.000000</td>\n", + " <td>-0.304006</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>0.327687</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>-0.358454</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>-0.535332</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100134869</th>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.389567</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.790772</td>\n", + " <td>0.000000</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100134869</th>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.389567</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.790772</td>\n", + " <td>0.000000</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84316</th>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.349235</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.358454</td>\n", + " <td>-0.340075</td>\n", + " <td>-0.971431</td>\n", + " <td>-0.545824</td>\n", + " <td>0.310340</td>\n", + " <td>0.438293</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84316</th>\n", + " <td>-0.810966</td>\n", + " <td>-0.689660</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.718088</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.731183</td>\n", + " <td>0.367371</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.434403</td>\n", + " <td>1.060047</td>\n", + " <td>0.490570</td>\n", + " <td>0.000000</td>\n", + " <td>-0.632629</td>\n", + " <td>-0.655172</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>200030</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.438293</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>200030</th>\n", + " <td>1.238787</td>\n", + " <td>1.090853</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.839960</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.438293</td>\n", + " <td>0.000000</td>\n", + " <td>0.618239</td>\n", + " <td>0.201634</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.469886</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642658</th>\n", + " <td>1.049631</td>\n", + " <td>1.358959</td>\n", + " <td>0.000000</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.321928</td>\n", + " <td>0.700440</td>\n", + " <td>0.298658</td>\n", + " <td>...</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.769772</td>\n", + " <td>0.232661</td>\n", + " <td>1.121015</td>\n", + " <td>0.831877</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>1.121015</td>\n", + " <td>0.459432</td>\n", + " </tr>\n", + " <tr>\n", + " <th>642658</th>\n", + " <td>1.049631</td>\n", + " <td>1.358959</td>\n", + " <td>0.000000</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.321928</td>\n", + " <td>0.700440</td>\n", + " <td>0.298658</td>\n", + " <td>...</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.769772</td>\n", + " <td>0.232661</td>\n", + " <td>1.121015</td>\n", + " <td>0.831877</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>1.121015</td>\n", + " <td>0.459432</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100302179</th>\n", + " <td>0.778209</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>100302179</th>\n", + " <td>0.778209</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.201634</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.463947</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>401508</th>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>-0.666576</td>\n", + " <td>1.629939</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-2.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>401508</th>\n", + " <td>0.232661</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>-0.666576</td>\n", + " <td>1.629939</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-2.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.260152</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.330973</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119016</th>\n", + " <td>0.378512</td>\n", + " <td>0.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.043943</td>\n", + " <td>-0.524915</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.377070</td>\n", + " <td>-0.643856</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>119016</th>\n", + " <td>0.448901</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.242977</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84458</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84458</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>574445</th>\n", + " <td>0.490570</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>0.238787</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>-0.823677</td>\n", + " <td>-0.957356</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>574445</th>\n", + " <td>0.490570</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.074001</td>\n", + " <td>0.000000</td>\n", + " <td>0.238787</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>-0.823677</td>\n", + " <td>-0.957356</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>-0.599462</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26095</th>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.269033</td>\n", + " <td>-1.043943</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26095</th>\n", + " <td>0.531069</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.251539</td>\n", + " <td>0.000000</td>\n", + " <td>-0.286304</td>\n", + " <td>0.000000</td>\n", + " <td>0.790772</td>\n", + " <td>...</td>\n", + " <td>0.269033</td>\n", + " <td>-1.043943</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.588574</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84968</th>\n", + " <td>-0.736966</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>84968</th>\n", + " <td>-0.736966</td>\n", + " <td>-0.321928</td>\n", + " <td>0.000000</td>\n", + " <td>0.570463</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.875672</td>\n", + " <td>-0.678072</td>\n", + " <td>0.378512</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.545824</td>\n", + " <td>-0.535332</td>\n", + " </tr>\n", + " <tr>\n", + " <th>80759</th>\n", + " <td>-0.556393</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.567041</td>\n", + " <td>-0.556393</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.621488</td>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>80759</th>\n", + " <td>-0.556393</td>\n", + " <td>0.599318</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.567041</td>\n", + " <td>-0.556393</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.621488</td>\n", + " <td>-0.454032</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.621488</td>\n", + " <td>0.761285</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3192</th>\n", + " <td>1.350497</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.220330</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>-0.268817</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.480265</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>1.021480</td>\n", + " <td>0.599318</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3192</th>\n", + " <td>1.350497</td>\n", + " <td>0.632268</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.220330</td>\n", + " <td>0.000000</td>\n", + " <td>0.448901</td>\n", + " <td>-0.268817</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.480265</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.330973</td>\n", + " <td>0.250962</td>\n", + " <td>0.000000</td>\n", + " <td>1.021480</td>\n", + " <td>0.599318</td>\n", + " </tr>\n", + " <tr>\n", + " <th>387707</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>387707</th>\n", + " <td>0.389567</td>\n", + " <td>-0.666576</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.340075</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>-1.014500</td>\n", + " <td>-0.473931</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.386468</td>\n", + " <td>-0.610433</td>\n", + " <td>-0.577767</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>79741</th>\n", + " <td>0.000000</td>\n", + " <td>0.589763</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.242977</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>...</td>\n", + " <td>-0.798366</td>\n", + " <td>-0.985645</td>\n", + " <td>-0.358454</td>\n", + " <td>0.000000</td>\n", + " <td>0.700440</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>-0.588574</td>\n", + " <td>0.459432</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>79741</th>\n", + " <td>0.000000</td>\n", + " <td>0.589763</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-0.242977</td>\n", + " <td>0.000000</td>\n", + " <td>0.207893</td>\n", + " <td>0.000000</td>\n", + " <td>0.220330</td>\n", + " <td>...</td>\n", + " <td>-0.798366</td>\n", + " <td>-0.985645</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.700440</td>\n", + " <td>0.000000</td>\n", + " <td>-0.367732</td>\n", + " <td>-0.588574</td>\n", + " <td>0.459432</td>\n", + " <td>0.269033</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>77 rows × 375 columns</p>\n", + "</div>" + ], + "text/plain": [ + " X-1004 X-1008 X-1027 X-1095 X-1119 X-1156 \\\n", + "gene_id \n", + "143872 0.000000 0.560715 0.000000 0.000000 0.000000 -0.330973 \n", + "143872 0.000000 0.560715 0.000000 0.000000 0.000000 -0.330973 \n", + "286464 0.000000 0.378512 0.000000 0.550901 0.000000 -0.524915 \n", + "286464 0.000000 0.378512 0.000000 0.550901 0.000000 -0.524915 \n", + "286464 0.000000 0.378512 0.000000 0.550901 0.000000 -0.524915 \n", + "51463 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "51463 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "642826 0.608809 0.859970 0.531069 0.000000 0.000000 0.871844 \n", + "642826 0.608809 0.859970 0.531069 0.000000 0.000000 0.871844 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "653067 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "399761 0.531069 0.718088 0.000000 0.000000 0.000000 -0.251539 \n", + "399761 0.531069 0.718088 0.000000 0.000000 0.000000 -0.251539 \n", + "647060 0.000000 0.000000 0.000000 0.000000 0.000000 -0.312939 \n", + "647060 0.000000 0.000000 0.000000 0.000000 0.000000 -0.312939 \n", + "284565 1.238787 1.090853 0.000000 0.000000 0.000000 1.570463 \n", + "284565 1.238787 1.090853 0.000000 0.000000 0.000000 1.310340 \n", + "84631 -0.823677 -0.524915 0.000000 0.632268 0.000000 -1.074001 \n", + "84631 -0.823677 -0.524915 0.000000 0.632268 0.000000 -1.074001 \n", + "161176 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "161176 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "341019 0.959770 0.000000 0.000000 0.000000 0.000000 -0.304006 \n", + "341019 0.959770 0.000000 0.000000 0.000000 0.000000 -0.304006 \n", + "83869 -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393 \n", + "83869 -3.000000 -3.184425 -3.321928 -3.000000 -2.599462 -3.556393 \n", + "9502 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "9502 0.000000 0.378512 0.000000 0.550901 0.000000 -0.434403 \n", + "83871 0.000000 0.000000 0.000000 0.500802 0.000000 -0.304006 \n", + "... ... ... ... ... ... ... \n", + "100134869 0.000000 0.207893 0.000000 0.000000 0.000000 0.000000 \n", + "100134869 0.000000 0.207893 0.000000 0.000000 0.000000 0.000000 \n", + "84316 -0.251539 0.000000 0.000000 0.000000 0.000000 -0.340075 \n", + "84316 -0.810966 -0.689660 0.000000 0.000000 0.000000 0.718088 \n", + "200030 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "200030 1.238787 1.090853 0.000000 0.000000 0.000000 1.839960 \n", + "642658 1.049631 1.358959 0.000000 0.599318 0.000000 0.000000 \n", + "642658 1.049631 1.358959 0.000000 0.599318 0.000000 0.000000 \n", + "100302179 0.778209 0.000000 0.000000 0.000000 0.000000 -0.340075 \n", + "100302179 0.778209 0.000000 0.000000 0.000000 0.000000 -0.340075 \n", + "401508 0.232661 0.000000 0.000000 0.000000 0.250962 0.000000 \n", + "401508 0.232661 0.000000 0.000000 0.000000 0.250962 0.000000 \n", + "119016 0.378512 0.570463 0.000000 0.000000 0.000000 -0.251539 \n", + "119016 0.448901 -0.286304 0.000000 0.000000 0.000000 -0.242977 \n", + "84458 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "84458 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "574445 0.490570 0.000000 -0.286304 0.000000 0.000000 -1.074001 \n", + "574445 0.490570 0.000000 -0.286304 0.000000 0.000000 -1.074001 \n", + "26095 0.531069 0.000000 0.000000 0.000000 0.000000 -0.251539 \n", + "26095 0.531069 0.000000 0.000000 0.000000 0.000000 -0.251539 \n", + "84968 -0.736966 -0.321928 0.000000 0.570463 0.000000 0.000000 \n", + "84968 -0.736966 -0.321928 0.000000 0.570463 0.000000 0.000000 \n", + "80759 -0.556393 0.599318 0.000000 0.000000 0.000000 0.250962 \n", + "80759 -0.556393 0.599318 0.000000 0.000000 0.000000 0.250962 \n", + "3192 1.350497 0.632268 0.000000 0.000000 0.000000 0.220330 \n", + "3192 1.350497 0.632268 0.000000 0.000000 0.000000 0.220330 \n", + "387707 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "387707 0.389567 -0.666576 0.000000 0.000000 0.000000 -0.340075 \n", + "79741 0.000000 0.589763 0.000000 0.000000 0.000000 -0.242977 \n", + "79741 0.000000 0.589763 0.000000 0.000000 0.000000 -0.242977 \n", + "\n", + " X-1167 X-1169 X-1172 X-1173 ... X-5694 \\\n", + "gene_id ... \n", + "143872 0.000000 -0.367732 0.350497 0.000000 ... 0.000000 \n", + "143872 0.000000 -0.367732 0.350497 0.000000 ... 0.000000 \n", + "286464 0.000000 -0.902389 -1.321928 0.367371 ... 0.000000 \n", + "286464 0.000000 -0.902389 -1.321928 0.367371 ... 0.000000 \n", + "286464 0.000000 -0.902389 -1.321928 0.367371 ... 0.000000 \n", + "51463 0.000000 0.448901 0.000000 0.000000 ... 0.469886 \n", + "51463 0.000000 0.448901 0.000000 0.000000 ... 0.438293 \n", + "642826 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "642826 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "653067 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "399761 0.000000 -0.286304 0.000000 0.790772 ... 0.367371 \n", + "399761 0.000000 -0.286304 0.000000 0.790772 ... 0.367371 \n", + "647060 0.000000 -0.321928 0.000000 0.618239 ... 0.000000 \n", + "647060 0.000000 -0.321928 0.000000 0.618239 ... 0.000000 \n", + "284565 0.000000 0.448901 0.411426 0.448901 ... 0.469886 \n", + "284565 0.000000 0.448901 0.411426 0.448901 ... 0.469886 \n", + "84631 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "84631 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "161176 0.000000 -0.358454 0.250962 0.339137 ... -0.875672 \n", + "161176 0.000000 -0.358454 0.250962 0.339137 ... -0.875672 \n", + "341019 0.000000 -0.349235 0.000000 0.000000 ... 0.000000 \n", + "341019 0.000000 -0.349235 0.000000 0.000000 ... 0.000000 \n", + "83869 -2.785875 -0.875672 -2.736966 -2.514573 ... -2.556393 \n", + "83869 -2.785875 -0.875672 -2.736966 -2.514573 ... -2.556393 \n", + "9502 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "9502 0.000000 -0.902389 1.220330 0.367371 ... 0.000000 \n", + "83871 0.000000 -0.349235 0.327687 0.000000 ... 0.000000 \n", + "... ... ... ... ... ... ... \n", + "100134869 0.000000 0.000000 0.232661 0.000000 ... 0.000000 \n", + "100134869 0.000000 0.000000 0.232661 0.000000 ... 0.000000 \n", + "84316 0.000000 -0.349235 -0.588574 0.000000 ... 0.000000 \n", + "84316 0.000000 0.000000 0.731183 0.367371 ... 0.000000 \n", + "200030 0.000000 0.448901 0.000000 0.000000 ... 0.438293 \n", + "200030 0.000000 0.448901 0.000000 0.000000 ... 0.438293 \n", + "642658 0.000000 0.321928 0.700440 0.298658 ... 0.632268 \n", + "642658 0.000000 0.321928 0.700440 0.298658 ... 0.632268 \n", + "100302179 0.000000 0.000000 0.000000 0.000000 ... 0.000000 \n", + "100302179 0.000000 0.000000 0.000000 0.000000 ... 0.000000 \n", + "401508 0.207893 0.000000 -0.666576 1.629939 ... 0.000000 \n", + "401508 0.207893 0.000000 -0.666576 1.629939 ... 0.000000 \n", + "119016 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "119016 0.000000 -0.286304 0.000000 0.000000 ... 0.000000 \n", + "84458 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "84458 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "574445 0.000000 0.238787 0.000000 0.000000 ... -0.823677 \n", + "574445 0.000000 0.238787 0.000000 0.000000 ... -0.823677 \n", + "26095 0.000000 -0.286304 0.000000 0.790772 ... 0.269033 \n", + "26095 0.000000 -0.286304 0.000000 0.790772 ... 0.269033 \n", + "84968 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "84968 0.000000 -0.875672 -0.678072 0.378512 ... 0.000000 \n", + "80759 0.000000 -0.386468 -0.567041 -0.556393 ... 0.000000 \n", + "80759 0.000000 -0.386468 -0.567041 -0.556393 ... 0.000000 \n", + "3192 0.000000 0.448901 -0.268817 0.000000 ... 0.480265 \n", + "3192 0.000000 0.448901 -0.268817 0.000000 ... 0.480265 \n", + "387707 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "387707 0.000000 -0.367732 0.000000 0.000000 ... 0.000000 \n", + "79741 0.000000 0.207893 0.000000 0.000000 ... -0.798366 \n", + "79741 0.000000 0.207893 0.000000 0.220330 ... -0.798366 \n", + "\n", + " X-5696 X-5713 X-5717 X-5727 X-5739 X-5808 \\\n", + "gene_id \n", + "143872 -0.349235 0.000000 -0.321928 -1.014500 -0.588574 0.000000 \n", + "143872 -0.349235 0.000000 -0.321928 -1.014500 -0.588574 0.000000 \n", + "286464 -1.014500 0.718088 0.000000 -1.014500 0.000000 0.000000 \n", + "286464 -1.014500 0.718088 0.000000 -1.014500 0.000000 0.000000 \n", + "286464 -1.014500 0.718088 0.000000 -1.014500 0.000000 0.000000 \n", + "51463 0.000000 0.618239 0.201634 0.000000 -0.260152 -0.349235 \n", + "51463 0.000000 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "642826 -1.014500 -0.473931 0.000000 0.618239 0.000000 -0.386468 \n", + "642826 -1.014500 -0.473931 0.000000 0.618239 0.000000 -0.386468 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "653067 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "399761 -1.535332 0.000000 0.000000 0.000000 0.000000 -0.599462 \n", + "399761 -1.535332 0.000000 0.000000 0.000000 0.000000 -0.599462 \n", + "647060 -0.621488 0.000000 -0.377070 -0.749038 -0.405451 0.000000 \n", + "647060 -0.621488 0.000000 -0.377070 -0.749038 -0.405451 0.000000 \n", + "284565 -0.504305 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "284565 -0.524915 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "84631 -1.014500 0.500802 0.000000 0.000000 0.000000 0.000000 \n", + "84631 -1.014500 0.500802 0.000000 0.000000 0.000000 0.000000 \n", + "161176 0.000000 -0.483985 0.000000 0.000000 -0.610433 0.000000 \n", + "161176 0.000000 -0.483985 0.000000 0.000000 -0.610433 0.000000 \n", + "341019 -0.666576 0.000000 -0.304006 -1.000000 -0.610433 0.000000 \n", + "341019 -0.312939 0.000000 -0.304006 -1.000000 -0.610433 0.000000 \n", + "83869 -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966 \n", + "83869 -3.943416 -2.785875 -2.152003 -3.556393 -3.251539 -2.736966 \n", + "9502 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "9502 -1.014500 0.580145 0.000000 -1.014500 0.000000 0.000000 \n", + "83871 0.000000 -0.463947 -0.358454 0.000000 0.000000 0.000000 \n", + "... ... ... ... ... ... ... \n", + "100134869 0.389567 0.000000 0.000000 -0.463947 0.000000 -0.367732 \n", + "100134869 0.389567 0.000000 0.000000 -0.463947 0.000000 -0.367732 \n", + "84316 -0.985645 -0.358454 -0.340075 -0.971431 -0.545824 0.310340 \n", + "84316 0.000000 0.000000 -0.434403 1.060047 0.490570 0.000000 \n", + "200030 0.000000 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "200030 0.000000 0.618239 0.201634 0.000000 -0.260152 0.000000 \n", + "642658 0.000000 0.769772 0.232661 1.121015 0.831877 -0.367732 \n", + "642658 0.000000 0.769772 0.232661 1.121015 0.831877 -0.367732 \n", + "100302179 0.000000 0.201634 -0.251539 0.000000 -0.463947 0.000000 \n", + "100302179 0.000000 0.201634 -0.251539 0.000000 -0.463947 0.000000 \n", + "401508 -2.321928 0.000000 0.000000 -0.260152 0.000000 -0.545824 \n", + "401508 -2.321928 0.000000 0.000000 -0.260152 0.000000 -0.545824 \n", + "119016 -1.043943 -0.524915 0.000000 0.000000 0.000000 -0.377070 \n", + "119016 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.367732 \n", + "84458 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "84458 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "574445 -0.957356 -0.321928 0.000000 -0.985645 -0.454032 0.000000 \n", + "574445 -0.957356 -0.321928 0.000000 -0.985645 -0.454032 0.000000 \n", + "26095 -1.043943 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "26095 -1.043943 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "84968 -1.000000 0.250962 0.000000 0.000000 0.000000 0.000000 \n", + "84968 -1.000000 0.250962 0.000000 0.000000 0.000000 0.000000 \n", + "80759 0.000000 -0.621488 -0.454032 0.000000 0.000000 -0.386468 \n", + "80759 0.000000 -0.621488 -0.454032 0.000000 0.000000 -0.386468 \n", + "3192 0.000000 0.207893 0.000000 0.000000 -0.330973 0.250962 \n", + "3192 0.000000 0.207893 0.000000 0.000000 -0.330973 0.250962 \n", + "387707 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "387707 -1.014500 -0.473931 0.000000 0.000000 0.000000 -0.386468 \n", + "79741 -0.985645 -0.358454 0.000000 0.700440 0.000000 -0.367732 \n", + "79741 -0.985645 0.000000 0.000000 0.700440 0.000000 -0.367732 \n", + "\n", + " X-5959 X-5975 X-6047 \n", + "gene_id \n", + "143872 0.000000 0.000000 -0.514573 \n", + "143872 0.000000 0.000000 -0.514573 \n", + "286464 0.000000 -1.494109 -1.494109 \n", + "286464 0.000000 -1.494109 -1.494109 \n", + "286464 0.000000 -1.494109 -1.494109 \n", + "51463 0.000000 0.469886 0.000000 \n", + "51463 0.000000 0.469886 0.000000 \n", + "642826 -1.494109 -0.312939 0.000000 \n", + "642826 -1.494109 -0.312939 0.000000 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "653067 -0.577767 -1.494109 -1.494109 \n", + "399761 -0.588574 0.000000 0.000000 \n", + "399761 -0.588574 0.000000 0.000000 \n", + "647060 -0.395929 0.000000 0.000000 \n", + "647060 -0.395929 0.000000 0.000000 \n", + "284565 0.000000 0.469886 0.000000 \n", + "284565 0.000000 0.469886 0.000000 \n", + "84631 -0.545824 -0.545824 -0.535332 \n", + "84631 -0.545824 -0.545824 -0.535332 \n", + "161176 0.411426 0.000000 0.000000 \n", + "161176 0.411426 0.000000 0.000000 \n", + "341019 0.000000 -0.444184 0.000000 \n", + "341019 0.000000 -0.444184 0.000000 \n", + "83869 -3.473931 -3.120294 -3.643856 \n", + "83869 -3.473931 -3.120294 -3.643856 \n", + "9502 -0.577767 -1.494109 -1.494109 \n", + "9502 -0.577767 -1.494109 -1.494109 \n", + "83871 0.448901 -0.535332 0.000000 \n", + "... ... ... ... \n", + "100134869 0.790772 0.000000 -0.535332 \n", + "100134869 0.790772 0.000000 -0.535332 \n", + "84316 0.438293 -0.473931 0.000000 \n", + "84316 -0.632629 -0.655172 0.000000 \n", + "200030 0.000000 0.469886 0.000000 \n", + "200030 0.000000 0.469886 0.000000 \n", + "642658 0.000000 1.121015 0.459432 \n", + "642658 0.000000 1.121015 0.459432 \n", + "100302179 0.000000 0.000000 0.000000 \n", + "100302179 0.000000 0.000000 0.000000 \n", + "401508 -0.330973 0.000000 0.000000 \n", + "401508 -0.330973 0.000000 0.000000 \n", + "119016 -0.643856 0.000000 0.000000 \n", + "119016 -0.577767 0.000000 0.000000 \n", + "84458 -0.610433 -0.577767 0.000000 \n", + "84458 -0.610433 -0.577767 0.000000 \n", + "574445 -0.599462 -0.577767 0.000000 \n", + "574445 -0.599462 -0.577767 0.000000 \n", + "26095 -0.588574 0.000000 0.000000 \n", + "26095 -0.588574 0.000000 0.000000 \n", + "84968 -0.545824 -0.545824 -0.535332 \n", + "84968 -0.545824 -0.545824 -0.535332 \n", + "80759 -0.621488 0.761285 0.000000 \n", + "80759 -0.621488 0.761285 0.000000 \n", + "3192 0.000000 1.021480 0.599318 \n", + "3192 0.000000 1.021480 0.599318 \n", + "387707 -0.610433 -0.577767 0.000000 \n", + "387707 -0.610433 -0.577767 0.000000 \n", + "79741 -0.588574 0.459432 0.000000 \n", + "79741 -0.588574 0.459432 0.269033 \n", + "\n", + "[77 rows x 375 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dups = list(set(pdx[pdx.index.duplicated(keep=False)].index.values))\n", + "pdx.loc[dups,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "37 duplicated IDs in 77 rows found.\n", + "duplicate rows removed due to low correlation of duplicated profiles 4\n", + "Merged 73 duplicated rows into 35 rows\n" + ] + } + ], + "source": [ + "# most of these dupliates correspond to genes merged in the current assembly, e.g. gene - gene-AS\n", + "pdx = handle_dups(pdx,corr_thr = 0.75)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "pdx = pdx.T.sort_index().T\n", + "pdx.to_csv(preprocessed_dir+\"/\"+\"PDX\"+\".Segment_Mean.CNA.tsv\",\n", + " sep = \"\\t\",header=True,index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluation of the results\n", + "1). How many common genes between four datasets?\n", + "\n", + "2). Do CNA profiles of the same cell line from GDSC and CCLE correlate?\n", + "\n", + "3). Do CNA profiles of the same cancer type from TCGA and PDX look similar?\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "ename": "IOError", + "evalue": "File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-39-1f476096b0ec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m### 1). How many common genes between four datasets?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# we take BRCA from TCGA because\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtcga\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"BRCA\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;31m#print(tcga.head(3))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mgdsc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpreprocessed_dir\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"GDSC\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\".Segment_Mean.CNA.tsv\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 676\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 677\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 678\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 679\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 680\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 442\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 785\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 786\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 787\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 788\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 789\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1012\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1013\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1014\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1015\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1016\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/olya/miniconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1706\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'usecols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1707\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1708\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1709\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1710\u001b[0m \u001b[0mpassed_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnames\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mIOError\u001b[0m: File /home/olya/SFU/Hossein/v1/preprocessed/CNA/BRCA.Segment_Mean.CNA.tsv does not exist" + ] + } + ], + "source": [ + "### 1). How many common genes between four datasets?\n", + "# we take BRCA from TCGA because \n", + "tcga = pd.read_csv(preprocessed_dir+\"BRCA\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n", + "#print(tcga.head(3))\n", + "gdsc = pd.read_csv(preprocessed_dir+\"GDSC\"+\".Segment_Mean.CNA.tsv\",sep = \"\\t\", index_col=0)\n", + "#print(tcga.head(3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### distribution of logR values in GDSC and CCLE \n", + "cn_values_gdsc = []\n", + "for row in df.iterrows():\n", + " cn_values_gdsc += list(row[1].values)\n", + "cn_values_ccle = []\n", + "for row in cna_table.iterrows():\n", + " cn_values_ccle+= list(row[1].values)\n", + "\n", + "cn_values_gdsc = sorted (cn_values_gdsc)\n", + "cn_values_ccle = sorted (cn_values_ccle)\n", + "plt.figure(figsize=(20,5))\n", + "plt.subplot(121)\n", + "tmp = plt.hist(cn_values_gdsc,bins=100,density = True,range=(-5,4))\n", + "plt.title(\"GDSC\")\n", + "plt.subplot(122)\n", + "tmp = plt.hist(cn_values_ccle,bins=100,density = True, range=(-5,4))\n", + "plt.title(\"CCLE\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}