1 lines (1 with data), 21.8 kB
{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.8"},"colab":{"name":"Pre-processing_Ver_2.ipynb","provenance":[{"file_id":"1LHLqt70oFXTTnxJtZPYfCRBPqWPAO6T2","timestamp":1596702929681}],"collapsed_sections":[],"toc_visible":true},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"UrM6jHdwy47-","colab_type":"text"},"source":["# Deep denoising auto-encoder and MLP based multi-output regression on TCGA multi-omics data\n","# Data Pre-processing"]},{"cell_type":"markdown","metadata":{"id":"hb-sT4V8y48J","colab_type":"text"},"source":["Note: You may skip this notebook if you already have the pre-processed data"]},{"cell_type":"markdown","metadata":{"id":"N2_z0jeay48M","colab_type":"text"},"source":["Importing libraries"]},{"cell_type":"code","metadata":{"trusted":false,"id":"_leYcysJy48P","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703444376,"user_tz":-330,"elapsed":1389,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["import pandas as pd\n","import numpy as np"],"execution_count":1,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"Dd1qV4aty48b","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703446015,"user_tz":-330,"elapsed":788,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["def intersection(list1, list2, list3):\n"," return list(set(list1) & set(list2) & set(list3))"],"execution_count":2,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"V2ZeJlWZy48j","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703467007,"user_tz":-330,"elapsed":1287,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["def extractMatchedIndices(list1, list2):\n"," comm = []\n"," for i in list2:\n"," for j in list1:\n"," if i in j:\n"," comm.append(list1.index(j))\n"," break #added in Ver. 2 to remove duplicate samples\n"," return comm"],"execution_count":3,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"RSWCxVCdy48r","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703467827,"user_tz":-330,"elapsed":1013,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["def remrows(data):\n"," t = int(0.8*data.shape[1])\n"," data = data.dropna(thresh=t)\n"," #data = data[(data.T != 0).any()]\n"," return data"],"execution_count":4,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"Vj8xAf3zy48x","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703472536,"user_tz":-330,"elapsed":1328,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["def remcolumns(data):\n"," t = int(0.8*data.shape[1])\n"," data = data.dropna(thresh=t,axis=1)\n"," #data = data.loc[:, (data != 0).any(axis=0)]\n"," return data"],"execution_count":5,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"njlD3x6ky483","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703476856,"user_tz":-330,"elapsed":1494,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["def tumor_normal_labels(list):\n"," sample = [x[13:15] for x in list]\n"," label = np.array([x in ['01','02','03','04','05','06','07','08','09'] for x in sample])\n"," label=1*label # converting boolean into int\n"," return label"],"execution_count":6,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"qtITXlnpy489","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703476864,"user_tz":-330,"elapsed":858,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["def myNormalize(data):\n"," (rows, cols) = data.shape\n"," mins = np.zeros(shape=(cols), dtype = np.float32)\n"," maxs = np.zeros(shape=(cols), dtype = np.float32)\n"," for j in range(cols):\n"," mins[j] = np.min(data[:,j])\n"," maxs[j] = np.max(data[:,j])\n"," \n"," result = np.copy(data)\n"," for i in range(rows):\n"," for j in range(cols):\n"," result[i,j] = (data[i,j] - mins[j]) / (maxs[j] - mins[j])\n"," return result"],"execution_count":7,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1y5bSW22y49E","colab_type":"text"},"source":["# Loading data"]},{"cell_type":"code","metadata":{"id":"cmJBB9flgeAU","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":124},"executionInfo":{"status":"ok","timestamp":1596703501319,"user_tz":-330,"elapsed":23361,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}},"outputId":"572e8f7b-39ea-4c92-a090-0e72844fb006"},"source":["from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":8,"outputs":[{"output_type":"stream","text":["Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n","\n","Enter your authorization code:\n","··········\n","Mounted at /content/drive\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"of_LBPo4gsAu","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703505814,"user_tz":-330,"elapsed":1412,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["#ls \"/content/drive/My Drive\""],"execution_count":9,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"KkAP-qmsAGpf","colab_type":"text"},"source":["Change paths accordingly"]},{"cell_type":"code","metadata":{"id":"2R8C7R7bhruf","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703519437,"user_tz":-330,"elapsed":14442,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["dnaMeth = pd.read_table('/content/drive/My Drive/TCGA Data/LIHC/LIHC_Methylation450__SingleValue__TSS1500__Both.txt',delimiter='\\t',index_col=0)\n","cna = pd.read_table('/content/drive/My Drive/TCGA Data/LIHC/LIHC__genome_wide_snp_6__GeneLevelCNA.txt',delimiter='\\t',index_col=0)\n","rnaSeq = pd.read_table('/content/drive/My Drive/TCGA Data/LIHC/LIHC_RNASeq__illuminahiseq_rnaseqv2__GeneExp.txt',delimiter='\\t',index_col=1) # Using Entrez ID as row identifier')"],"execution_count":10,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XOoX4JEyy49M","colab_type":"text"},"source":["Dropping redundant columns"]},{"cell_type":"code","metadata":{"trusted":false,"id":"aANBFACmy49O","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703521789,"user_tz":-330,"elapsed":1342,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["dnaMeth = dnaMeth.drop(dnaMeth.columns[[0]], axis=1)\n","rnaSeq = rnaSeq.drop(rnaSeq.columns[[0]], axis=1)\n","cna = cna.drop(cna.columns[[0,1]], axis=1)"],"execution_count":11,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"4V99fvVhUjMs","colab_type":"text"},"source":["# Pre-processing"]},{"cell_type":"markdown","metadata":{"id":"ZoTL3vkfy49U","colab_type":"text"},"source":["Extracting sample names using TCGA barcode"]},{"cell_type":"code","metadata":{"trusted":false,"id":"r6TzRh3ly49W","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703525576,"user_tz":-330,"elapsed":1035,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["dnaMethSamples = list(dnaMeth)\n","rnaSamples = list(rnaSeq)\n","cnaSamples = list(cna)\n","methID = [x[8:16] for x in dnaMethSamples]\n","rnaID = [x[8:16] for x in rnaSamples]\n","cnaID = [x[8:16] for x in cnaSamples]"],"execution_count":12,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_vTq7gEWy49f","colab_type":"text"},"source":["Removing duplicates"]},{"cell_type":"code","metadata":{"trusted":false,"id":"j8x8iSBDy49h","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703527348,"user_tz":-330,"elapsed":718,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["methID=set(methID)\n","rnaID=set(rnaID)\n","cnaID=set(cnaID)"],"execution_count":13,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"9y8j7aoDy49m","colab_type":"text"},"source":["Reconverting into lists"]},{"cell_type":"code","metadata":{"trusted":false,"id":"RxUixMQ2y49o","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703529878,"user_tz":-330,"elapsed":1156,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["methID=list(methID)\n","rnaID=list(rnaID)\n","cnaID=list(cnaID)"],"execution_count":14,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"kxzms2pBy49u","colab_type":"text"},"source":["Finding out common samples"]},{"cell_type":"code","metadata":{"id":"bcmkmI6OstRa","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703533562,"user_tz":-330,"elapsed":1348,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["commonSamples = intersection(methID, rnaID, cnaID)\n","#commonMeth = extractMatchedIndices(methID,commonSamples)\n","#commonRNA = extractMatchedIndices(rnaID,commonSamples)\n","#commonCNA = extractMatchedIndices(cnaID,commonSamples)\n","\n","#changes made in Ver. 2\n","commonMeth = extractMatchedIndices(list(dnaMeth),commonSamples)\n","commonRNA = extractMatchedIndices(list(rnaSeq),commonSamples)\n","commonCNA = extractMatchedIndices(list(cna),commonSamples)"],"execution_count":15,"outputs":[]},{"cell_type":"code","metadata":{"id":"qRfgeSmpx7mg","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1596703538584,"user_tz":-330,"elapsed":1043,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}},"outputId":"12c8aaa3-4fd5-4dbe-b6c3-670b03271483"},"source":["#print(len(commonSamples), len(commonMeth),len(commonRNA),len(commonCNA))"],"execution_count":16,"outputs":[{"output_type":"stream","text":["404 404 404 404\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"Flahz8pMy490","colab_type":"text"},"source":["Removing rows (genes) having more than 20% missing values across all samples (patients)"]},{"cell_type":"code","metadata":{"trusted":false,"id":"6iP7d2oYy492","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703542796,"user_tz":-330,"elapsed":1370,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["dnaMeth = remrows(dnaMeth)\n","rnaSeq = remrows(rnaSeq)\n","cna = remrows(cna)"],"execution_count":17,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"qjmI6R16y496","colab_type":"text"},"source":["Removing columns (samples) having more than 20% missing values across all rows (genes)"]},{"cell_type":"code","metadata":{"trusted":false,"id":"G5jmX3Pmy497","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703544827,"user_tz":-330,"elapsed":1387,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["dnaMeth = remcolumns(dnaMeth)\n","rnaSeq = remcolumns(rnaSeq)\n","cna = remcolumns(cna)"],"execution_count":18,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"r7ovBPnVy4-A","colab_type":"text"},"source":["Reducing each omics data to common samples only"]},{"cell_type":"code","metadata":{"trusted":false,"id":"H5x0SPoHy4-B","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703546290,"user_tz":-330,"elapsed":893,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["dnaMeth = dnaMeth.iloc[:,commonMeth]\n","rnaSeq = rnaSeq.iloc[:,commonRNA]\n","cna = cna.iloc[:,commonCNA]"],"execution_count":19,"outputs":[]},{"cell_type":"code","metadata":{"id":"xzxRT25hiT80","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596703553779,"user_tz":-330,"elapsed":1309,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["#Validation\n","#df = pd.DataFrame(list(zip(*[commonSamples, list(dnaMeth),list(rnaSeq),list(cna)]))).add_prefix('Col')\n","#df.to_csv('/content/drive/My Drive/TCGA Data/LIHC/IDs_post_processing.csv', index=False)"],"execution_count":21,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"7AvbCr25y4-G","colab_type":"text"},"source":["Removing 1st quantile for rnaSeq"]},{"cell_type":"code","metadata":{"trusted":false,"id":"vL0ycXy1y4-I","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704174054,"user_tz":-330,"elapsed":1217,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["rnaSeq_rowsum = rnaSeq.sum(axis=1)\n","ind = pd.DataFrame(rnaSeq_rowsum > rnaSeq_rowsum.quantile(0.25))\n","rnaSeq = rnaSeq[ind.values]"],"execution_count":22,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1Hnr3rKky4-M","colab_type":"text"},"source":["Finding tumor and normal samples"]},{"cell_type":"code","metadata":{"trusted":false,"id":"0XEEHMSly4-N","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704176840,"user_tz":-330,"elapsed":1400,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["labels = tumor_normal_labels(list(dnaMeth))"],"execution_count":23,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"IAyt-xYCy4-Q","colab_type":"text"},"source":["Imputing remaining missing values"]},{"cell_type":"code","metadata":{"id":"s1ImY6yzlFKt","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704179390,"user_tz":-330,"elapsed":1324,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["from sklearn.impute import SimpleImputer"],"execution_count":24,"outputs":[]},{"cell_type":"code","metadata":{"id":"wfKrbg3Daoop","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704181412,"user_tz":-330,"elapsed":1322,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["imp = SimpleImputer(missing_values=np.NaN, strategy='mean', copy=True)\n","imputedDNAMeth = imp.fit_transform(dnaMeth)\n","imputedRNASeq = imp.fit_transform(rnaSeq)\n","imputedCNA = imp.fit_transform(cna)"],"execution_count":25,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"pu_W1sUdy4-T","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704184939,"user_tz":-330,"elapsed":1444,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["#imp = Imputer(missing_values='NaN', strategy='mean', axis=1, copy=True)\n","#imputedDNAMeth = imp.fit_transform(dnaMeth)\n","#imputedRNASeq = imp.fit_transform(rnaSeq)\n","#imputedCNA = imp.fit_transform(cna)"],"execution_count":26,"outputs":[]},{"cell_type":"code","metadata":{"trusted":false,"id":"rUiCY5wvy4-X","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704184946,"user_tz":-330,"elapsed":1085,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["imputedDNAMeth=imputedDNAMeth.transpose()\n","imputedRNASeq=imputedRNASeq.transpose()\n","imputedCNA=imputedCNA.transpose()"],"execution_count":27,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"JMWVQovJy4-a","colab_type":"text"},"source":["Normalizing datasets using min-max normalization"]},{"cell_type":"code","metadata":{"trusted":false,"id":"t6lxChj8y4-b","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704215966,"user_tz":-330,"elapsed":29860,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["normalized_DNAMeth = myNormalize(imputedDNAMeth)\n","normalized_RNASeq = myNormalize(imputedRNASeq)\n","normalized_CNA = myNormalize(imputedCNA)"],"execution_count":28,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"v3BsYUqlAWJ8","colab_type":"text"},"source":["Final dimensions after pre-processing"]},{"cell_type":"code","metadata":{"id":"F-4Ihkgl_KV_","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1596704222003,"user_tz":-330,"elapsed":1830,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}},"outputId":"7765ec5e-8c2d-459c-9d1f-9857fa94dcfe"},"source":["print(dnaMeth.shape, rnaSeq.shape, cna.shape)"],"execution_count":29,"outputs":[{"output_type":"stream","text":["(18996, 404) (15397, 404) (23604, 404)\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"zUo-N3ImUr9a","colab_type":"text"},"source":["# Exporting data"]},{"cell_type":"markdown","metadata":{"id":"Y4Mlmy1Ly4-g","colab_type":"text"},"source":["Saving pre-processed files"]},{"cell_type":"code","metadata":{"trusted":false,"id":"3sibFUody4-h","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704608399,"user_tz":-330,"elapsed":1280,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["preprocessed_DNAMeth = pd.DataFrame(normalized_DNAMeth)\n","preprocessed_RNASeq = pd.DataFrame(normalized_RNASeq)\n","preprocessed_CNA = pd.DataFrame(normalized_CNA)\n","labels=pd.DataFrame(labels)"],"execution_count":31,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"lgVBld_Py4-k","colab_type":"text"},"source":["Exporting pre-processed data to csv files"]},{"cell_type":"code","metadata":{"trusted":true,"id":"DOKCZh_Jy4-n","colab_type":"code","colab":{},"executionInfo":{"status":"ok","timestamp":1596704658344,"user_tz":-330,"elapsed":48324,"user":{"displayName":"Dibyendu B. Seal","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhPAguO61Uo4uNw9cuJjCQ-Nv8Nt-Q0CYP9qR7W=s64","userId":"09389361721160752492"}}},"source":["preprocessed_DNAMeth.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_preprocessed_DNAMeth.csv',index=False)\n","preprocessed_RNASeq.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_preprocessed_RNASeq.csv',index=False)\n","preprocessed_CNA.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_preprocessed_CNA.csv',index=False)\n","labels.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_labels.csv',index=False)"],"execution_count":32,"outputs":[]}]}