--- a +++ b/docs/usage/preprocess-downstream-analysis.md @@ -0,0 +1,194 @@ +# Preparing data for downstream analyses + +## To prepare the data for classification + +```python +X_multiomics, y = luad_data.load_data(omics="all", target=["pathologic_stage"], remove_duplicates=True) + +print(X_multiomics['MessengerRNA'].shape, + X_multiomics['MicroRNA'].shape, + X_multiomics['LncRNA'].shape, + y.shape) +``` + +> (338, 20472) (338, 1870) (338, 12727) (338, 1) + + +```python +print(y) +``` + + +<div> +<table border="1" class="dataframe"> + <thead> + <tr style="text-align: right;"> + <th></th> + <th>pathologic_stage</th> + </tr> + </thead> + <tbody> + <tr> + <th>TCGA-05-4390-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-05-4405-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-05-4410-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-05-4417-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-05-4424-01A</th> + <td>Stage II</td> + </tr> + <tr> + <th>TCGA-05-4427-01A</th> + <td>Stage II</td> + </tr> + <tr> + <th>TCGA-05-4433-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-05-5423-01A</th> + <td>Stage II</td> + </tr> + <tr> + <th>TCGA-05-5425-01A</th> + <td>Stage II</td> + </tr> + <tr> + <th>TCGA-05-5428-01A</th> + <td>Stage II</td> + </tr> + <tr> + <th>TCGA-05-5715-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-38-4631-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-38-7271-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-38-A44F-01A</th> + <td>Stage I</td> + </tr> + <tr> + <th>TCGA-44-2655-11A</th> + <td>Stage I</td> + </tr> + </tbody> +</table> +<p>336 rows × 1 columns</p> +</div> + + + +## Log2 transform the mRNA, microRNA, and lncRNA expression values + + +```python +def expression_val_transform(x): + return np.log2(x+1) +X_multiomics['MessengerRNA'] = X_multiomics['MessengerRNA'].applymap(expression_val_transform) +X_multiomics['MicroRNA'] = X_multiomics['MicroRNA'].applymap(expression_val_transform) +# X_multiomics['LncRNA'] = X_multiomics['LncRNA'].applymap(expression_val_transform) +``` + +## Classification of Cancer Stage + + +```python +from sklearn import preprocessing +from sklearn import metrics +from sklearn.svm import SVC, LinearSVC +import sklearn.linear_model +from sklearn.model_selection import train_test_split + +``` + + +```python +binarizer = preprocessing.LabelEncoder() +binarizer.fit(y) +binarizer.transform(y) +``` + + + array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, + 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, + 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, + 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, + 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, + 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, + 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]) + + + + +```python +for omic in ["MessengerRNA", "MicroRNA"]: + print(omic) + scaler = sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=False) + scaler.fit(X_multiomics[omic]) + + X_train, X_test, Y_train, Y_test = \ + train_test_split(X_multiomics[omic], y, test_size=0.3, random_state=np.random.randint(0, 10000), stratify=y) + print(X_train.shape, X_test.shape) + + + X_train = scaler.transform(X_train) + + model = LinearSVC(C=1e-2, penalty='l1', class_weight='balanced', dual=False, multi_class="ovr") +# model = sklearn.linear_model.LogisticRegression(C=1e-0, penalty='l1', fit_intercept=False, class_weight="balanced") +# model = SVC(C=1e0, kernel="rbf", class_weight="balanced", decision_function_shape="ovo") + + model.fit(X=X_train, y=Y_train) + print("NONZERO", len(np.nonzero(model.coef_)[0])) + print("Training accuracy", metrics.accuracy_score(model.predict(X_train), Y_train)) + print(metrics.classification_report(y_pred=model.predict(X_test), y_true=Y_test)) + +``` + + MessengerRNA + (254, 20472) (109, 20472) + NONZERO 0 + Training accuracy 0.6929133858267716 + precision recall f1-score support + + Stage I 0.69 1.00 0.82 75 + Stage II 0.00 0.00 0.00 34 + + avg / total 0.47 0.69 0.56 109 + + MicroRNA + (254, 1870) (109, 1870) + NONZERO 0 + Training accuracy 0.6929133858267716 + precision recall f1-score support + + Stage I 0.69 1.00 0.82 75 + Stage II 0.00 0.00 0.00 34 + + avg / total 0.47 0.69 0.56 109