Switch to unified view

a b/docs/usage/preprocess-downstream-analysis.md
1
# Preparing data for downstream analyses
2
3
## To prepare the data for classification
4
5
```python
6
X_multiomics, y = luad_data.load_data(omics="all", target=["pathologic_stage"], remove_duplicates=True)
7
8
print(X_multiomics['MessengerRNA'].shape,
9
      X_multiomics['MicroRNA'].shape,
10
      X_multiomics['LncRNA'].shape,
11
      y.shape)
12
```
13
14
> (338, 20472) (338, 1870) (338, 12727) (338, 1)
15
16
17
```python
18
print(y)
19
```
20
21
22
<div>
23
<table border="1" class="dataframe">
24
  <thead>
25
    <tr style="text-align: right;">
26
      <th></th>
27
      <th>pathologic_stage</th>
28
    </tr>
29
  </thead>
30
  <tbody>
31
    <tr>
32
      <th>TCGA-05-4390-01A</th>
33
      <td>Stage I</td>
34
    </tr>
35
    <tr>
36
      <th>TCGA-05-4405-01A</th>
37
      <td>Stage I</td>
38
    </tr>
39
    <tr>
40
      <th>TCGA-05-4410-01A</th>
41
      <td>Stage I</td>
42
    </tr>
43
    <tr>
44
      <th>TCGA-05-4417-01A</th>
45
      <td>Stage I</td>
46
    </tr>
47
    <tr>
48
      <th>TCGA-05-4424-01A</th>
49
      <td>Stage II</td>
50
    </tr>
51
    <tr>
52
      <th>TCGA-05-4427-01A</th>
53
      <td>Stage II</td>
54
    </tr>
55
    <tr>
56
      <th>TCGA-05-4433-01A</th>
57
      <td>Stage I</td>
58
    </tr>
59
    <tr>
60
      <th>TCGA-05-5423-01A</th>
61
      <td>Stage II</td>
62
    </tr>
63
    <tr>
64
      <th>TCGA-05-5425-01A</th>
65
      <td>Stage II</td>
66
    </tr>
67
    <tr>
68
      <th>TCGA-05-5428-01A</th>
69
      <td>Stage II</td>
70
    </tr>
71
    <tr>
72
      <th>TCGA-05-5715-01A</th>
73
      <td>Stage I</td>
74
    </tr>
75
    <tr>
76
      <th>TCGA-38-4631-01A</th>
77
      <td>Stage I</td>
78
    </tr>
79
    <tr>
80
      <th>TCGA-38-7271-01A</th>
81
      <td>Stage I</td>
82
    </tr>
83
    <tr>
84
      <th>TCGA-38-A44F-01A</th>
85
      <td>Stage I</td>
86
    </tr>
87
    <tr>
88
      <th>TCGA-44-2655-11A</th>
89
      <td>Stage I</td>
90
    </tr>
91
  </tbody>
92
</table>
93
<p>336 rows × 1 columns</p>
94
</div>
95
96
97
98
## Log2 transform the mRNA, microRNA, and lncRNA expression values
99
100
101
```python
102
def expression_val_transform(x):
103
    return np.log2(x+1)
104
X_multiomics['MessengerRNA'] = X_multiomics['MessengerRNA'].applymap(expression_val_transform)
105
X_multiomics['MicroRNA'] = X_multiomics['MicroRNA'].applymap(expression_val_transform)
106
# X_multiomics['LncRNA'] = X_multiomics['LncRNA'].applymap(expression_val_transform)
107
```
108
109
## Classification of Cancer Stage
110
111
112
```python
113
from sklearn import preprocessing
114
from sklearn import metrics
115
from sklearn.svm import SVC, LinearSVC
116
import sklearn.linear_model
117
from sklearn.model_selection import train_test_split
118
119
```
120
121
122
```python
123
binarizer = preprocessing.LabelEncoder()
124
binarizer.fit(y)
125
binarizer.transform(y)
126
```
127
128
129
    array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
130
           0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
131
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
132
           0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133
           0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
134
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
135
           1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
136
           0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
137
           0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
138
           0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
139
           0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
140
           1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
141
           1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
142
           1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
143
           0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
144
           0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
145
           1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])
146
147
148
149
150
```python
151
for omic in ["MessengerRNA", "MicroRNA"]:
152
    print(omic)
153
    scaler = sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=False)
154
    scaler.fit(X_multiomics[omic])
155
156
    X_train, X_test, Y_train, Y_test = \
157
        train_test_split(X_multiomics[omic], y, test_size=0.3, random_state=np.random.randint(0, 10000), stratify=y)
158
    print(X_train.shape, X_test.shape)
159
160
161
    X_train = scaler.transform(X_train)
162
163
    model = LinearSVC(C=1e-2, penalty='l1', class_weight='balanced', dual=False, multi_class="ovr")
164
#     model = sklearn.linear_model.LogisticRegression(C=1e-0, penalty='l1', fit_intercept=False, class_weight="balanced")
165
#     model = SVC(C=1e0, kernel="rbf", class_weight="balanced", decision_function_shape="ovo")
166
167
    model.fit(X=X_train, y=Y_train)
168
    print("NONZERO", len(np.nonzero(model.coef_)[0]))
169
    print("Training accuracy", metrics.accuracy_score(model.predict(X_train), Y_train))
170
    print(metrics.classification_report(y_pred=model.predict(X_test), y_true=Y_test))
171
172
```
173
174
    MessengerRNA
175
    (254, 20472) (109, 20472)
176
    NONZERO 0
177
    Training accuracy 0.6929133858267716
178
                 precision    recall  f1-score   support
179
180
        Stage I       0.69      1.00      0.82        75
181
       Stage II       0.00      0.00      0.00        34
182
183
    avg / total       0.47      0.69      0.56       109
184
185
    MicroRNA
186
    (254, 1870) (109, 1870)
187
    NONZERO 0
188
    Training accuracy 0.6929133858267716
189
                 precision    recall  f1-score   support
190
191
        Stage I       0.69      1.00      0.82        75
192
       Stage II       0.00      0.00      0.00        34
193
194
    avg / total       0.47      0.69      0.56       109