|
a |
|
b/docs/usage/preprocess-downstream-analysis.md |
|
|
1 |
# Preparing data for downstream analyses |
|
|
2 |
|
|
|
3 |
## To prepare the data for classification |
|
|
4 |
|
|
|
5 |
```python |
|
|
6 |
X_multiomics, y = luad_data.load_data(omics="all", target=["pathologic_stage"], remove_duplicates=True) |
|
|
7 |
|
|
|
8 |
print(X_multiomics['MessengerRNA'].shape, |
|
|
9 |
X_multiomics['MicroRNA'].shape, |
|
|
10 |
X_multiomics['LncRNA'].shape, |
|
|
11 |
y.shape) |
|
|
12 |
``` |
|
|
13 |
|
|
|
14 |
> (338, 20472) (338, 1870) (338, 12727) (338, 1) |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
```python |
|
|
18 |
print(y) |
|
|
19 |
``` |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
<div> |
|
|
23 |
<table border="1" class="dataframe"> |
|
|
24 |
<thead> |
|
|
25 |
<tr style="text-align: right;"> |
|
|
26 |
<th></th> |
|
|
27 |
<th>pathologic_stage</th> |
|
|
28 |
</tr> |
|
|
29 |
</thead> |
|
|
30 |
<tbody> |
|
|
31 |
<tr> |
|
|
32 |
<th>TCGA-05-4390-01A</th> |
|
|
33 |
<td>Stage I</td> |
|
|
34 |
</tr> |
|
|
35 |
<tr> |
|
|
36 |
<th>TCGA-05-4405-01A</th> |
|
|
37 |
<td>Stage I</td> |
|
|
38 |
</tr> |
|
|
39 |
<tr> |
|
|
40 |
<th>TCGA-05-4410-01A</th> |
|
|
41 |
<td>Stage I</td> |
|
|
42 |
</tr> |
|
|
43 |
<tr> |
|
|
44 |
<th>TCGA-05-4417-01A</th> |
|
|
45 |
<td>Stage I</td> |
|
|
46 |
</tr> |
|
|
47 |
<tr> |
|
|
48 |
<th>TCGA-05-4424-01A</th> |
|
|
49 |
<td>Stage II</td> |
|
|
50 |
</tr> |
|
|
51 |
<tr> |
|
|
52 |
<th>TCGA-05-4427-01A</th> |
|
|
53 |
<td>Stage II</td> |
|
|
54 |
</tr> |
|
|
55 |
<tr> |
|
|
56 |
<th>TCGA-05-4433-01A</th> |
|
|
57 |
<td>Stage I</td> |
|
|
58 |
</tr> |
|
|
59 |
<tr> |
|
|
60 |
<th>TCGA-05-5423-01A</th> |
|
|
61 |
<td>Stage II</td> |
|
|
62 |
</tr> |
|
|
63 |
<tr> |
|
|
64 |
<th>TCGA-05-5425-01A</th> |
|
|
65 |
<td>Stage II</td> |
|
|
66 |
</tr> |
|
|
67 |
<tr> |
|
|
68 |
<th>TCGA-05-5428-01A</th> |
|
|
69 |
<td>Stage II</td> |
|
|
70 |
</tr> |
|
|
71 |
<tr> |
|
|
72 |
<th>TCGA-05-5715-01A</th> |
|
|
73 |
<td>Stage I</td> |
|
|
74 |
</tr> |
|
|
75 |
<tr> |
|
|
76 |
<th>TCGA-38-4631-01A</th> |
|
|
77 |
<td>Stage I</td> |
|
|
78 |
</tr> |
|
|
79 |
<tr> |
|
|
80 |
<th>TCGA-38-7271-01A</th> |
|
|
81 |
<td>Stage I</td> |
|
|
82 |
</tr> |
|
|
83 |
<tr> |
|
|
84 |
<th>TCGA-38-A44F-01A</th> |
|
|
85 |
<td>Stage I</td> |
|
|
86 |
</tr> |
|
|
87 |
<tr> |
|
|
88 |
<th>TCGA-44-2655-11A</th> |
|
|
89 |
<td>Stage I</td> |
|
|
90 |
</tr> |
|
|
91 |
</tbody> |
|
|
92 |
</table> |
|
|
93 |
<p>336 rows × 1 columns</p> |
|
|
94 |
</div> |
|
|
95 |
|
|
|
96 |
|
|
|
97 |
|
|
|
98 |
## Log2 transform the mRNA, microRNA, and lncRNA expression values |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
```python |
|
|
102 |
def expression_val_transform(x): |
|
|
103 |
return np.log2(x+1) |
|
|
104 |
X_multiomics['MessengerRNA'] = X_multiomics['MessengerRNA'].applymap(expression_val_transform) |
|
|
105 |
X_multiomics['MicroRNA'] = X_multiomics['MicroRNA'].applymap(expression_val_transform) |
|
|
106 |
# X_multiomics['LncRNA'] = X_multiomics['LncRNA'].applymap(expression_val_transform) |
|
|
107 |
``` |
|
|
108 |
|
|
|
109 |
## Classification of Cancer Stage |
|
|
110 |
|
|
|
111 |
|
|
|
112 |
```python |
|
|
113 |
from sklearn import preprocessing |
|
|
114 |
from sklearn import metrics |
|
|
115 |
from sklearn.svm import SVC, LinearSVC |
|
|
116 |
import sklearn.linear_model |
|
|
117 |
from sklearn.model_selection import train_test_split |
|
|
118 |
|
|
|
119 |
``` |
|
|
120 |
|
|
|
121 |
|
|
|
122 |
```python |
|
|
123 |
binarizer = preprocessing.LabelEncoder() |
|
|
124 |
binarizer.fit(y) |
|
|
125 |
binarizer.transform(y) |
|
|
126 |
``` |
|
|
127 |
|
|
|
128 |
|
|
|
129 |
array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, |
|
|
130 |
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, |
|
|
131 |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, |
|
|
132 |
0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
|
133 |
0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, |
|
|
134 |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, |
|
|
135 |
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, |
|
|
136 |
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, |
|
|
137 |
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, |
|
|
138 |
0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, |
|
|
139 |
0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, |
|
|
140 |
1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, |
|
|
141 |
1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, |
|
|
142 |
1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, |
|
|
143 |
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, |
|
|
144 |
0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, |
|
|
145 |
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]) |
|
|
146 |
|
|
|
147 |
|
|
|
148 |
|
|
|
149 |
|
|
|
150 |
```python |
|
|
151 |
for omic in ["MessengerRNA", "MicroRNA"]: |
|
|
152 |
print(omic) |
|
|
153 |
scaler = sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=False) |
|
|
154 |
scaler.fit(X_multiomics[omic]) |
|
|
155 |
|
|
|
156 |
X_train, X_test, Y_train, Y_test = \ |
|
|
157 |
train_test_split(X_multiomics[omic], y, test_size=0.3, random_state=np.random.randint(0, 10000), stratify=y) |
|
|
158 |
print(X_train.shape, X_test.shape) |
|
|
159 |
|
|
|
160 |
|
|
|
161 |
X_train = scaler.transform(X_train) |
|
|
162 |
|
|
|
163 |
model = LinearSVC(C=1e-2, penalty='l1', class_weight='balanced', dual=False, multi_class="ovr") |
|
|
164 |
# model = sklearn.linear_model.LogisticRegression(C=1e-0, penalty='l1', fit_intercept=False, class_weight="balanced") |
|
|
165 |
# model = SVC(C=1e0, kernel="rbf", class_weight="balanced", decision_function_shape="ovo") |
|
|
166 |
|
|
|
167 |
model.fit(X=X_train, y=Y_train) |
|
|
168 |
print("NONZERO", len(np.nonzero(model.coef_)[0])) |
|
|
169 |
print("Training accuracy", metrics.accuracy_score(model.predict(X_train), Y_train)) |
|
|
170 |
print(metrics.classification_report(y_pred=model.predict(X_test), y_true=Y_test)) |
|
|
171 |
|
|
|
172 |
``` |
|
|
173 |
|
|
|
174 |
MessengerRNA |
|
|
175 |
(254, 20472) (109, 20472) |
|
|
176 |
NONZERO 0 |
|
|
177 |
Training accuracy 0.6929133858267716 |
|
|
178 |
precision recall f1-score support |
|
|
179 |
|
|
|
180 |
Stage I 0.69 1.00 0.82 75 |
|
|
181 |
Stage II 0.00 0.00 0.00 34 |
|
|
182 |
|
|
|
183 |
avg / total 0.47 0.69 0.56 109 |
|
|
184 |
|
|
|
185 |
MicroRNA |
|
|
186 |
(254, 1870) (109, 1870) |
|
|
187 |
NONZERO 0 |
|
|
188 |
Training accuracy 0.6929133858267716 |
|
|
189 |
precision recall f1-score support |
|
|
190 |
|
|
|
191 |
Stage I 0.69 1.00 0.82 75 |
|
|
192 |
Stage II 0.00 0.00 0.00 34 |
|
|
193 |
|
|
|
194 |
avg / total 0.47 0.69 0.56 109 |