|
a |
|
b/preprocessing/Preprocessing_Hemap_featurematrix_generation.R |
|
|
1 |
#*************************************************************************************************** |
|
|
2 |
#******************************* Make immunology FM ********************************************** |
|
|
3 |
#*************************************************************************************************** |
|
|
4 |
library(mclust) |
|
|
5 |
library(data.table) |
|
|
6 |
library(parallel) |
|
|
7 |
|
|
|
8 |
source("/research/users/ppolonen/git_home/common_scripts/featurematrix/functions_generate_fm.R") |
|
|
9 |
|
|
|
10 |
# WD |
|
|
11 |
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/") |
|
|
12 |
|
|
|
13 |
# GEXP |
|
|
14 |
gexp=t(get(load("data9544_with_gene_symbols.RData"))) |
|
|
15 |
|
|
|
16 |
# annotation table |
|
|
17 |
check=get(load("Hemap_immunology_Annotations_8304.Rdata")) |
|
|
18 |
annot=read.delim("anno_coord_data9544_15pct_bw2.5_updated.txt", stringsAsFactors=F, header=T) |
|
|
19 |
|
|
|
20 |
# listing files, not needed anymore |
|
|
21 |
# f=list.files("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/Annotations_immunology/", pattern = ".csv", full.names = T) |
|
|
22 |
# f=f[-1] |
|
|
23 |
# annot_normal=do.call(rbind, lapply(f, read.csv, header=F, stringsAsFactors=F, skip=1)) |
|
|
24 |
|
|
|
25 |
#******************************** filtering ************************************* |
|
|
26 |
exvivotreatments_allowed <- c("none", "na", "control", "activation", "differentiation", "differentiation followed by activation", " differentiation followed by LPS+IFNg", "differentiation followed by IFN", "differentiation followed by inflammatory cytokines", "differentiation followed by LPS", "differentiation followed by CD40L", "differentiation followed by Poly(I:C)", "differentiation (IL-4)", "differentiation with EPO", "anti-IgM", "IL-2", "IL-3", "IL-3+CpG") |
|
|
27 |
|
|
|
28 |
rm_healthy = annot$Sample.type%in%c("NonCancerHealthy")&!(annot$In.vivo.treatment%in%c("na", "none", "no")&annot$Ex.vivo.treatment%in%exvivotreatments_allowed) |
|
|
29 |
rm_cancer_prolif = annot$Sample.type%in%c("Cancer", "Prolif")&!annot$Ex.vivo.treatment=="none" |
|
|
30 |
rm_celline = annot$Sample.type%in%c("CellLine")&!annot$Ex.vivo.treatment%in%c("none", "na", "control") |
|
|
31 |
rm_treatments = annot$GSE.identifier..experiment.%in%c("GSE26661")|annot$GSM.identifier..sample.%in%c("GSM425497", "GSM425499") |
|
|
32 |
rm_noncancer = !annot$Sample.type%in%c("Cancer", "Prolif", "CellLine", "NonCancerHealthy") |
|
|
33 |
rm_mm_pbmc = annot$colorClass=="MM"&!annot$Sample.isolation=="CD138+ plasma cells" |
|
|
34 |
# cluster exclusion |
|
|
35 |
# AML,GSE7538,24 |
|
|
36 |
# CLL,GSE18866,GSE9250,32 |
|
|
37 |
# LP,GSE12453,GSE7345,6 |
|
|
38 |
# MM,GSE24147,GSE24522,18 |
|
|
39 |
# MP,GSE12079,GSE15811,13 |
|
|
40 |
# TCL,GSE14879,16 microdissected, do not remove |
|
|
41 |
|
|
|
42 |
rm_cluster_differ=annot$GSE.identifier..experiment.%in%c("GSE7538", "GSE9250", "GSE12453", "GSE18866","GSE24522", "GSE7345", "GSE12079", "GSE15811", "GSE24147")&annot$Sample.type=="Cancer" |
|
|
43 |
|
|
|
44 |
annot_left_out = annot |
|
|
45 |
|
|
|
46 |
annot_left_out$reason_removed[rm_healthy]="Normal Cell sample, ex-vivo or in-vivo treated" |
|
|
47 |
annot_left_out$reason_removed[rm_cluster_differ]="Cancer sample, outlier cluster" |
|
|
48 |
annot_left_out$reason_removed[rm_celline]="Cell line sample, ex-vivo treated" |
|
|
49 |
annot_left_out$reason_removed[rm_cancer_prolif|rm_treatments]="Cancer sample, ex-vivo treated" |
|
|
50 |
annot_left_out$reason_removed[rm_noncancer]="NonCancer sample, not healthy" |
|
|
51 |
annot_left_out$reason_removed[rm_mm_pbmc]="NonCancer sample, not healthy" |
|
|
52 |
|
|
|
53 |
annot_left_out <- annot_left_out[(rm_noncancer|rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_mm_pbmc),] |
|
|
54 |
fwrite(annot_left_out, "hemap_1072_leftout_reasonremoved.txt", sep="\t") |
|
|
55 |
|
|
|
56 |
annot <- annot[!(rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_noncancer|rm_mm_pbmc),] |
|
|
57 |
gexp <- gexp[,!(rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_noncancer|rm_mm_pbmc)] |
|
|
58 |
|
|
|
59 |
newSamples=annot[!annot[,1]%in%check[,1],] |
|
|
60 |
fwrite(newSamples, "new_samples_in_hemap_8472.txt", sep="\t") |
|
|
61 |
|
|
|
62 |
dim(check[check[,1]%in%annot[,1],]) |
|
|
63 |
|
|
|
64 |
|
|
|
65 |
#**************************************************************************************** |
|
|
66 |
# annot[rm_healthy&annot$GSM.identifier..sample.%in%annot_normal[,1],] # excluded from previous normals |
|
|
67 |
|
|
|
68 |
# new |
|
|
69 |
annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=gsub("NonCancerHealthy_|NonCancerHealthy|na_","", gsub("_StemCell_|Myeloid_|_na_na|1_na_na|2_na_na|Lymphoid_|_G.*.|_na_G0|_na_G1.*.|T-|B-|M1-Differentiating|Erythroid_","", annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")])) |
|
|
70 |
annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")][annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=="na"]="LymphNode_GerminalCentre" |
|
|
71 |
annot$CLASS[annot$CLASS=="CD4+Tcell"]="RestingCD4+Tcell" |
|
|
72 |
annot$CLASS[annot$CLASS=="CD8+Tcell"]="RestingCD8+Tcell" |
|
|
73 |
|
|
|
74 |
annot$MAINCLASS[annot$Sample.type%in%c("NonCancerHealthy")]="NonCancerHealthy" |
|
|
75 |
annot$colorClass[annot$Sample.type%in%c("NonCancerHealthy")][annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=="na"]="Lymphoid" |
|
|
76 |
|
|
|
77 |
# old |
|
|
78 |
# annot$CLASS[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,5] |
|
|
79 |
# annot$MAINCLASS[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,3] |
|
|
80 |
# annot$colorClass[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,4] |
|
|
81 |
|
|
|
82 |
#*********************************** compute geometric mean ******************************** |
|
|
83 |
# get certain genes GEXP |
|
|
84 |
rownames(gexp)=paste("N:GEXP:", rownames(gexp), sep="") |
|
|
85 |
|
|
|
86 |
dat_a=gexp[grep("GZMA|PRF1|GNLY|GZMH|GZMM", rownames(gexp)),] |
|
|
87 |
# dat_a=matrix[grep("GZMA|PRF1", rownames(matrix)),] |
|
|
88 |
dat=2^dat_a+0.01 |
|
|
89 |
rownames(dat) |
|
|
90 |
gm1=log2(t(apply(dat, 2, gm_mean))) |
|
|
91 |
rownames(gm1)="CytolyticScore" |
|
|
92 |
|
|
|
93 |
# also HLA |
|
|
94 |
dat_a2=gexp[rownames(gexp)%in%c("N:GEXP:B2M", "N:GEXP:HLA-A", "N:GEXP:HLA-B", "N:GEXP:HLA-C"),] |
|
|
95 |
|
|
|
96 |
dat2=2^dat_a2+0.01 |
|
|
97 |
rownames(dat2) |
|
|
98 |
gm2=log2(t(apply(dat2, 2, gm_mean))) |
|
|
99 |
rownames(gm2)="HLAIScore" |
|
|
100 |
|
|
|
101 |
# also HLAII |
|
|
102 |
dat_a3=gexp[rownames(gexp)%in%c("N:GEXP:HLA-DMA", |
|
|
103 |
"N:GEXP:HLA-DMB", |
|
|
104 |
"N:GEXP:HLA-DPA1", |
|
|
105 |
"N:GEXP:HLA-DPB1", |
|
|
106 |
"N:GEXP:HLA-DRA", |
|
|
107 |
"N:GEXP:HLA-DRB1"),] |
|
|
108 |
|
|
|
109 |
dat3=2^dat_a3+0.01 |
|
|
110 |
rownames(dat3) |
|
|
111 |
gm3=log2(t(apply(dat3, 2, gm_mean))) |
|
|
112 |
rownames(gm3)="HLAIIScore" |
|
|
113 |
|
|
|
114 |
classification1=data.frame(t(rep("medium", length(gm1))), stringsAsFactors = F) |
|
|
115 |
zscore=as.numeric(scale(t(gm1))) |
|
|
116 |
classification1[zscore>=1]="high" |
|
|
117 |
classification1[zscore<=(-1)]="low" |
|
|
118 |
rownames(classification1)="CytolyticScore" |
|
|
119 |
colnames(classification1)=colnames(gexp) |
|
|
120 |
|
|
|
121 |
classification2=data.frame(t(rep("medium", length(gm2))), stringsAsFactors = F) |
|
|
122 |
zscore=as.numeric(scale(t(gm2))) |
|
|
123 |
classification2[zscore>=1]="high" |
|
|
124 |
classification2[zscore<=(-1)]="low" |
|
|
125 |
rownames(classification2)="HLAIScore" |
|
|
126 |
colnames(classification2)=colnames(gexp) |
|
|
127 |
|
|
|
128 |
classification3=data.frame(t(rep("medium", length(gm3))), stringsAsFactors = F) |
|
|
129 |
zscore=as.numeric(scale(t(gm3))) |
|
|
130 |
classification3[zscore>=1]="high" |
|
|
131 |
classification3[zscore<=(-1)]="low" |
|
|
132 |
rownames(classification3)="HLAIIScore" |
|
|
133 |
colnames(classification3)=colnames(gexp) |
|
|
134 |
|
|
|
135 |
classification=data.frame(t(rbind(classification1,classification2,classification3)), stringsAsFactors = F) |
|
|
136 |
|
|
|
137 |
|
|
|
138 |
immunoscores=as.data.frame(t(rbind(gm1, gm2, gm3))) |
|
|
139 |
immunoscoresfm=make.features(immunoscores, datatype="SAMP", prefix="") |
|
|
140 |
colnames(immunoscoresfm)=colnames(gexp) |
|
|
141 |
|
|
|
142 |
immunoscores_class_fm=make.features(classification, datatype="SAMP", prefix="") |
|
|
143 |
colnames(immunoscores_class_fm)=colnames(gexp) |
|
|
144 |
|
|
|
145 |
# excluding categorical here, they slow everything down! |
|
|
146 |
l.data_list=list(gexp, immunoscoresfm, immunoscores_class_fm) |
|
|
147 |
data_list=data.frame(do.call(rbind, l.data_list)) |
|
|
148 |
|
|
|
149 |
# ******************************** Infer cell fractions *********************************** |
|
|
150 |
|
|
|
151 |
# cibersort |
|
|
152 |
results=read.delim("CIBERSORT-Results.txt", row.names = 1, header=T, stringsAsFactors = F) |
|
|
153 |
colnames(results)=paste0("N:SAMP:CIBERSORT_", gsub(" |-|\\.", "_", colnames(results)), "") |
|
|
154 |
cibersort=t(results) |
|
|
155 |
cibersort=cibersort[,colnames(cibersort)%in%colnames(gexp)] |
|
|
156 |
|
|
|
157 |
MCP=get(load("MCP_counter_data.Rdata")) |
|
|
158 |
rownames(MCP)=paste0("N:SAMP:", rownames(MCP), "") |
|
|
159 |
MCP=MCP[,colnames(MCP)%in%colnames(cibersort)] |
|
|
160 |
|
|
|
161 |
l.fractions=list(cibersort, MCP) |
|
|
162 |
fractions=data.frame(do.call(rbind, l.fractions), stringsAsFactors = F) |
|
|
163 |
|
|
|
164 |
#********************************** Clinical data ****************************** |
|
|
165 |
|
|
|
166 |
files=list.files(".", pattern=".info.tsv", full.names = T) |
|
|
167 |
|
|
|
168 |
surv_data=do.call(rbind, lapply(files, read.delim, header=T, stringsAsFactors=F)) |
|
|
169 |
surv_data=surv_data[surv_data[,1]%in%colnames(gexp),] |
|
|
170 |
surv_data=surv_data[!is.na(surv_data[,3]),] |
|
|
171 |
|
|
|
172 |
surv_d=t(surv_data[,2:3]) |
|
|
173 |
colnames(surv_d)=surv_data[,1] |
|
|
174 |
rownames(surv_d)=c("N:CLIN:OS_Time", "B:CLIN:OS_Status") |
|
|
175 |
|
|
|
176 |
surv_d=surv_d[,match(colnames(gexp), colnames(surv_d))] |
|
|
177 |
colnames(surv_d)=colnames(gexp) |
|
|
178 |
|
|
|
179 |
#******************************* tumor percentage ******************************** |
|
|
180 |
Sys.setlocale(locale="C") |
|
|
181 |
sorteds=read.delim("sorted_samples.txt", stringsAsFactors=F, header=F) |
|
|
182 |
|
|
|
183 |
gsm=unlist(lapply(1:dim(sorteds)[1], function(i){ |
|
|
184 |
annot$GSM.identifier..sample.[annot$Sample.isolation%in%sorteds[i,1]&annot$colorClass%in%sorteds[i,2]] |
|
|
185 |
})) |
|
|
186 |
add=annot$GSM.identifier..sample.[grepl("CD303", annot$Sample.isolation)] |
|
|
187 |
|
|
|
188 |
sorted=t(annot$GSM.identifier..sample.%in%c(gsm, add)) |
|
|
189 |
|
|
|
190 |
rownames(sorted)="B:CLIN:CELLS_SORTED" |
|
|
191 |
colnames(sorted)=colnames(gexp) |
|
|
192 |
|
|
|
193 |
sorted[,grepl("Padiatr", annot$Additional.notes)]=1 |
|
|
194 |
# sorted[,annot$Additional.notes=="The leukemic blasts were sorted based on CD41, CD7, CD117, CD33, and CD34 antibodies as previously described (Klin. Padiatr. 217, 126-134)."] = 1 |
|
|
195 |
|
|
|
196 |
tumor_per=gsub("blast%: |>=|%|blast cell percentage: |t_cell_purity: |>|;|blast count, % of sample, -1=unavailable : ","", annot$Purity.Tumor.cell.content) |
|
|
197 |
tumor_per[tumor_per=="high"]=80 |
|
|
198 |
tumor_per[as.numeric(tumor_per)>100]=100 |
|
|
199 |
tumor_per[tumor_per=="-1"]=0 |
|
|
200 |
tumor_per[tumor_per%in%c("n./a.", "na")]=NA |
|
|
201 |
|
|
|
202 |
malt=read.delim("clinical_annotations_MALT.txt", stringsAsFactors=F, header=T) |
|
|
203 |
replace=malt[match(colnames(gexp), malt$GSMID),] |
|
|
204 |
tumor_per[grepl("Percentage of tumor", tumor_per)]=replace$X..Tumor[grepl("Percentage of tumor", tumor_per)] |
|
|
205 |
|
|
|
206 |
tumor_percentage=data.matrix(t(as.numeric(tumor_per))) |
|
|
207 |
rownames(tumor_percentage)="N:SAMP:BLAST_TUMOR_PERCENTAGE" |
|
|
208 |
colnames(tumor_percentage)=colnames(gexp) |
|
|
209 |
|
|
|
210 |
# T-cell percentages |
|
|
211 |
T_per=data.matrix(t(as.numeric(replace$X..T.cells))) |
|
|
212 |
rownames(T_per)="N:SAMP:TCELL_PERCENTAGE" |
|
|
213 |
colnames(T_per)=colnames(gexp) |
|
|
214 |
|
|
|
215 |
tissue_per=data.matrix(t(as.numeric(replace$X..Lung))) |
|
|
216 |
rownames(tissue_per)="N:SAMP:TISSUE_PERCENTAGE" |
|
|
217 |
colnames(tissue_per)=colnames(gexp) |
|
|
218 |
|
|
|
219 |
|
|
|
220 |
#*************************** adding some lymphoma annotation ************************************* |
|
|
221 |
anno = read.delim("GSE10846_series_matrix_info_ipi_clean.txt", stringsAsFactors=F, header=T) |
|
|
222 |
anno=anno[anno[,1]%in%annot[,1],] |
|
|
223 |
|
|
|
224 |
anno=anno[match(annot[,1], anno[,1]),] |
|
|
225 |
|
|
|
226 |
annot$In.vivo.treatment[annot[,1]%in%anno[,1]]=gsub("*.*: |;", "", anno$chemotherapy[annot[,1]%in%anno[,1]]) |
|
|
227 |
annot$In.vivo.treatment[annot$In.vivo.treatment%in%"NA"]=NA |
|
|
228 |
|
|
|
229 |
annot$dlbcl_ipi=NA |
|
|
230 |
annot$dlbcl_ipi[annot[,1]%in%anno[,1]]=anno$ipi[annot[,1]%in%anno[,1]] |
|
|
231 |
|
|
|
232 |
CHOP=t(annot$In.vivo.treatment%in%"CHOP-Like Regimen"*1) |
|
|
233 |
RCHOP=t(annot$In.vivo.treatment%in%"R-CHOP-Like Regimen"*1) |
|
|
234 |
CHOP[is.na(annot$In.vivo.treatment)]=NA |
|
|
235 |
RCHOP[is.na(annot$In.vivo.treatment)]=NA |
|
|
236 |
rownames(CHOP)="B:CLIN:Chemotherapy_CHOP" |
|
|
237 |
rownames(RCHOP)="B:CLIN:Chemotherapy_RCHOP" |
|
|
238 |
colnames(CHOP)=colnames(gexp) |
|
|
239 |
colnames(RCHOP)=colnames(gexp) |
|
|
240 |
|
|
|
241 |
# add cytogenetic information |
|
|
242 |
genetics_org=read.delim("AML_preBALL_cytogenetics_vectors.txt", stringsAsFactors=F, header=T, row.names=1) |
|
|
243 |
genetics=genetics_org[rownames(genetics_org)%in%colnames(gexp),] |
|
|
244 |
genetics=t(genetics)*1 |
|
|
245 |
rownames(genetics)=paste("B:CLIN:", "GENETICS_", rownames(genetics), "", sep="") |
|
|
246 |
|
|
|
247 |
cytogenetic=annot$Cytogenetics |
|
|
248 |
cytogenetic[cytogenetic%in%c("na", "n/a", "", " ")]=NA |
|
|
249 |
cytogenetic[grepl("without|unknown|remainingcytogenetics|no del13q|crlf2 fish: Normal|crlf2 fish: n/a", cytogenetic)]=NA |
|
|
250 |
cytogenetic[grepl("ormal", cytogenetic)]="normal_karyotype" |
|
|
251 |
cytogenetic[grepl("MLL", cytogenetic)]="MLL" |
|
|
252 |
cytogenetic=gsub("complex aberrant karyotype", "complex karyotype", cytogenetic) |
|
|
253 |
cytogenetic=gsub("hyperdiploid karyotype", "hyperdiploid", cytogenetic) |
|
|
254 |
cytogenetic=gsub("TAL$", "TAL1", cytogenetic) |
|
|
255 |
cytogenetic=gsub("remaining cytogenetics|other abNormalities", "other", cytogenetic) |
|
|
256 |
cytogenetic=gsub("fish:|trisomy 8 |;deletion|; complex karyotype|, complex karyotype|: positive| chromosomal aberrations|/API2-MALT1|/API2-MALT1 negative|deletion *.*: negative|/IGH-MALT1|, plus other| plus other", "", cytogenetic) |
|
|
257 |
cytogenetic=gsub("trisomy ", "trisomy", cytogenetic) |
|
|
258 |
cytogenetic=gsub("TEL deleted", "TEL_deleted", cytogenetic) |
|
|
259 |
cytogenetic=gsub("p13.1", "p13", cytogenetic) |
|
|
260 |
cytogenetic=gsub(";$", "", cytogenetic) |
|
|
261 |
cytogenetic=gsub("\\+ ", "+", cytogenetic) |
|
|
262 |
cytogenetic=gsub("i\\(", "inv(", cytogenetic) |
|
|
263 |
cytogenetic=gsub("complex karyotype", "complex_karyotype", cytogenetic) |
|
|
264 |
cytogenetic=gsub("^ ", "", cytogenetic) |
|
|
265 |
annot$cytogenetic_clean=cytogenetic |
|
|
266 |
|
|
|
267 |
cytogenetic_terms=sort(unique(unlist(strsplit(cytogenetic, " ")))) |
|
|
268 |
|
|
|
269 |
cytogenetic_terms=unlist(strsplit(cytogenetic, "; |; | |, | |/")) |
|
|
270 |
cytogenetic_terms=gsub(" ", "", cytogenetic_terms) |
|
|
271 |
|
|
|
272 |
terms=table(cytogenetic_terms) |
|
|
273 |
shared=names(terms)[terms>5] |
|
|
274 |
|
|
|
275 |
cytogenetics=do.call(rbind, mclapply(shared,FIND_LOGICAL, cytogenetic, mc.cores=8)) |
|
|
276 |
colnames(cytogenetics)=colnames(gexp) |
|
|
277 |
#*********************** |
|
|
278 |
|
|
|
279 |
# age annotations |
|
|
280 |
age=gsub("*.*: |^ |;| yr| age| years|-.*.$| Years|d 32.8|d 54.8", "", annot$Age) |
|
|
281 |
age[age%in%c("na", "n/a", "", "not available")]=NA |
|
|
282 |
age[grepl("month|Month", age)]=signif(as.numeric(gsub(" months.*.| months| Months", "", age[grepl("month|Month", age)]))/12, 2) |
|
|
283 |
age[age=="Adult"]=30 |
|
|
284 |
age[age=="Children"]=5 |
|
|
285 |
age[age=="pediatric"]=1 |
|
|
286 |
age=t(as.numeric(age)) |
|
|
287 |
age[age>100]=NA |
|
|
288 |
rownames(age)="N:CLIN:AGE" |
|
|
289 |
colnames(age)=colnames(gexp) |
|
|
290 |
|
|
|
291 |
# gender annotations |
|
|
292 |
gender=toupper(annot$Gender) |
|
|
293 |
gender[gender%in%c("GENDER: NOT AVAILABLE", "GENDER: NA;", "SEX: UNKNOWN;")]=NA |
|
|
294 |
gender=gsub(" |;", "", gender) |
|
|
295 |
gender=gsub("GENDER:|SEX/AGE:|/.*.|SEX:", "", gender) |
|
|
296 |
gender[gender%in%c("F", "FEMALE", "WOMAN")]="female" |
|
|
297 |
gender[gender%in%c("M", "MALE", "MAN")]="male" |
|
|
298 |
gender=t(gender) |
|
|
299 |
rownames(gender)="C:CLIN:GENDER" |
|
|
300 |
colnames(gender)=colnames(gexp) |
|
|
301 |
|
|
|
302 |
# race annotations |
|
|
303 |
race=toupper(annot$Race) |
|
|
304 |
race[grep("AGE", race)]=NA |
|
|
305 |
race[grep("OTHER", race)]="OTHER" |
|
|
306 |
race[grep("AFRICAN|RACE: AA;|RACE: B;", race)]="AFRICAN" |
|
|
307 |
race[grep("HISPANIC|RACE: H; ", race)]="HISPANIC" |
|
|
308 |
race[grepl("EUROPEAN|CAUCASIAN|ANGLO-AMERICAN|WHITE|RACE: W;|RACE: C;", race)]="EUROPEAN" |
|
|
309 |
race[grep("ASIAN", race)]="ASIAN" |
|
|
310 |
race[!grepl("ASIAN|AFRICAN|HISPANIC|EUROPEAN|OTHER", race)]=NA |
|
|
311 |
|
|
|
312 |
race=t(race) |
|
|
313 |
rownames(race)="C:CLIN:RACE" |
|
|
314 |
colnames(race)=colnames(gexp) |
|
|
315 |
|
|
|
316 |
#*************************** adding some myeloma annotation ************************************* |
|
|
317 |
annomm = read.delim2("GSE24080_MM_clininfo_GSMid_clean.txt", stringsAsFactors=F, header=T) |
|
|
318 |
annomm2 = read.delim2("GSE19784_MM_survival_GSMid_iss_clean.txt", stringsAsFactors=F, header=T) |
|
|
319 |
|
|
|
320 |
annomm$b2m=gsub("<0.5", "0.5", annomm$b2m) |
|
|
321 |
annomm$b2m=as.numeric(gsub(",", ".", annomm$b2m)) |
|
|
322 |
|
|
|
323 |
annomm$aspc=as.numeric(gsub(",", ".", annomm$aspc)) |
|
|
324 |
annomm$bmpc=as.numeric(gsub(",", ".", annomm$bmpc)) |
|
|
325 |
|
|
|
326 |
# first combine the two: |
|
|
327 |
library(data.table) |
|
|
328 |
combmm=data.frame(rbindlist(list(annomm, annomm2), fill = TRUE), stringsAsFactors = F) |
|
|
329 |
|
|
|
330 |
combmm=combmm[match(colnames(gexp), combmm$accession),] |
|
|
331 |
|
|
|
332 |
# now add these vectors to annot table |
|
|
333 |
age[colnames(gexp)%in%combmm$accession]=signif(combmm$age[colnames(gexp)%in%combmm$accession], 3) |
|
|
334 |
gender[colnames(gexp)%in%combmm$accession]=combmm$sex[colnames(gexp)%in%combmm$accession] |
|
|
335 |
|
|
|
336 |
combmm$race[combmm$race%in%"other"]="OTHER" |
|
|
337 |
combmm$race[combmm$race%in%"white"]="EUROPEAN" |
|
|
338 |
race[colnames(gexp)%in%combmm$accession]=combmm$race[colnames(gexp)%in%combmm$accession] |
|
|
339 |
|
|
|
340 |
# time and status: |
|
|
341 |
surv_d[1,colnames(gexp)%in%combmm$accession]=signif(combmm$os_time[colnames(gexp)%in%combmm$accession], 3) |
|
|
342 |
surv_d[2,colnames(gexp)%in%combmm$accession]=signif(combmm$os_censor[colnames(gexp)%in%combmm$accession], 3) |
|
|
343 |
|
|
|
344 |
# pfs time and status |
|
|
345 |
pfs=cbind(combmm$pfs_time, combmm$pfs_censor) |
|
|
346 |
pfs=t(pfs) |
|
|
347 |
colnames(pfs)=colnames(gexp) |
|
|
348 |
rownames(pfs)=c("N:CLIN:PFS_Time", "B:CLIN:PFS_Status") |
|
|
349 |
|
|
|
350 |
# other myeloma annotations: |
|
|
351 |
otherMM_C=t(combmm[,c(4,8)]) |
|
|
352 |
rownames(otherMM_C)=paste0("C:CLIN:MM_", toupper(rownames(otherMM_C)), "") |
|
|
353 |
colnames(otherMM_C)=colnames(gexp) |
|
|
354 |
|
|
|
355 |
# numeric myeloma: |
|
|
356 |
otherMM_N=t(combmm[,c(9:20)]) |
|
|
357 |
rownames(otherMM_N)=paste0("N:CLIN:MM_", toupper(rownames(otherMM_N)), "") |
|
|
358 |
colnames(otherMM_N)=colnames(gexp) |
|
|
359 |
|
|
|
360 |
add_mm=t(combmm$cytogenetic_abnormalities) |
|
|
361 |
colnames(add_mm)=colnames(gexp) |
|
|
362 |
rownames(add_mm)=c("B:CLIN:MM_CYTOGENETIC_ABNORMALITIES") |
|
|
363 |
|
|
|
364 |
l.clin=list(sorted, surv_d, pfs, tumor_percentage,T_per,tissue_per, CHOP, RCHOP, gender, age,race, cytogenetics, genetics, add_mm, otherMM_N, otherMM_C) |
|
|
365 |
clin=data.frame(do.call(rbind, l.clin), stringsAsFactors = F) |
|
|
366 |
|
|
|
367 |
#****************************** make annotation clusters ****************************************** |
|
|
368 |
annot$acute=rep("other", dim(annot)[1]) |
|
|
369 |
annot$acute[annot$colorClass=="AML"|annot$colorClass=="pre-B-ALL"|annot$colorClass=="T-ALL"]="acute_leukemias" |
|
|
370 |
annot$acute[annot$colorClass=="CLL"|annot$colorClass=="CML"]="chronic_leukemias" |
|
|
371 |
annot$acute[grepl("NonCancer", annot$MAINCLASS)]="NonCancer" |
|
|
372 |
annot$disease=rep("other", dim(annot)[1]) |
|
|
373 |
annot$disease[grepl("NonCancer", annot$Sub.maps.available)]="NonCancer" |
|
|
374 |
annot$CLASS2=annot$CLASS |
|
|
375 |
annot$CLASS2[grepl("CellLine_Myeloma", annot$CLASS2)]="CellLine_Myeloma" |
|
|
376 |
annot$CLASS2[grepl("CellLine_Leukemia", annot$CLASS2)]="CellLine_Leukemia" |
|
|
377 |
annot$CLASS2[grepl("CellLine_Lymphoma", annot$CLASS2)]="CellLine_Lymphoma" |
|
|
378 |
annot$CLASS2[grepl("CellLine_mix", annot$CLASS2)]="CellLine_mix" |
|
|
379 |
|
|
|
380 |
findthese=c("NonCancer", "Cancer_Leukemia", "Cancer_Myeloma", "Cancer_Lymphoma","CellLine_Leukemia","CellLine_Lymphoma","CellLine_Myeloma","Prolif_Lymphoproliferative_ALPS","Prolif_Lymphoproliferative_MPN", "Prolif_Myeloproliferative_LCH_LC", "Prolif_Myeloproliferative_MDS") |
|
|
381 |
|
|
|
382 |
for(f in findthese){ |
|
|
383 |
annot$disease[grepl(f, annot$MAINCLASS)]=f |
|
|
384 |
} |
|
|
385 |
|
|
|
386 |
annot$subclasses=rep("other", dim(annot)[1]) |
|
|
387 |
findthese=c("NonCancer", "Cancer_Myeloma", "Cancer_Lymphoma", "CellLine_Leukemia", "CellLine_Lymphoma","CellLine_Myeloma","Prolif_Lymphoproliferative_ALPS","Prolif_Lymphoproliferative_MPN", "Prolif_Myeloproliferative_LCH_LC", "Prolif_Myeloproliferative_MDS") |
|
|
388 |
findthese2=c("T-ALL", "pre-B-ALL", "AML","CML","CLL", "BCL", "TCL", "B-Lymphoid", "T-Lymphoid","Lymphoid", "Myeloid", "Erythroid", "StemCell") |
|
|
389 |
|
|
|
390 |
for(f in findthese){ |
|
|
391 |
annot$subclasses[grepl(f, annot$MAINCLASS)]=f |
|
|
392 |
} |
|
|
393 |
for(f in findthese2){ |
|
|
394 |
annot$subclasses[grepl(f, annot$colorClass)]=f |
|
|
395 |
} |
|
|
396 |
DLBCL=c("Cancer_Lymphoma_BCL_DLBCL_ABC", "Cancer_Lymphoma_BCL_DLBCL_GCB", "Cancer_Lymphoma_BCL_DLBCL_na") |
|
|
397 |
annot$subclasses[annot$CLASS2%in%DLBCL]="BCL_DLBCL" |
|
|
398 |
|
|
|
399 |
|
|
|
400 |
annot$CLASS=gsub("_na|_check|_testicular", "",annot$CLASS) |
|
|
401 |
|
|
|
402 |
# lymphoma annotations |
|
|
403 |
bLY=(1:nrow(annot)%in%grep("Lymphoma_BCL",annot$CLASS))&(!1:nrow(annot)%in%grep("CellLine",annot$CLASS))&(!1:nrow(annot)%in%grep("NonCancer",annot$CLASS)) |
|
|
404 |
tLY=(1:nrow(annot)%in%grep("Lymphoma_TCL",annot$CLASS))&(!1:nrow(annot)%in%grep("CellLine",annot$CLASS))&(!1:nrow(annot)%in%grep("NonCancer",annot$CLASS)) |
|
|
405 |
annot$CLASS=gsub("Cancer_", "", annot$CLASS) |
|
|
406 |
|
|
|
407 |
annot$tbLY=NA |
|
|
408 |
annot$tbLY[bLY|tLY]=annot$CLASS[bLY|tLY] |
|
|
409 |
|
|
|
410 |
# these are the terms to look for |
|
|
411 |
# table(annot$disease) |
|
|
412 |
# table(annot$colorClass) |
|
|
413 |
# table(annot$acute) |
|
|
414 |
# table(annot$subclasses) |
|
|
415 |
# table(annot$tbLY) |
|
|
416 |
|
|
|
417 |
#******************************************************************************************* |
|
|
418 |
#*************************** annotation clusters ******************************************** |
|
|
419 |
|
|
|
420 |
#****************************** make clusters ****************************************** |
|
|
421 |
|
|
|
422 |
# make immunological normal annotation vectors |
|
|
423 |
annot$plotNormals = "Other" |
|
|
424 |
|
|
|
425 |
lv=!annot$Sample.type%in%"NonCancerHealthy" |
|
|
426 |
annot$plotNormals[lv]="" |
|
|
427 |
|
|
|
428 |
lv=grepl("RestingBcell|NaiveBcell|MemoryBcell|BcellActivated", annot$CLASS) |
|
|
429 |
annot$plotNormals[lv]="B cell" |
|
|
430 |
|
|
|
431 |
lv=grepl("GerminalCentre", annot$CLASS) |
|
|
432 |
annot$plotNormals[lv]="Germinal centre cell" |
|
|
433 |
|
|
|
434 |
lv=grepl("PlasmaBcell", annot$CLASS) |
|
|
435 |
annot$plotNormals[lv]="Plasma cell" |
|
|
436 |
|
|
|
437 |
lv=grepl("Tcell|NaturalKillerCell", annot$CLASS) |
|
|
438 |
annot$plotNormals[lv]="T/NK cell" |
|
|
439 |
|
|
|
440 |
lv=grepl("DendriticCell", annot$CLASS) |
|
|
441 |
annot$plotNormals[lv]="Dendritic cell" |
|
|
442 |
|
|
|
443 |
lv=grepl("Langerhans", annot$CLASS) |
|
|
444 |
annot$plotNormals[lv]="Langerhans cell" |
|
|
445 |
|
|
|
446 |
lv=grepl("Eryth|Platelet", annot$CLASS) |
|
|
447 |
annot$plotNormals[lv]="Erythroid" |
|
|
448 |
|
|
|
449 |
lv=grepl("Monocyte", annot$CLASS) |
|
|
450 |
annot$plotNormals[lv]="Monocyte" |
|
|
451 |
|
|
|
452 |
lv=grepl("Macrophage", annot$CLASS) |
|
|
453 |
annot$plotNormals[lv]="Macrophage" |
|
|
454 |
|
|
|
455 |
lv=grepl("Neutrophil", annot$CLASS) |
|
|
456 |
annot$plotNormals[lv]="Neutrophil" |
|
|
457 |
|
|
|
458 |
lv=grepl("MyeloidProgenitor", annot$CLASS) |
|
|
459 |
annot$plotNormals[lv]="Myeloid progenitor" |
|
|
460 |
|
|
|
461 |
lv=grepl("HematopoieticStemCell", annot$CLASS) |
|
|
462 |
annot$plotNormals[lv]="HSC" |
|
|
463 |
|
|
|
464 |
lv=grepl("^Mononuclear", annot$CLASS) |
|
|
465 |
annot$plotNormals[lv]="PBMC" |
|
|
466 |
|
|
|
467 |
lv=grepl("LymphNode", annot$CLASS) |
|
|
468 |
annot$plotNormals[lv]="Lymph node" |
|
|
469 |
|
|
|
470 |
HLAplot_normals <- c("B cell", "Plasma cell", "T/NK cell", "Dendritic cell", "Erythroid", "Monocyte", "Macrophage", "Neutrophil", "Myeloid progenitor", "HSC") |
|
|
471 |
cytolyticplot_normals <- c("PBMC", "Lymph node") |
|
|
472 |
costimplot_normals <- c(HLAplot_normals, "PBMC", "Lymph node", "Langerhans cell", "Germinal centre cell") |
|
|
473 |
|
|
|
474 |
annot$immunoNormals=annot$Category.specifying.lineage.tumor.origin |
|
|
475 |
|
|
|
476 |
lv=grepl("CD8|CD8+TcellActivated", annot$Category.specifying.lineage.tumor.origin) |
|
|
477 |
annot$immunoNormals[lv]="CD8+Tcell" |
|
|
478 |
|
|
|
479 |
lv=grepl("NaturalKiller", annot$Category.specifying.lineage.tumor.origin) |
|
|
480 |
annot$immunoNormals[lv]="NKCell" |
|
|
481 |
|
|
|
482 |
lv=grepl("M2-Macrophage", annot$Category.specifying.lineage.tumor.origin) |
|
|
483 |
annot$immunoNormals[lv]="M2-Macrophage" |
|
|
484 |
|
|
|
485 |
lv=grepl("M1-Macrophage", annot$Category.specifying.lineage.tumor.origin) |
|
|
486 |
annot$immunoNormals[lv]="M1-Macrophage" |
|
|
487 |
|
|
|
488 |
lv=grepl("DendriticCell", annot$Category.specifying.lineage.tumor.origin) |
|
|
489 |
annot$immunoNormals[lv]="DendriticCell" |
|
|
490 |
|
|
|
491 |
lv=grepl("Monocyte", annot$Category.specifying.lineage.tumor.origin) |
|
|
492 |
annot$immunoNormals[lv]="Monocyte" |
|
|
493 |
|
|
|
494 |
lv=grepl("CD4+", annot$Category.specifying.lineage.tumor.origin) |
|
|
495 |
annot$immunoNormals[lv]="CD4+Tcell" |
|
|
496 |
|
|
|
497 |
lv=!annot$Sample.type%in%"NonCancerHealthy" |
|
|
498 |
annot$immunoNormals[lv]="" |
|
|
499 |
|
|
|
500 |
lv=grepl("Eryth|Platelet", annot$Category.specifying.lineage.tumor.origin) |
|
|
501 |
annot$immunoNormals[lv]="Erythroid" |
|
|
502 |
|
|
|
503 |
lv=grepl("CD3", annot$Category.specifying.lineage.tumor.origin) |
|
|
504 |
annot$immunoNormals[lv]="Tcell" |
|
|
505 |
|
|
|
506 |
lv=grepl("GerminalCentre", annot$Category.specifying.lineage.tumor.origin) |
|
|
507 |
annot$immunoNormals[lv]="GerminalCentreCell" |
|
|
508 |
|
|
|
509 |
lv=grepl("^Tcell$|^TcellActivated$|^TcellResting$", annot$Category.specifying.lineage.tumor.origin) |
|
|
510 |
annot$immunoNormals[lv]="Tcell" |
|
|
511 |
|
|
|
512 |
lv=grepl("^ActivatedBcell$|^RestingBcell$|^BcellActivated$", annot$Category.specifying.lineage.tumor.origin) |
|
|
513 |
annot$immunoNormals[lv]="Bcell" |
|
|
514 |
|
|
|
515 |
# annotated clusters |
|
|
516 |
tbLY=FUN_MAKE_ALL(annot$tbLY, "annotated_class", annot$tbLY, 0) |
|
|
517 |
subclasses=FUN_MAKE_ALL(annot$subclasses, "annotated_class", annot$subclasses, 0) |
|
|
518 |
acute_chronic=FUN_MAKE_ALL(annot$acute, "annotated_class", annot$acute, 0) |
|
|
519 |
colorClass=FUN_MAKE_ALL(annot$colorClass, "annotated_class", annot$colorClass, 0) |
|
|
520 |
disease=FUN_MAKE_ALL(annot$disease, "annotated_class", annot$disease, 0) |
|
|
521 |
tbLY=FUN_MAKE_ALL(annot$tbLY, "annotated_class", annot$tbLY, 0) |
|
|
522 |
fullclass=FUN_MAKE_ALL(annot$CLASS2, "annotated_class", annot$CLASS2, 0) |
|
|
523 |
immunoclass=FUN_MAKE_ALL(annot$immunoNormals, "annotated_class_immunoNormals", annot$immunoNormals, 0) |
|
|
524 |
|
|
|
525 |
l.comparisons=list(disease, colorClass, acute_chronic,subclasses, tbLY, fullclass, immunoclass) |
|
|
526 |
comparisons=do.call(rbind, l.comparisons) |
|
|
527 |
comparisons=data.frame(data.matrix(comparisons[!duplicated(rownames(comparisons)),]), stringsAsFactors = F) |
|
|
528 |
colnames(comparisons)=colnames(gexp) |
|
|
529 |
|
|
|
530 |
# test if all rows are fine, should be >1 values |
|
|
531 |
A=apply(comparisons, 1, unique) |
|
|
532 |
|
|
|
533 |
B=unlist(lapply(A, function(d)sum(d%in%c(1,0))>=2)) |
|
|
534 |
|
|
|
535 |
if(!all(B))stop("Check comparisons, impossible comparisons made") |
|
|
536 |
|
|
|
537 |
# categorical feats |
|
|
538 |
class1=FUN_MAKE_CATEGORICAL(annot$tbLY, "annotated_class_BCL_TCL") |
|
|
539 |
class2=FUN_MAKE_CATEGORICAL(annot$colorClass, "annotated_class_colorclass") |
|
|
540 |
class3=FUN_MAKE_CATEGORICAL(annot$disease, "annotated_class_disease") |
|
|
541 |
class4=FUN_MAKE_CATEGORICAL(annot$immunoNormals, "annotated_class_immunoNormals") |
|
|
542 |
|
|
|
543 |
l.comparisons=list(class1, class2, class3, class4) |
|
|
544 |
comparisons_cat=do.call(rbind, l.comparisons) |
|
|
545 |
comparisons_cat=data.frame(comparisons_cat[!duplicated(rownames(comparisons_cat)),], stringsAsFactors = F) |
|
|
546 |
|
|
|
547 |
colnames(comparisons_cat)=colnames(gexp) |
|
|
548 |
|
|
|
549 |
|
|
|
550 |
#**************************************************************************************************** |
|
|
551 |
#******************** Next we start to create features of these individual data types *************** |
|
|
552 |
#**************************************************************************************************** |
|
|
553 |
|
|
|
554 |
l.fm=list(data_list,clin,fractions, comparisons) |
|
|
555 |
|
|
|
556 |
library(data.table) |
|
|
557 |
fm=rbindlist(l.fm, use.names=F, fill=F) |
|
|
558 |
|
|
|
559 |
fm=data.frame(fm, stringsAsFactors=F) |
|
|
560 |
rownames(fm)=unlist(lapply(l.fm, rownames)) |
|
|
561 |
|
|
|
562 |
matrix=fm |
|
|
563 |
|
|
|
564 |
# also add clinicaldata to annotations |
|
|
565 |
numclin=t(data.matrix(clin[!grepl("^C:", rownames(clin)),])) |
|
|
566 |
numchr=t(clin[grepl("^C:", rownames(clin)),]) |
|
|
567 |
colnames(numclin)=gsub(".:CLIN:|.:GEXP:|", "", colnames(numclin)) |
|
|
568 |
colnames(numchr)=gsub(".:CLIN:|.:GEXP:|", "", colnames(numchr)) |
|
|
569 |
|
|
|
570 |
annot_add=data.frame(numclin, numchr, stringsAsFactors = F) |
|
|
571 |
|
|
|
572 |
fractions2=t(fractions) |
|
|
573 |
colnames(fractions2)=gsub("N:SAMP:", "", colnames(fractions2)) |
|
|
574 |
annot2=data.frame(annot, annot_add, "CytolyticScore"=as.numeric(gm1) ,"HLAIScore"=as.numeric(gm2), "HLAIIScore"=as.numeric(gm3),classification,fractions2, stringsAsFactors = F) |
|
|
575 |
|
|
|
576 |
|
|
|
577 |
# annot |
|
|
578 |
clusters=read.delim("AML_15pct_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T) |
|
|
579 |
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] |
|
|
580 |
clusters_cancermap=clusters$X1.5..cluster |
|
|
581 |
|
|
|
582 |
matrix_sub=matrix[,colnames(matrix)%in%clusters$ID] |
|
|
583 |
annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,] |
|
|
584 |
|
|
|
585 |
# TCGA clusters |
|
|
586 |
cluster_mapping=read.delim("Table_TCGA_cluster_AML_cluster_assignment.txt", header=T, stringsAsFactors=F, sep="\t") |
|
|
587 |
TCGA_cluster=rep("NA", dim(annot_sub)[1]) |
|
|
588 |
|
|
|
589 |
TCGA_cluster[clusters_cancermap%in%cluster_mapping[1,2]]="TCGA_AML_cluster_1" |
|
|
590 |
TCGA_cluster[clusters_cancermap%in%cluster_mapping[2,2]]="TCGA_AML_cluster_2" |
|
|
591 |
TCGA_cluster[clusters_cancermap%in%cluster_mapping[3:5,2]]="TCGA_AML_cluster_3" |
|
|
592 |
TCGA_cluster[clusters_cancermap%in%cluster_mapping[6:10,2]]="TCGA_AML_cluster_4" |
|
|
593 |
TCGA_cluster[clusters_cancermap%in%cluster_mapping[11,2]]="TCGA_AML_cluster_5" |
|
|
594 |
TCGA_cluster[clusters_cancermap%in%cluster_mapping[12:15,2]]="TCGA_AML_cluster_6" |
|
|
595 |
TCGA_cluster[clusters_cancermap%in%cluster_mapping[16:17,2]]="TCGA_AML_cluster_7" |
|
|
596 |
|
|
|
597 |
n=lapply(unique(TCGA_cluster), function(i)annot_sub$GSM.identifier..sample.[TCGA_cluster%in%i]) |
|
|
598 |
names(n)=unique(TCGA_cluster) |
|
|
599 |
n=n[!unique(TCGA_cluster)%in%"NA"] |
|
|
600 |
save(n, file="Hemap_immunology_TCGA_clusters.Rdata") |
|
|
601 |
|
|
|
602 |
save(matrix, file="Hemap_immunology_fm.Rdata") |
|
|
603 |
save(annot2, file="Hemap_immunology_Annotations.Rdata") |
|
|
604 |
write.table(annot2,"Hemap_immunology_Annotations.tsv", sep="\t", col.names=T, row.names=F, quote=FALSE) |
|
|
605 |
|
|
|
606 |
write.table(t(c("N:SAMP", as.character(colnames(matrix)))), file="Hemap_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) |
|
|
607 |
write.table(matrix, file="Hemap_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) |
|
|
608 |
|
|
|
609 |
# make a small fix here to harmonize survival data to months: |
|
|
610 |
load("Hemap_immunology_Annotations.Rdata") |
|
|
611 |
|
|
|
612 |
unique(cbind(annot2[!is.na(annot2$OS_Time), c(2,4)])) |
|
|
613 |
annot2$OS_Time[!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE10846,GSE11318", "GSE10846", "GSE10846,GSE17372", "GSE11877")]=annot2$OS_Time[!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE10846,GSE11318", "GSE10846", "GSE10846,GSE17372", "GSE11877")]*12 |
|
|
614 |
|
|
|
615 |
# for myeloma, transform data to 5y survival to compare data sets: |
|
|
616 |
modify=!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE16716,GSE24080") |
|
|
617 |
find=annot2$OS_Time>60&modify |
|
|
618 |
|
|
|
619 |
# change status to alive if dead later |
|
|
620 |
find2=annot2$OS_Status==1&modify |
|
|
621 |
annot2$OS_Status[find&find2]=0 |
|
|
622 |
annot2$OS_Time[find]=60 |
|
|
623 |
|
|
|
624 |
save(annot2, file="Hemap_immunology_Annotations.Rdata") |
|
|
625 |
|
|
|
626 |
#**************************************************************************************** |
|
|
627 |
# This FM can then be used as a backbone for other FMs. GSVA and clusters must be added |
|
|
628 |
#**************************************************************************************** |
|
|
629 |
|
|
|
630 |
matrix=get(load("Hemap_immunology_fm.Rdata")) |
|
|
631 |
annot=get(load("Hemap_immunology_Annotations.Rdata")) |
|
|
632 |
|
|
|
633 |
#********************************** Full map ********************************** |
|
|
634 |
clusters=read.delim("anno_coord_data9544_15pct_bw2.5_updated.txt", stringsAsFactors=F, header=T) |
|
|
635 |
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] |
|
|
636 |
|
|
|
637 |
#*********************************** GSVA input **************************** |
|
|
638 |
gsva=get(load("data8238_dufva_immunological_genes_updated_2016_GSVA_geneperm_lean_eFDR.Rdata")) |
|
|
639 |
bindea=get(load("data8238_all_samples_dufva_bindea_2013_geneset_GSVA.Rdata")) |
|
|
640 |
load("data8238_all_samples_Combined_pathway_signatures_210616_GSVA.Rdata") |
|
|
641 |
gsva_es=rbind(gsva, bindea, gsva_es) |
|
|
642 |
gsva_es=gsva_es[!duplicated(rownames(gsva_es)),] |
|
|
643 |
|
|
|
644 |
# match cols gsva |
|
|
645 |
gsva_es=data.frame(gsva_es[,match(colnames(matrix), colnames(gsva_es))]) |
|
|
646 |
colnames(gsva_es)=colnames(matrix) |
|
|
647 |
|
|
|
648 |
#**************************** Cancermap clusters **************************** |
|
|
649 |
clusters_cancermap=clusters$X2.5..cluster |
|
|
650 |
cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0) |
|
|
651 |
subclasses=FUN_MAKE_ALL(annot$subclasses, "cancermap_cluster", clusters_cancermap, 0.8) |
|
|
652 |
acute_chronic=FUN_MAKE_ALL(annot$acute, "cancermap_cluster", clusters_cancermap, 0.8) |
|
|
653 |
colorClass=FUN_MAKE_ALL(annot$colorClass, "cancermap_cluster", clusters_cancermap, 0.8) |
|
|
654 |
disease=FUN_MAKE_ALL(annot$disease, "cancermap_cluster", annot$disease, 0.8) |
|
|
655 |
fullclass=FUN_MAKE_ALL(annot$CLASS2, "cancermap_cluster", annot$CLASS2, 0.8) |
|
|
656 |
|
|
|
657 |
class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster") |
|
|
658 |
|
|
|
659 |
l.comparisons=list(cluster_cancermap, subclasses, acute_chronic, colorClass, disease,fullclass, class_cancermap) |
|
|
660 |
comparisons_cat=do.call(rbind, l.comparisons) |
|
|
661 |
comparisons_cat=data.frame(comparisons_cat[!duplicated(rownames(comparisons_cat)),], stringsAsFactors = F) |
|
|
662 |
colnames(comparisons_cat)=colnames(matrix) |
|
|
663 |
|
|
|
664 |
# combine |
|
|
665 |
l.fm=list(matrix, gsva_es, comparisons_cat) |
|
|
666 |
|
|
|
667 |
library(data.table) |
|
|
668 |
fm=rbindlist(l.fm, use.names=F, fill=F) |
|
|
669 |
|
|
|
670 |
fm=data.frame(fm, stringsAsFactors=F) |
|
|
671 |
rownames(fm)=unlist(lapply(l.fm, rownames)) |
|
|
672 |
|
|
|
673 |
# remove rows with few values or NAs |
|
|
674 |
rm=apply(fm, 1, function(v)all(is.na(v))) |
|
|
675 |
fm=fm[!rm,] |
|
|
676 |
|
|
|
677 |
save(fm, file="Hemap_immunology_fm_cancermap.Rdata") |
|
|
678 |
|
|
|
679 |
write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_immunology_fm_cancermap.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) |
|
|
680 |
write.table(fm, file="Hemap_immunology_fm_cancermap.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) |
|
|
681 |
|
|
|
682 |
|
|
|
683 |
#********************************** Lymphoma ********************************** |
|
|
684 |
matrix=get(load("Hemap_immunology_fm.Rdata")) |
|
|
685 |
annot=get(load("Hemap_immunology_Annotations.Rdata")) |
|
|
686 |
|
|
|
687 |
# annot |
|
|
688 |
clusters=read.delim("Hemap_Lymphoma_15pct_genes_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T) |
|
|
689 |
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] |
|
|
690 |
|
|
|
691 |
matrix_sub=matrix[,colnames(matrix)%in%clusters$ID] |
|
|
692 |
annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,] |
|
|
693 |
|
|
|
694 |
load("data9544_LYMPHOMA_all_samples_Combined_pathway_drug_signatures_2017_GSVA.Rdata") |
|
|
695 |
rownames(gsva_es)=gsub(" ", "_", rownames(gsva_es)) |
|
|
696 |
|
|
|
697 |
# match cols gsva |
|
|
698 |
gsva_es=data.frame(gsva_es[,match(colnames(matrix_sub), colnames(gsva_es))]) |
|
|
699 |
colnames(gsva_es)=colnames(matrix_sub) |
|
|
700 |
|
|
|
701 |
clusters_cancermap=clusters$X1.5..cluster |
|
|
702 |
|
|
|
703 |
# comparisons |
|
|
704 |
cluster_subtypes=FUN_MAKE_ALL(annot_sub$CLASS, "cancermap_cluster", clusters_cancermap, 0.8) |
|
|
705 |
cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0) |
|
|
706 |
cluster_BCL_TCL=FUN_MAKE_ALL(annot_sub$tbLY, "cancermap_cluster", clusters_cancermap, 0.8) |
|
|
707 |
class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster") |
|
|
708 |
|
|
|
709 |
l.comparisons=list(cluster_subtypes, cluster_cancermap, cluster_BCL_TCL, class_cancermap) |
|
|
710 |
comparisons_cat=do.call(rbind, l.comparisons) |
|
|
711 |
comparisons_cat=data.frame(data.matrix(comparisons_cat[!duplicated(rownames(comparisons_cat)),]), stringsAsFactors = F) |
|
|
712 |
|
|
|
713 |
|
|
|
714 |
# combine |
|
|
715 |
l.fm=list(matrix_sub, data.frame(gsva_es), comparisons_cat) |
|
|
716 |
|
|
|
717 |
library(data.table) |
|
|
718 |
fm=rbindlist(l.fm, use.names=F, fill=F) |
|
|
719 |
|
|
|
720 |
fm=data.frame(fm, stringsAsFactors=F) |
|
|
721 |
rownames(fm)=unlist(lapply(l.fm, rownames)) |
|
|
722 |
|
|
|
723 |
# remove rows with few values or NAs |
|
|
724 |
rm=apply(fm, 1, function(v)all(is.na(v))) |
|
|
725 |
fm=fm[!rm,] |
|
|
726 |
|
|
|
727 |
save(fm, file="Hemap_LYMPHOMA_immunology_fm.Rdata") |
|
|
728 |
|
|
|
729 |
write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_LYMPHOMA_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) |
|
|
730 |
write.table(fm, file="Hemap_LYMPHOMA_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) |
|
|
731 |
|
|
|
732 |
#********************************** AML ********************************** |
|
|
733 |
|
|
|
734 |
matrix=get(load("Hemap_immunology_fm.Rdata")) |
|
|
735 |
annot=get(load("Hemap_immunology_Annotations.Rdata")) |
|
|
736 |
|
|
|
737 |
# annot |
|
|
738 |
clusters=read.delim("AML_15pct_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T) |
|
|
739 |
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] |
|
|
740 |
|
|
|
741 |
clusters_cancermap=clusters$X1.5..cluster |
|
|
742 |
|
|
|
743 |
matrix_sub=matrix[,colnames(matrix)%in%clusters$ID] |
|
|
744 |
annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,] |
|
|
745 |
|
|
|
746 |
# GSVA input |
|
|
747 |
gsva=get(load("data9544_AML_all_samples_Combined_pathway_drug_signatures_2017_GSVA.Rdata")) |
|
|
748 |
|
|
|
749 |
# match cols gsva |
|
|
750 |
gsva_es=data.frame(gsva_es[,match(colnames(matrix_sub), colnames(gsva_es))]) |
|
|
751 |
colnames(gsva_es)=colnames(matrix_sub) |
|
|
752 |
|
|
|
753 |
# TCGA clusters |
|
|
754 |
cluster_mapping=read.delim("Table_TCGA_cluster_AML_cluster_assignment.txt", header=T, stringsAsFactors=F, sep="\t") |
|
|
755 |
|
|
|
756 |
annot_sub$TCGA_cluster=rep("NA", dim(annot_sub)[1]) |
|
|
757 |
|
|
|
758 |
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[1,2]]="TCGA_AML_cluster_1" |
|
|
759 |
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[2,2]]="TCGA_AML_cluster_2" |
|
|
760 |
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[3:5,2]]="TCGA_AML_cluster_3" |
|
|
761 |
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[6:10,2]]="TCGA_AML_cluster_4" |
|
|
762 |
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[11,2]]="TCGA_AML_cluster_5" |
|
|
763 |
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[12:15,2]]="TCGA_AML_cluster_6" |
|
|
764 |
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[16:17,2]]="TCGA_AML_cluster_7" |
|
|
765 |
|
|
|
766 |
# comparisons |
|
|
767 |
cluster_TCGA=FUN_MAKE_ALL(annot_sub$TCGA_cluster, "cancermap_cluster", annot_sub$TCGA_cluster, 0.9) |
|
|
768 |
cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0) |
|
|
769 |
cluster_subtypes=FUN_MAKE_ALL(annot_sub$CLASS, "cancermap_cluster", clusters_cancermap, 0.9) |
|
|
770 |
class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster") |
|
|
771 |
class_TCGA=FUN_MAKE_CATEGORICAL(annot_sub$TCGA_cluster, "cancermap_cluster") |
|
|
772 |
|
|
|
773 |
l.comparisons=list(cluster_TCGA, cluster_cancermap, cluster_subtypes, class_cancermap, class_TCGA) |
|
|
774 |
comparisons_cat=do.call(rbind, l.comparisons) |
|
|
775 |
comparisons_cat=data.frame(data.matrix(comparisons_cat[!duplicated(rownames(comparisons_cat)),]), stringsAsFactors = F) |
|
|
776 |
|
|
|
777 |
# combine |
|
|
778 |
l.fm=list(matrix_sub, data.frame(gsva_es), comparisons_cat) |
|
|
779 |
|
|
|
780 |
library(data.table) |
|
|
781 |
fm=rbindlist(l.fm, use.names=F, fill=F) |
|
|
782 |
|
|
|
783 |
fm=data.frame(fm, stringsAsFactors=F) |
|
|
784 |
rownames(fm)=unlist(lapply(l.fm, rownames)) |
|
|
785 |
|
|
|
786 |
# remove rows with few values or NAs |
|
|
787 |
rm=apply(fm, 1, function(v)all(is.na(v))) |
|
|
788 |
fm=fm[!rm,] |
|
|
789 |
|
|
|
790 |
save(fm, file="Hemap_AML_immunology_fm.Rdata") |
|
|
791 |
|
|
|
792 |
write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_AML_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) |
|
|
793 |
write.table(fm, file="Hemap_AML_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) |
|
|
794 |
|
|
|
795 |
# write.table(all_isolation, file="Hemap_all_isolation.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) |