--- a +++ b/preprocessing/Preprocessing_Hemap_featurematrix_generation.R @@ -0,0 +1,795 @@ +#*************************************************************************************************** +#******************************* Make immunology FM ********************************************** +#*************************************************************************************************** +library(mclust) +library(data.table) +library(parallel) + +source("/research/users/ppolonen/git_home/common_scripts/featurematrix/functions_generate_fm.R") + +# WD +setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/") + +# GEXP +gexp=t(get(load("data9544_with_gene_symbols.RData"))) + +# annotation table +check=get(load("Hemap_immunology_Annotations_8304.Rdata")) +annot=read.delim("anno_coord_data9544_15pct_bw2.5_updated.txt", stringsAsFactors=F, header=T) + +# listing files, not needed anymore +# f=list.files("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/Annotations_immunology/", pattern = ".csv", full.names = T) +# f=f[-1] +# annot_normal=do.call(rbind, lapply(f, read.csv, header=F, stringsAsFactors=F, skip=1)) + +#******************************** filtering ************************************* +exvivotreatments_allowed <- c("none", "na", "control", "activation", "differentiation", "differentiation followed by activation", " differentiation followed by LPS+IFNg", "differentiation followed by IFN", "differentiation followed by inflammatory cytokines", "differentiation followed by LPS", "differentiation followed by CD40L", "differentiation followed by Poly(I:C)", "differentiation (IL-4)", "differentiation with EPO", "anti-IgM", "IL-2", "IL-3", "IL-3+CpG") + +rm_healthy = annot$Sample.type%in%c("NonCancerHealthy")&!(annot$In.vivo.treatment%in%c("na", "none", "no")&annot$Ex.vivo.treatment%in%exvivotreatments_allowed) +rm_cancer_prolif = annot$Sample.type%in%c("Cancer", "Prolif")&!annot$Ex.vivo.treatment=="none" +rm_celline = annot$Sample.type%in%c("CellLine")&!annot$Ex.vivo.treatment%in%c("none", "na", "control") +rm_treatments = annot$GSE.identifier..experiment.%in%c("GSE26661")|annot$GSM.identifier..sample.%in%c("GSM425497", "GSM425499") +rm_noncancer = !annot$Sample.type%in%c("Cancer", "Prolif", "CellLine", "NonCancerHealthy") +rm_mm_pbmc = annot$colorClass=="MM"&!annot$Sample.isolation=="CD138+ plasma cells" +# cluster exclusion +# AML,GSE7538,24 +# CLL,GSE18866,GSE9250,32 +# LP,GSE12453,GSE7345,6 +# MM,GSE24147,GSE24522,18 +# MP,GSE12079,GSE15811,13 +# TCL,GSE14879,16 microdissected, do not remove + +rm_cluster_differ=annot$GSE.identifier..experiment.%in%c("GSE7538", "GSE9250", "GSE12453", "GSE18866","GSE24522", "GSE7345", "GSE12079", "GSE15811", "GSE24147")&annot$Sample.type=="Cancer" + +annot_left_out = annot + +annot_left_out$reason_removed[rm_healthy]="Normal Cell sample, ex-vivo or in-vivo treated" +annot_left_out$reason_removed[rm_cluster_differ]="Cancer sample, outlier cluster" +annot_left_out$reason_removed[rm_celline]="Cell line sample, ex-vivo treated" +annot_left_out$reason_removed[rm_cancer_prolif|rm_treatments]="Cancer sample, ex-vivo treated" +annot_left_out$reason_removed[rm_noncancer]="NonCancer sample, not healthy" +annot_left_out$reason_removed[rm_mm_pbmc]="NonCancer sample, not healthy" + +annot_left_out <- annot_left_out[(rm_noncancer|rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_mm_pbmc),] +fwrite(annot_left_out, "hemap_1072_leftout_reasonremoved.txt", sep="\t") + +annot <- annot[!(rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_noncancer|rm_mm_pbmc),] +gexp <- gexp[,!(rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_noncancer|rm_mm_pbmc)] + +newSamples=annot[!annot[,1]%in%check[,1],] +fwrite(newSamples, "new_samples_in_hemap_8472.txt", sep="\t") + +dim(check[check[,1]%in%annot[,1],]) + + +#**************************************************************************************** +# annot[rm_healthy&annot$GSM.identifier..sample.%in%annot_normal[,1],] # excluded from previous normals + +# new +annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=gsub("NonCancerHealthy_|NonCancerHealthy|na_","", gsub("_StemCell_|Myeloid_|_na_na|1_na_na|2_na_na|Lymphoid_|_G.*.|_na_G0|_na_G1.*.|T-|B-|M1-Differentiating|Erythroid_","", annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")])) +annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")][annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=="na"]="LymphNode_GerminalCentre" +annot$CLASS[annot$CLASS=="CD4+Tcell"]="RestingCD4+Tcell" +annot$CLASS[annot$CLASS=="CD8+Tcell"]="RestingCD8+Tcell" + +annot$MAINCLASS[annot$Sample.type%in%c("NonCancerHealthy")]="NonCancerHealthy" +annot$colorClass[annot$Sample.type%in%c("NonCancerHealthy")][annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=="na"]="Lymphoid" + +# old +# annot$CLASS[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,5] +# annot$MAINCLASS[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,3] +# annot$colorClass[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,4] + +#*********************************** compute geometric mean ******************************** +# get certain genes GEXP +rownames(gexp)=paste("N:GEXP:", rownames(gexp), sep="") + +dat_a=gexp[grep("GZMA|PRF1|GNLY|GZMH|GZMM", rownames(gexp)),] +# dat_a=matrix[grep("GZMA|PRF1", rownames(matrix)),] +dat=2^dat_a+0.01 +rownames(dat) +gm1=log2(t(apply(dat, 2, gm_mean))) +rownames(gm1)="CytolyticScore" + +# also HLA +dat_a2=gexp[rownames(gexp)%in%c("N:GEXP:B2M", "N:GEXP:HLA-A", "N:GEXP:HLA-B", "N:GEXP:HLA-C"),] + +dat2=2^dat_a2+0.01 +rownames(dat2) +gm2=log2(t(apply(dat2, 2, gm_mean))) +rownames(gm2)="HLAIScore" + +# also HLAII +dat_a3=gexp[rownames(gexp)%in%c("N:GEXP:HLA-DMA", + "N:GEXP:HLA-DMB", + "N:GEXP:HLA-DPA1", + "N:GEXP:HLA-DPB1", + "N:GEXP:HLA-DRA", + "N:GEXP:HLA-DRB1"),] + +dat3=2^dat_a3+0.01 +rownames(dat3) +gm3=log2(t(apply(dat3, 2, gm_mean))) +rownames(gm3)="HLAIIScore" + +classification1=data.frame(t(rep("medium", length(gm1))), stringsAsFactors = F) +zscore=as.numeric(scale(t(gm1))) +classification1[zscore>=1]="high" +classification1[zscore<=(-1)]="low" +rownames(classification1)="CytolyticScore" +colnames(classification1)=colnames(gexp) + +classification2=data.frame(t(rep("medium", length(gm2))), stringsAsFactors = F) +zscore=as.numeric(scale(t(gm2))) +classification2[zscore>=1]="high" +classification2[zscore<=(-1)]="low" +rownames(classification2)="HLAIScore" +colnames(classification2)=colnames(gexp) + +classification3=data.frame(t(rep("medium", length(gm3))), stringsAsFactors = F) +zscore=as.numeric(scale(t(gm3))) +classification3[zscore>=1]="high" +classification3[zscore<=(-1)]="low" +rownames(classification3)="HLAIIScore" +colnames(classification3)=colnames(gexp) + +classification=data.frame(t(rbind(classification1,classification2,classification3)), stringsAsFactors = F) + + +immunoscores=as.data.frame(t(rbind(gm1, gm2, gm3))) +immunoscoresfm=make.features(immunoscores, datatype="SAMP", prefix="") +colnames(immunoscoresfm)=colnames(gexp) + +immunoscores_class_fm=make.features(classification, datatype="SAMP", prefix="") +colnames(immunoscores_class_fm)=colnames(gexp) + +# excluding categorical here, they slow everything down! +l.data_list=list(gexp, immunoscoresfm, immunoscores_class_fm) +data_list=data.frame(do.call(rbind, l.data_list)) + +# ******************************** Infer cell fractions *********************************** + +# cibersort +results=read.delim("CIBERSORT-Results.txt", row.names = 1, header=T, stringsAsFactors = F) +colnames(results)=paste0("N:SAMP:CIBERSORT_", gsub(" |-|\\.", "_", colnames(results)), "") +cibersort=t(results) +cibersort=cibersort[,colnames(cibersort)%in%colnames(gexp)] + +MCP=get(load("MCP_counter_data.Rdata")) +rownames(MCP)=paste0("N:SAMP:", rownames(MCP), "") +MCP=MCP[,colnames(MCP)%in%colnames(cibersort)] + +l.fractions=list(cibersort, MCP) +fractions=data.frame(do.call(rbind, l.fractions), stringsAsFactors = F) + +#********************************** Clinical data ****************************** + +files=list.files(".", pattern=".info.tsv", full.names = T) + +surv_data=do.call(rbind, lapply(files, read.delim, header=T, stringsAsFactors=F)) +surv_data=surv_data[surv_data[,1]%in%colnames(gexp),] +surv_data=surv_data[!is.na(surv_data[,3]),] + +surv_d=t(surv_data[,2:3]) +colnames(surv_d)=surv_data[,1] +rownames(surv_d)=c("N:CLIN:OS_Time", "B:CLIN:OS_Status") + +surv_d=surv_d[,match(colnames(gexp), colnames(surv_d))] +colnames(surv_d)=colnames(gexp) + +#******************************* tumor percentage ******************************** +Sys.setlocale(locale="C") +sorteds=read.delim("sorted_samples.txt", stringsAsFactors=F, header=F) + +gsm=unlist(lapply(1:dim(sorteds)[1], function(i){ + annot$GSM.identifier..sample.[annot$Sample.isolation%in%sorteds[i,1]&annot$colorClass%in%sorteds[i,2]] +})) +add=annot$GSM.identifier..sample.[grepl("CD303", annot$Sample.isolation)] + +sorted=t(annot$GSM.identifier..sample.%in%c(gsm, add)) + +rownames(sorted)="B:CLIN:CELLS_SORTED" +colnames(sorted)=colnames(gexp) + +sorted[,grepl("Padiatr", annot$Additional.notes)]=1 +# sorted[,annot$Additional.notes=="The leukemic blasts were sorted based on CD41, CD7, CD117, CD33, and CD34 antibodies as previously described (Klin. Padiatr. 217, 126-134)."] = 1 + +tumor_per=gsub("blast%: |>=|%|blast cell percentage: |t_cell_purity: |>|;|blast count, % of sample, -1=unavailable : ","", annot$Purity.Tumor.cell.content) +tumor_per[tumor_per=="high"]=80 +tumor_per[as.numeric(tumor_per)>100]=100 +tumor_per[tumor_per=="-1"]=0 +tumor_per[tumor_per%in%c("n./a.", "na")]=NA + +malt=read.delim("clinical_annotations_MALT.txt", stringsAsFactors=F, header=T) +replace=malt[match(colnames(gexp), malt$GSMID),] +tumor_per[grepl("Percentage of tumor", tumor_per)]=replace$X..Tumor[grepl("Percentage of tumor", tumor_per)] + +tumor_percentage=data.matrix(t(as.numeric(tumor_per))) +rownames(tumor_percentage)="N:SAMP:BLAST_TUMOR_PERCENTAGE" +colnames(tumor_percentage)=colnames(gexp) + +# T-cell percentages +T_per=data.matrix(t(as.numeric(replace$X..T.cells))) +rownames(T_per)="N:SAMP:TCELL_PERCENTAGE" +colnames(T_per)=colnames(gexp) + +tissue_per=data.matrix(t(as.numeric(replace$X..Lung))) +rownames(tissue_per)="N:SAMP:TISSUE_PERCENTAGE" +colnames(tissue_per)=colnames(gexp) + + +#*************************** adding some lymphoma annotation ************************************* +anno = read.delim("GSE10846_series_matrix_info_ipi_clean.txt", stringsAsFactors=F, header=T) +anno=anno[anno[,1]%in%annot[,1],] + +anno=anno[match(annot[,1], anno[,1]),] + +annot$In.vivo.treatment[annot[,1]%in%anno[,1]]=gsub("*.*: |;", "", anno$chemotherapy[annot[,1]%in%anno[,1]]) +annot$In.vivo.treatment[annot$In.vivo.treatment%in%"NA"]=NA + +annot$dlbcl_ipi=NA +annot$dlbcl_ipi[annot[,1]%in%anno[,1]]=anno$ipi[annot[,1]%in%anno[,1]] + +CHOP=t(annot$In.vivo.treatment%in%"CHOP-Like Regimen"*1) +RCHOP=t(annot$In.vivo.treatment%in%"R-CHOP-Like Regimen"*1) +CHOP[is.na(annot$In.vivo.treatment)]=NA +RCHOP[is.na(annot$In.vivo.treatment)]=NA +rownames(CHOP)="B:CLIN:Chemotherapy_CHOP" +rownames(RCHOP)="B:CLIN:Chemotherapy_RCHOP" +colnames(CHOP)=colnames(gexp) +colnames(RCHOP)=colnames(gexp) + +# add cytogenetic information +genetics_org=read.delim("AML_preBALL_cytogenetics_vectors.txt", stringsAsFactors=F, header=T, row.names=1) +genetics=genetics_org[rownames(genetics_org)%in%colnames(gexp),] +genetics=t(genetics)*1 +rownames(genetics)=paste("B:CLIN:", "GENETICS_", rownames(genetics), "", sep="") + +cytogenetic=annot$Cytogenetics +cytogenetic[cytogenetic%in%c("na", "n/a", "", " ")]=NA +cytogenetic[grepl("without|unknown|remainingcytogenetics|no del13q|crlf2 fish: Normal|crlf2 fish: n/a", cytogenetic)]=NA +cytogenetic[grepl("ormal", cytogenetic)]="normal_karyotype" +cytogenetic[grepl("MLL", cytogenetic)]="MLL" +cytogenetic=gsub("complex aberrant karyotype", "complex karyotype", cytogenetic) +cytogenetic=gsub("hyperdiploid karyotype", "hyperdiploid", cytogenetic) +cytogenetic=gsub("TAL$", "TAL1", cytogenetic) +cytogenetic=gsub("remaining cytogenetics|other abNormalities", "other", cytogenetic) +cytogenetic=gsub("fish:|trisomy 8 |;deletion|; complex karyotype|, complex karyotype|: positive| chromosomal aberrations|/API2-MALT1|/API2-MALT1 negative|deletion *.*: negative|/IGH-MALT1|, plus other| plus other", "", cytogenetic) +cytogenetic=gsub("trisomy ", "trisomy", cytogenetic) +cytogenetic=gsub("TEL deleted", "TEL_deleted", cytogenetic) +cytogenetic=gsub("p13.1", "p13", cytogenetic) +cytogenetic=gsub(";$", "", cytogenetic) +cytogenetic=gsub("\\+ ", "+", cytogenetic) +cytogenetic=gsub("i\\(", "inv(", cytogenetic) +cytogenetic=gsub("complex karyotype", "complex_karyotype", cytogenetic) +cytogenetic=gsub("^ ", "", cytogenetic) +annot$cytogenetic_clean=cytogenetic + +cytogenetic_terms=sort(unique(unlist(strsplit(cytogenetic, " ")))) + +cytogenetic_terms=unlist(strsplit(cytogenetic, "; |; | |, | |/")) +cytogenetic_terms=gsub(" ", "", cytogenetic_terms) + +terms=table(cytogenetic_terms) +shared=names(terms)[terms>5] + +cytogenetics=do.call(rbind, mclapply(shared,FIND_LOGICAL, cytogenetic, mc.cores=8)) +colnames(cytogenetics)=colnames(gexp) +#*********************** + +# age annotations +age=gsub("*.*: |^ |;| yr| age| years|-.*.$| Years|d 32.8|d 54.8", "", annot$Age) +age[age%in%c("na", "n/a", "", "not available")]=NA +age[grepl("month|Month", age)]=signif(as.numeric(gsub(" months.*.| months| Months", "", age[grepl("month|Month", age)]))/12, 2) +age[age=="Adult"]=30 +age[age=="Children"]=5 +age[age=="pediatric"]=1 +age=t(as.numeric(age)) +age[age>100]=NA +rownames(age)="N:CLIN:AGE" +colnames(age)=colnames(gexp) + +# gender annotations +gender=toupper(annot$Gender) +gender[gender%in%c("GENDER: NOT AVAILABLE", "GENDER: NA;", "SEX: UNKNOWN;")]=NA +gender=gsub(" |;", "", gender) +gender=gsub("GENDER:|SEX/AGE:|/.*.|SEX:", "", gender) +gender[gender%in%c("F", "FEMALE", "WOMAN")]="female" +gender[gender%in%c("M", "MALE", "MAN")]="male" +gender=t(gender) +rownames(gender)="C:CLIN:GENDER" +colnames(gender)=colnames(gexp) + +# race annotations +race=toupper(annot$Race) +race[grep("AGE", race)]=NA +race[grep("OTHER", race)]="OTHER" +race[grep("AFRICAN|RACE: AA;|RACE: B;", race)]="AFRICAN" +race[grep("HISPANIC|RACE: H; ", race)]="HISPANIC" +race[grepl("EUROPEAN|CAUCASIAN|ANGLO-AMERICAN|WHITE|RACE: W;|RACE: C;", race)]="EUROPEAN" +race[grep("ASIAN", race)]="ASIAN" +race[!grepl("ASIAN|AFRICAN|HISPANIC|EUROPEAN|OTHER", race)]=NA + +race=t(race) +rownames(race)="C:CLIN:RACE" +colnames(race)=colnames(gexp) + +#*************************** adding some myeloma annotation ************************************* +annomm = read.delim2("GSE24080_MM_clininfo_GSMid_clean.txt", stringsAsFactors=F, header=T) +annomm2 = read.delim2("GSE19784_MM_survival_GSMid_iss_clean.txt", stringsAsFactors=F, header=T) + +annomm$b2m=gsub("<0.5", "0.5", annomm$b2m) +annomm$b2m=as.numeric(gsub(",", ".", annomm$b2m)) + +annomm$aspc=as.numeric(gsub(",", ".", annomm$aspc)) +annomm$bmpc=as.numeric(gsub(",", ".", annomm$bmpc)) + +# first combine the two: +library(data.table) +combmm=data.frame(rbindlist(list(annomm, annomm2), fill = TRUE), stringsAsFactors = F) + +combmm=combmm[match(colnames(gexp), combmm$accession),] + +# now add these vectors to annot table +age[colnames(gexp)%in%combmm$accession]=signif(combmm$age[colnames(gexp)%in%combmm$accession], 3) +gender[colnames(gexp)%in%combmm$accession]=combmm$sex[colnames(gexp)%in%combmm$accession] + +combmm$race[combmm$race%in%"other"]="OTHER" +combmm$race[combmm$race%in%"white"]="EUROPEAN" +race[colnames(gexp)%in%combmm$accession]=combmm$race[colnames(gexp)%in%combmm$accession] + +# time and status: +surv_d[1,colnames(gexp)%in%combmm$accession]=signif(combmm$os_time[colnames(gexp)%in%combmm$accession], 3) +surv_d[2,colnames(gexp)%in%combmm$accession]=signif(combmm$os_censor[colnames(gexp)%in%combmm$accession], 3) + +# pfs time and status +pfs=cbind(combmm$pfs_time, combmm$pfs_censor) +pfs=t(pfs) +colnames(pfs)=colnames(gexp) +rownames(pfs)=c("N:CLIN:PFS_Time", "B:CLIN:PFS_Status") + +# other myeloma annotations: +otherMM_C=t(combmm[,c(4,8)]) +rownames(otherMM_C)=paste0("C:CLIN:MM_", toupper(rownames(otherMM_C)), "") +colnames(otherMM_C)=colnames(gexp) + +# numeric myeloma: +otherMM_N=t(combmm[,c(9:20)]) +rownames(otherMM_N)=paste0("N:CLIN:MM_", toupper(rownames(otherMM_N)), "") +colnames(otherMM_N)=colnames(gexp) + +add_mm=t(combmm$cytogenetic_abnormalities) +colnames(add_mm)=colnames(gexp) +rownames(add_mm)=c("B:CLIN:MM_CYTOGENETIC_ABNORMALITIES") + +l.clin=list(sorted, surv_d, pfs, tumor_percentage,T_per,tissue_per, CHOP, RCHOP, gender, age,race, cytogenetics, genetics, add_mm, otherMM_N, otherMM_C) +clin=data.frame(do.call(rbind, l.clin), stringsAsFactors = F) + +#****************************** make annotation clusters ****************************************** +annot$acute=rep("other", dim(annot)[1]) +annot$acute[annot$colorClass=="AML"|annot$colorClass=="pre-B-ALL"|annot$colorClass=="T-ALL"]="acute_leukemias" +annot$acute[annot$colorClass=="CLL"|annot$colorClass=="CML"]="chronic_leukemias" +annot$acute[grepl("NonCancer", annot$MAINCLASS)]="NonCancer" +annot$disease=rep("other", dim(annot)[1]) +annot$disease[grepl("NonCancer", annot$Sub.maps.available)]="NonCancer" +annot$CLASS2=annot$CLASS +annot$CLASS2[grepl("CellLine_Myeloma", annot$CLASS2)]="CellLine_Myeloma" +annot$CLASS2[grepl("CellLine_Leukemia", annot$CLASS2)]="CellLine_Leukemia" +annot$CLASS2[grepl("CellLine_Lymphoma", annot$CLASS2)]="CellLine_Lymphoma" +annot$CLASS2[grepl("CellLine_mix", annot$CLASS2)]="CellLine_mix" + +findthese=c("NonCancer", "Cancer_Leukemia", "Cancer_Myeloma", "Cancer_Lymphoma","CellLine_Leukemia","CellLine_Lymphoma","CellLine_Myeloma","Prolif_Lymphoproliferative_ALPS","Prolif_Lymphoproliferative_MPN", "Prolif_Myeloproliferative_LCH_LC", "Prolif_Myeloproliferative_MDS") + +for(f in findthese){ + annot$disease[grepl(f, annot$MAINCLASS)]=f +} + +annot$subclasses=rep("other", dim(annot)[1]) +findthese=c("NonCancer", "Cancer_Myeloma", "Cancer_Lymphoma", "CellLine_Leukemia", "CellLine_Lymphoma","CellLine_Myeloma","Prolif_Lymphoproliferative_ALPS","Prolif_Lymphoproliferative_MPN", "Prolif_Myeloproliferative_LCH_LC", "Prolif_Myeloproliferative_MDS") +findthese2=c("T-ALL", "pre-B-ALL", "AML","CML","CLL", "BCL", "TCL", "B-Lymphoid", "T-Lymphoid","Lymphoid", "Myeloid", "Erythroid", "StemCell") + +for(f in findthese){ + annot$subclasses[grepl(f, annot$MAINCLASS)]=f +} +for(f in findthese2){ + annot$subclasses[grepl(f, annot$colorClass)]=f +} +DLBCL=c("Cancer_Lymphoma_BCL_DLBCL_ABC", "Cancer_Lymphoma_BCL_DLBCL_GCB", "Cancer_Lymphoma_BCL_DLBCL_na") +annot$subclasses[annot$CLASS2%in%DLBCL]="BCL_DLBCL" + + +annot$CLASS=gsub("_na|_check|_testicular", "",annot$CLASS) + +# lymphoma annotations +bLY=(1:nrow(annot)%in%grep("Lymphoma_BCL",annot$CLASS))&(!1:nrow(annot)%in%grep("CellLine",annot$CLASS))&(!1:nrow(annot)%in%grep("NonCancer",annot$CLASS)) +tLY=(1:nrow(annot)%in%grep("Lymphoma_TCL",annot$CLASS))&(!1:nrow(annot)%in%grep("CellLine",annot$CLASS))&(!1:nrow(annot)%in%grep("NonCancer",annot$CLASS)) +annot$CLASS=gsub("Cancer_", "", annot$CLASS) + +annot$tbLY=NA +annot$tbLY[bLY|tLY]=annot$CLASS[bLY|tLY] + +# these are the terms to look for +# table(annot$disease) +# table(annot$colorClass) +# table(annot$acute) +# table(annot$subclasses) +# table(annot$tbLY) + +#******************************************************************************************* +#*************************** annotation clusters ******************************************** + +#****************************** make clusters ****************************************** + +# make immunological normal annotation vectors +annot$plotNormals = "Other" + +lv=!annot$Sample.type%in%"NonCancerHealthy" +annot$plotNormals[lv]="" + +lv=grepl("RestingBcell|NaiveBcell|MemoryBcell|BcellActivated", annot$CLASS) +annot$plotNormals[lv]="B cell" + +lv=grepl("GerminalCentre", annot$CLASS) +annot$plotNormals[lv]="Germinal centre cell" + +lv=grepl("PlasmaBcell", annot$CLASS) +annot$plotNormals[lv]="Plasma cell" + +lv=grepl("Tcell|NaturalKillerCell", annot$CLASS) +annot$plotNormals[lv]="T/NK cell" + +lv=grepl("DendriticCell", annot$CLASS) +annot$plotNormals[lv]="Dendritic cell" + +lv=grepl("Langerhans", annot$CLASS) +annot$plotNormals[lv]="Langerhans cell" + +lv=grepl("Eryth|Platelet", annot$CLASS) +annot$plotNormals[lv]="Erythroid" + +lv=grepl("Monocyte", annot$CLASS) +annot$plotNormals[lv]="Monocyte" + +lv=grepl("Macrophage", annot$CLASS) +annot$plotNormals[lv]="Macrophage" + +lv=grepl("Neutrophil", annot$CLASS) +annot$plotNormals[lv]="Neutrophil" + +lv=grepl("MyeloidProgenitor", annot$CLASS) +annot$plotNormals[lv]="Myeloid progenitor" + +lv=grepl("HematopoieticStemCell", annot$CLASS) +annot$plotNormals[lv]="HSC" + +lv=grepl("^Mononuclear", annot$CLASS) +annot$plotNormals[lv]="PBMC" + +lv=grepl("LymphNode", annot$CLASS) +annot$plotNormals[lv]="Lymph node" + +HLAplot_normals <- c("B cell", "Plasma cell", "T/NK cell", "Dendritic cell", "Erythroid", "Monocyte", "Macrophage", "Neutrophil", "Myeloid progenitor", "HSC") +cytolyticplot_normals <- c("PBMC", "Lymph node") +costimplot_normals <- c(HLAplot_normals, "PBMC", "Lymph node", "Langerhans cell", "Germinal centre cell") + +annot$immunoNormals=annot$Category.specifying.lineage.tumor.origin + +lv=grepl("CD8|CD8+TcellActivated", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="CD8+Tcell" + +lv=grepl("NaturalKiller", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="NKCell" + +lv=grepl("M2-Macrophage", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="M2-Macrophage" + +lv=grepl("M1-Macrophage", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="M1-Macrophage" + +lv=grepl("DendriticCell", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="DendriticCell" + +lv=grepl("Monocyte", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="Monocyte" + +lv=grepl("CD4+", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="CD4+Tcell" + +lv=!annot$Sample.type%in%"NonCancerHealthy" +annot$immunoNormals[lv]="" + +lv=grepl("Eryth|Platelet", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="Erythroid" + +lv=grepl("CD3", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="Tcell" + +lv=grepl("GerminalCentre", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="GerminalCentreCell" + +lv=grepl("^Tcell$|^TcellActivated$|^TcellResting$", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="Tcell" + +lv=grepl("^ActivatedBcell$|^RestingBcell$|^BcellActivated$", annot$Category.specifying.lineage.tumor.origin) +annot$immunoNormals[lv]="Bcell" + +# annotated clusters +tbLY=FUN_MAKE_ALL(annot$tbLY, "annotated_class", annot$tbLY, 0) +subclasses=FUN_MAKE_ALL(annot$subclasses, "annotated_class", annot$subclasses, 0) +acute_chronic=FUN_MAKE_ALL(annot$acute, "annotated_class", annot$acute, 0) +colorClass=FUN_MAKE_ALL(annot$colorClass, "annotated_class", annot$colorClass, 0) +disease=FUN_MAKE_ALL(annot$disease, "annotated_class", annot$disease, 0) +tbLY=FUN_MAKE_ALL(annot$tbLY, "annotated_class", annot$tbLY, 0) +fullclass=FUN_MAKE_ALL(annot$CLASS2, "annotated_class", annot$CLASS2, 0) +immunoclass=FUN_MAKE_ALL(annot$immunoNormals, "annotated_class_immunoNormals", annot$immunoNormals, 0) + +l.comparisons=list(disease, colorClass, acute_chronic,subclasses, tbLY, fullclass, immunoclass) +comparisons=do.call(rbind, l.comparisons) +comparisons=data.frame(data.matrix(comparisons[!duplicated(rownames(comparisons)),]), stringsAsFactors = F) +colnames(comparisons)=colnames(gexp) + +# test if all rows are fine, should be >1 values +A=apply(comparisons, 1, unique) + +B=unlist(lapply(A, function(d)sum(d%in%c(1,0))>=2)) + +if(!all(B))stop("Check comparisons, impossible comparisons made") + +# categorical feats +class1=FUN_MAKE_CATEGORICAL(annot$tbLY, "annotated_class_BCL_TCL") +class2=FUN_MAKE_CATEGORICAL(annot$colorClass, "annotated_class_colorclass") +class3=FUN_MAKE_CATEGORICAL(annot$disease, "annotated_class_disease") +class4=FUN_MAKE_CATEGORICAL(annot$immunoNormals, "annotated_class_immunoNormals") + +l.comparisons=list(class1, class2, class3, class4) +comparisons_cat=do.call(rbind, l.comparisons) +comparisons_cat=data.frame(comparisons_cat[!duplicated(rownames(comparisons_cat)),], stringsAsFactors = F) + +colnames(comparisons_cat)=colnames(gexp) + + +#**************************************************************************************************** +#******************** Next we start to create features of these individual data types *************** +#**************************************************************************************************** + +l.fm=list(data_list,clin,fractions, comparisons) + +library(data.table) +fm=rbindlist(l.fm, use.names=F, fill=F) + +fm=data.frame(fm, stringsAsFactors=F) +rownames(fm)=unlist(lapply(l.fm, rownames)) + +matrix=fm + +# also add clinicaldata to annotations +numclin=t(data.matrix(clin[!grepl("^C:", rownames(clin)),])) +numchr=t(clin[grepl("^C:", rownames(clin)),]) +colnames(numclin)=gsub(".:CLIN:|.:GEXP:|", "", colnames(numclin)) +colnames(numchr)=gsub(".:CLIN:|.:GEXP:|", "", colnames(numchr)) + +annot_add=data.frame(numclin, numchr, stringsAsFactors = F) + +fractions2=t(fractions) +colnames(fractions2)=gsub("N:SAMP:", "", colnames(fractions2)) +annot2=data.frame(annot, annot_add, "CytolyticScore"=as.numeric(gm1) ,"HLAIScore"=as.numeric(gm2), "HLAIIScore"=as.numeric(gm3),classification,fractions2, stringsAsFactors = F) + + +# annot +clusters=read.delim("AML_15pct_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T) +clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] +clusters_cancermap=clusters$X1.5..cluster + +matrix_sub=matrix[,colnames(matrix)%in%clusters$ID] +annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,] + +# TCGA clusters +cluster_mapping=read.delim("Table_TCGA_cluster_AML_cluster_assignment.txt", header=T, stringsAsFactors=F, sep="\t") +TCGA_cluster=rep("NA", dim(annot_sub)[1]) + +TCGA_cluster[clusters_cancermap%in%cluster_mapping[1,2]]="TCGA_AML_cluster_1" +TCGA_cluster[clusters_cancermap%in%cluster_mapping[2,2]]="TCGA_AML_cluster_2" +TCGA_cluster[clusters_cancermap%in%cluster_mapping[3:5,2]]="TCGA_AML_cluster_3" +TCGA_cluster[clusters_cancermap%in%cluster_mapping[6:10,2]]="TCGA_AML_cluster_4" +TCGA_cluster[clusters_cancermap%in%cluster_mapping[11,2]]="TCGA_AML_cluster_5" +TCGA_cluster[clusters_cancermap%in%cluster_mapping[12:15,2]]="TCGA_AML_cluster_6" +TCGA_cluster[clusters_cancermap%in%cluster_mapping[16:17,2]]="TCGA_AML_cluster_7" + +n=lapply(unique(TCGA_cluster), function(i)annot_sub$GSM.identifier..sample.[TCGA_cluster%in%i]) +names(n)=unique(TCGA_cluster) +n=n[!unique(TCGA_cluster)%in%"NA"] +save(n, file="Hemap_immunology_TCGA_clusters.Rdata") + +save(matrix, file="Hemap_immunology_fm.Rdata") +save(annot2, file="Hemap_immunology_Annotations.Rdata") +write.table(annot2,"Hemap_immunology_Annotations.tsv", sep="\t", col.names=T, row.names=F, quote=FALSE) + +write.table(t(c("N:SAMP", as.character(colnames(matrix)))), file="Hemap_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) +write.table(matrix, file="Hemap_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) + +# make a small fix here to harmonize survival data to months: +load("Hemap_immunology_Annotations.Rdata") + +unique(cbind(annot2[!is.na(annot2$OS_Time), c(2,4)])) +annot2$OS_Time[!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE10846,GSE11318", "GSE10846", "GSE10846,GSE17372", "GSE11877")]=annot2$OS_Time[!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE10846,GSE11318", "GSE10846", "GSE10846,GSE17372", "GSE11877")]*12 + +# for myeloma, transform data to 5y survival to compare data sets: +modify=!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE16716,GSE24080") +find=annot2$OS_Time>60&modify + +# change status to alive if dead later +find2=annot2$OS_Status==1&modify +annot2$OS_Status[find&find2]=0 +annot2$OS_Time[find]=60 + +save(annot2, file="Hemap_immunology_Annotations.Rdata") + +#**************************************************************************************** +# This FM can then be used as a backbone for other FMs. GSVA and clusters must be added +#**************************************************************************************** + +matrix=get(load("Hemap_immunology_fm.Rdata")) +annot=get(load("Hemap_immunology_Annotations.Rdata")) + +#********************************** Full map ********************************** +clusters=read.delim("anno_coord_data9544_15pct_bw2.5_updated.txt", stringsAsFactors=F, header=T) +clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] + +#*********************************** GSVA input **************************** +gsva=get(load("data8238_dufva_immunological_genes_updated_2016_GSVA_geneperm_lean_eFDR.Rdata")) +bindea=get(load("data8238_all_samples_dufva_bindea_2013_geneset_GSVA.Rdata")) +load("data8238_all_samples_Combined_pathway_signatures_210616_GSVA.Rdata") +gsva_es=rbind(gsva, bindea, gsva_es) +gsva_es=gsva_es[!duplicated(rownames(gsva_es)),] + +# match cols gsva +gsva_es=data.frame(gsva_es[,match(colnames(matrix), colnames(gsva_es))]) +colnames(gsva_es)=colnames(matrix) + +#**************************** Cancermap clusters **************************** +clusters_cancermap=clusters$X2.5..cluster +cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0) +subclasses=FUN_MAKE_ALL(annot$subclasses, "cancermap_cluster", clusters_cancermap, 0.8) +acute_chronic=FUN_MAKE_ALL(annot$acute, "cancermap_cluster", clusters_cancermap, 0.8) +colorClass=FUN_MAKE_ALL(annot$colorClass, "cancermap_cluster", clusters_cancermap, 0.8) +disease=FUN_MAKE_ALL(annot$disease, "cancermap_cluster", annot$disease, 0.8) +fullclass=FUN_MAKE_ALL(annot$CLASS2, "cancermap_cluster", annot$CLASS2, 0.8) + +class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster") + +l.comparisons=list(cluster_cancermap, subclasses, acute_chronic, colorClass, disease,fullclass, class_cancermap) +comparisons_cat=do.call(rbind, l.comparisons) +comparisons_cat=data.frame(comparisons_cat[!duplicated(rownames(comparisons_cat)),], stringsAsFactors = F) +colnames(comparisons_cat)=colnames(matrix) + +# combine +l.fm=list(matrix, gsva_es, comparisons_cat) + +library(data.table) +fm=rbindlist(l.fm, use.names=F, fill=F) + +fm=data.frame(fm, stringsAsFactors=F) +rownames(fm)=unlist(lapply(l.fm, rownames)) + +# remove rows with few values or NAs +rm=apply(fm, 1, function(v)all(is.na(v))) +fm=fm[!rm,] + +save(fm, file="Hemap_immunology_fm_cancermap.Rdata") + +write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_immunology_fm_cancermap.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) +write.table(fm, file="Hemap_immunology_fm_cancermap.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) + + +#********************************** Lymphoma ********************************** +matrix=get(load("Hemap_immunology_fm.Rdata")) +annot=get(load("Hemap_immunology_Annotations.Rdata")) + +# annot +clusters=read.delim("Hemap_Lymphoma_15pct_genes_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T) +clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] + +matrix_sub=matrix[,colnames(matrix)%in%clusters$ID] +annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,] + +load("data9544_LYMPHOMA_all_samples_Combined_pathway_drug_signatures_2017_GSVA.Rdata") +rownames(gsva_es)=gsub(" ", "_", rownames(gsva_es)) + +# match cols gsva +gsva_es=data.frame(gsva_es[,match(colnames(matrix_sub), colnames(gsva_es))]) +colnames(gsva_es)=colnames(matrix_sub) + +clusters_cancermap=clusters$X1.5..cluster + +# comparisons +cluster_subtypes=FUN_MAKE_ALL(annot_sub$CLASS, "cancermap_cluster", clusters_cancermap, 0.8) +cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0) +cluster_BCL_TCL=FUN_MAKE_ALL(annot_sub$tbLY, "cancermap_cluster", clusters_cancermap, 0.8) +class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster") + +l.comparisons=list(cluster_subtypes, cluster_cancermap, cluster_BCL_TCL, class_cancermap) +comparisons_cat=do.call(rbind, l.comparisons) +comparisons_cat=data.frame(data.matrix(comparisons_cat[!duplicated(rownames(comparisons_cat)),]), stringsAsFactors = F) + + +# combine +l.fm=list(matrix_sub, data.frame(gsva_es), comparisons_cat) + +library(data.table) +fm=rbindlist(l.fm, use.names=F, fill=F) + +fm=data.frame(fm, stringsAsFactors=F) +rownames(fm)=unlist(lapply(l.fm, rownames)) + +# remove rows with few values or NAs +rm=apply(fm, 1, function(v)all(is.na(v))) +fm=fm[!rm,] + +save(fm, file="Hemap_LYMPHOMA_immunology_fm.Rdata") + +write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_LYMPHOMA_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) +write.table(fm, file="Hemap_LYMPHOMA_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) + +#********************************** AML ********************************** + +matrix=get(load("Hemap_immunology_fm.Rdata")) +annot=get(load("Hemap_immunology_Annotations.Rdata")) + +# annot +clusters=read.delim("AML_15pct_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T) +clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,] + +clusters_cancermap=clusters$X1.5..cluster + +matrix_sub=matrix[,colnames(matrix)%in%clusters$ID] +annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,] + +# GSVA input +gsva=get(load("data9544_AML_all_samples_Combined_pathway_drug_signatures_2017_GSVA.Rdata")) + +# match cols gsva +gsva_es=data.frame(gsva_es[,match(colnames(matrix_sub), colnames(gsva_es))]) +colnames(gsva_es)=colnames(matrix_sub) + +# TCGA clusters +cluster_mapping=read.delim("Table_TCGA_cluster_AML_cluster_assignment.txt", header=T, stringsAsFactors=F, sep="\t") + +annot_sub$TCGA_cluster=rep("NA", dim(annot_sub)[1]) + +annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[1,2]]="TCGA_AML_cluster_1" +annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[2,2]]="TCGA_AML_cluster_2" +annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[3:5,2]]="TCGA_AML_cluster_3" +annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[6:10,2]]="TCGA_AML_cluster_4" +annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[11,2]]="TCGA_AML_cluster_5" +annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[12:15,2]]="TCGA_AML_cluster_6" +annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[16:17,2]]="TCGA_AML_cluster_7" + +# comparisons +cluster_TCGA=FUN_MAKE_ALL(annot_sub$TCGA_cluster, "cancermap_cluster", annot_sub$TCGA_cluster, 0.9) +cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0) +cluster_subtypes=FUN_MAKE_ALL(annot_sub$CLASS, "cancermap_cluster", clusters_cancermap, 0.9) +class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster") +class_TCGA=FUN_MAKE_CATEGORICAL(annot_sub$TCGA_cluster, "cancermap_cluster") + +l.comparisons=list(cluster_TCGA, cluster_cancermap, cluster_subtypes, class_cancermap, class_TCGA) +comparisons_cat=do.call(rbind, l.comparisons) +comparisons_cat=data.frame(data.matrix(comparisons_cat[!duplicated(rownames(comparisons_cat)),]), stringsAsFactors = F) + +# combine +l.fm=list(matrix_sub, data.frame(gsva_es), comparisons_cat) + +library(data.table) +fm=rbindlist(l.fm, use.names=F, fill=F) + +fm=data.frame(fm, stringsAsFactors=F) +rownames(fm)=unlist(lapply(l.fm, rownames)) + +# remove rows with few values or NAs +rm=apply(fm, 1, function(v)all(is.na(v))) +fm=fm[!rm,] + +save(fm, file="Hemap_AML_immunology_fm.Rdata") + +write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_AML_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F) +write.table(fm, file="Hemap_AML_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T) + +# write.table(all_isolation, file="Hemap_all_isolation.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F)