ImmunogenomicLandscape-BC / Git / [8e0848] /preprocessing/Preprocessing_Hemap_featurematrix

Models:
WandaB/
ImmunogenomicLandscape-BC
Downloads: 1
[8e0848]: / preprocessing / Preprocessing_Hemap_featurematrix_generation.R
History
Download this file
796 lines (592 with data), 35.6 kB

#***************************************************************************************************
#******************************* Make immunology FM **********************************************
#***************************************************************************************************
library(mclust)
library(data.table)
library(parallel)

source("/research/users/ppolonen/git_home/common_scripts/featurematrix/functions_generate_fm.R")

# WD
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/")

# GEXP
gexp=t(get(load("data9544_with_gene_symbols.RData")))

# annotation table
check=get(load("Hemap_immunology_Annotations_8304.Rdata"))
annot=read.delim("anno_coord_data9544_15pct_bw2.5_updated.txt", stringsAsFactors=F, header=T)

# listing files, not needed anymore
# f=list.files("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/Annotations_immunology/", pattern = ".csv", full.names = T)
# f=f[-1]
# annot_normal=do.call(rbind, lapply(f, read.csv, header=F, stringsAsFactors=F, skip=1))

#******************************** filtering *************************************
exvivotreatments_allowed <- c("none", "na", "control", "activation", "differentiation", "differentiation followed by activation", " differentiation followed by LPS+IFNg", "differentiation followed by IFN", "differentiation followed by inflammatory cytokines", "differentiation followed by LPS", "differentiation followed by CD40L", "differentiation followed by Poly(I:C)", "differentiation (IL-4)", "differentiation with EPO", "anti-IgM", "IL-2", "IL-3", "IL-3+CpG")

rm_healthy = annot$Sample.type%in%c("NonCancerHealthy")&!(annot$In.vivo.treatment%in%c("na", "none", "no")&annot$Ex.vivo.treatment%in%exvivotreatments_allowed)
rm_cancer_prolif = annot$Sample.type%in%c("Cancer", "Prolif")&!annot$Ex.vivo.treatment=="none"
rm_celline = annot$Sample.type%in%c("CellLine")&!annot$Ex.vivo.treatment%in%c("none", "na", "control")
rm_treatments = annot$GSE.identifier..experiment.%in%c("GSE26661")|annot$GSM.identifier..sample.%in%c("GSM425497", "GSM425499")
rm_noncancer = !annot$Sample.type%in%c("Cancer", "Prolif", "CellLine", "NonCancerHealthy")
rm_mm_pbmc = annot$colorClass=="MM"&!annot$Sample.isolation=="CD138+ plasma cells"
# cluster exclusion
# AML,GSE7538,24
# CLL,GSE18866,GSE9250,32
# LP,GSE12453,GSE7345,6
# MM,GSE24147,GSE24522,18
# MP,GSE12079,GSE15811,13
# TCL,GSE14879,16 microdissected, do not remove

rm_cluster_differ=annot$GSE.identifier..experiment.%in%c("GSE7538", "GSE9250", "GSE12453", "GSE18866","GSE24522", "GSE7345", "GSE12079", "GSE15811", "GSE24147")&annot$Sample.type=="Cancer"

annot_left_out = annot

annot_left_out$reason_removed[rm_healthy]="Normal Cell sample, ex-vivo or in-vivo treated"
annot_left_out$reason_removed[rm_cluster_differ]="Cancer sample, outlier cluster"
annot_left_out$reason_removed[rm_celline]="Cell line sample, ex-vivo treated"
annot_left_out$reason_removed[rm_cancer_prolif|rm_treatments]="Cancer sample, ex-vivo treated"
annot_left_out$reason_removed[rm_noncancer]="NonCancer sample, not healthy"
annot_left_out$reason_removed[rm_mm_pbmc]="NonCancer sample, not healthy"

annot_left_out <- annot_left_out[(rm_noncancer|rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_mm_pbmc),]
fwrite(annot_left_out, "hemap_1072_leftout_reasonremoved.txt", sep="\t")

annot <- annot[!(rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_noncancer|rm_mm_pbmc),]
gexp <- gexp[,!(rm_healthy|rm_cancer_prolif|rm_celline|rm_treatments|rm_cluster_differ|rm_noncancer|rm_mm_pbmc)]

newSamples=annot[!annot[,1]%in%check[,1],]
fwrite(newSamples, "new_samples_in_hemap_8472.txt", sep="\t")

dim(check[check[,1]%in%annot[,1],])


#****************************************************************************************
# annot[rm_healthy&annot$GSM.identifier..sample.%in%annot_normal[,1],] # excluded from previous normals

# new
annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=gsub("NonCancerHealthy_|NonCancerHealthy|na_","", gsub("_StemCell_|Myeloid_|_na_na|1_na_na|2_na_na|Lymphoid_|_G.*.|_na_G0|_na_G1.*.|T-|B-|M1-Differentiating|Erythroid_","", annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]))
annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")][annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=="na"]="LymphNode_GerminalCentre"
annot$CLASS[annot$CLASS=="CD4+Tcell"]="RestingCD4+Tcell"
annot$CLASS[annot$CLASS=="CD8+Tcell"]="RestingCD8+Tcell"

annot$MAINCLASS[annot$Sample.type%in%c("NonCancerHealthy")]="NonCancerHealthy"
annot$colorClass[annot$Sample.type%in%c("NonCancerHealthy")][annot$CLASS[annot$Sample.type%in%c("NonCancerHealthy")]=="na"]="Lymphoid"

# old
# annot$CLASS[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,5]
# annot$MAINCLASS[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,3]
# annot$colorClass[match(annot_normal[,1], annot$GSM.identifier..sample.)]=annot_normal[,4]

#*********************************** compute geometric mean ********************************
# get certain genes GEXP
rownames(gexp)=paste("N:GEXP:", rownames(gexp), sep="")

dat_a=gexp[grep("GZMA|PRF1|GNLY|GZMH|GZMM", rownames(gexp)),]
# dat_a=matrix[grep("GZMA|PRF1", rownames(matrix)),]
dat=2^dat_a+0.01
rownames(dat)
gm1=log2(t(apply(dat, 2, gm_mean)))
rownames(gm1)="CytolyticScore"

# also HLA
dat_a2=gexp[rownames(gexp)%in%c("N:GEXP:B2M", "N:GEXP:HLA-A", "N:GEXP:HLA-B", "N:GEXP:HLA-C"),]

dat2=2^dat_a2+0.01
rownames(dat2)
gm2=log2(t(apply(dat2, 2, gm_mean)))
rownames(gm2)="HLAIScore"

# also HLAII
dat_a3=gexp[rownames(gexp)%in%c("N:GEXP:HLA-DMA",
                "N:GEXP:HLA-DMB",
                "N:GEXP:HLA-DPA1",
                "N:GEXP:HLA-DPB1",
                "N:GEXP:HLA-DRA",
                "N:GEXP:HLA-DRB1"),]

dat3=2^dat_a3+0.01
rownames(dat3)
gm3=log2(t(apply(dat3, 2, gm_mean)))
rownames(gm3)="HLAIIScore"

classification1=data.frame(t(rep("medium", length(gm1))), stringsAsFactors = F)
zscore=as.numeric(scale(t(gm1)))
classification1[zscore>=1]="high"
classification1[zscore<=(-1)]="low"
rownames(classification1)="CytolyticScore" 
colnames(classification1)=colnames(gexp)

classification2=data.frame(t(rep("medium", length(gm2))), stringsAsFactors = F)
zscore=as.numeric(scale(t(gm2)))
classification2[zscore>=1]="high"
classification2[zscore<=(-1)]="low"
rownames(classification2)="HLAIScore" 
colnames(classification2)=colnames(gexp)

classification3=data.frame(t(rep("medium", length(gm3))), stringsAsFactors = F)
zscore=as.numeric(scale(t(gm3)))
classification3[zscore>=1]="high"
classification3[zscore<=(-1)]="low"
rownames(classification3)="HLAIIScore" 
colnames(classification3)=colnames(gexp)

classification=data.frame(t(rbind(classification1,classification2,classification3)), stringsAsFactors = F)


immunoscores=as.data.frame(t(rbind(gm1, gm2, gm3)))
immunoscoresfm=make.features(immunoscores, datatype="SAMP", prefix="")
colnames(immunoscoresfm)=colnames(gexp)

immunoscores_class_fm=make.features(classification, datatype="SAMP", prefix="")
colnames(immunoscores_class_fm)=colnames(gexp)

# excluding categorical here, they slow everything down!
l.data_list=list(gexp, immunoscoresfm, immunoscores_class_fm)
data_list=data.frame(do.call(rbind, l.data_list))

# ******************************** Infer cell fractions ***********************************

# cibersort
results=read.delim("CIBERSORT-Results.txt", row.names = 1, header=T, stringsAsFactors = F)
colnames(results)=paste0("N:SAMP:CIBERSORT_", gsub(" |-|\\.", "_", colnames(results)), "")
cibersort=t(results)
cibersort=cibersort[,colnames(cibersort)%in%colnames(gexp)]

MCP=get(load("MCP_counter_data.Rdata"))
rownames(MCP)=paste0("N:SAMP:", rownames(MCP), "")
MCP=MCP[,colnames(MCP)%in%colnames(cibersort)]

l.fractions=list(cibersort, MCP)
fractions=data.frame(do.call(rbind, l.fractions), stringsAsFactors = F)

#********************************** Clinical data ******************************

files=list.files(".", pattern=".info.tsv", full.names = T)

surv_data=do.call(rbind, lapply(files, read.delim, header=T, stringsAsFactors=F))
surv_data=surv_data[surv_data[,1]%in%colnames(gexp),]
surv_data=surv_data[!is.na(surv_data[,3]),]

surv_d=t(surv_data[,2:3])
colnames(surv_d)=surv_data[,1]
rownames(surv_d)=c("N:CLIN:OS_Time", "B:CLIN:OS_Status")

surv_d=surv_d[,match(colnames(gexp), colnames(surv_d))]
colnames(surv_d)=colnames(gexp)

#******************************* tumor percentage ********************************
Sys.setlocale(locale="C")
sorteds=read.delim("sorted_samples.txt", stringsAsFactors=F, header=F)

gsm=unlist(lapply(1:dim(sorteds)[1], function(i){
  annot$GSM.identifier..sample.[annot$Sample.isolation%in%sorteds[i,1]&annot$colorClass%in%sorteds[i,2]]
}))
add=annot$GSM.identifier..sample.[grepl("CD303", annot$Sample.isolation)]

sorted=t(annot$GSM.identifier..sample.%in%c(gsm, add))

rownames(sorted)="B:CLIN:CELLS_SORTED"
colnames(sorted)=colnames(gexp)

sorted[,grepl("Padiatr", annot$Additional.notes)]=1
# sorted[,annot$Additional.notes=="The leukemic blasts were sorted based on CD41, CD7, CD117, CD33, and CD34 antibodies as previously described (Klin. Padiatr. 217, 126-134)."] = 1

tumor_per=gsub("blast%: |>=|%|blast cell percentage: |t_cell_purity: |>|;|blast count, % of sample, -1=unavailable : ","", annot$Purity.Tumor.cell.content)
tumor_per[tumor_per=="high"]=80
tumor_per[as.numeric(tumor_per)>100]=100
tumor_per[tumor_per=="-1"]=0
tumor_per[tumor_per%in%c("n./a.", "na")]=NA

malt=read.delim("clinical_annotations_MALT.txt", stringsAsFactors=F, header=T)
replace=malt[match(colnames(gexp), malt$GSMID),]
tumor_per[grepl("Percentage of tumor", tumor_per)]=replace$X..Tumor[grepl("Percentage of tumor", tumor_per)]

tumor_percentage=data.matrix(t(as.numeric(tumor_per)))
rownames(tumor_percentage)="N:SAMP:BLAST_TUMOR_PERCENTAGE"
colnames(tumor_percentage)=colnames(gexp)

# T-cell percentages
T_per=data.matrix(t(as.numeric(replace$X..T.cells)))
rownames(T_per)="N:SAMP:TCELL_PERCENTAGE"
colnames(T_per)=colnames(gexp)

tissue_per=data.matrix(t(as.numeric(replace$X..Lung)))
rownames(tissue_per)="N:SAMP:TISSUE_PERCENTAGE"
colnames(tissue_per)=colnames(gexp)


#*************************** adding some lymphoma annotation *************************************
anno = read.delim("GSE10846_series_matrix_info_ipi_clean.txt", stringsAsFactors=F, header=T)
anno=anno[anno[,1]%in%annot[,1],]

anno=anno[match(annot[,1], anno[,1]),]

annot$In.vivo.treatment[annot[,1]%in%anno[,1]]=gsub("*.*: |;", "", anno$chemotherapy[annot[,1]%in%anno[,1]])
annot$In.vivo.treatment[annot$In.vivo.treatment%in%"NA"]=NA

annot$dlbcl_ipi=NA
annot$dlbcl_ipi[annot[,1]%in%anno[,1]]=anno$ipi[annot[,1]%in%anno[,1]]

CHOP=t(annot$In.vivo.treatment%in%"CHOP-Like Regimen"*1)
RCHOP=t(annot$In.vivo.treatment%in%"R-CHOP-Like Regimen"*1)
CHOP[is.na(annot$In.vivo.treatment)]=NA
RCHOP[is.na(annot$In.vivo.treatment)]=NA
rownames(CHOP)="B:CLIN:Chemotherapy_CHOP"
rownames(RCHOP)="B:CLIN:Chemotherapy_RCHOP"
colnames(CHOP)=colnames(gexp)
colnames(RCHOP)=colnames(gexp)

# add cytogenetic information
genetics_org=read.delim("AML_preBALL_cytogenetics_vectors.txt", stringsAsFactors=F, header=T, row.names=1)
genetics=genetics_org[rownames(genetics_org)%in%colnames(gexp),]
genetics=t(genetics)*1
rownames(genetics)=paste("B:CLIN:", "GENETICS_", rownames(genetics), "", sep="")

cytogenetic=annot$Cytogenetics
cytogenetic[cytogenetic%in%c("na", "n/a", "", " ")]=NA
cytogenetic[grepl("without|unknown|remainingcytogenetics|no del13q|crlf2 fish: Normal|crlf2 fish: n/a", cytogenetic)]=NA
cytogenetic[grepl("ormal", cytogenetic)]="normal_karyotype"
cytogenetic[grepl("MLL", cytogenetic)]="MLL"
cytogenetic=gsub("complex aberrant karyotype", "complex karyotype", cytogenetic)
cytogenetic=gsub("hyperdiploid karyotype", "hyperdiploid", cytogenetic)
cytogenetic=gsub("TAL$", "TAL1", cytogenetic)
cytogenetic=gsub("remaining cytogenetics|other abNormalities", "other", cytogenetic)
cytogenetic=gsub("fish:|trisomy 8 |;deletion|; complex karyotype|, complex karyotype|: positive| chromosomal aberrations|/API2-MALT1|/API2-MALT1 negative|deletion *.*: negative|/IGH-MALT1|, plus other| plus other", "", cytogenetic)
cytogenetic=gsub("trisomy ", "trisomy", cytogenetic)
cytogenetic=gsub("TEL deleted", "TEL_deleted", cytogenetic)
cytogenetic=gsub("p13.1", "p13", cytogenetic)
cytogenetic=gsub(";$", "", cytogenetic)
cytogenetic=gsub("\\+ ", "+", cytogenetic)
cytogenetic=gsub("i\\(", "inv(", cytogenetic)
cytogenetic=gsub("complex karyotype", "complex_karyotype", cytogenetic)
cytogenetic=gsub("^ ", "", cytogenetic)
annot$cytogenetic_clean=cytogenetic

cytogenetic_terms=sort(unique(unlist(strsplit(cytogenetic, " "))))

cytogenetic_terms=unlist(strsplit(cytogenetic, "; |; |  |, | |/"))
cytogenetic_terms=gsub(" ", "", cytogenetic_terms)

terms=table(cytogenetic_terms)
shared=names(terms)[terms>5]

cytogenetics=do.call(rbind, mclapply(shared,FIND_LOGICAL, cytogenetic, mc.cores=8))
colnames(cytogenetics)=colnames(gexp)
#***********************

# age annotations
age=gsub("*.*: |^ |;| yr| age| years|-.*.$| Years|d 32.8|d 54.8", "", annot$Age)
age[age%in%c("na", "n/a", "", "not available")]=NA
age[grepl("month|Month", age)]=signif(as.numeric(gsub(" months.*.| months| Months", "", age[grepl("month|Month", age)]))/12, 2)
age[age=="Adult"]=30
age[age=="Children"]=5
age[age=="pediatric"]=1
age=t(as.numeric(age))
age[age>100]=NA
rownames(age)="N:CLIN:AGE"
colnames(age)=colnames(gexp)

# gender annotations
gender=toupper(annot$Gender)
gender[gender%in%c("GENDER: NOT AVAILABLE", "GENDER: NA;", "SEX: UNKNOWN;")]=NA
gender=gsub(" |;", "", gender)
gender=gsub("GENDER:|SEX/AGE:|/.*.|SEX:", "", gender)
gender[gender%in%c("F", "FEMALE", "WOMAN")]="female"
gender[gender%in%c("M", "MALE", "MAN")]="male"
gender=t(gender)
rownames(gender)="C:CLIN:GENDER"
colnames(gender)=colnames(gexp)

# race annotations
race=toupper(annot$Race)
race[grep("AGE", race)]=NA
race[grep("OTHER", race)]="OTHER"
race[grep("AFRICAN|RACE: AA;|RACE: B;", race)]="AFRICAN"
race[grep("HISPANIC|RACE: H; ", race)]="HISPANIC"
race[grepl("EUROPEAN|CAUCASIAN|ANGLO-AMERICAN|WHITE|RACE: W;|RACE: C;", race)]="EUROPEAN"
race[grep("ASIAN", race)]="ASIAN"
race[!grepl("ASIAN|AFRICAN|HISPANIC|EUROPEAN|OTHER", race)]=NA

race=t(race)
rownames(race)="C:CLIN:RACE"
colnames(race)=colnames(gexp)

#*************************** adding some myeloma annotation *************************************
annomm = read.delim2("GSE24080_MM_clininfo_GSMid_clean.txt", stringsAsFactors=F, header=T)
annomm2 = read.delim2("GSE19784_MM_survival_GSMid_iss_clean.txt", stringsAsFactors=F, header=T)

annomm$b2m=gsub("<0.5", "0.5", annomm$b2m)
annomm$b2m=as.numeric(gsub(",", ".", annomm$b2m))

annomm$aspc=as.numeric(gsub(",", ".", annomm$aspc))
annomm$bmpc=as.numeric(gsub(",", ".", annomm$bmpc))

# first combine the two:
library(data.table)
combmm=data.frame(rbindlist(list(annomm, annomm2), fill = TRUE), stringsAsFactors = F)

combmm=combmm[match(colnames(gexp), combmm$accession),]

# now add these vectors to annot table
age[colnames(gexp)%in%combmm$accession]=signif(combmm$age[colnames(gexp)%in%combmm$accession], 3)
gender[colnames(gexp)%in%combmm$accession]=combmm$sex[colnames(gexp)%in%combmm$accession]

combmm$race[combmm$race%in%"other"]="OTHER"
combmm$race[combmm$race%in%"white"]="EUROPEAN"
race[colnames(gexp)%in%combmm$accession]=combmm$race[colnames(gexp)%in%combmm$accession]

# time and status:
surv_d[1,colnames(gexp)%in%combmm$accession]=signif(combmm$os_time[colnames(gexp)%in%combmm$accession], 3)
surv_d[2,colnames(gexp)%in%combmm$accession]=signif(combmm$os_censor[colnames(gexp)%in%combmm$accession], 3)

# pfs time and status
pfs=cbind(combmm$pfs_time, combmm$pfs_censor)
pfs=t(pfs)
colnames(pfs)=colnames(gexp)
rownames(pfs)=c("N:CLIN:PFS_Time", "B:CLIN:PFS_Status")

# other myeloma annotations:
otherMM_C=t(combmm[,c(4,8)])
rownames(otherMM_C)=paste0("C:CLIN:MM_", toupper(rownames(otherMM_C)), "")
colnames(otherMM_C)=colnames(gexp)

# numeric myeloma:
otherMM_N=t(combmm[,c(9:20)])
rownames(otherMM_N)=paste0("N:CLIN:MM_", toupper(rownames(otherMM_N)), "")
colnames(otherMM_N)=colnames(gexp)

add_mm=t(combmm$cytogenetic_abnormalities)
colnames(add_mm)=colnames(gexp)
rownames(add_mm)=c("B:CLIN:MM_CYTOGENETIC_ABNORMALITIES")

l.clin=list(sorted, surv_d, pfs, tumor_percentage,T_per,tissue_per, CHOP, RCHOP, gender, age,race, cytogenetics, genetics, add_mm, otherMM_N, otherMM_C)
clin=data.frame(do.call(rbind, l.clin), stringsAsFactors = F)

#****************************** make annotation clusters ******************************************
annot$acute=rep("other", dim(annot)[1])
annot$acute[annot$colorClass=="AML"|annot$colorClass=="pre-B-ALL"|annot$colorClass=="T-ALL"]="acute_leukemias"
annot$acute[annot$colorClass=="CLL"|annot$colorClass=="CML"]="chronic_leukemias"
annot$acute[grepl("NonCancer", annot$MAINCLASS)]="NonCancer"
annot$disease=rep("other", dim(annot)[1])
annot$disease[grepl("NonCancer", annot$Sub.maps.available)]="NonCancer"
annot$CLASS2=annot$CLASS
annot$CLASS2[grepl("CellLine_Myeloma", annot$CLASS2)]="CellLine_Myeloma"
annot$CLASS2[grepl("CellLine_Leukemia", annot$CLASS2)]="CellLine_Leukemia"
annot$CLASS2[grepl("CellLine_Lymphoma", annot$CLASS2)]="CellLine_Lymphoma"
annot$CLASS2[grepl("CellLine_mix", annot$CLASS2)]="CellLine_mix"

findthese=c("NonCancer", "Cancer_Leukemia", "Cancer_Myeloma", "Cancer_Lymphoma","CellLine_Leukemia","CellLine_Lymphoma","CellLine_Myeloma","Prolif_Lymphoproliferative_ALPS","Prolif_Lymphoproliferative_MPN", "Prolif_Myeloproliferative_LCH_LC", "Prolif_Myeloproliferative_MDS")

for(f in findthese){
  annot$disease[grepl(f, annot$MAINCLASS)]=f
}

annot$subclasses=rep("other", dim(annot)[1])
findthese=c("NonCancer", "Cancer_Myeloma", "Cancer_Lymphoma", "CellLine_Leukemia", "CellLine_Lymphoma","CellLine_Myeloma","Prolif_Lymphoproliferative_ALPS","Prolif_Lymphoproliferative_MPN", "Prolif_Myeloproliferative_LCH_LC", "Prolif_Myeloproliferative_MDS")
findthese2=c("T-ALL", "pre-B-ALL", "AML","CML","CLL", "BCL", "TCL", "B-Lymphoid", "T-Lymphoid","Lymphoid", "Myeloid", "Erythroid", "StemCell")

for(f in findthese){
  annot$subclasses[grepl(f, annot$MAINCLASS)]=f
}
for(f in findthese2){
  annot$subclasses[grepl(f, annot$colorClass)]=f
}
DLBCL=c("Cancer_Lymphoma_BCL_DLBCL_ABC", "Cancer_Lymphoma_BCL_DLBCL_GCB", "Cancer_Lymphoma_BCL_DLBCL_na")
annot$subclasses[annot$CLASS2%in%DLBCL]="BCL_DLBCL"


annot$CLASS=gsub("_na|_check|_testicular", "",annot$CLASS)

# lymphoma annotations
bLY=(1:nrow(annot)%in%grep("Lymphoma_BCL",annot$CLASS))&(!1:nrow(annot)%in%grep("CellLine",annot$CLASS))&(!1:nrow(annot)%in%grep("NonCancer",annot$CLASS))
tLY=(1:nrow(annot)%in%grep("Lymphoma_TCL",annot$CLASS))&(!1:nrow(annot)%in%grep("CellLine",annot$CLASS))&(!1:nrow(annot)%in%grep("NonCancer",annot$CLASS))
annot$CLASS=gsub("Cancer_", "", annot$CLASS)

annot$tbLY=NA
annot$tbLY[bLY|tLY]=annot$CLASS[bLY|tLY]

# these are the terms to look for
# table(annot$disease)
# table(annot$colorClass)
# table(annot$acute)
# table(annot$subclasses)
# table(annot$tbLY)

#*******************************************************************************************
#*************************** annotation clusters ********************************************

#****************************** make clusters ******************************************

# make immunological normal annotation vectors
annot$plotNormals = "Other"

lv=!annot$Sample.type%in%"NonCancerHealthy"
annot$plotNormals[lv]=""

lv=grepl("RestingBcell|NaiveBcell|MemoryBcell|BcellActivated", annot$CLASS)
annot$plotNormals[lv]="B cell"

lv=grepl("GerminalCentre", annot$CLASS)
annot$plotNormals[lv]="Germinal centre cell"

lv=grepl("PlasmaBcell", annot$CLASS)
annot$plotNormals[lv]="Plasma cell"

lv=grepl("Tcell|NaturalKillerCell", annot$CLASS)
annot$plotNormals[lv]="T/NK cell"

lv=grepl("DendriticCell", annot$CLASS)
annot$plotNormals[lv]="Dendritic cell"

lv=grepl("Langerhans", annot$CLASS)
annot$plotNormals[lv]="Langerhans cell"

lv=grepl("Eryth|Platelet", annot$CLASS)
annot$plotNormals[lv]="Erythroid"

lv=grepl("Monocyte", annot$CLASS)
annot$plotNormals[lv]="Monocyte"

lv=grepl("Macrophage", annot$CLASS)
annot$plotNormals[lv]="Macrophage"

lv=grepl("Neutrophil", annot$CLASS)
annot$plotNormals[lv]="Neutrophil"

lv=grepl("MyeloidProgenitor", annot$CLASS)
annot$plotNormals[lv]="Myeloid progenitor"

lv=grepl("HematopoieticStemCell", annot$CLASS)
annot$plotNormals[lv]="HSC"

lv=grepl("^Mononuclear", annot$CLASS)
annot$plotNormals[lv]="PBMC"

lv=grepl("LymphNode", annot$CLASS)
annot$plotNormals[lv]="Lymph node"

HLAplot_normals <- c("B cell", "Plasma cell", "T/NK cell", "Dendritic cell", "Erythroid", "Monocyte", "Macrophage", "Neutrophil", "Myeloid progenitor", "HSC")
cytolyticplot_normals <- c("PBMC", "Lymph node")
costimplot_normals <- c(HLAplot_normals, "PBMC", "Lymph node", "Langerhans cell", "Germinal centre cell")

annot$immunoNormals=annot$Category.specifying.lineage.tumor.origin

lv=grepl("CD8|CD8+TcellActivated", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="CD8+Tcell"

lv=grepl("NaturalKiller", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="NKCell"

lv=grepl("M2-Macrophage", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="M2-Macrophage"

lv=grepl("M1-Macrophage", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="M1-Macrophage"

lv=grepl("DendriticCell", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="DendriticCell"

lv=grepl("Monocyte", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="Monocyte"

lv=grepl("CD4+", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="CD4+Tcell"

lv=!annot$Sample.type%in%"NonCancerHealthy"
annot$immunoNormals[lv]=""

lv=grepl("Eryth|Platelet", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="Erythroid"

lv=grepl("CD3", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="Tcell"

lv=grepl("GerminalCentre", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="GerminalCentreCell"

lv=grepl("^Tcell$|^TcellActivated$|^TcellResting$", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="Tcell"

lv=grepl("^ActivatedBcell$|^RestingBcell$|^BcellActivated$", annot$Category.specifying.lineage.tumor.origin)
annot$immunoNormals[lv]="Bcell"

# annotated clusters
tbLY=FUN_MAKE_ALL(annot$tbLY, "annotated_class", annot$tbLY, 0)
subclasses=FUN_MAKE_ALL(annot$subclasses, "annotated_class", annot$subclasses, 0)
acute_chronic=FUN_MAKE_ALL(annot$acute, "annotated_class", annot$acute, 0)
colorClass=FUN_MAKE_ALL(annot$colorClass, "annotated_class", annot$colorClass, 0)
disease=FUN_MAKE_ALL(annot$disease, "annotated_class", annot$disease, 0)
tbLY=FUN_MAKE_ALL(annot$tbLY, "annotated_class", annot$tbLY, 0)
fullclass=FUN_MAKE_ALL(annot$CLASS2, "annotated_class", annot$CLASS2, 0)
immunoclass=FUN_MAKE_ALL(annot$immunoNormals, "annotated_class_immunoNormals", annot$immunoNormals, 0)

l.comparisons=list(disease, colorClass, acute_chronic,subclasses, tbLY, fullclass, immunoclass)
comparisons=do.call(rbind, l.comparisons)
comparisons=data.frame(data.matrix(comparisons[!duplicated(rownames(comparisons)),]), stringsAsFactors = F)
colnames(comparisons)=colnames(gexp)

# test if all rows are fine, should be >1 values
A=apply(comparisons, 1, unique)

B=unlist(lapply(A, function(d)sum(d%in%c(1,0))>=2))

if(!all(B))stop("Check comparisons, impossible comparisons made")

# categorical feats
class1=FUN_MAKE_CATEGORICAL(annot$tbLY, "annotated_class_BCL_TCL")
class2=FUN_MAKE_CATEGORICAL(annot$colorClass, "annotated_class_colorclass")
class3=FUN_MAKE_CATEGORICAL(annot$disease, "annotated_class_disease")
class4=FUN_MAKE_CATEGORICAL(annot$immunoNormals, "annotated_class_immunoNormals")

l.comparisons=list(class1, class2, class3, class4)
comparisons_cat=do.call(rbind, l.comparisons)
comparisons_cat=data.frame(comparisons_cat[!duplicated(rownames(comparisons_cat)),], stringsAsFactors = F)

colnames(comparisons_cat)=colnames(gexp)


#****************************************************************************************************
#******************** Next we start to create features of these individual data types ***************
#****************************************************************************************************

l.fm=list(data_list,clin,fractions, comparisons)

library(data.table)
fm=rbindlist(l.fm, use.names=F, fill=F)

fm=data.frame(fm, stringsAsFactors=F)
rownames(fm)=unlist(lapply(l.fm, rownames))

matrix=fm

# also add clinicaldata to annotations
numclin=t(data.matrix(clin[!grepl("^C:", rownames(clin)),]))
numchr=t(clin[grepl("^C:", rownames(clin)),])
colnames(numclin)=gsub(".:CLIN:|.:GEXP:|", "", colnames(numclin))
colnames(numchr)=gsub(".:CLIN:|.:GEXP:|", "", colnames(numchr))

annot_add=data.frame(numclin, numchr, stringsAsFactors = F)

fractions2=t(fractions)
colnames(fractions2)=gsub("N:SAMP:", "", colnames(fractions2))
annot2=data.frame(annot, annot_add, "CytolyticScore"=as.numeric(gm1) ,"HLAIScore"=as.numeric(gm2), "HLAIIScore"=as.numeric(gm3),classification,fractions2, stringsAsFactors = F)


# annot
clusters=read.delim("AML_15pct_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T)
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,]
clusters_cancermap=clusters$X1.5..cluster

matrix_sub=matrix[,colnames(matrix)%in%clusters$ID]
annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,]

# TCGA clusters
cluster_mapping=read.delim("Table_TCGA_cluster_AML_cluster_assignment.txt", header=T, stringsAsFactors=F, sep="\t")
TCGA_cluster=rep("NA", dim(annot_sub)[1])

TCGA_cluster[clusters_cancermap%in%cluster_mapping[1,2]]="TCGA_AML_cluster_1"
TCGA_cluster[clusters_cancermap%in%cluster_mapping[2,2]]="TCGA_AML_cluster_2"
TCGA_cluster[clusters_cancermap%in%cluster_mapping[3:5,2]]="TCGA_AML_cluster_3"
TCGA_cluster[clusters_cancermap%in%cluster_mapping[6:10,2]]="TCGA_AML_cluster_4"
TCGA_cluster[clusters_cancermap%in%cluster_mapping[11,2]]="TCGA_AML_cluster_5"
TCGA_cluster[clusters_cancermap%in%cluster_mapping[12:15,2]]="TCGA_AML_cluster_6"
TCGA_cluster[clusters_cancermap%in%cluster_mapping[16:17,2]]="TCGA_AML_cluster_7"

n=lapply(unique(TCGA_cluster), function(i)annot_sub$GSM.identifier..sample.[TCGA_cluster%in%i])
names(n)=unique(TCGA_cluster)
n=n[!unique(TCGA_cluster)%in%"NA"]
save(n, file="Hemap_immunology_TCGA_clusters.Rdata")

save(matrix, file="Hemap_immunology_fm.Rdata")
save(annot2, file="Hemap_immunology_Annotations.Rdata")
write.table(annot2,"Hemap_immunology_Annotations.tsv", sep="\t", col.names=T, row.names=F, quote=FALSE)

write.table(t(c("N:SAMP", as.character(colnames(matrix)))), file="Hemap_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F)
write.table(matrix, file="Hemap_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T)

# make a small fix here to harmonize survival data to months:
load("Hemap_immunology_Annotations.Rdata")

unique(cbind(annot2[!is.na(annot2$OS_Time), c(2,4)]))
annot2$OS_Time[!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE10846,GSE11318", "GSE10846", "GSE10846,GSE17372", "GSE11877")]=annot2$OS_Time[!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE10846,GSE11318", "GSE10846", "GSE10846,GSE17372", "GSE11877")]*12

# for myeloma, transform data to 5y survival to compare data sets:
modify=!is.na(annot2$OS_Time)&annot2[,2]%in%c("GSE16716,GSE24080")
find=annot2$OS_Time>60&modify

# change status to alive if dead later
find2=annot2$OS_Status==1&modify
annot2$OS_Status[find&find2]=0
annot2$OS_Time[find]=60

save(annot2, file="Hemap_immunology_Annotations.Rdata")

#****************************************************************************************
# This FM can then be used as a backbone for other FMs. GSVA and clusters must be added
#****************************************************************************************

matrix=get(load("Hemap_immunology_fm.Rdata"))
annot=get(load("Hemap_immunology_Annotations.Rdata"))

#********************************** Full map **********************************
clusters=read.delim("anno_coord_data9544_15pct_bw2.5_updated.txt", stringsAsFactors=F, header=T)
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,]

#*********************************** GSVA input ****************************
gsva=get(load("data8238_dufva_immunological_genes_updated_2016_GSVA_geneperm_lean_eFDR.Rdata"))
bindea=get(load("data8238_all_samples_dufva_bindea_2013_geneset_GSVA.Rdata"))
load("data8238_all_samples_Combined_pathway_signatures_210616_GSVA.Rdata")
gsva_es=rbind(gsva, bindea, gsva_es)
gsva_es=gsva_es[!duplicated(rownames(gsva_es)),]

# match cols gsva
gsva_es=data.frame(gsva_es[,match(colnames(matrix), colnames(gsva_es))])
colnames(gsva_es)=colnames(matrix)

#**************************** Cancermap clusters ****************************
clusters_cancermap=clusters$X2.5..cluster
cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0)
subclasses=FUN_MAKE_ALL(annot$subclasses, "cancermap_cluster", clusters_cancermap, 0.8)
acute_chronic=FUN_MAKE_ALL(annot$acute, "cancermap_cluster", clusters_cancermap, 0.8)
colorClass=FUN_MAKE_ALL(annot$colorClass, "cancermap_cluster", clusters_cancermap, 0.8)
disease=FUN_MAKE_ALL(annot$disease, "cancermap_cluster", annot$disease, 0.8)
fullclass=FUN_MAKE_ALL(annot$CLASS2, "cancermap_cluster", annot$CLASS2, 0.8)

class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster")

l.comparisons=list(cluster_cancermap, subclasses, acute_chronic, colorClass, disease,fullclass, class_cancermap)
comparisons_cat=do.call(rbind, l.comparisons)
comparisons_cat=data.frame(comparisons_cat[!duplicated(rownames(comparisons_cat)),], stringsAsFactors = F)
colnames(comparisons_cat)=colnames(matrix)

# combine
l.fm=list(matrix, gsva_es, comparisons_cat)

library(data.table)
fm=rbindlist(l.fm, use.names=F, fill=F)

fm=data.frame(fm, stringsAsFactors=F)
rownames(fm)=unlist(lapply(l.fm, rownames))

# remove rows with few values or NAs
rm=apply(fm, 1, function(v)all(is.na(v)))
fm=fm[!rm,]

save(fm, file="Hemap_immunology_fm_cancermap.Rdata")

write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_immunology_fm_cancermap.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F)
write.table(fm, file="Hemap_immunology_fm_cancermap.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T)


#********************************** Lymphoma **********************************
matrix=get(load("Hemap_immunology_fm.Rdata"))
annot=get(load("Hemap_immunology_Annotations.Rdata"))

# annot
clusters=read.delim("Hemap_Lymphoma_15pct_genes_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T)
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,]

matrix_sub=matrix[,colnames(matrix)%in%clusters$ID]
annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,]

load("data9544_LYMPHOMA_all_samples_Combined_pathway_drug_signatures_2017_GSVA.Rdata")
rownames(gsva_es)=gsub(" ", "_", rownames(gsva_es))

# match cols gsva
gsva_es=data.frame(gsva_es[,match(colnames(matrix_sub), colnames(gsva_es))])
colnames(gsva_es)=colnames(matrix_sub)

clusters_cancermap=clusters$X1.5..cluster

# comparisons
cluster_subtypes=FUN_MAKE_ALL(annot_sub$CLASS, "cancermap_cluster", clusters_cancermap, 0.8)
cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0)
cluster_BCL_TCL=FUN_MAKE_ALL(annot_sub$tbLY, "cancermap_cluster", clusters_cancermap, 0.8)
class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster")

l.comparisons=list(cluster_subtypes, cluster_cancermap, cluster_BCL_TCL, class_cancermap)
comparisons_cat=do.call(rbind, l.comparisons)
comparisons_cat=data.frame(data.matrix(comparisons_cat[!duplicated(rownames(comparisons_cat)),]), stringsAsFactors = F)


# combine
l.fm=list(matrix_sub, data.frame(gsva_es), comparisons_cat)

library(data.table)
fm=rbindlist(l.fm, use.names=F, fill=F)

fm=data.frame(fm, stringsAsFactors=F)
rownames(fm)=unlist(lapply(l.fm, rownames))

# remove rows with few values or NAs
rm=apply(fm, 1, function(v)all(is.na(v)))
fm=fm[!rm,]

save(fm, file="Hemap_LYMPHOMA_immunology_fm.Rdata")

write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_LYMPHOMA_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F)
write.table(fm, file="Hemap_LYMPHOMA_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T)

#********************************** AML **********************************

matrix=get(load("Hemap_immunology_fm.Rdata"))
annot=get(load("Hemap_immunology_Annotations.Rdata"))

# annot
clusters=read.delim("AML_15pct_BHSNE_mean-shift.txt", stringsAsFactors=F, header=T)
clusters=clusters[clusters$ID%in%annot$GSM.identifier..sample.,]

clusters_cancermap=clusters$X1.5..cluster

matrix_sub=matrix[,colnames(matrix)%in%clusters$ID]
annot_sub=annot[annot$GSM.identifier..sample.%in%clusters$ID,]

# GSVA input
gsva=get(load("data9544_AML_all_samples_Combined_pathway_drug_signatures_2017_GSVA.Rdata"))

# match cols gsva
gsva_es=data.frame(gsva_es[,match(colnames(matrix_sub), colnames(gsva_es))])
colnames(gsva_es)=colnames(matrix_sub)

# TCGA clusters
cluster_mapping=read.delim("Table_TCGA_cluster_AML_cluster_assignment.txt", header=T, stringsAsFactors=F, sep="\t")

annot_sub$TCGA_cluster=rep("NA", dim(annot_sub)[1])

annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[1,2]]="TCGA_AML_cluster_1"
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[2,2]]="TCGA_AML_cluster_2"
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[3:5,2]]="TCGA_AML_cluster_3"
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[6:10,2]]="TCGA_AML_cluster_4"
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[11,2]]="TCGA_AML_cluster_5"
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[12:15,2]]="TCGA_AML_cluster_6"
annot_sub$TCGA_cluster[clusters_cancermap%in%cluster_mapping[16:17,2]]="TCGA_AML_cluster_7"

# comparisons
cluster_TCGA=FUN_MAKE_ALL(annot_sub$TCGA_cluster, "cancermap_cluster", annot_sub$TCGA_cluster, 0.9)
cluster_cancermap=FUN_MAKE_ALL(clusters_cancermap, "cancermap_cluster", clusters_cancermap, 0)
cluster_subtypes=FUN_MAKE_ALL(annot_sub$CLASS, "cancermap_cluster", clusters_cancermap, 0.9)
class_cancermap=FUN_MAKE_CATEGORICAL(clusters_cancermap, "cancermap_cluster")
class_TCGA=FUN_MAKE_CATEGORICAL(annot_sub$TCGA_cluster, "cancermap_cluster")

l.comparisons=list(cluster_TCGA, cluster_cancermap, cluster_subtypes, class_cancermap, class_TCGA)
comparisons_cat=do.call(rbind, l.comparisons)
comparisons_cat=data.frame(data.matrix(comparisons_cat[!duplicated(rownames(comparisons_cat)),]), stringsAsFactors = F)

# combine
l.fm=list(matrix_sub, data.frame(gsva_es), comparisons_cat)

library(data.table)
fm=rbindlist(l.fm, use.names=F, fill=F)

fm=data.frame(fm, stringsAsFactors=F)
rownames(fm)=unlist(lapply(l.fm, rownames))

# remove rows with few values or NAs
rm=apply(fm, 1, function(v)all(is.na(v)))
fm=fm[!rm,]

save(fm, file="Hemap_AML_immunology_fm.Rdata")

write.table(t(c("N:SAMP", as.character(colnames(fm)))), file="Hemap_AML_immunology_fm.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F)
write.table(fm, file="Hemap_AML_immunology_fm.tsv", sep="\t", col.names=F, row.names=T, quote=FALSE, append=T)

# write.table(all_isolation, file="Hemap_all_isolation.tsv", sep="\t", col.names=F, row.names=F, quote=FALSE, append=F)