--- a +++ b/common_scripts/statistics/mixtureModel.R @@ -0,0 +1,60 @@ +mixtureM.gene=function(gene, d.matrix, FIX=F){ + library(mclust) + library(parallel) + dat=d.matrix[rownames(d.matrix)%in%gene,,drop=F] + model=Mclust(as.numeric(dat),2) + + # first test if we can find clear components: + class=model$classification + + class[class == 1] = -1 # background + class[class == 2] = 1 # expressed high + + for (j in 1:dim(dat)[2]) { # Adjust the classes. + if (dat[,j] < 4) { + class[j] = -1 + next + } + if (model$uncertainty[j] > 0.1) { + class[j] = 0 + next + } + if (dat[,j] > 10) { + class[j] = 1 + } + } + + # in some cases background distribution does not exist (gene always expressed, -1 missing) + # or over 90% of the samples are uncertain + # in these cases use original model component for expressed - notexpressed + # These cases have only one distribution, try to identify if it is background distribution or highly expressed gene distribution + if(FIX){ + + # this means, that only one distribution was found, not two: + if((sum(class==-1)==0|sum(class==0)>0.9*length(class))&!all(class==1)){ + + # odds, how many samples have expression above/below 6 + odds=sum(dat[class==0]>=6)/sum(class==0) + odds2=sum(dat[class==0]<6)/sum(class==0) + + # 60% of the uncertain have expression above 6 + if(odds>0.6){ + class[model$classification == 2] = 1 + } + + # 60% of the uncertain have expression below 6 + if(odds2>0.6){ + class[model$classification == 1] = -1 + } + } + } + + ret_class=data.matrix(t(class)) + dimnames(ret_class)=dimnames(dat) + return(ret_class) +} + +# data=t(get(load("/research/groups/sysgen/PROJECTS/HEMAP/HEMAP/dat2figs/data/data9544_with_gene_symbols.RData"))) +# d.matrix=data +# profile=do.call(rbind, mclapply(rownames(data), mixtureM.gene, data, FIX=T, mc.cores=8)) +# save(profile, file="mixtureM_profile.Rdata")