ImmunogenomicLandscape-BC / Git / [8e0848] /preprocessing/Preprocessing_normalize_hguarray

Models:
WandaB/
ImmunogenomicLandscape-BC
Downloads: 1
[8e0848]: / preprocessing / Preprocessing_normalize_hguarray_GSE98588.R
History
Download this file
183 lines (144 with data), 6.6 kB

##Before running this script create pdata.txt file that has 1st column CEL file names, then other sample source/type description columns

##The results are created using affymetrix annotations and alternive cdf from BrainArray: REFSEQ mapping used!
## alternative cdf files can be obtained from http://brainarray.mbni.med.umich.edu/Brainarray/Database/CustomCDF/CDF_download.asp
## installation of alternative probe mapping packages (requires AnnotationDBI Bioconductor package!) from shell: sudo R CMD INSTALL package_name
# you need 3 packages from brainarray, cdf probe db, download from link if you don´t have them

setwd("/research/groups/sysgen/raw_data/petri/GSE98588_RAW/")
pdata="pdata.txt"

##---------------------------------------------------------

## set these variables to match the normalization you want
useALT_probemap=TRUE #Use alternative probe or not
useGCRMA=F #Use advanced version of GCRMA or not

##---------------------------------------------------------

normalize.hgu.133plus=function(pdata, outputpath=getwd(), useALT_probemap=T, useGCRMA=T){
  ## record time it takes to run
  ptm <- proc.time()
  
  ##REQUIRED packages
  
  require(affy)
  require(gcrma)
  require(affyQCReport)
  require(annaffy)
  
  if(useALT_probemap){
    require(hgu133plus2hsentrezgcdf)
    require(hgu133plus2hsentrezgprobe)
    require(hgu133plus2hsentrezg.db)
  }
  
  require(hgu133plus2cdf)
  require(hgu133plus2.db)
  require(hgu133plus2probe)
  
  if(useGCRMA){
    ##---------------------------------------------------------
    ## PROBE AFFINITY CALCULATION
    
    ## for GC-RMA background correction we need to compute the probe affinity model
    ## check if the files already exist, if not, create them
    
    ## Using Affymetrix probe mapping
    if(!useALT_probemap){
      if(!file.exists("affinity_human133Plus2.RData")){
        affinity.info.human133plus2=compute.affinities("hgu133Plus2")
        save(affinity.info.human133plus2,file="affinity_human133Plus2.RData")
      }
      load("affinity_human133Plus2.RData")
    }
    
    ## Using alternative probe mapping (make sure that these libraries have been installed! cdf, probe and .db packages)
    if(useALT_probemap){
      if(!file.exists("affinity_human133Plus2alt.RData")){
        affinity.info.human133plus2alt=compute.affinities("hgu133Plus2_Hs_ENTREZG")
        save(affinity.info.human133plus2alt,file="affinity_human133Plus2alt.RData")
      }
      load("affinity_human133Plus2alt.RData")
    }
    
    ##---------------------------------------------------------
  }
  ##read in pdata.txt and determine filenames to normalize based on it
  pheno=read.AnnotatedDataFrame(pdata)
  Filenames=rownames(pData(pheno))
  gsms=pheno$Accession
  rownames(pData(pheno))=Filenames
  experiment_ID=pheno$gse[1]
  
  ## AFFYMETRIX PROBE MAPPING
  ## GC-RMA normalization
  if(!useALT_probemap&useGCRMA){
    eset=just.gcrma(filenames=Filenames, phenoData=pheno, normalize=TRUE, type="fullmodel", fast=FALSE, cdfname="hgu133Plus2",affinity.info=affinity.info.human133plus2)
  }
  
  ## RMA normalization
  if(!useALT_probemap&!useGCRMA){
    eset=just.rma(filenames=Filenames, phenoData=pheno, normalize=TRUE, cdfname="hgu133Plus2")
  }
  
  ## ALTERNATIVE PROBE MAPPING
  ## GC-RMA normalization
  if(useALT_probemap&useGCRMA){
    eset=just.gcrma(filenames=Filenames, phenoData=pheno, normalize=TRUE, type="fullmodel", fast=FALSE, cdfname="hgu133Plus2_Hs_ENTREZG",affinity.info=affinity.info.human133plus2alt)
  }
  
  ## RMA normalization
  if(useALT_probemap&!useGCRMA){
    eset=just.rma(filenames=Filenames, phenoData=pheno, normalize=TRUE, cdfname="hgu133Plus2_Hs_ENTREZG")
  }
  
  ##---------------------------------------------------------
  ## PUT TOGETHER A TABLE WITH ANNOTATIONS
  
  ##annotation data
  probeIDs=featureNames(eset)
  
  ## find and remove ctrl probe data
  ctrls=grep("AFFX", probeIDs)
  discard=1:length(probeIDs)%in%ctrls
  probeIDs=probeIDs[!discard]
  # probeIDs=gsub("_at", "", probeIDs)
  eset=eset[!discard,]
  
  entrezIDs=aafLocusLink(probeIDs, "hgu133plus2hsentrezg.db")
  entrezIDv=as.vector(entrezIDs, mode="character")
  
  symbols=aafSymbol(probeIDs, "hgu133plus2.db")
  symbolv=as.vector(symbols, mode="character")
  
  library(org.Hs.eg.db)
  keys=entrezIDv
  
  symbolv <- mapIds(org.Hs.eg.db,
                    keys=keys,
                    column="SYMBOL",
                    keytype="ENTREZID",
                    multiVals="first")
  all(keys==names(symbolv))
  
  rm=is.na(symbolv)
  
  symbolv[is.na(symbolv)]=keys[is.na(symbolv)]
  
  ##put together in a dataframe
  DF=data.frame(exprs(eset),stringsAsFactors=FALSE)
  colnames(DF)=paste(gsub("_.*.", "", gsms), pheno$Title, sep="_")
  
  DF=DF[!(duplicated(symbolv)|symbolv=="integer(0)"),]
  rownames(DF)=symbolv[!(duplicated(symbolv)|symbolv=="integer(0)")]
  ##---------------------------------------------------------
  
  ## variance filter
  gvar=apply(exprs(eset),1,var)
  var_median=median(gvar)
  select=gvar>var_median
  png(paste(experiment_ID, "variance_histogram.png", sep="_"), bg="transparent")
  hist(gvar, main=paste("distribution of gene variance, median : ",var_median,sep=""), xlab="variance")
  dev.off()
  
  gexprs=apply(exprs(eset),1,mean) #mean exprs level for each gene
  exprs_median=median(gexprs)
  png(paste(experiment_ID, "exprs_histogram.png", sep="_"), bg="transparent")
  hist(gexprs, main=paste("distribution of gene expression, median : ",exprs_median,sep=""), xlab="variance")
  dev.off()
  
  png(paste(experiment_ID, "exprs_vs_variance.png", sep="_"), bg="transparent")
  plot(gexprs,gvar, main="exprs level vs variance", xlab="gexprs", ylab="variance")
  dev.off()
  
  ## use variance as filter
  DFfilt=DF[select,]
  ##---------------------------------------------------------
  ## WRITE OUTPUT FILES
  
  if(!useALT_probemap&useGCRMA){
    output=paste(experiment_ID,"affyID_gcrma_normalized.txt", sep="_")
  }
  if(!useALT_probemap&!useGCRMA){
    output=paste(experiment_ID,"affyID_rma_normalized.txt", sep="_")
  }
  if(useALT_probemap&useGCRMA){
    output=paste(experiment_ID,"symbol_gcrma_normalized.txt", sep="_")
  }
  if(useALT_probemap&!useGCRMA){
    output=paste(experiment_ID,"symbol_rma_normalized.txt", sep="_")
  }
  
  save(DF, file=file.path(gsub(".txt", ".Rdata", output)))
  
  write.table(DF,file=output, sep="\t", row.names=T, quote=FALSE)
  write.table(DFfilt, file=paste("filt", output,sep="_"), sep="\t", row.names=T, quote=FALSE)
  ptmf = proc.time() - ptm
  print(ptmf)
}

# run
normalize.hgu.133plus(pdata)