[4ff317]: / FL_integration / RNASeq / IDconversion_Annotation.Rmd

Download this file

102 lines (79 with data), 4.5 kB

---
title: "Genesymbols"
output: html_notebook
---

```{r getting genesymbols }
options(scipen=999)
library(GenomicRanges)
library(AnnotationHub)
library(stringr)
library(dplyr)
library(DESeq2)
library(tidyverse)
#format(Readcount, scientific=FALSE);
#replace e_ to e- in input file getting from salmon

DLBCL <- read.table("TSD_EXPORTED_METADATA_SAMPEID/DLBCL-salmon_TPM_Patiendsampleid.txt", header=TRUE, sep = "\t", row.names = 1)
DLBCL <-dplyr::select(DLBCL, -c('K562', 'KMH2'))
view(DLBCL)
FL <- read.table("TSD_EXPORTED_METADATA_SAMPEID/FL_Normal_salmon_TPM_Patiendsampleid.txt", header=TRUE, sep = "\t", row.names = 1)
metaData <- read.csv('TSD_EXPORTED_METADATA_SAMPEID/metadata.txt', header = TRUE, sep = "\t")
metaData
tpmcount <- cbind(FL,DLBCL)

view(tpmcount)
tpmcount <- rownames_to_column(tpmcount, var = "EnsembleID")
ah <- AnnotationHub()
resdata1<-tpmcount

##for checking FILES on browser
#d <- display(ah)#choose AH75121
#query(ah, c("ensembl","gtf3","human"))
#AH10684 is ensembl/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf_0.0.1.RData
gc <- ah[["AH10684"]]
#Grch38.100version and useAH63757 forEnsemble GRCH38.93.gtf version32, ucscTxDB
head(gc)
#removes decimal point from EnsembleID of annotation hub dataframe 
#EnsembleID <- sapply(strsplit(gc$gene_id,"\\."), function(x) x[1]) 
#format(resdata1, scientific=FALSE);
tpmcount
EnsembleID <- as.data.frame(resdata1$EnsembleID)
head(EnsembleID)

#combining ensemble id with annotations
#combined_annotate <-cbind(gc,EnsembleID)
head(resdata1$EnsembleID)
#identifies overlaps between DESEQ2 DE gene table and annotations
overlaps <- match(resdata1$EnsembleID, gc$gene_id)
#overlaps
#storing Only overlaps 
stored <- gc[overlaps,]   
bedfile<-cbind(stored,resdata1)
stored
head(bedfile)
genesymbol<- make.unique(as.character(bedfile$gene_name, ignore.case=TRUE))

#gene_name_unqiue<- as.data.frame(gene_name) 

bedfile <- cbind(genesymbol,bedfile)
library(data.table)
currentjob<-"fl_normal_dlbcl"
fwrite(bedfile,paste0(currentjob,"_Completefile_with_annotation.bed",delim ="\t")) 

protein_coding_bedfile<-bedfile[bedfile$source %in% c("protein_coding"), ]
protein_coding_bedfile
library(data.table)

fwrite(protein_coding_bedfile,paste0(currentjob,"_Proteincoding_with_annotation.bed",delim ="\t")) 
```

```{r proteincoding genes only }
library(tidyverse)
#Here we are only Including Samples from Follicular Lymphoma cohort and Normal healthy Samples 

currentjob="fl_normal"

DE_Coordinate_FL <- protein_coding_bedfile[,c("genesymbol","P1_1","P1_2","P2_1","P2_2","P4_1","P6_3","P8_2","P9_2","P10_2","P11_2","P12_1","P14_1","P14_2","P23_1","P27_1","P27_2","P28_1","P29_1","P29_2","P30_2","P31_1","P31_4","P33_1","P34_2","P35_1","P35_2","P36_2","P40_1","P41_1","P42_1","P42_2","P43_1","P44_1","P44_2","P46_1","P46_3","P48_1","P49_1","P49_2","P50_1","P56_1","P56_2","P75_1","P11_1","P9_1","P3_1","P3_2","P46_2","P6_2","P57_1","P58_1","JP6_2","P10_1","P21_1","P21_2","P23_2","P23_3","P25_2","P27_3","P31_3","P32_1","P34_1","P43_2","P59_1","P61_1","P75_2","P75_3","BC1_Stim","BC1_Unstim","BC2_Stim","BC2_Unstim","BC3_Stim","BC3_Unstim","BC4_Stim","BC4_Unstim","T25ii_GC","T25ii_Memory","T25ii_Naive","T31_GC","T31_Memory","T31_Naive","T7_GC","T7_Memory","T7_Naive","T9_GC","T9_Memory","T9_Naive")]

fwrite(DE_Coordinate_FL,paste0(currentjob,"_proteincoding_genesymbol_TPM.txt",sep="\t"))
#REMOVE LINES with where value =1 appears for in all samples for downstream analysis using sublime REGEX..

file.copy("fl_normal_proteincoding_genesymbol_TPM.txt", "/Dropbox (UiO)/LymphomaBiology/FL1/FL1_RNASEQ/0_INPUTFILES/1_SALMON_TPM_matrices/inputfiles/")

```

```{r Removing BC_Stimulated Samples }
#Removing BC_Stimulated Samples and low tumor purity samples
count_matrix <- read.delim("/UiO Dropbox/Ankush Sharma/LymphomaBiology/FL1/FL1_RNASEQ/0_INPUTFILES/1_SALMON_TPM_matrices/TSD_EXPORTED_METADATA_SAMPLEID/FL_Normal_salmon_TPM_Patiendsampleid.txt",header = TRUE, sep = "\t",row.names = 1)

# List of samples to be removed
samples_to_remove <- c("BC1_Stim", "BC1_Unstim", "BC2_Stim", "BC2_Unstim", 
                       "BC3_Stim", "BC3_Unstim", "BC4_Stim", "BC4_Unstim","P34_1", "P35_1","JP6_2")

# Remove the specified samples from the count matrix (assuming the count matrix is part of the original data)

count_matrix <- count_matrix %>% select(-one_of(samples_to_remove))
write.csv(count_matrix,"/UiO Dropbox//Ankush Sharma/LymphomaBiology/FL1/FL1_RNASEQ/GEOSubmission/CountMatrixFL_Normal_SalmonTPM.csv")

```