Software:
Database:
Create directories and copy raw sequencing files:
mkdir 00_rawdata 01_fastp 02_cutadapt 03_rsem
mv *.fastq 00_rawdata
cd 00_rawdata/
ls *_1.fastq.gz > ../filelist
cd ..
Adapter sequence detection:
for i in `cat filelist`
do
j=${i/_1./_2.}
fastp \
--detect_adapter_for_pe \
-i 00_rawdata/$i \
-I 00_rawdata/$j \
-o 01_fastp/$i \
-O 01_fastp/$j \
-z 4 -q 20 -u 40 -n 6
done
Remove adapter:
for i in `cat filelist`
do
j=${i/_1./_2.}
cutadapt \
--discard-trimmed \
-O 18 \
-j 20 \
-b AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \
-B AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \
-o 02_cutadapt/$i \
-p 02_cutadapt/$j \
00_rawdata/$i \
00_rawdata/$j
done
Install RSEM:
wget https://github.com/deweylab/RSEM/archive/v1.3.3.tar.gz
tar -zxvf v1.3.3.tar.gz
cd rsem
make install prefix=/software/biosoft/rsem/
echo 'PATH=$PATH:/software/biosoft/rsem/bin/' >> ~/.bashrc
source ~/.bashrc
Build index:
rsem-prepare-reference \
-p 20 \
--gtf \
./gencode.v39.annotation.gtf \
--hisat2-hca \
./GRCh38.primary_assembly.genome.fa \
./grch38
Align RNA-seq against human genome and calculate gene expression:
for i in `cat filelist`
do
j=${i/_1./_2.}
k=${i%%_*}
rsem-calculate-expression \
--paired-end \
-p 20 \
--hisat2-hca \
02_cutadapt/$i \
02_cutadapt/$j \
/software/biosoft/rsem/database/grch38/grch38 \
03_rsem/$k
done
The count and FPKM data of all genes in each sample should be present in [SampleID].genes.results
Concatenate the results of each sample to generate the file count.txt
data<-read.table("count.txt",row.names=1,sep="\t",header=T)
group<-read.table("group.txt",sep="\t",header=T) ## optional metadata
dds <- DESeqDataSetFromMatrix(countData = data, colData = group, design= ~ Group)
vst<-assay(varianceStabilizingTransformation(dds))
write.table(vst,"transcriptome.txt",append=FALSE,sep="\t",quote=FALSE)