|
a |
|
b/R/Gene_Dictionary.R |
|
|
1 |
# Gene_Dictionary.R |
|
|
2 |
|
|
|
3 |
if (!require(data.table)) { |
|
|
4 |
install.packages("data.table") |
|
|
5 |
library(data.table) |
|
|
6 |
} |
|
|
7 |
if (!require(biomaRt)) { |
|
|
8 |
BiocInstaller::biocLite("biomaRt") |
|
|
9 |
library(biomaRt) |
|
|
10 |
} |
|
|
11 |
if (!require(curl)) { |
|
|
12 |
install.packages("curl") |
|
|
13 |
library(curl) |
|
|
14 |
} |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
# ==== Create a dictionary for all genes ==== |
|
|
18 |
# Load a gene expression file |
|
|
19 |
exp_file <- get(load("Data/TCGA/SummarizedExperiments/Exp/TCGA-ACC_GeneExpression.rdata")) |
|
|
20 |
all_genes <- rownames(exp_file) |
|
|
21 |
# Use an older version of biomaRt to ensure hg19 coordinates are retrieved |
|
|
22 |
ensembl_75 = useMart( |
|
|
23 |
biomart = "ENSEMBL_MART_ENSEMBL", |
|
|
24 |
host = "feb2014.archive.ensembl.org", |
|
|
25 |
path = "/biomart/martservice", |
|
|
26 |
dataset = "hsapiens_gene_ensembl" |
|
|
27 |
) |
|
|
28 |
atts <- listAttributes(ensembl_75, what = "name") |
|
|
29 |
atts[grep(pattern = "gene", x = atts, ignore.case = T)] |
|
|
30 |
atts[grep(pattern = "exon", x = atts, ignore.case = T)] |
|
|
31 |
atts[grep(pattern = "location", x = atts, ignore.case = T)] |
|
|
32 |
atts[grep(pattern = "entrez", x = atts, ignore.case = T)] |
|
|
33 |
# Retrieve relevant attributes |
|
|
34 |
length(all_genes) |
|
|
35 |
dict <- biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene", |
|
|
36 |
"external_gene_id", |
|
|
37 |
"gene_biotype", |
|
|
38 |
'chromosome_name', 'start_position', |
|
|
39 |
'end_position', 'strand'), |
|
|
40 |
filters = "ensembl_gene_id", values = all_genes, |
|
|
41 |
mart = ensembl_75) |
|
|
42 |
dict_dt <- as.data.table(dict) |
|
|
43 |
nrow(dict_dt) |
|
|
44 |
t <- biomaRt::getBM(attributes = c("ensembl_gene_id", "ensembl_exon_id"), |
|
|
45 |
filters = "ensembl_gene_id", values = all_genes, |
|
|
46 |
mart = ensembl_75) |
|
|
47 |
t <- as.data.table(t) |
|
|
48 |
t[, exon_count := nrow(.SD), by = ensembl_gene_id] |
|
|
49 |
exon_counts <- unique(t[, c("ensembl_gene_id", "exon_count")]) |
|
|
50 |
dict_dt <- merge(x = dict_dt, y = exon_counts) |
|
|
51 |
fwrite(dict_dt, file = "Data/gene_dictionary.txt", sep = "\t") |
|
|
52 |
|
|
|
53 |
# ==== Find the source of miRNAs ==== |
|
|
54 |
# Download miRNA data from mirBase |
|
|
55 |
download.file(url = "ftp://mirbase.org/pub/mirbase/20/genomes/hsa.gff3", |
|
|
56 |
destfile = "mirbase_data_hsa.gff3") |
|
|
57 |
mirbase <- fread("mirbase_data_hsa.gff3") |
|
|
58 |
# Extract miRNA names |
|
|
59 |
temp <- gsub(pattern = "Derives.*", replacement = "", x = mirbase$V9) |
|
|
60 |
temp <- gsub(pattern = ".*Name=", replacement = "", x = temp) |
|
|
61 |
temp <- gsub(pattern = ";", replacement = "", x = temp) |
|
|
62 |
|
|
|
63 |
mirbase <- |
|
|
64 |
data.table( |
|
|
65 |
chr = mirbase$V1, |
|
|
66 |
start = mirbase$V4, |
|
|
67 |
end = mirbase$V5, |
|
|
68 |
name = temp, |
|
|
69 |
strand = mirbase$V7 |
|
|
70 |
) |
|
|
71 |
|
|
|
72 |
# Create a GRanges object |
|
|
73 |
miRNA_gr <- makeGRangesFromDataFrame( |
|
|
74 |
df = mirbase, |
|
|
75 |
keep.extra.columns = F, |
|
|
76 |
seqnames.field = "chr", |
|
|
77 |
start.field = "start", |
|
|
78 |
end.field = "end", |
|
|
79 |
strand.field = "strand" |
|
|
80 |
) |
|
|
81 |
names(miRNA_gr) <- temp |
|
|
82 |
|
|
|
83 |
# Save |
|
|
84 |
saveRDS(miRNA_gr, "Human_miRNA_GRanges.rds") |