Diff of /R/Gene_Dictionary.R [000000] .. [c3b4f8]

Switch to unified view

a b/R/Gene_Dictionary.R
1
# Gene_Dictionary.R
2
3
if (!require(data.table)) {
4
    install.packages("data.table")
5
    library(data.table)
6
}
7
if (!require(biomaRt)) {
8
    BiocInstaller::biocLite("biomaRt")
9
    library(biomaRt)
10
}
11
if (!require(curl)) {
12
    install.packages("curl")
13
    library(curl)
14
}
15
16
17
# ==== Create a dictionary for all genes ====
18
# Load a gene expression file
19
exp_file <- get(load("Data/TCGA/SummarizedExperiments/Exp/TCGA-ACC_GeneExpression.rdata"))
20
all_genes <- rownames(exp_file)
21
# Use an older version of biomaRt to ensure hg19 coordinates are retrieved
22
ensembl_75 = useMart(
23
    biomart = "ENSEMBL_MART_ENSEMBL",
24
    host = "feb2014.archive.ensembl.org",
25
    path = "/biomart/martservice",
26
    dataset = "hsapiens_gene_ensembl"
27
)
28
atts <- listAttributes(ensembl_75, what = "name")
29
atts[grep(pattern = "gene", x = atts, ignore.case = T)]
30
atts[grep(pattern = "exon", x = atts, ignore.case = T)]
31
atts[grep(pattern = "location", x = atts, ignore.case = T)]
32
atts[grep(pattern = "entrez", x = atts, ignore.case = T)]
33
# Retrieve relevant attributes
34
length(all_genes)
35
dict <- biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene",
36
                                      "external_gene_id",
37
                                      "gene_biotype",
38
                                      'chromosome_name', 'start_position',
39
                                      'end_position', 'strand'),
40
                       filters = "ensembl_gene_id", values = all_genes,
41
                       mart = ensembl_75)
42
dict_dt <- as.data.table(dict)
43
nrow(dict_dt)
44
t <- biomaRt::getBM(attributes = c("ensembl_gene_id", "ensembl_exon_id"),
45
                    filters = "ensembl_gene_id", values = all_genes,
46
                    mart = ensembl_75)
47
t <- as.data.table(t)
48
t[, exon_count := nrow(.SD), by = ensembl_gene_id]
49
exon_counts <- unique(t[, c("ensembl_gene_id", "exon_count")])
50
dict_dt <- merge(x = dict_dt, y = exon_counts)
51
fwrite(dict_dt, file = "Data/gene_dictionary.txt", sep = "\t")
52
53
# ==== Find the source of miRNAs ====
54
# Download miRNA data from mirBase
55
download.file(url = "ftp://mirbase.org/pub/mirbase/20/genomes/hsa.gff3",
56
              destfile = "mirbase_data_hsa.gff3")
57
mirbase <- fread("mirbase_data_hsa.gff3")
58
# Extract miRNA names
59
temp <- gsub(pattern = "Derives.*", replacement = "", x = mirbase$V9)
60
temp <- gsub(pattern = ".*Name=", replacement = "", x = temp)
61
temp <- gsub(pattern = ";", replacement = "", x = temp)
62
63
mirbase <-
64
    data.table(
65
        chr = mirbase$V1,
66
        start = mirbase$V4,
67
        end = mirbase$V5,
68
        name = temp,
69
        strand = mirbase$V7
70
    )
71
72
# Create a GRanges object
73
miRNA_gr <- makeGRangesFromDataFrame(
74
    df = mirbase,
75
    keep.extra.columns = F,
76
    seqnames.field = "chr",
77
    start.field = "start",
78
    end.field = "end",
79
    strand.field = "strand"
80
)
81
names(miRNA_gr) <- temp
82
83
# Save
84
saveRDS(miRNA_gr, "Human_miRNA_GRanges.rds")