a b/data/data_reading_SEERBOMB.r
1
setwd(".")
2
options(stringsAsFactors = FALSE)
3
4
# list.of.packages <- c("PRROC", "e1071", "randomForest","class", "gmodels", "formula.tools")
5
6
list.of.packages <- c("PRROC", "e1071", "randomForest","class", "gmodels", "SEERaBomb", "miceadds")
7
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
8
if(length(new.packages)) install.packages(new.packages)
9
10
11
# library("PRROC")
12
# library("e1071")
13
# library("randomForest")
14
# library("class")
15
# library("gmodels")
16
library("SEERaBomb")
17
library("miceadds")
18
19
num_to_return <- 1
20
exe_num <- sample(1:as.numeric(Sys.time()), num_to_return)
21
22
# https://rdrr.io/cran/SEERaBomb/src/inst/doc/examples/mkDataBinaries.R
23
24
df <- getFields(seerHome="./SEER_1973_2015_CUSTOM_TEXTDATA/")
25
26
rdf=pickFields(df)
27
# pickFields(df, picks=c("casenum","reg","race","sex","agedx",  "yrbrth","seqnum","modx","yrdx","histo3",      "ICD9","COD","surv","radiatn","chemo", "tvalue"))
28
29
## Dataset with date of birth
30
31
rdf_TumorSize_Age=pickFields(df, picks=c("casenum","reg","race","sex","agedx",  "yrbrth","seqnum","modx","yrdx","histo3",   "ICD9","COD","surv","radiatn","chemo", "tvalue", "mvalue"))
32
33
mkSEER(rdf_TumorSize_Age, seerHome="./SEER_1973_2015_CUSTOM_TEXTDATA/")
34
35
load.Rdata("./SEER_1973_2015_CUSTOM_TEXTDATA/mrgd/cancDef_ALL_TumorSize_Metastasis.RData", "data_ALL_TMY")
36
37
lung_cancer_data <- data_ALL_TMY%>%filter(cancer=="lung")
38
lung_cancer_dataframe <- as.data.frame(lung_cancer_data)
39
colnames(lung_cancer_dataframe)
40
41
lung_cancer_dataframe_TMY <- lung_cancer_dataframe[,c("tvalue","yrbrth","mvalue")]
42
43
# Removes the NA values
44
lung_cancer_dataframe_TMY_complete_cases <- lung_cancer_dataframe_TMY[complete.cases(lung_cancer_dataframe_TMY), ]
45
46
47
# AJCC 3rd Edition, TNM, and Stage in SEER Data
48
# https://seer.cancer.gov/seerstat/variables/seer/ajcc-stage/3rd.html
49
50
# Let's keep only M0 (code 00) and M1 (code 10) 
51
lung_cancer_dataframe_TMY_noNA_onlyM0M1 <- lung_cancer_dataframe_TMY_complete_cases[lung_cancer_dataframe_TMY_complete_cases$"mvalue" == "10" | lung_cancer_dataframe_TMY_complete_cases$"mvalue" == "0", ]
52
53
colnames(lung_cancer_dataframe_TMY_noNA_onlyM0M1) <- c("TumorSize", "YearOfBirth", "Metastasis")
54
55
 write.table(lung_cancer_dataframe_TMY_noNA_onlyM0M1, paste("lung_cancer_dataframe_TMY_noNA_onlyM0M1_time", exe_num, ".csv", sep=""),col.names=TRUE, row.names=FALSE, sep=",")
56
 
57
## Dataset with "agerec"
58
59
rdf_TumorSize_AgeB=pickFields(df, picks=c("casenum","reg","race","sex","agedx",  "agerec","seqnum","modx","yrdx","histo3",   "ICD9","COD","surv","radiatn","chemo", "tvalue", "mvalue"))
60
61
mkSEER(rdf_TumorSize_AgeB, seerHome="./SEER_1973_2015_CUSTOM_TEXTDATA/", outFile="cancDef_ALL_TumorSize_Age_Metastasis")
62
63
load.Rdata("./SEER_1973_2015_CUSTOM_TEXTDATA/mrgd/cancDef_ALL_TumorSize_Age_Metastasis.RData", "data_ALL_TMA")
64
65
lung_cancer_age_data <- data_ALL_TMA%>%filter(cancer=="lung")
66
lung_cancer_age_dataframe <- as.data.frame(lung_cancer_age_data)
67
colnames(lung_cancer_age_dataframe)
68
69
lung_cancer_dataframe_TMA <- lung_cancer_age_dataframe[,c("tvalue","agedx","mvalue")]
70
71
# Removes the NA values
72
lung_cancer_dataframe_TMA_complete_cases <- lung_cancer_dataframe_TMA[complete.cases(lung_cancer_dataframe_TMA), ]
73
74
75
# AJCC 3rd Edition, TNM, and Stage in SEER Data
76
# https://seer.cancer.gov/seerstat/variables/seer/ajcc-stage/3rd.html
77
78
# Let's keep only M0 (code 00) and M1 (code 10) 
79
lung_cancer_dataframe_TMA_noNA_onlyM0M1 <- lung_cancer_dataframe_TMA_complete_cases[lung_cancer_dataframe_TMA_complete_cases$"mvalue" == "10" | lung_cancer_dataframe_TMA_complete_cases$"mvalue" == "0", ]
80
81
colnames(lung_cancer_dataframe_TMA_noNA_onlyM0M1) <- c("TumorSize", "AgeAtDiagnosis", "Metastasis")
82
lung_cancer_dataframe_TMA_noNA_onlyM0M1$Metastasis <- lung_cancer_dataframe_TMA_noNA_onlyM0M1$Metastasis/10
83
84
 write.table(lung_cancer_dataframe_TMA_noNA_onlyM0M1, paste("lung_cancer_dataframe_TMA_noNA_onlyM0M1_time", exe_num, ".csv", sep=""),col.names=TRUE, row.names=FALSE, sep=",")
85
 
86