|
a |
|
b/data/data_reading_SEERBOMB.r |
|
|
1 |
setwd(".") |
|
|
2 |
options(stringsAsFactors = FALSE) |
|
|
3 |
|
|
|
4 |
# list.of.packages <- c("PRROC", "e1071", "randomForest","class", "gmodels", "formula.tools") |
|
|
5 |
|
|
|
6 |
list.of.packages <- c("PRROC", "e1071", "randomForest","class", "gmodels", "SEERaBomb", "miceadds") |
|
|
7 |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] |
|
|
8 |
if(length(new.packages)) install.packages(new.packages) |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
# library("PRROC") |
|
|
12 |
# library("e1071") |
|
|
13 |
# library("randomForest") |
|
|
14 |
# library("class") |
|
|
15 |
# library("gmodels") |
|
|
16 |
library("SEERaBomb") |
|
|
17 |
library("miceadds") |
|
|
18 |
|
|
|
19 |
num_to_return <- 1 |
|
|
20 |
exe_num <- sample(1:as.numeric(Sys.time()), num_to_return) |
|
|
21 |
|
|
|
22 |
# https://rdrr.io/cran/SEERaBomb/src/inst/doc/examples/mkDataBinaries.R |
|
|
23 |
|
|
|
24 |
df <- getFields(seerHome="./SEER_1973_2015_CUSTOM_TEXTDATA/") |
|
|
25 |
|
|
|
26 |
rdf=pickFields(df) |
|
|
27 |
# pickFields(df, picks=c("casenum","reg","race","sex","agedx", "yrbrth","seqnum","modx","yrdx","histo3", "ICD9","COD","surv","radiatn","chemo", "tvalue")) |
|
|
28 |
|
|
|
29 |
## Dataset with date of birth |
|
|
30 |
|
|
|
31 |
rdf_TumorSize_Age=pickFields(df, picks=c("casenum","reg","race","sex","agedx", "yrbrth","seqnum","modx","yrdx","histo3", "ICD9","COD","surv","radiatn","chemo", "tvalue", "mvalue")) |
|
|
32 |
|
|
|
33 |
mkSEER(rdf_TumorSize_Age, seerHome="./SEER_1973_2015_CUSTOM_TEXTDATA/") |
|
|
34 |
|
|
|
35 |
load.Rdata("./SEER_1973_2015_CUSTOM_TEXTDATA/mrgd/cancDef_ALL_TumorSize_Metastasis.RData", "data_ALL_TMY") |
|
|
36 |
|
|
|
37 |
lung_cancer_data <- data_ALL_TMY%>%filter(cancer=="lung") |
|
|
38 |
lung_cancer_dataframe <- as.data.frame(lung_cancer_data) |
|
|
39 |
colnames(lung_cancer_dataframe) |
|
|
40 |
|
|
|
41 |
lung_cancer_dataframe_TMY <- lung_cancer_dataframe[,c("tvalue","yrbrth","mvalue")] |
|
|
42 |
|
|
|
43 |
# Removes the NA values |
|
|
44 |
lung_cancer_dataframe_TMY_complete_cases <- lung_cancer_dataframe_TMY[complete.cases(lung_cancer_dataframe_TMY), ] |
|
|
45 |
|
|
|
46 |
|
|
|
47 |
# AJCC 3rd Edition, TNM, and Stage in SEER Data |
|
|
48 |
# https://seer.cancer.gov/seerstat/variables/seer/ajcc-stage/3rd.html |
|
|
49 |
|
|
|
50 |
# Let's keep only M0 (code 00) and M1 (code 10) |
|
|
51 |
lung_cancer_dataframe_TMY_noNA_onlyM0M1 <- lung_cancer_dataframe_TMY_complete_cases[lung_cancer_dataframe_TMY_complete_cases$"mvalue" == "10" | lung_cancer_dataframe_TMY_complete_cases$"mvalue" == "0", ] |
|
|
52 |
|
|
|
53 |
colnames(lung_cancer_dataframe_TMY_noNA_onlyM0M1) <- c("TumorSize", "YearOfBirth", "Metastasis") |
|
|
54 |
|
|
|
55 |
write.table(lung_cancer_dataframe_TMY_noNA_onlyM0M1, paste("lung_cancer_dataframe_TMY_noNA_onlyM0M1_time", exe_num, ".csv", sep=""),col.names=TRUE, row.names=FALSE, sep=",") |
|
|
56 |
|
|
|
57 |
## Dataset with "agerec" |
|
|
58 |
|
|
|
59 |
rdf_TumorSize_AgeB=pickFields(df, picks=c("casenum","reg","race","sex","agedx", "agerec","seqnum","modx","yrdx","histo3", "ICD9","COD","surv","radiatn","chemo", "tvalue", "mvalue")) |
|
|
60 |
|
|
|
61 |
mkSEER(rdf_TumorSize_AgeB, seerHome="./SEER_1973_2015_CUSTOM_TEXTDATA/", outFile="cancDef_ALL_TumorSize_Age_Metastasis") |
|
|
62 |
|
|
|
63 |
load.Rdata("./SEER_1973_2015_CUSTOM_TEXTDATA/mrgd/cancDef_ALL_TumorSize_Age_Metastasis.RData", "data_ALL_TMA") |
|
|
64 |
|
|
|
65 |
lung_cancer_age_data <- data_ALL_TMA%>%filter(cancer=="lung") |
|
|
66 |
lung_cancer_age_dataframe <- as.data.frame(lung_cancer_age_data) |
|
|
67 |
colnames(lung_cancer_age_dataframe) |
|
|
68 |
|
|
|
69 |
lung_cancer_dataframe_TMA <- lung_cancer_age_dataframe[,c("tvalue","agedx","mvalue")] |
|
|
70 |
|
|
|
71 |
# Removes the NA values |
|
|
72 |
lung_cancer_dataframe_TMA_complete_cases <- lung_cancer_dataframe_TMA[complete.cases(lung_cancer_dataframe_TMA), ] |
|
|
73 |
|
|
|
74 |
|
|
|
75 |
# AJCC 3rd Edition, TNM, and Stage in SEER Data |
|
|
76 |
# https://seer.cancer.gov/seerstat/variables/seer/ajcc-stage/3rd.html |
|
|
77 |
|
|
|
78 |
# Let's keep only M0 (code 00) and M1 (code 10) |
|
|
79 |
lung_cancer_dataframe_TMA_noNA_onlyM0M1 <- lung_cancer_dataframe_TMA_complete_cases[lung_cancer_dataframe_TMA_complete_cases$"mvalue" == "10" | lung_cancer_dataframe_TMA_complete_cases$"mvalue" == "0", ] |
|
|
80 |
|
|
|
81 |
colnames(lung_cancer_dataframe_TMA_noNA_onlyM0M1) <- c("TumorSize", "AgeAtDiagnosis", "Metastasis") |
|
|
82 |
lung_cancer_dataframe_TMA_noNA_onlyM0M1$Metastasis <- lung_cancer_dataframe_TMA_noNA_onlyM0M1$Metastasis/10 |
|
|
83 |
|
|
|
84 |
write.table(lung_cancer_dataframe_TMA_noNA_onlyM0M1, paste("lung_cancer_dataframe_TMA_noNA_onlyM0M1_time", exe_num, ".csv", sep=""),col.names=TRUE, row.names=FALSE, sep=",") |
|
|
85 |
|
|
|
86 |
|