Diff of /1-all_clin.R [000000] .. [d2c46b]

Switch to side-by-side view

--- a
+++ b/1-all_clin.R
@@ -0,0 +1,585 @@
+# 20210410
+# Pre-processing of samples
+# Get the sample from inside the phs000724, then get the sample with qualified quality and do the grouping of ejection fraction
+# The latter will only do HFpEF analysis
+# Then get the clinical features that we are concerned about more than 90
+# Then get the time, get the exam8, eight years after the disease sample and control
+
+
+library(rlang)
+library(dplyr)
+library("readxl")
+
+#data
+{
+  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c1.HMB-IRB-MDS/PhenotypeFiles")
+  c1=read.table("phs000724.v7.pht004246.v2.p11.c1.Framingham_DNA_Methylation_Sample_Attributes_I.HMB-IRB-MDS.txt",sep="\t",header = T)
+  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c2.HMB-IRB-NPU-MDS/PhenotypeFiles")
+  c2=read.table("phs000724.v7.pht004246.v2.p11.c2.Framingham_DNA_Methylation_Sample_Attributes_I.HMB-IRB-NPU-MDS.txt",sep="\t",header = T)
+  data1 <- rbind(c1,c2)
+  data1$SAMPID=strsplit(as.character(data1$SAMPID), "_724")
+  data1 = data1[,c(2,3,8)]
+  data1 <- filter(data1, PACKS_SET !="GEN3")
+  data1 <- filter(data1, LABID !="")
+  data1$SAMPID <- as.character(data1$SAMPID)#2725
+  
+  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c1.HMB-IRB-MDS/PhenotypeFiles")
+  c1=read.table("phs000724.v7.pht004247.v2.p11.c1.Framingham_DNA_Methylation_Sample_Attributes_II.HMB-IRB-MDS.txt",sep="\t",header = T)
+  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c2.HMB-IRB-NPU-MDS/PhenotypeFiles")
+  c2=read.table("phs000724.v7.pht004247.v2.p11.c2.Framingham_DNA_Methylation_Sample_Attributes_II.HMB-IRB-NPU-MDS.txt",sep="\t",header = T)
+  data2 <- rbind(c1,c2)
+  data2$SAMPID=strsplit(as.character(data2$SAMPID), "_724")
+  data2 = data2[,c(2,3,5,11)]
+  data2$SAMPID = as.character(data2$SAMPID)#2782
+  colnames(data2)[3] = "Sample_Well"
+  data = merge(data1,data2,by="SAMPID",all.x=TRUE,all.y=TRUE)#2782;data2 have rep, data1 no rep
+}
+#Pre-processing data
+{
+  #qc-17
+  data2_QC = filter(data2,QC_Comment != "")
+  QC = unique(as.character(data2_QC$SAMPID))
+  #rep-29
+  index<-duplicated(data2$SAMPID)
+  data2_index <-data2[index,]
+  rep = unique(as.character(data2_index$SAMPID))
+  data = filter(data, !SAMPID %in% c(rep,QC))#2626
+  data$Sentrix_ID <- unlist(lapply(as.character(data$LABID.y),function(x){strsplit(x,'_')[[1]][[1]]}))
+  #slide: data$Sentrix_ID=gsub("_[[:alnum:]]*", "", data$LABID.y)
+  data$Sentrix_Position <- unlist(lapply(as.character(data$LABID.y),function(x){strsplit(x,'_')[[1]][[2]]}))
+  #arrayPos:data$Sentrix_Position=gsub("[0-9]*_", "", data$LABID.y)
+  data = data[,c(1,3,5,6,7,8)] #2626
+}
+save(data,file="data.Rdata")
+#EF
+{
+  #c1
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  echo <- read_excel("phs000007.v30.pht002572.v6.p11.c1.t_echo_2008_m_0549s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  echo <- data.frame(echo)
+  echo <- echo[,c(2,38,123)]
+  #c2
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  echo_c2 <- read_excel("phs000007.v30.pht002572.v6.p11.c2.t_echo_2008_m_0549s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  echo_c2 <- data.frame(echo_c2)
+  echo_c2 <- echo_c2[,c(2,38,123)]
+  #rbind
+  echo <- rbind(echo,echo_c2)
+  colnames(echo)[2] <- c("EF")
+  colnames(echo)[3] <- c("LVSF") 
+  echo_1 <- filter(echo,EF<40)
+  echo_2 <- filter(echo,EF>50)
+  echo_3 <- filter(echo,EF<50 & EF>40)
+}
+HFpEF = merge(echo_2[,1:2],data,by.x="shareid",by.y="SAMPID")#20
+HFrEF = merge(echo_1[,1:2],data,by.x="shareid",by.y="SAMPID")#2334
+HFmrEF = merge(echo_3[,1:2],data,by.x="shareid",by.y="SAMPID")#38
+save(HFpEF,file="HFpEF.Rdata")
+#clinical
+{
+  #cvd
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  cvd_c1 <- read_excel("phs000007.v30.pht003316.v7.p11.c1.vr_survcvd_2014_a_1023s.HMB-IRB-MDS.xlsx",sheet=1, na = "NA", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  cvd_c2 <- read_excel("phs000007.v30.pht003316.v7.p11.c2.vr_survcvd_2014_a_1023s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "NA", skip = 10)
+  cvd = rbind(cvd_c1,cvd_c2)
+  cvd <- data.frame(cvd)
+  cvd <- cvd[,-c(1,3)]
+  head(cvd)
+  #mi
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  MI_c1 <- read_excel("phs000007.v30.pht000309.v13.p11.c1.vr_soe_2016_a_1073s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  MI_c2 <- read_excel("phs000007.v30.pht000309.v13.p11.c2.vr_soe_2016_a_1073s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  MI = rbind(MI_c1,MI_c2)
+  MI <- MI[,-c(1,4,5,6,8)]
+  MIyes <-filter(MI , EVENT == "1" | EVENT == "2" | EVENT == "3" )
+  MIyes <- MIyes[,-2]
+  MIyes <- MIyes[!duplicated(MIyes$shareid), ] 
+  MIyes$mi <- rep(1,nrow(MIyes))
+  head(MIyes)
+  colnames(MIyes)[2] ="midate"
+  tmp1=merge(cvd,MIyes,by="shareid",all.x=TRUE,all.y=TRUE)
+  tmp1$mi <- ifelse(is.na(tmp1$mi) == "FALSE" ,"1","0")
+  #diba
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  DIAB_c1 <- read_excel("phs000007.v30.pht000041.v7.p11.c1.vr_diab_ex09_1_1002s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  DIAB_c1 <- data.frame(DIAB_c1)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  DIAB_c2 <- read_excel("phs000007.v30.pht000041.v7.p11.c2.vr_diab_ex09_1_1002s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  DIAB_c2 <- data.frame(DIAB_c2)
+  DIAB = rbind(DIAB_c1,DIAB_c2)
+  head(DIAB)
+  DIAByes <- filter(DIAB,DIAB$CURR_DIAB1 == 1| DIAB$CURR_DIAB2 == 1|DIAB$CURR_DIAB3 == 1|DIAB$CURR_DIAB4 == 1|DIAB$CURR_DIAB5 == 1| DIAB$CURR_DIAB6 == 1| DIAB$CURR_DIAB7 == 1 | DIAB$CURR_DIAB8 == 1)
+  DIAByes <- DIAByes[,c(2,11)]
+  DIAByes[,2] <- "1"
+  
+  DIABno <- filter(DIAB,DIAB$CURR_DIAB1 != 1 & DIAB$CURR_DIAB2 != 1 & DIAB$CURR_DIAB3 != 1  & DIAB$CURR_DIAB4 != 1 & DIAB$CURR_DIAB5 != 1 &  DIAB$CURR_DIAB6 != 1 & DIAB$CURR_DIAB7 != 1 & DIAB$CURR_DIAB8 != 1)
+  DIABno <- DIABno[,c(2,11)]
+  DIABno[,2] <- "0"
+  
+  DIAB <- rbind(DIAByes,DIABno)
+  DIAB <- DIAB[!duplicated(DIAB$shareid), ] 
+  tmp2 = merge(tmp1,DIAB,by="shareid",all.x=TRUE,all.y=TRUE)
+  #afx
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  afx_c1 <- read_excel("phs000007.v30.pht003315.v7.p11.c1.vr_survaf_2014_a_0987s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  afx_c2 <- read_excel("phs000007.v30.pht003315.v7.p11.c2.vr_survaf_2014_a_0987s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  afx = rbind(afx_c1,afx_c2)
+  afx <- data.frame(afx)
+  afx <- afx[,c(2,4,5)]
+  head(afx)
+  tmp3 = merge(tmp2,afx,by="shareid",all.x=TRUE,all.y=TRUE)
+  #stroke
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  stroke_c1 <- read_excel("phs000007.v30.pht006023.v2.p11.c1.vr_survstk_2014_a_1031s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  stroke_c2 <- read_excel("phs000007.v30.pht006023.v2.p11.c2.vr_survstk_2014_a_1031s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  stroke = rbind(stroke_c1,stroke_c2)
+  stroke <- data.frame(stroke)
+  stroke <- stroke[,c(2,4,7)]
+  head(stroke)
+  tmp4 = merge(tmp3,stroke,by="shareid",all.x=TRUE,all.y=TRUE)
+  #meta1
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta1_c1 <- read_excel("phs000007.v30.pht006027.v2.p11.c1.vr_wkthru_ex09_1_1001s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta1_c2 <- read_excel("phs000007.v30.pht006027.v2.p11.c2.vr_wkthru_ex09_1_1001s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  meta1 = rbind(meta1_c1,meta1_c2)
+  c=c(1,206,207,3,20,29,38,47,56,62,71,80,89,96,105,114,128,137,146,155,161,170,188,197) #extract exam8 all information
+  meta1 <- meta1[,c+1]
+  head(meta1)
+  meta1 <- data.frame(meta1)
+  normalization<-function(x){
+    return((x-min(x))/(max(x)-min(x)))}
+  meta1$SEX <- normalization(meta1$SEX)
+  tmp5 = merge(tmp4,meta1,by="shareid",all.x=TRUE,all.y=TRUE)
+  #meta2
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta2_c1 <- read_excel("phs000007.v30.pht000747.v6.p11.c1.ex1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta2_c2 <- read_excel("phs000007.v30.pht000747.v6.p11.c2.ex1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  meta2 = rbind(meta2_c1,meta2_c2)
+  c=c("shareid","H010","H071","H074","H077","H480")
+  meta2 <- meta2[,colnames(meta2) %in% c]
+  names <- c("shareid","aspirin","beer","wine","liquor","sleep")
+  colnames(meta2) <- names
+  tmp6 = merge(tmp5,meta2,by="shareid",all.x=TRUE,all.y=TRUE)
+  #meta3
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta3_other2_c1 <- read_excel("phs000007.v30.pht000742.v6.p11.c1.fhslab1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta3_other2_c2 <- read_excel("phs000007.v30.pht000742.v6.p11.c2.fhslab1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  meta3_other2 = rbind(meta3_other2_c1,meta3_other2_c2)
+  meta3_other2 <- meta3_other2[,c(2,10:12)]
+  colnames(meta3_other2)[1] <- c("SAMPID") 
+  colnames(meta3_other2)[2] <- c("Albumin_urine")
+  colnames(meta3_other2)[3] <- c("Creatinine_urine")
+  colnames(meta3_other2)[4] <- c("Hemoglobin_A1c_wholeblood")
+  meta3_other2 <- data.frame(meta3_other2)
+  head(meta3_other2)
+  tmp7 = merge(tmp6,meta3_other2,by.x = "shareid",by.y="SAMPID",all.x=TRUE,all.y=TRUE)
+  #meta4
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta3_other1_c1 <- read_excel("phs000007.v30.pht000747.v6.p11.c1.ex1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  meta3_other1_c2 <- read_excel("phs000007.v30.pht000747.v6.p11.c2.ex1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  meta3_other1 = rbind(meta3_other1_c1,meta3_other1_c2)
+  meta3_other1 <- meta3_other1[,-1]
+  meta3_other1 <- data.frame(meta3_other1)
+  meta3_other1 <- meta3_other1[,c(1,251:292,450,451,453,456,459)]
+  c <- c("ATRIAL_ENLARGEMENT","RVH","LVH","RHEUMATIC","AORTIC_VALVE","MITRAL_VALVE","OTHER_HEART",
+         "ARRHYTHMIA","OTHER_PERIPHERAL_VASCULAR_DISEASE","OTHER_VASCULAR_DIAGNOSIS","DEMENTIA",
+         "PARKINSON","ADULT_SEIZURE_DISORDER","NEUROLOGICAL","THYROID","ENDOCRINE","RENAL",
+         "GYNECOLOGIC","EMPHYSEMA","PNEUMONIA","ASTHMA","PULMONARY","GOUT","DEGENERATIVE","RHEUMATOID_ARTHRITIS",
+         "MUSCULOSKELETAL","GALLBLADDER","GERD","LIVER","GI_DISEASE","HEMATOLOGIC_DISORDER","BLEEDING_DISORDER","EYE",
+         "ENT","SKIN","OTHER","DEPRESSION","ANXIETY","PSYCHOSIS","OTHER2","PROSTATE","INFECTIOUS",
+         "FEVER","PNEUMONIA","CHRONIC_BRONCHITIS","EMPHYSEMA","COPD")
+  colnames(meta3_other1)[2:48] <- tolower(c)
+  meta3_other1[1:10,1:10]
+  colnames(meta3_other1)[1] <- c("SAMPID") 
+  #全部变为0,1的;去除2,3的
+  for(i in 2:ncol(meta3_other1)){
+    meta3_other1[,i] <- ifelse(meta3_other1[,i] == 0 ,0,1)
+  }
+  tmp8 = merge(tmp7,meta3_other1,by.x = "shareid",by.y="SAMPID",all.x=TRUE,all.y=TRUE)
+  #crp
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  CRP_c1 <- read_excel("phs000007.v30.pht002888.v5.p11.c1.l_crp_2008_m_0477s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+  CRP_c1 = CRP_c1[,-5]
+  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+  CRP_c2 <- read_excel("phs000007.v30.pht002888.v5.p11.c2.l_crp_2008_m_0477s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+  CRP = rbind(CRP_c1,CRP_c2)
+  CRP <- CRP[,c(2,4)]
+  head(CRP)
+  tmp9 = merge(tmp8,CRP,by="shareid",all.x=TRUE,all.y=TRUE)
+  #medicine
+  {
+    #FOLIC
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      FOLIC_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      FOLIC_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      FOLIC =  rbind(FOLIC_c1,FOLIC_c2)
+      FOLIC <- data.frame(FOLIC)
+      FOLIC <- FOLIC[,c(2,10:17)]
+      FOLIC <- filter(FOLIC, phrm_gp1 == "VITAMIN B12 AND FOLIC ACID" & chem_gp1 == "Folic acid and derivatives")
+      
+      data <- merge(FOLIC ,tmp9,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "Folic_acid"
+      colnames(data)[3] = "Folic_acid_amount"
+      data$Folic_acid_amount <- ifelse(data$Folic_acid == 0 ,"0",data$Folic_acid_amount)
+    }
+    #aspirin-only cvd realtivel
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      aspirin_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      aspirin_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      aspirin = rbind(aspirin_c1,aspirin_c2)
+      aspirin <- data.frame(aspirin)
+      aspirin <- aspirin[,c(2,10:17)]
+      aspirin <- filter(aspirin, system1 == "BLOOD AND BLOOD FORMING ORGANS" & 
+                          ther_gp1 == "ANTITHROMBOTIC AGENTS" &  
+                          chem_gp1 == "Platelet aggregation inhibitors excl. heparin" & chem_nm1 == "ACETYLSALICYLIC ACID")
+      
+      data <- merge(aspirin ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "aspirin"
+      colnames(data)[3] = "aspirin_amount"
+      data$aspirin_amount <- ifelse(data$aspirin == 0 ,"0",data$aspirin_amount)
+    }
+    #clopidogrel
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      clopidogrel_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      clopidogrel_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      clopidogrel = rbind(clopidogrel_c1,clopidogrel_c2)
+      clopidogrel <- data.frame(clopidogrel)
+      clopidogrel <- clopidogrel[,c(2,10:17)]
+      clopidogrel <- filter(clopidogrel, system1 == "BLOOD AND BLOOD FORMING ORGANS" & 
+                              ther_gp1 == "ANTITHROMBOTIC AGENTS" &  
+                              chem_gp1 == "Platelet aggregation inhibitors excl. heparin" & chem_nm1 == "CLOPIDOGREL")
+      
+      data <- merge(clopidogrel ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "clopidogrel"
+      colnames(data)[3] = "clopidogrel_amount"
+      data$clopidogrel_amount <- ifelse(data$clopidogrel == 0 ,"0",data$clopidogrel_amount)
+    }
+    #warfarin
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      warfarin_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      warfarin_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      warfarin = rbind(warfarin_c1,warfarin_c2)
+      warfarin <- data.frame(warfarin)
+      warfarin <- warfarin[,c(2,10:17)]
+      warfarin <- filter(warfarin, system1 == "BLOOD AND BLOOD FORMING ORGANS" & 
+                           ther_gp1 == "ANTITHROMBOTIC AGENTS" &  
+                           chem_gp1 == "Vitamin K antagonists" )
+      
+      data <- merge(warfarin ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "warfarin"
+      colnames(data)[3] = "warfarin_amount"
+      data$warfarin_amount <- ifelse(data$warfarin == 0 ,"0",data$warfarin_amount)
+    }
+    #ACEI
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      ACEI_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      ACEI_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      ACEI = rbind(ACEI_c1,ACEI_c2)
+      ACEI <- data.frame(ACEI)
+      ACEI <- ACEI[,c(2,10:17)]
+      ACEI <- filter(ACEI, system1 == "CARDIOVASCULAR SYSTEM" & 
+                       ther_gp1 == "AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM" &  
+                       phrm_gp1 == "ACE INHIBITORS, PLAIN" )
+      
+      data <- merge(ACEI ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "ACEI"
+      colnames(data)[3] = "ACEI_amount"
+      data$ACEI_amount <- ifelse(data$ACEI == 0 ,"0",data$ACEI_amount)
+    }
+    #Angiotensin_II_antagonists
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Angiotensin_II_antagonists_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Angiotensin_II_antagonists_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      Angiotensin_II_antagonists = rbind(Angiotensin_II_antagonists_c1,Angiotensin_II_antagonists_c2)
+      Angiotensin_II_antagonists <- data.frame(Angiotensin_II_antagonists)
+      Angiotensin_II_antagonists <- Angiotensin_II_antagonists[,c(2,10:17)]
+      Angiotensin_II_antagonists <- filter(Angiotensin_II_antagonists, system1 == "CARDIOVASCULAR SYSTEM" & 
+                                             ther_gp1 == "AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM" &  
+                                             phrm_gp1 == "ANGIOTENSIN II ANTAGONISTS, PLAIN" )
+      
+      data <- merge(Angiotensin_II_antagonists ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "Angiotensin_II_antagonists"
+      colnames(data)[3] = "Angiotensin_II_antagonists_amount"
+      data$Angiotensin_II_antagonists_amount <- ifelse(data$Angiotensin_II_antagonists == 0 ,"0",data$Angiotensin_II_antagonists_amount)
+    }
+    #Beta_blocking
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Beta_blocking_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Beta_blocking_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      Beta_blocking = rbind(Beta_blocking_c1,Beta_blocking_c2)
+      Beta_blocking <- data.frame(Beta_blocking)
+      Beta_blocking <- Beta_blocking[,c(2,10:17)]
+      Beta_blocking <- filter(Beta_blocking, system1 == "CARDIOVASCULAR SYSTEM" & 
+                                ther_gp1 == "BETA BLOCKING AGENTS" )
+      
+      data <- merge(Beta_blocking ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "Beta_blocking"
+      colnames(data)[3] = "Beta_blocking_amount"
+      data$Beta_blocking_amount <- ifelse(data$Beta_blocking == 0 ,"0",data$Beta_blocking_amount)
+      
+    }
+    #CO_Q_10
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      CO_Q_10_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      CO_Q_10_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      CO_Q_10 = rbind(CO_Q_10_c1,CO_Q_10_c2)
+      CO_Q_10 <- data.frame(CO_Q_10)
+      CO_Q_10 <- CO_Q_10[,c(2,10:17)]
+      CO_Q_10 <- filter(CO_Q_10, system1 == "CARDIOVASCULAR SYSTEM" & 
+                          ther_gp1 == "CARDIAC THERAPY" &
+                          phrm_gp1 == "OTHER CARDIAC PREPARATIONS")
+      
+      data <- merge(CO_Q_10 ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "CO_Q_10"
+      colnames(data)[3] = "CO_Q_10_amount"
+      data$CO_Q_10_amount <- ifelse(data$CO_Q_10 == 0 ,"0",data$CO_Q_10_amount)
+    }
+    #vasodilators
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      vasodilators_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      vasodilators_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      vasodilators = rbind(vasodilators_c1,vasodilators_c2)
+      vasodilators <- data.frame(vasodilators)
+      vasodilators <- vasodilators[,c(2,10:17)]
+      vasodilators <- filter(vasodilators, system1 == "CARDIOVASCULAR SYSTEM" & 
+                               ther_gp1 == "CARDIAC THERAPY" &
+                               phrm_gp1 == "VASODILATORS USED IN CARDIAC DISEASES")
+      
+      data <- merge(vasodilators ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "vasodilators"
+      colnames(data)[3] = "vasodilators_amount"
+      data$vasodilators_amount <- ifelse(data$vasodilators == 0 ,"0",data$vasodilators_amount)
+    }
+    #amiodarone
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      amiodarone_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      amiodarone_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      amiodarone = rbind(amiodarone_c1,amiodarone_c2)
+      amiodarone <- data.frame(amiodarone)
+      amiodarone <- amiodarone[,c(2,10:17)]
+      amiodarone <- filter(amiodarone, system1 == "CARDIOVASCULAR SYSTEM" & 
+                             ther_gp1 == "CARDIAC THERAPY" &
+                             phrm_gp1 == "ANTIARRHYTHMICS, CLASS I AND III" &
+                             chem_nm1 == "AMIODARONE")
+      
+      data <- merge(amiodarone ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "amiodarone"
+      colnames(data)[3] = "amiodarone_amount"
+      data$amiodarone_amount <- ifelse(data$amiodarone == 0 ,"0",data$amiodarone_amount)
+    }
+    #Aldosterone antagonists
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Aldosterone_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Aldosterone_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      Aldosterone = rbind(Aldosterone_c1,Aldosterone_c2)
+      Aldosterone <- data.frame(Aldosterone)
+      Aldosterone <- Aldosterone[,c(2,10:17)]
+      Aldosterone <- filter(Aldosterone, system1 == "CARDIOVASCULAR SYSTEM" & 
+                              ther_gp1 == "DIURETICS" &
+                              chem_gp1 == "Aldosterone antagonists")
+      
+      data <- merge(vasodilators ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "Aldosterone"
+      colnames(data)[3] = "Aldosterone_amount"
+      data$Aldosterone_amount <- ifelse(data$Aldosterone == 0 ,"0",data$Aldosterone_amount)
+      
+    }
+    #potassium_sparing_diuretic
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      potassium_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      potassium_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      potassium = rbind(potassium_c1,potassium_c2)
+      potassium <- data.frame(potassium)
+      potassium <- potassium[,c(2,10:17)]
+      potassium <- filter(potassium, system1 == "CARDIOVASCULAR SYSTEM" & 
+                            ther_gp1 == "DIURETICS" &
+                            chem_gp1 == "Other potassium-sparing agents")
+      
+      data <- merge(potassium ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "potassium"
+      colnames(data)[3] = "potassium_amount"
+      data$potassium_amount <- ifelse(data$potassium == 0 ,"0",data$potassium_amount)
+    }
+    #Sulfonamides
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Sulfonamides_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Sulfonamides_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      Sulfonamides = rbind(Sulfonamides_c1,Sulfonamides_c2)
+      Sulfonamides <- data.frame(Sulfonamides)
+      Sulfonamides <- Sulfonamides[,c(2,10:17)]
+      Sulfonamides <- filter(Sulfonamides, system1 == "CARDIOVASCULAR SYSTEM" & 
+                               ther_gp1 == "DIURETICS" &
+                               chem_gp1 == "Sulfonamides, plain")
+      
+      data <- merge(Sulfonamides ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "Sulfonamides"
+      colnames(data)[3] = "Sulfonamides_amount"
+      data$Sulfonamides_amount <- ifelse(data$Sulfonamides == 0 ,"0",data$Sulfonamides_amount)
+      
+    }
+    #Thiazides
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Thiazides_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      Thiazides_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      Thiazides = rbind(Thiazides_c1,Thiazides_c2)
+      Thiazides <- data.frame(Thiazides)
+      Thiazides <- Thiazides[,c(2,10:17)]
+      Thiazides <- filter(Thiazides, system1 == "CARDIOVASCULAR SYSTEM" & 
+                            ther_gp1 == "DIURETICS" &
+                            chem_gp1 == "Thiazides, plain")
+      
+      data <- merge(Thiazides ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "Thiazides"
+      colnames(data)[3] = "Thiazides_amount"
+      data$Thiazides_amount <- ifelse(data$Thiazides == 0 ,"0",data$Thiazides_amount)
+      
+    }
+    #statin
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      STATIN_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      STATIN_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      STATIN = rbind(STATIN_c1,STATIN_c2)
+      STATIN <- data.frame(STATIN)
+      STATIN <- STATIN[,c(2,10:17)]
+      STATIN1 <- filter(STATIN, system1 == "CARDIOVASCULAR SYSTEM" & 
+                          ther_gp1 == "LIPID MODIFYING AGENTS" &
+                          chem_gp1 == "HMG CoA reductase inhibitors")
+      STATIN2 <- filter(STATIN, system1 == "CARDIOVASCULAR SYSTEM" & 
+                          ther_gp1 == "LIPID MODIFYING AGENTS" &
+                          chem_gp1 == "Other lipid modifying agents" & chem_nm1 == "EZETIMIBE" & MEDNAME == "SIMVASTATIN" |MEDNAME == "SIMVASTATIN)")
+      STATIN <- rbind(STATIN1,STATIN2)
+      data <- merge(STATIN ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "STATIN"
+      colnames(data)[3] = "STATIN_amount"
+      data$STATIN_amount <- ifelse(data$STATIN == 0 ,"0",data$STATIN_amount)
+    }
+    #OMEGA_3
+    {
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      OMEGA_3_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
+      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
+      OMEGA_3_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
+      OMEGA_3 = rbind(OMEGA_3_c1,OMEGA_3_c2)
+      OMEGA_3 <- data.frame(OMEGA_3)
+      OMEGA_3 <- OMEGA_3[,c(2,10:17)]
+      OMEGA_3 <- filter(OMEGA_3, system1 == "CARDIOVASCULAR SYSTEM" & 
+                          ther_gp1 == "LIPID MODIFYING AGENTS" &
+                          chem_gp1 == "Other lipid modifying agents" & chem_nm1 == "OMEGA-3-TRIGLYCERIDES")
+      data <- merge(OMEGA_3 ,data,by="shareid",all.y=TRUE)
+      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
+      data <- data[,-c(2:4,6:8)]
+      colnames(data)[2] = "OMEGA_3"
+      colnames(data)[3] = "OMEGA_3_amount"
+      data$OMEGA_3_amount <- ifelse(data$OMEGA_3 == 0 ,"0",data$OMEGA_3_amount)
+    }
+    dim(data)
+    #15285,125
+  }
+  #duplicated--no among
+  data <- data[!duplicated(data[,-c(3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33)]),]#15154   125
+  save(data,file="E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/all_clin.Rdata")
+  data_exam8 = filter(data,DATE8 != "NA")#3000  125
+  data_exam8$mi = ifelse(data_exam8$mi == 1 & data_exam8$midate<data_exam8$DATE8,1,0)
+  data_exam8$cvd = ifelse(data_exam8$cvd == 1 & data_exam8$cvddate<data_exam8$DATE8,1,0)
+  data_exam8$chd = ifelse(data_exam8$chd == 1 & data_exam8$chddate<data_exam8$DATE8,1,0)
+  data_exam8$afx = ifelse(data_exam8$afx == 1 & data_exam8$afxdate<data_exam8$DATE8,1,0)
+  data_exam8$stroke = ifelse(data_exam8$stroke == 1 & data_exam8$strokedate<data_exam8$DATE8,1,0)
+  save(data_exam8,file="E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/data_exam8.Rdata")
+}
+setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary")
+load(file="data_exam8.Rdata")
+HFpEF_exam8 = merge(HFpEF,data_exam8,by="shareid")#2334  131
+save(HFpEF_exam8,file="HFpEF_exam8.Rdata")
+#time
+{
+  library(dplyr)
+  library(tibble)
+  EHR = HFpEF_exam8
+  
+  #clin_data_control <- filter(EHR, chf == 0 ,(chfdate - DATE8 ) >8*365) 
+  clin_data_control <- filter(EHR, (chfdate - DATE8 ) >8*365) 
+  summary((clin_data_control$chfdate - clin_data_control$DATE8))
+  out1 = filter(EHR, chf == 1 & chfdate < DATE8 )
+  #clin_data_chf <- filter(EHR, chf == 1 & (chfdate >= DATE8))
+  clin_data_chf <- filter(EHR, chf == 1 & (chfdate >= DATE8) & (chfdate - DATE8 ) <8*365)
+  summary((clin_data_chf$chfdate - clin_data_chf$DATE8))
+  out2 = filter(EHR, chf == 0 & (chfdate >= DATE8) & (chfdate - DATE8 ) <=8*365 )
+  # tmp = filter(EHR, chf == 1 & (chfdate - DATE8 <= 5*365) & (chfdate >= DATE8))
+  # summary(tmp$chfdate - tmp$DATE8)
+  clin_data_chf_control <- rbind(clin_data_chf,clin_data_control)#989
+  #20211220 cosoring
+  clin_data_chf_control <- filter(EHR,!EHR$shareid %in% out1$shareid)
+}
+table(clin_data_chf_control$PACKS_SET)
+#GEN3  JHU  UMN 
+#0  171  797 
+setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary")
+save(clin_data_chf_control,file="clin_data_chf_control.Rdata")
\ No newline at end of file