HFmeRisk / Git / [d2c46b] /1-all

Models:
RaymondKing/
HFmeRisk
Downloads: 1
[d2c46b]: / 1-all_clin.R
History
Download this file
585 lines (560 with data), 32.2 kB

# 20210410
# Pre-processing of samples
# Get the sample from inside the phs000724, then get the sample with qualified quality and do the grouping of ejection fraction
# The latter will only do HFpEF analysis
# Then get the clinical features that we are concerned about more than 90
# Then get the time, get the exam8, eight years after the disease sample and control


library(rlang)
library(dplyr)
library("readxl")

#data
{
  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c1.HMB-IRB-MDS/PhenotypeFiles")
  c1=read.table("phs000724.v7.pht004246.v2.p11.c1.Framingham_DNA_Methylation_Sample_Attributes_I.HMB-IRB-MDS.txt",sep="\t",header = T)
  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c2.HMB-IRB-NPU-MDS/PhenotypeFiles")
  c2=read.table("phs000724.v7.pht004246.v2.p11.c2.Framingham_DNA_Methylation_Sample_Attributes_I.HMB-IRB-NPU-MDS.txt",sep="\t",header = T)
  data1 <- rbind(c1,c2)
  data1$SAMPID=strsplit(as.character(data1$SAMPID), "_724")
  data1 = data1[,c(2,3,8)]
  data1 <- filter(data1, PACKS_SET !="GEN3")
  data1 <- filter(data1, LABID !="")
  data1$SAMPID <- as.character(data1$SAMPID)#2725
  
  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c1.HMB-IRB-MDS/PhenotypeFiles")
  c1=read.table("phs000724.v7.pht004247.v2.p11.c1.Framingham_DNA_Methylation_Sample_Attributes_II.HMB-IRB-MDS.txt",sep="\t",header = T)
  setwd("H:/dbgap_CHD/ChildStudyConsentSet_phs000724.Framingham.v7.p10.c2.HMB-IRB-NPU-MDS/PhenotypeFiles")
  c2=read.table("phs000724.v7.pht004247.v2.p11.c2.Framingham_DNA_Methylation_Sample_Attributes_II.HMB-IRB-NPU-MDS.txt",sep="\t",header = T)
  data2 <- rbind(c1,c2)
  data2$SAMPID=strsplit(as.character(data2$SAMPID), "_724")
  data2 = data2[,c(2,3,5,11)]
  data2$SAMPID = as.character(data2$SAMPID)#2782
  colnames(data2)[3] = "Sample_Well"
  data = merge(data1,data2,by="SAMPID",all.x=TRUE,all.y=TRUE)#2782;data2 have rep, data1 no rep
}
#Pre-processing data
{
  #qc-17
  data2_QC = filter(data2,QC_Comment != "")
  QC = unique(as.character(data2_QC$SAMPID))
  #rep-29
  index<-duplicated(data2$SAMPID)
  data2_index <-data2[index,]
  rep = unique(as.character(data2_index$SAMPID))
  data = filter(data, !SAMPID %in% c(rep,QC))#2626
  data$Sentrix_ID <- unlist(lapply(as.character(data$LABID.y),function(x){strsplit(x,'_')[[1]][[1]]}))
  #slide: data$Sentrix_ID=gsub("_[[:alnum:]]*", "", data$LABID.y)
  data$Sentrix_Position <- unlist(lapply(as.character(data$LABID.y),function(x){strsplit(x,'_')[[1]][[2]]}))
  #arrayPos:data$Sentrix_Position=gsub("[0-9]*_", "", data$LABID.y)
  data = data[,c(1,3,5,6,7,8)] #2626
}
save(data,file="data.Rdata")
#EF
{
  #c1
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  echo <- read_excel("phs000007.v30.pht002572.v6.p11.c1.t_echo_2008_m_0549s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  echo <- data.frame(echo)
  echo <- echo[,c(2,38,123)]
  #c2
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  echo_c2 <- read_excel("phs000007.v30.pht002572.v6.p11.c2.t_echo_2008_m_0549s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  echo_c2 <- data.frame(echo_c2)
  echo_c2 <- echo_c2[,c(2,38,123)]
  #rbind
  echo <- rbind(echo,echo_c2)
  colnames(echo)[2] <- c("EF")
  colnames(echo)[3] <- c("LVSF") 
  echo_1 <- filter(echo,EF<40)
  echo_2 <- filter(echo,EF>50)
  echo_3 <- filter(echo,EF<50 & EF>40)
}
HFpEF = merge(echo_2[,1:2],data,by.x="shareid",by.y="SAMPID")#20
HFrEF = merge(echo_1[,1:2],data,by.x="shareid",by.y="SAMPID")#2334
HFmrEF = merge(echo_3[,1:2],data,by.x="shareid",by.y="SAMPID")#38
save(HFpEF,file="HFpEF.Rdata")
#clinical
{
  #cvd
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  cvd_c1 <- read_excel("phs000007.v30.pht003316.v7.p11.c1.vr_survcvd_2014_a_1023s.HMB-IRB-MDS.xlsx",sheet=1, na = "NA", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  cvd_c2 <- read_excel("phs000007.v30.pht003316.v7.p11.c2.vr_survcvd_2014_a_1023s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "NA", skip = 10)
  cvd = rbind(cvd_c1,cvd_c2)
  cvd <- data.frame(cvd)
  cvd <- cvd[,-c(1,3)]
  head(cvd)
  #mi
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  MI_c1 <- read_excel("phs000007.v30.pht000309.v13.p11.c1.vr_soe_2016_a_1073s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  MI_c2 <- read_excel("phs000007.v30.pht000309.v13.p11.c2.vr_soe_2016_a_1073s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  MI = rbind(MI_c1,MI_c2)
  MI <- MI[,-c(1,4,5,6,8)]
  MIyes <-filter(MI , EVENT == "1" | EVENT == "2" | EVENT == "3" )
  MIyes <- MIyes[,-2]
  MIyes <- MIyes[!duplicated(MIyes$shareid), ] 
  MIyes$mi <- rep(1,nrow(MIyes))
  head(MIyes)
  colnames(MIyes)[2] ="midate"
  tmp1=merge(cvd,MIyes,by="shareid",all.x=TRUE,all.y=TRUE)
  tmp1$mi <- ifelse(is.na(tmp1$mi) == "FALSE" ,"1","0")
  #diba
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  DIAB_c1 <- read_excel("phs000007.v30.pht000041.v7.p11.c1.vr_diab_ex09_1_1002s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  DIAB_c1 <- data.frame(DIAB_c1)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  DIAB_c2 <- read_excel("phs000007.v30.pht000041.v7.p11.c2.vr_diab_ex09_1_1002s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  DIAB_c2 <- data.frame(DIAB_c2)
  DIAB = rbind(DIAB_c1,DIAB_c2)
  head(DIAB)
  DIAByes <- filter(DIAB,DIAB$CURR_DIAB1 == 1| DIAB$CURR_DIAB2 == 1|DIAB$CURR_DIAB3 == 1|DIAB$CURR_DIAB4 == 1|DIAB$CURR_DIAB5 == 1| DIAB$CURR_DIAB6 == 1| DIAB$CURR_DIAB7 == 1 | DIAB$CURR_DIAB8 == 1)
  DIAByes <- DIAByes[,c(2,11)]
  DIAByes[,2] <- "1"
  
  DIABno <- filter(DIAB,DIAB$CURR_DIAB1 != 1 & DIAB$CURR_DIAB2 != 1 & DIAB$CURR_DIAB3 != 1  & DIAB$CURR_DIAB4 != 1 & DIAB$CURR_DIAB5 != 1 &  DIAB$CURR_DIAB6 != 1 & DIAB$CURR_DIAB7 != 1 & DIAB$CURR_DIAB8 != 1)
  DIABno <- DIABno[,c(2,11)]
  DIABno[,2] <- "0"
  
  DIAB <- rbind(DIAByes,DIABno)
  DIAB <- DIAB[!duplicated(DIAB$shareid), ] 
  tmp2 = merge(tmp1,DIAB,by="shareid",all.x=TRUE,all.y=TRUE)
  #afx
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  afx_c1 <- read_excel("phs000007.v30.pht003315.v7.p11.c1.vr_survaf_2014_a_0987s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  afx_c2 <- read_excel("phs000007.v30.pht003315.v7.p11.c2.vr_survaf_2014_a_0987s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  afx = rbind(afx_c1,afx_c2)
  afx <- data.frame(afx)
  afx <- afx[,c(2,4,5)]
  head(afx)
  tmp3 = merge(tmp2,afx,by="shareid",all.x=TRUE,all.y=TRUE)
  #stroke
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  stroke_c1 <- read_excel("phs000007.v30.pht006023.v2.p11.c1.vr_survstk_2014_a_1031s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  stroke_c2 <- read_excel("phs000007.v30.pht006023.v2.p11.c2.vr_survstk_2014_a_1031s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  stroke = rbind(stroke_c1,stroke_c2)
  stroke <- data.frame(stroke)
  stroke <- stroke[,c(2,4,7)]
  head(stroke)
  tmp4 = merge(tmp3,stroke,by="shareid",all.x=TRUE,all.y=TRUE)
  #meta1
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta1_c1 <- read_excel("phs000007.v30.pht006027.v2.p11.c1.vr_wkthru_ex09_1_1001s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta1_c2 <- read_excel("phs000007.v30.pht006027.v2.p11.c2.vr_wkthru_ex09_1_1001s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  meta1 = rbind(meta1_c1,meta1_c2)
  c=c(1,206,207,3,20,29,38,47,56,62,71,80,89,96,105,114,128,137,146,155,161,170,188,197) #extract exam8 all information
  meta1 <- meta1[,c+1]
  head(meta1)
  meta1 <- data.frame(meta1)
  normalization<-function(x){
    return((x-min(x))/(max(x)-min(x)))}
  meta1$SEX <- normalization(meta1$SEX)
  tmp5 = merge(tmp4,meta1,by="shareid",all.x=TRUE,all.y=TRUE)
  #meta2
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta2_c1 <- read_excel("phs000007.v30.pht000747.v6.p11.c1.ex1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta2_c2 <- read_excel("phs000007.v30.pht000747.v6.p11.c2.ex1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  meta2 = rbind(meta2_c1,meta2_c2)
  c=c("shareid","H010","H071","H074","H077","H480")
  meta2 <- meta2[,colnames(meta2) %in% c]
  names <- c("shareid","aspirin","beer","wine","liquor","sleep")
  colnames(meta2) <- names
  tmp6 = merge(tmp5,meta2,by="shareid",all.x=TRUE,all.y=TRUE)
  #meta3
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta3_other2_c1 <- read_excel("phs000007.v30.pht000742.v6.p11.c1.fhslab1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta3_other2_c2 <- read_excel("phs000007.v30.pht000742.v6.p11.c2.fhslab1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  meta3_other2 = rbind(meta3_other2_c1,meta3_other2_c2)
  meta3_other2 <- meta3_other2[,c(2,10:12)]
  colnames(meta3_other2)[1] <- c("SAMPID") 
  colnames(meta3_other2)[2] <- c("Albumin_urine")
  colnames(meta3_other2)[3] <- c("Creatinine_urine")
  colnames(meta3_other2)[4] <- c("Hemoglobin_A1c_wholeblood")
  meta3_other2 <- data.frame(meta3_other2)
  head(meta3_other2)
  tmp7 = merge(tmp6,meta3_other2,by.x = "shareid",by.y="SAMPID",all.x=TRUE,all.y=TRUE)
  #meta4
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta3_other1_c1 <- read_excel("phs000007.v30.pht000747.v6.p11.c1.ex1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  meta3_other1_c2 <- read_excel("phs000007.v30.pht000747.v6.p11.c2.ex1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  meta3_other1 = rbind(meta3_other1_c1,meta3_other1_c2)
  meta3_other1 <- meta3_other1[,-1]
  meta3_other1 <- data.frame(meta3_other1)
  meta3_other1 <- meta3_other1[,c(1,251:292,450,451,453,456,459)]
  c <- c("ATRIAL_ENLARGEMENT","RVH","LVH","RHEUMATIC","AORTIC_VALVE","MITRAL_VALVE","OTHER_HEART",
         "ARRHYTHMIA","OTHER_PERIPHERAL_VASCULAR_DISEASE","OTHER_VASCULAR_DIAGNOSIS","DEMENTIA",
         "PARKINSON","ADULT_SEIZURE_DISORDER","NEUROLOGICAL","THYROID","ENDOCRINE","RENAL",
         "GYNECOLOGIC","EMPHYSEMA","PNEUMONIA","ASTHMA","PULMONARY","GOUT","DEGENERATIVE","RHEUMATOID_ARTHRITIS",
         "MUSCULOSKELETAL","GALLBLADDER","GERD","LIVER","GI_DISEASE","HEMATOLOGIC_DISORDER","BLEEDING_DISORDER","EYE",
         "ENT","SKIN","OTHER","DEPRESSION","ANXIETY","PSYCHOSIS","OTHER2","PROSTATE","INFECTIOUS",
         "FEVER","PNEUMONIA","CHRONIC_BRONCHITIS","EMPHYSEMA","COPD")
  colnames(meta3_other1)[2:48] <- tolower(c)
  meta3_other1[1:10,1:10]
  colnames(meta3_other1)[1] <- c("SAMPID") 
  #全部变为0,1的；去除2,3的
  for(i in 2:ncol(meta3_other1)){
    meta3_other1[,i] <- ifelse(meta3_other1[,i] == 0 ,0,1)
  }
  tmp8 = merge(tmp7,meta3_other1,by.x = "shareid",by.y="SAMPID",all.x=TRUE,all.y=TRUE)
  #crp
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  CRP_c1 <- read_excel("phs000007.v30.pht002888.v5.p11.c1.l_crp_2008_m_0477s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
  CRP_c1 = CRP_c1[,-5]
  setwd("E:/workplace/mywork/methy/dbgap/3_clin")
  CRP_c2 <- read_excel("phs000007.v30.pht002888.v5.p11.c2.l_crp_2008_m_0477s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
  CRP = rbind(CRP_c1,CRP_c2)
  CRP <- CRP[,c(2,4)]
  head(CRP)
  tmp9 = merge(tmp8,CRP,by="shareid",all.x=TRUE,all.y=TRUE)
  #medicine
  {
    #FOLIC
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      FOLIC_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      FOLIC_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      FOLIC =  rbind(FOLIC_c1,FOLIC_c2)
      FOLIC <- data.frame(FOLIC)
      FOLIC <- FOLIC[,c(2,10:17)]
      FOLIC <- filter(FOLIC, phrm_gp1 == "VITAMIN B12 AND FOLIC ACID" & chem_gp1 == "Folic acid and derivatives")
      
      data <- merge(FOLIC ,tmp9,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "Folic_acid"
      colnames(data)[3] = "Folic_acid_amount"
      data$Folic_acid_amount <- ifelse(data$Folic_acid == 0 ,"0",data$Folic_acid_amount)
    }
    #aspirin-only cvd realtivel
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      aspirin_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      aspirin_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      aspirin = rbind(aspirin_c1,aspirin_c2)
      aspirin <- data.frame(aspirin)
      aspirin <- aspirin[,c(2,10:17)]
      aspirin <- filter(aspirin, system1 == "BLOOD AND BLOOD FORMING ORGANS" & 
                          ther_gp1 == "ANTITHROMBOTIC AGENTS" &  
                          chem_gp1 == "Platelet aggregation inhibitors excl. heparin" & chem_nm1 == "ACETYLSALICYLIC ACID")
      
      data <- merge(aspirin ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "aspirin"
      colnames(data)[3] = "aspirin_amount"
      data$aspirin_amount <- ifelse(data$aspirin == 0 ,"0",data$aspirin_amount)
    }
    #clopidogrel
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      clopidogrel_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      clopidogrel_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      clopidogrel = rbind(clopidogrel_c1,clopidogrel_c2)
      clopidogrel <- data.frame(clopidogrel)
      clopidogrel <- clopidogrel[,c(2,10:17)]
      clopidogrel <- filter(clopidogrel, system1 == "BLOOD AND BLOOD FORMING ORGANS" & 
                              ther_gp1 == "ANTITHROMBOTIC AGENTS" &  
                              chem_gp1 == "Platelet aggregation inhibitors excl. heparin" & chem_nm1 == "CLOPIDOGREL")
      
      data <- merge(clopidogrel ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "clopidogrel"
      colnames(data)[3] = "clopidogrel_amount"
      data$clopidogrel_amount <- ifelse(data$clopidogrel == 0 ,"0",data$clopidogrel_amount)
    }
    #warfarin
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      warfarin_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      warfarin_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      warfarin = rbind(warfarin_c1,warfarin_c2)
      warfarin <- data.frame(warfarin)
      warfarin <- warfarin[,c(2,10:17)]
      warfarin <- filter(warfarin, system1 == "BLOOD AND BLOOD FORMING ORGANS" & 
                           ther_gp1 == "ANTITHROMBOTIC AGENTS" &  
                           chem_gp1 == "Vitamin K antagonists" )
      
      data <- merge(warfarin ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "warfarin"
      colnames(data)[3] = "warfarin_amount"
      data$warfarin_amount <- ifelse(data$warfarin == 0 ,"0",data$warfarin_amount)
    }
    #ACEI
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      ACEI_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      ACEI_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      ACEI = rbind(ACEI_c1,ACEI_c2)
      ACEI <- data.frame(ACEI)
      ACEI <- ACEI[,c(2,10:17)]
      ACEI <- filter(ACEI, system1 == "CARDIOVASCULAR SYSTEM" & 
                       ther_gp1 == "AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM" &  
                       phrm_gp1 == "ACE INHIBITORS, PLAIN" )
      
      data <- merge(ACEI ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "ACEI"
      colnames(data)[3] = "ACEI_amount"
      data$ACEI_amount <- ifelse(data$ACEI == 0 ,"0",data$ACEI_amount)
    }
    #Angiotensin_II_antagonists
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Angiotensin_II_antagonists_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Angiotensin_II_antagonists_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      Angiotensin_II_antagonists = rbind(Angiotensin_II_antagonists_c1,Angiotensin_II_antagonists_c2)
      Angiotensin_II_antagonists <- data.frame(Angiotensin_II_antagonists)
      Angiotensin_II_antagonists <- Angiotensin_II_antagonists[,c(2,10:17)]
      Angiotensin_II_antagonists <- filter(Angiotensin_II_antagonists, system1 == "CARDIOVASCULAR SYSTEM" & 
                                             ther_gp1 == "AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM" &  
                                             phrm_gp1 == "ANGIOTENSIN II ANTAGONISTS, PLAIN" )
      
      data <- merge(Angiotensin_II_antagonists ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "Angiotensin_II_antagonists"
      colnames(data)[3] = "Angiotensin_II_antagonists_amount"
      data$Angiotensin_II_antagonists_amount <- ifelse(data$Angiotensin_II_antagonists == 0 ,"0",data$Angiotensin_II_antagonists_amount)
    }
    #Beta_blocking
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Beta_blocking_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Beta_blocking_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      Beta_blocking = rbind(Beta_blocking_c1,Beta_blocking_c2)
      Beta_blocking <- data.frame(Beta_blocking)
      Beta_blocking <- Beta_blocking[,c(2,10:17)]
      Beta_blocking <- filter(Beta_blocking, system1 == "CARDIOVASCULAR SYSTEM" & 
                                ther_gp1 == "BETA BLOCKING AGENTS" )
      
      data <- merge(Beta_blocking ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "Beta_blocking"
      colnames(data)[3] = "Beta_blocking_amount"
      data$Beta_blocking_amount <- ifelse(data$Beta_blocking == 0 ,"0",data$Beta_blocking_amount)
      
    }
    #CO_Q_10
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      CO_Q_10_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      CO_Q_10_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      CO_Q_10 = rbind(CO_Q_10_c1,CO_Q_10_c2)
      CO_Q_10 <- data.frame(CO_Q_10)
      CO_Q_10 <- CO_Q_10[,c(2,10:17)]
      CO_Q_10 <- filter(CO_Q_10, system1 == "CARDIOVASCULAR SYSTEM" & 
                          ther_gp1 == "CARDIAC THERAPY" &
                          phrm_gp1 == "OTHER CARDIAC PREPARATIONS")
      
      data <- merge(CO_Q_10 ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "CO_Q_10"
      colnames(data)[3] = "CO_Q_10_amount"
      data$CO_Q_10_amount <- ifelse(data$CO_Q_10 == 0 ,"0",data$CO_Q_10_amount)
    }
    #vasodilators
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      vasodilators_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      vasodilators_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      vasodilators = rbind(vasodilators_c1,vasodilators_c2)
      vasodilators <- data.frame(vasodilators)
      vasodilators <- vasodilators[,c(2,10:17)]
      vasodilators <- filter(vasodilators, system1 == "CARDIOVASCULAR SYSTEM" & 
                               ther_gp1 == "CARDIAC THERAPY" &
                               phrm_gp1 == "VASODILATORS USED IN CARDIAC DISEASES")
      
      data <- merge(vasodilators ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "vasodilators"
      colnames(data)[3] = "vasodilators_amount"
      data$vasodilators_amount <- ifelse(data$vasodilators == 0 ,"0",data$vasodilators_amount)
    }
    #amiodarone
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      amiodarone_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      amiodarone_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      amiodarone = rbind(amiodarone_c1,amiodarone_c2)
      amiodarone <- data.frame(amiodarone)
      amiodarone <- amiodarone[,c(2,10:17)]
      amiodarone <- filter(amiodarone, system1 == "CARDIOVASCULAR SYSTEM" & 
                             ther_gp1 == "CARDIAC THERAPY" &
                             phrm_gp1 == "ANTIARRHYTHMICS, CLASS I AND III" &
                             chem_nm1 == "AMIODARONE")
      
      data <- merge(amiodarone ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "amiodarone"
      colnames(data)[3] = "amiodarone_amount"
      data$amiodarone_amount <- ifelse(data$amiodarone == 0 ,"0",data$amiodarone_amount)
    }
    #Aldosterone antagonists
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Aldosterone_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Aldosterone_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      Aldosterone = rbind(Aldosterone_c1,Aldosterone_c2)
      Aldosterone <- data.frame(Aldosterone)
      Aldosterone <- Aldosterone[,c(2,10:17)]
      Aldosterone <- filter(Aldosterone, system1 == "CARDIOVASCULAR SYSTEM" & 
                              ther_gp1 == "DIURETICS" &
                              chem_gp1 == "Aldosterone antagonists")
      
      data <- merge(vasodilators ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "Aldosterone"
      colnames(data)[3] = "Aldosterone_amount"
      data$Aldosterone_amount <- ifelse(data$Aldosterone == 0 ,"0",data$Aldosterone_amount)
      
    }
    #potassium_sparing_diuretic
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      potassium_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      potassium_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      potassium = rbind(potassium_c1,potassium_c2)
      potassium <- data.frame(potassium)
      potassium <- potassium[,c(2,10:17)]
      potassium <- filter(potassium, system1 == "CARDIOVASCULAR SYSTEM" & 
                            ther_gp1 == "DIURETICS" &
                            chem_gp1 == "Other potassium-sparing agents")
      
      data <- merge(potassium ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "potassium"
      colnames(data)[3] = "potassium_amount"
      data$potassium_amount <- ifelse(data$potassium == 0 ,"0",data$potassium_amount)
    }
    #Sulfonamides
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Sulfonamides_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Sulfonamides_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      Sulfonamides = rbind(Sulfonamides_c1,Sulfonamides_c2)
      Sulfonamides <- data.frame(Sulfonamides)
      Sulfonamides <- Sulfonamides[,c(2,10:17)]
      Sulfonamides <- filter(Sulfonamides, system1 == "CARDIOVASCULAR SYSTEM" & 
                               ther_gp1 == "DIURETICS" &
                               chem_gp1 == "Sulfonamides, plain")
      
      data <- merge(Sulfonamides ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "Sulfonamides"
      colnames(data)[3] = "Sulfonamides_amount"
      data$Sulfonamides_amount <- ifelse(data$Sulfonamides == 0 ,"0",data$Sulfonamides_amount)
      
    }
    #Thiazides
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Thiazides_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      Thiazides_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      Thiazides = rbind(Thiazides_c1,Thiazides_c2)
      Thiazides <- data.frame(Thiazides)
      Thiazides <- Thiazides[,c(2,10:17)]
      Thiazides <- filter(Thiazides, system1 == "CARDIOVASCULAR SYSTEM" & 
                            ther_gp1 == "DIURETICS" &
                            chem_gp1 == "Thiazides, plain")
      
      data <- merge(Thiazides ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "Thiazides"
      colnames(data)[3] = "Thiazides_amount"
      data$Thiazides_amount <- ifelse(data$Thiazides == 0 ,"0",data$Thiazides_amount)
      
    }
    #statin
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      STATIN_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      STATIN_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      STATIN = rbind(STATIN_c1,STATIN_c2)
      STATIN <- data.frame(STATIN)
      STATIN <- STATIN[,c(2,10:17)]
      STATIN1 <- filter(STATIN, system1 == "CARDIOVASCULAR SYSTEM" & 
                          ther_gp1 == "LIPID MODIFYING AGENTS" &
                          chem_gp1 == "HMG CoA reductase inhibitors")
      STATIN2 <- filter(STATIN, system1 == "CARDIOVASCULAR SYSTEM" & 
                          ther_gp1 == "LIPID MODIFYING AGENTS" &
                          chem_gp1 == "Other lipid modifying agents" & chem_nm1 == "EZETIMIBE" & MEDNAME == "SIMVASTATIN" |MEDNAME == "SIMVASTATIN)")
      STATIN <- rbind(STATIN1,STATIN2)
      data <- merge(STATIN ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "STATIN"
      colnames(data)[3] = "STATIN_amount"
      data$STATIN_amount <- ifelse(data$STATIN == 0 ,"0",data$STATIN_amount)
    }
    #OMEGA_3
    {
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      OMEGA_3_c1 <- read_excel("phs000007.v30.pht000828.v6.p11.c1.meds1_8s.HMB-IRB-MDS.xlsx",sheet=1, na = "", skip = 10)
      setwd("E:/workplace/mywork/methy/dbgap/3_clin")
      OMEGA_3_c2 <- read_excel("phs000007.v30.pht000828.v6.p11.c2.meds1_8s.HMB-IRB-NPU-MDS.xlsx",sheet=1, na = "", skip = 10)
      OMEGA_3 = rbind(OMEGA_3_c1,OMEGA_3_c2)
      OMEGA_3 <- data.frame(OMEGA_3)
      OMEGA_3 <- OMEGA_3[,c(2,10:17)]
      OMEGA_3 <- filter(OMEGA_3, system1 == "CARDIOVASCULAR SYSTEM" & 
                          ther_gp1 == "LIPID MODIFYING AGENTS" &
                          chem_gp1 == "Other lipid modifying agents" & chem_nm1 == "OMEGA-3-TRIGLYCERIDES")
      data <- merge(OMEGA_3 ,data,by="shareid",all.y=TRUE)
      data$chem_gp1 <- ifelse(is.na(data$IDTYPE) == "FALSE" ,"1","0")
      data <- data[,-c(2:4,6:8)]
      colnames(data)[2] = "OMEGA_3"
      colnames(data)[3] = "OMEGA_3_amount"
      data$OMEGA_3_amount <- ifelse(data$OMEGA_3 == 0 ,"0",data$OMEGA_3_amount)
    }
    dim(data)
    #15285,125
  }
  #duplicated--no among
  data <- data[!duplicated(data[,-c(3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33)]),]#15154   125
  save(data,file="E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/all_clin.Rdata")
  data_exam8 = filter(data,DATE8 != "NA")#3000  125
  data_exam8$mi = ifelse(data_exam8$mi == 1 & data_exam8$midate<data_exam8$DATE8,1,0)
  data_exam8$cvd = ifelse(data_exam8$cvd == 1 & data_exam8$cvddate<data_exam8$DATE8,1,0)
  data_exam8$chd = ifelse(data_exam8$chd == 1 & data_exam8$chddate<data_exam8$DATE8,1,0)
  data_exam8$afx = ifelse(data_exam8$afx == 1 & data_exam8$afxdate<data_exam8$DATE8,1,0)
  data_exam8$stroke = ifelse(data_exam8$stroke == 1 & data_exam8$strokedate<data_exam8$DATE8,1,0)
  save(data_exam8,file="E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary/data_exam8.Rdata")
}
setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary")
load(file="data_exam8.Rdata")
HFpEF_exam8 = merge(HFpEF,data_exam8,by="shareid")#2334  131
save(HFpEF_exam8,file="HFpEF_exam8.Rdata")
#time
{
  library(dplyr)
  library(tibble)
  EHR = HFpEF_exam8
  
  #clin_data_control <- filter(EHR, chf == 0 ,(chfdate - DATE8 ) >8*365) 
  clin_data_control <- filter(EHR, (chfdate - DATE8 ) >8*365) 
  summary((clin_data_control$chfdate - clin_data_control$DATE8))
  out1 = filter(EHR, chf == 1 & chfdate < DATE8 )
  #clin_data_chf <- filter(EHR, chf == 1 & (chfdate >= DATE8))
  clin_data_chf <- filter(EHR, chf == 1 & (chfdate >= DATE8) & (chfdate - DATE8 ) <8*365)
  summary((clin_data_chf$chfdate - clin_data_chf$DATE8))
  out2 = filter(EHR, chf == 0 & (chfdate >= DATE8) & (chfdate - DATE8 ) <=8*365 )
  # tmp = filter(EHR, chf == 1 & (chfdate - DATE8 <= 5*365) & (chfdate >= DATE8))
  # summary(tmp$chfdate - tmp$DATE8)
  clin_data_chf_control <- rbind(clin_data_chf,clin_data_control)#989
  #20211220 cosoring
  clin_data_chf_control <- filter(EHR,!EHR$shareid %in% out1$shareid)
}
table(clin_data_chf_control$PACKS_SET)
#GEN3  JHU  UMN 
#0  171  797 
setwd("E:\\workplace\\mywork\\methy\\dbgap\\chf\\data_chf_contr\\early_chf\\c1_UMN_JHU\\train_UMN_tset_JHU/1123_dataSummary")
save(clin_data_chf_control,file="clin_data_chf_control.Rdata")