Diff of /Functions-GenerateData.R [000000] .. [c4e594]

Switch to side-by-side view

--- a
+++ b/Functions-GenerateData.R
@@ -0,0 +1,191 @@
+Generate.Data <- function(sample_size,num_features,theta_pred,model){
+  
+  
+  ####
+  #### To generate the treatment
+  ####
+  treatment <- rbinom(sample_size,1,0.5)
+  
+  
+  ####
+  #### To generate the labels I need to create for each model the logistic regression function
+  ####
+  switch(model,
+         { #### Model 1
+           
+           sigma <- diag(num_features)
+           correl <-0
+           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
+           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
+           diag(sigma) <- 1
+           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
+
+  
+           prog_part <- apply(covariates[,1:5],1,sum); # prog_part <- prog_part - mean(prog_part);
+           pred_part <-  apply(covariates[,4:8],1,sum);# pred_part <- pred_part - mean(pred_part);
+           logit_pY1 <-  prog_part + (treatment+0.1) * theta_pred * pred_part ;
+           prog_features <- 1:5; pred_features <- 4:8;
+           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
+
+           for (index_feature in 1:num_features){ 
+           covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
+           }
+           
+           covariates <- as.data.frame.matrix(covariates)
+         },
+         { #### Model 2
+           
+           sigma <- diag(num_features)
+           correl <-0
+           sigma[seq(1,15,by=2),seq(1,15,by=2)] <-correl # correlation between odds features
+           sigma[seq(2,15,by=2),seq(2,15,by=2)] <-correl # correlation between even features
+           diag(sigma) <- 1
+           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
+
+           prog_part <- apply(covariates[,1:5],1,sum); #  prog_part <- prog_part - mean(prog_part);
+           pred_part <- apply(covariates[,6:10],1,sum);#  pred_part <- pred_part - mean(pred_part);
+           logit_pY1 <-  prog_part + (treatment+0)*theta_pred * pred_part ;
+           prog_features <- 1:5; pred_features <- 6:10;
+           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
+
+           for (index_feature in 1:num_features){ 
+             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
+           }
+           
+           covariates <- as.data.frame.matrix(covariates)
+         },   
+         { #### Model 3
+           
+           sigma <- diag(num_features)
+           correl <-0.70
+           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
+           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
+           diag(sigma) <- 1
+           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
+
+           
+
+           prog_part <- apply(covariates[,1:5],1,sum); #  prog_part <- prog_part - mean(prog_part);
+           pred_part <- apply(covariates[,6:10],1,sum);#  pred_part <- pred_part - mean(pred_part);
+           logit_pY1 <- prog_part + (treatment+0)*theta_pred * pred_part ;
+           prog_features <- 1:5; pred_features <- 6:10;
+           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
+
+           for (index_feature in 1:num_features){ 
+             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
+           }
+           
+           covariates <- as.data.frame.matrix(covariates)
+         },                  
+         { #### Model 4
+           
+           sigma <- diag(num_features)
+           correl <-0.70
+           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
+           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
+           diag(sigma) <- 1
+           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
+
+  
+           prog_part <- covariates[,1] + apply(covariates[,c(2,3)],1,prod) + apply(covariates[,c(4,5)],1,prod) ;#  prog_part <- prog_part - mean(prog_part);
+           pred_part <-   ( covariates[,6] + apply(covariates[,c(7,8)],1,prod) + apply(covariates[,c(9,10)],1,prod) );#  pred_part <- pred_part - mean(pred_part);
+           logit_pY1 <-  prog_part +  treatment * theta_pred * pred_part;
+           prog_features <- 1:5; pred_features <- 6:10;
+           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
+
+           for (index_feature in 1:num_features){ 
+             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
+           }
+           
+           covariates <- as.data.frame.matrix(covariates)
+         },
+         
+         
+         { #### Model 5
+           sigma <- diag(num_features)
+           correl <-0.70
+           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
+           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
+           diag(sigma) <- 1
+           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
+           prog_part <-  covariates[,1]*(covariates[,2]*covariates[,3]-covariates[,4]^2) ;#   prog_part <- prog_part - mean(prog_part);
+           pred_part <-   covariates[,5]*exp(covariates[,6]*covariates[,7]-covariates[,8]^2)  ;#  pred_part <- pred_part - mean(pred_part);
+           logit_pY1 <-  prog_part +  treatment*theta_pred * pred_part;
+           prog_features <- 1:4; pred_features <- 5:8;
+           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
+
+           for (index_feature in 1:num_features){ 
+             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
+           }
+           
+           covariates <- as.data.frame.matrix(covariates)
+         },
+         
+         { #### Model  6
+           sigma <- diag(num_features)
+           correl <-0.70
+           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
+           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
+           diag(sigma) <- 1
+           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
+  
+           prog_part <-  covariates[,1] + (covariates[,2]>-0.545) * (covariates[,3]<0.545) ;#  prog_part <- prog_part - mean(prog_part);
+           pred_part <-  covariates[,4] + (covariates[,5]>-0.545) * (covariates[,6]<0.545) ;#  pred_part <- pred_part - mean(pred_part);
+           logit_pY1 <-  prog_part +  treatment*theta_pred * pred_part;
+           prog_features <- 1:3; pred_features <- 4:6;
+           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
+
+           for (index_feature in 1:num_features){ 
+             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
+           }
+           
+           covariates <- as.data.frame.matrix(covariates)
+           
+         },
+         { #### Model  7
+           sigma <- diag(num_features)
+           correl <-0.70
+           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
+           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
+           diag(sigma) <- 1
+           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
+           prog_part <-  covariates[,1] + (covariates[,2]>-0.545) * (covariates[,3]<0.545)*(covariates[,4]>0);#   prog_part <- prog_part - mean(prog_part);
+           pred_part <-  covariates[,5] + (covariates[,6]>-0.545) * (covariates[,7]<0.545)*(covariates[,8]>0);#  pred_part <- pred_part - mean(pred_part);
+           logit_pY1 <-  prog_part +  treatment*theta_pred * pred_part;
+           prog_features <- 1:4; pred_features <- 5:8;
+           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
+
+           for (index_feature in 1:num_features){ 
+             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
+           }
+           
+           covariates <- as.data.frame.matrix(covariates)
+         },
+         
+         
+    
+         stop("Wrong model")
+  )
+  
+  
+  
+  pY1 <- 1/(1+exp(-logit_pY1))
+  labels <-  rbinom(sample_size,1,pY1);
+  
+  
+
+  synthetic_datasets <- numeric(0)
+  synthetic_datasets$data <- covariates
+  synthetic_datasets$treatment <- treatment
+  synthetic_datasets$labels <- labels
+  synthetic_datasets$prog_features <- prog_features
+  synthetic_datasets$pred_features <- pred_features  
+  synthetic_datasets$irr_features <- irr_features
+
+  return(synthetic_datasets)
+  
+}
+
+
+
+