--- a +++ b/Functions-GenerateData.R @@ -0,0 +1,191 @@ +Generate.Data <- function(sample_size,num_features,theta_pred,model){ + + + #### + #### To generate the treatment + #### + treatment <- rbinom(sample_size,1,0.5) + + + #### + #### To generate the labels I need to create for each model the logistic regression function + #### + switch(model, + { #### Model 1 + + sigma <- diag(num_features) + correl <-0 + sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features + sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features + diag(sigma) <- 1 + covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS) + + + prog_part <- apply(covariates[,1:5],1,sum); # prog_part <- prog_part - mean(prog_part); + pred_part <- apply(covariates[,4:8],1,sum);# pred_part <- pred_part - mean(pred_part); + logit_pY1 <- prog_part + (treatment+0.1) * theta_pred * pred_part ; + prog_features <- 1:5; pred_features <- 4:8; + irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features)) + + for (index_feature in 1:num_features){ + covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1))) + } + + covariates <- as.data.frame.matrix(covariates) + }, + { #### Model 2 + + sigma <- diag(num_features) + correl <-0 + sigma[seq(1,15,by=2),seq(1,15,by=2)] <-correl # correlation between odds features + sigma[seq(2,15,by=2),seq(2,15,by=2)] <-correl # correlation between even features + diag(sigma) <- 1 + covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS) + + prog_part <- apply(covariates[,1:5],1,sum); # prog_part <- prog_part - mean(prog_part); + pred_part <- apply(covariates[,6:10],1,sum);# pred_part <- pred_part - mean(pred_part); + logit_pY1 <- prog_part + (treatment+0)*theta_pred * pred_part ; + prog_features <- 1:5; pred_features <- 6:10; + irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features)) + + for (index_feature in 1:num_features){ + covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1))) + } + + covariates <- as.data.frame.matrix(covariates) + }, + { #### Model 3 + + sigma <- diag(num_features) + correl <-0.70 + sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features + sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features + diag(sigma) <- 1 + covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS) + + + + prog_part <- apply(covariates[,1:5],1,sum); # prog_part <- prog_part - mean(prog_part); + pred_part <- apply(covariates[,6:10],1,sum);# pred_part <- pred_part - mean(pred_part); + logit_pY1 <- prog_part + (treatment+0)*theta_pred * pred_part ; + prog_features <- 1:5; pred_features <- 6:10; + irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features)) + + for (index_feature in 1:num_features){ + covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1))) + } + + covariates <- as.data.frame.matrix(covariates) + }, + { #### Model 4 + + sigma <- diag(num_features) + correl <-0.70 + sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features + sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features + diag(sigma) <- 1 + covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS) + + + prog_part <- covariates[,1] + apply(covariates[,c(2,3)],1,prod) + apply(covariates[,c(4,5)],1,prod) ;# prog_part <- prog_part - mean(prog_part); + pred_part <- ( covariates[,6] + apply(covariates[,c(7,8)],1,prod) + apply(covariates[,c(9,10)],1,prod) );# pred_part <- pred_part - mean(pred_part); + logit_pY1 <- prog_part + treatment * theta_pred * pred_part; + prog_features <- 1:5; pred_features <- 6:10; + irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features)) + + for (index_feature in 1:num_features){ + covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1))) + } + + covariates <- as.data.frame.matrix(covariates) + }, + + + { #### Model 5 + sigma <- diag(num_features) + correl <-0.70 + sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features + sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features + diag(sigma) <- 1 + covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS) + prog_part <- covariates[,1]*(covariates[,2]*covariates[,3]-covariates[,4]^2) ;# prog_part <- prog_part - mean(prog_part); + pred_part <- covariates[,5]*exp(covariates[,6]*covariates[,7]-covariates[,8]^2) ;# pred_part <- pred_part - mean(pred_part); + logit_pY1 <- prog_part + treatment*theta_pred * pred_part; + prog_features <- 1:4; pred_features <- 5:8; + irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features)) + + for (index_feature in 1:num_features){ + covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1))) + } + + covariates <- as.data.frame.matrix(covariates) + }, + + { #### Model 6 + sigma <- diag(num_features) + correl <-0.70 + sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features + sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features + diag(sigma) <- 1 + covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS) + + prog_part <- covariates[,1] + (covariates[,2]>-0.545) * (covariates[,3]<0.545) ;# prog_part <- prog_part - mean(prog_part); + pred_part <- covariates[,4] + (covariates[,5]>-0.545) * (covariates[,6]<0.545) ;# pred_part <- pred_part - mean(pred_part); + logit_pY1 <- prog_part + treatment*theta_pred * pred_part; + prog_features <- 1:3; pred_features <- 4:6; + irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features)) + + for (index_feature in 1:num_features){ + covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1))) + } + + covariates <- as.data.frame.matrix(covariates) + + }, + { #### Model 7 + sigma <- diag(num_features) + correl <-0.70 + sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features + sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features + diag(sigma) <- 1 + covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS) + prog_part <- covariates[,1] + (covariates[,2]>-0.545) * (covariates[,3]<0.545)*(covariates[,4]>0);# prog_part <- prog_part - mean(prog_part); + pred_part <- covariates[,5] + (covariates[,6]>-0.545) * (covariates[,7]<0.545)*(covariates[,8]>0);# pred_part <- pred_part - mean(pred_part); + logit_pY1 <- prog_part + treatment*theta_pred * pred_part; + prog_features <- 1:4; pred_features <- 5:8; + irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features)) + + for (index_feature in 1:num_features){ + covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1))) + } + + covariates <- as.data.frame.matrix(covariates) + }, + + + + stop("Wrong model") + ) + + + + pY1 <- 1/(1+exp(-logit_pY1)) + labels <- rbinom(sample_size,1,pY1); + + + + synthetic_datasets <- numeric(0) + synthetic_datasets$data <- covariates + synthetic_datasets$treatment <- treatment + synthetic_datasets$labels <- labels + synthetic_datasets$prog_features <- prog_features + synthetic_datasets$pred_features <- pred_features + synthetic_datasets$irr_features <- irr_features + + return(synthetic_datasets) + +} + + + +