Diff of /Functions-GenerateData.R [000000] .. [c4e594]

Switch to unified view

a b/Functions-GenerateData.R
1
Generate.Data <- function(sample_size,num_features,theta_pred,model){
2
  
3
  
4
  ####
5
  #### To generate the treatment
6
  ####
7
  treatment <- rbinom(sample_size,1,0.5)
8
  
9
  
10
  ####
11
  #### To generate the labels I need to create for each model the logistic regression function
12
  ####
13
  switch(model,
14
         { #### Model 1
15
           
16
           sigma <- diag(num_features)
17
           correl <-0
18
           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
19
           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
20
           diag(sigma) <- 1
21
           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
22
23
  
24
           prog_part <- apply(covariates[,1:5],1,sum); # prog_part <- prog_part - mean(prog_part);
25
           pred_part <-  apply(covariates[,4:8],1,sum);# pred_part <- pred_part - mean(pred_part);
26
           logit_pY1 <-  prog_part + (treatment+0.1) * theta_pred * pred_part ;
27
           prog_features <- 1:5; pred_features <- 4:8;
28
           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
29
30
           for (index_feature in 1:num_features){ 
31
           covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
32
           }
33
           
34
           covariates <- as.data.frame.matrix(covariates)
35
         },
36
         { #### Model 2
37
           
38
           sigma <- diag(num_features)
39
           correl <-0
40
           sigma[seq(1,15,by=2),seq(1,15,by=2)] <-correl # correlation between odds features
41
           sigma[seq(2,15,by=2),seq(2,15,by=2)] <-correl # correlation between even features
42
           diag(sigma) <- 1
43
           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
44
45
           prog_part <- apply(covariates[,1:5],1,sum); #  prog_part <- prog_part - mean(prog_part);
46
           pred_part <- apply(covariates[,6:10],1,sum);#  pred_part <- pred_part - mean(pred_part);
47
           logit_pY1 <-  prog_part + (treatment+0)*theta_pred * pred_part ;
48
           prog_features <- 1:5; pred_features <- 6:10;
49
           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
50
51
           for (index_feature in 1:num_features){ 
52
             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
53
           }
54
           
55
           covariates <- as.data.frame.matrix(covariates)
56
         },   
57
         { #### Model 3
58
           
59
           sigma <- diag(num_features)
60
           correl <-0.70
61
           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
62
           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
63
           diag(sigma) <- 1
64
           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
65
66
           
67
68
           prog_part <- apply(covariates[,1:5],1,sum); #  prog_part <- prog_part - mean(prog_part);
69
           pred_part <- apply(covariates[,6:10],1,sum);#  pred_part <- pred_part - mean(pred_part);
70
           logit_pY1 <- prog_part + (treatment+0)*theta_pred * pred_part ;
71
           prog_features <- 1:5; pred_features <- 6:10;
72
           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
73
74
           for (index_feature in 1:num_features){ 
75
             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
76
           }
77
           
78
           covariates <- as.data.frame.matrix(covariates)
79
         },                  
80
         { #### Model 4
81
           
82
           sigma <- diag(num_features)
83
           correl <-0.70
84
           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
85
           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
86
           diag(sigma) <- 1
87
           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
88
89
  
90
           prog_part <- covariates[,1] + apply(covariates[,c(2,3)],1,prod) + apply(covariates[,c(4,5)],1,prod) ;#  prog_part <- prog_part - mean(prog_part);
91
           pred_part <-   ( covariates[,6] + apply(covariates[,c(7,8)],1,prod) + apply(covariates[,c(9,10)],1,prod) );#  pred_part <- pred_part - mean(pred_part);
92
           logit_pY1 <-  prog_part +  treatment * theta_pred * pred_part;
93
           prog_features <- 1:5; pred_features <- 6:10;
94
           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
95
96
           for (index_feature in 1:num_features){ 
97
             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
98
           }
99
           
100
           covariates <- as.data.frame.matrix(covariates)
101
         },
102
         
103
         
104
         { #### Model 5
105
           sigma <- diag(num_features)
106
           correl <-0.70
107
           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
108
           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
109
           diag(sigma) <- 1
110
           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
111
           prog_part <-  covariates[,1]*(covariates[,2]*covariates[,3]-covariates[,4]^2) ;#   prog_part <- prog_part - mean(prog_part);
112
           pred_part <-   covariates[,5]*exp(covariates[,6]*covariates[,7]-covariates[,8]^2)  ;#  pred_part <- pred_part - mean(pred_part);
113
           logit_pY1 <-  prog_part +  treatment*theta_pred * pred_part;
114
           prog_features <- 1:4; pred_features <- 5:8;
115
           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
116
117
           for (index_feature in 1:num_features){ 
118
             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
119
           }
120
           
121
           covariates <- as.data.frame.matrix(covariates)
122
         },
123
         
124
         { #### Model  6
125
           sigma <- diag(num_features)
126
           correl <-0.70
127
           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
128
           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
129
           diag(sigma) <- 1
130
           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
131
  
132
           prog_part <-  covariates[,1] + (covariates[,2]>-0.545) * (covariates[,3]<0.545) ;#  prog_part <- prog_part - mean(prog_part);
133
           pred_part <-  covariates[,4] + (covariates[,5]>-0.545) * (covariates[,6]<0.545) ;#  pred_part <- pred_part - mean(pred_part);
134
           logit_pY1 <-  prog_part +  treatment*theta_pred * pred_part;
135
           prog_features <- 1:3; pred_features <- 4:6;
136
           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
137
138
           for (index_feature in 1:num_features){ 
139
             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
140
           }
141
           
142
           covariates <- as.data.frame.matrix(covariates)
143
           
144
         },
145
         { #### Model  7
146
           sigma <- diag(num_features)
147
           correl <-0.70
148
           sigma[seq(1,num_features,by=2),seq(1,num_features,by=2)] <-correl # correlation between odds features
149
           sigma[seq(2,num_features,by=2),seq(2,num_features,by=2)] <-correl # correlation between even features
150
           diag(sigma) <- 1
151
           covariates <- mvrnorm(sample_size, rep(0, num_features), sigma) # I need install.packages("MASS") and library(MASS)
152
           prog_part <-  covariates[,1] + (covariates[,2]>-0.545) * (covariates[,3]<0.545)*(covariates[,4]>0);#   prog_part <- prog_part - mean(prog_part);
153
           pred_part <-  covariates[,5] + (covariates[,6]>-0.545) * (covariates[,7]<0.545)*(covariates[,8]>0);#  pred_part <- pred_part - mean(pred_part);
154
           logit_pY1 <-  prog_part +  treatment*theta_pred * pred_part;
155
           prog_features <- 1:4; pred_features <- 5:8;
156
           irr_features <-setdiff(seq(1,num_features,by=1),c(prog_features,pred_features))
157
158
           for (index_feature in 1:num_features){ 
159
             covariates[,index_feature] = t(discretize( covariates[,index_feature], disc="equalwidth", nbins= sample(c(2,3,4,5),1)))
160
           }
161
           
162
           covariates <- as.data.frame.matrix(covariates)
163
         },
164
         
165
         
166
    
167
         stop("Wrong model")
168
  )
169
  
170
  
171
  
172
  pY1 <- 1/(1+exp(-logit_pY1))
173
  labels <-  rbinom(sample_size,1,pY1);
174
  
175
  
176
177
  synthetic_datasets <- numeric(0)
178
  synthetic_datasets$data <- covariates
179
  synthetic_datasets$treatment <- treatment
180
  synthetic_datasets$labels <- labels
181
  synthetic_datasets$prog_features <- prog_features
182
  synthetic_datasets$pred_features <- pred_features  
183
  synthetic_datasets$irr_features <- irr_features
184
185
  return(synthetic_datasets)
186
  
187
}
188
189
190
191