415 lines (414 with data), 14.5 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 27/28\n",
"\n",
" To enable shared memory capabilities, try: install.extras('\n",
"NMF\n",
"')\n",
"\n",
"Loading required package: mclust\n",
"\n",
"Package 'mclust' version 5.4.6\n",
"Type 'citation(\"mclust\")' for citing this R package in publications.\n",
"\n",
"Loading required package: ade4\n",
"\n",
"\n",
"Attaching package: ‘ade4’\n",
"\n",
"\n",
"The following object is masked from ‘package:BiocGenerics’:\n",
"\n",
" score\n",
"\n",
"\n",
"\n",
"Attaching package: ‘GPArotation’\n",
"\n",
"\n",
"The following object is masked from ‘package:NMF’:\n",
"\n",
" entropy\n",
"\n",
"\n",
"\n",
"Attaching package: ‘MOFAtools’\n",
"\n",
"\n",
"The following objects are masked from ‘package:NMF’:\n",
"\n",
" featureNames, featureNames<-, predict, sampleNames, sampleNames<-\n",
"\n",
"\n",
"The following objects are masked from ‘package:Biobase’:\n",
"\n",
" featureNames, featureNames<-, sampleNames, sampleNames<-\n",
"\n",
"\n",
"The following object is masked from ‘package:stats’:\n",
"\n",
" predict\n",
"\n",
"\n",
"Loading required package: JADE\n",
"\n",
"Loading required package: lattice\n",
"\n",
"Loading required package: caTools\n",
"\n",
"Loading required package: gdata\n",
"\n",
"gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.\n",
"\n",
"\n",
"\n",
"gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.\n",
"\n",
"\n",
"Attaching package: ‘gdata’\n",
"\n",
"\n",
"The following object is masked from ‘package:Biobase’:\n",
"\n",
" combine\n",
"\n",
"\n",
"The following object is masked from ‘package:BiocGenerics’:\n",
"\n",
" combine\n",
"\n",
"\n",
"The following object is masked from ‘package:stats’:\n",
"\n",
" nobs\n",
"\n",
"\n",
"The following object is masked from ‘package:utils’:\n",
"\n",
" object.size\n",
"\n",
"\n",
"The following object is masked from ‘package:base’:\n",
"\n",
" startsWith\n",
"\n",
"\n",
"Loading required package: gtools\n",
"\n",
"\n",
"Attaching package: ‘gtools’\n",
"\n",
"\n",
"The following object is masked from ‘package:InterSIM’:\n",
"\n",
" logit\n",
"\n",
"\n",
"Loading required package: gplots\n",
"\n",
"\n",
"Attaching package: ‘gplots’\n",
"\n",
"\n",
"The following object is masked from ‘package:stats’:\n",
"\n",
" lowess\n",
"\n",
"\n"
]
}
],
"source": [
"library(\"InterSIM\", quietly = TRUE)\n",
"source(\"runfactorization.R\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1] \"../data/\"\n",
"[1] \"20210329212141\"\n",
"[1] \"../data/simulations_20210329212141/\"\n",
"[1] \"../results20210329212141/\"\n"
]
}
],
"source": [
"# Base folder for data\n",
"data_folder <- \"../data/\"\n",
"# Label to identify current run\n",
"tag <- format(Sys.time(), \"%Y%m%d%H%M%S\")\n",
"# Folder containing simulated data\n",
"simul_folder <- paste0(data_folder, \"simulations_\", tag, \"/\") \n",
"# Folder for comparison results\n",
"results_folder <- paste0(\"../results\", tag, \"/\")\n",
"print(data_folder)\n",
"print(tag)\n",
"print(simul_folder)\n",
"print(results_folder)\n",
"dir.create(data_folder, showWarnings = FALSE)\n",
"dir.create(simul_folder, showWarnings = FALSE)\n",
"dir.create(results_folder, showWarnings = FALSE)\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"list_clusters <- seq(5,15,5)\n",
"list_distrib <- c(\"heterogeneous\",\"equal\")\n",
"\n",
"# For a given number of clusters\n",
"for(size in list_distrib) {\n",
" thisdir1 <- paste(simul_folder,size, sep=\"\")\n",
" dir.create(thisdir1, showWarnings = FALSE)\n",
" # Data distribution among clusters will either be heterogeneous or equal \n",
" for (num.clusters in list_clusters) {\n",
" thisdir2 <- paste(thisdir1,\"/\",num.clusters, sep=\"\")\n",
" dir.create(thisdir2, showWarnings = FALSE)\n",
" }\n",
" }\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"## Simulate data\n",
"## INPUTS:\n",
"# folder = location where the simulated data should be saved\n",
"# num.clusters = number of clusters to be imposed on the data\n",
"# size = heterogeneous for heterogeneous clusters, equal for equally-sized clusters\n",
"## OUPUTS: matrices of simulated data are saved to file in folder\n",
"##模拟数据\n",
"##输入:\n",
"#folder=应保存模拟数据的位置\n",
"#num.clusters =要施加在数据上的集群数\n",
"#size =对于异构集群,heterogeneous,对于相等大小的集群,equal\n",
"## OUPUTS:模拟数据矩阵保存到文件夹中的文件中\n",
"simulated_data_generation <- function(out.folder, num.clusters, size=\"heterogeneous\", predefined=TRUE) {\n",
" \n",
" # Number of clusters\n",
" num.clusters <- as.numeric(num.clusters)\n",
" # Size of the effect\n",
" effect <- 2.5\n",
" # Sample proportions per clusters defined here are those used for the paper\n",
" #此处定义的每类样本比例是本文使用的比例\n",
" prop_predefined <- list(\n",
" \"heterogeneous\" = list(\n",
" \"5\" = c(0.35, 0.13, 0.19, 0.08, 0.25),\n",
" \"10\" = c(0.20, 0.10, 0.07, 0.10, 0.15, 0.13, 0.10, 0.08, 0.05, 0.02),\n",
" \"15\" = c(0.10,0.08,0.04,0.03,0.12,0.03,0.10,0.03,0.05,0.02,0.1,0.2,0.03,0.02,0.05)\n",
" ),\n",
" \"equal\" = list(\n",
" \"5\" = c(0.25,0.2,0.2,0.2,0.15),\n",
" \"10\" = c(0.15,0.1,0.1,0.1,0.1,0.1,0.05,0.1,0.1,0.1),\n",
" \"15\" = c(0.07,0.07,0.07,0.06,0.07,0.07,0.07,0.06,0.07,0.06,0.07,0.06,0.07,0.06,0.07)\n",
" )\n",
" )\n",
"\n",
" # Check provided parameter (size) against allowed values\n",
" if(! size %in% names(prop_predefined)) {\n",
" print(paste0(\"ERROR: size can only assume value : \", \n",
" paste0(names(prop_predefined), collapse=\",\"),\n",
" \" found : \", size))\n",
" }\n",
"\n",
" # If article proportions are to be used\n",
" if(predefined) {\n",
" # Check provided parameter (number of clusters) against allowed values\n",
" if(! as.character(num.clusters) %in% names(prop_predefined[[size]])) {\n",
" print(paste0(\"ERROR: num.clusters can only assume value : \", \n",
" paste0(names(prop_predefined[[size]]), collapse=\",\"),\n",
" \" found : \",\n",
" num.clusters))\n",
" }\n",
" prop <- prop_predefined[[size]][[as.character(num.clusters)]]\n",
" prop[1] <- 1-sum(prop[-1])\n",
" }\n",
" # Otherwise\n",
" else {\n",
" if(size == \"equal\") {\n",
" # Could be simplified! Only necessary because InterSIM is \"easily offended\" :\n",
" # ensure same object type as in the heterogeneous case, and that not all \n",
" # values are exactly the same (should not impact the number of samples per group)\n",
" # - same type\n",
" equals <- rep(1, num.clusters)\n",
" prop <- equals/sum(equals)\n",
" # - slightly imbalanced\n",
" delta <- 0.05*prop[1]\n",
" prop[1] <- prop[1]+delta\n",
" prop[num.clusters] <- prop[num.clusters]-delta\n",
" # - sum is 1\n",
" prop <- round(prop, digits = 10)\n",
" prop[1] <- 1-sum(prop[-1])\n",
" }\n",
" else {\n",
" random <- runif(n = num.clusters, min = 0, max = 1)\n",
" prop <- random/sum(random)\n",
" }\n",
" }\n",
"\n",
" # Simulate data based on provided parameters\n",
" print(prop)\n",
" print(sum(prop))\n",
" print(sum(prop)==1)\n",
" sim.D <- InterSIM(n.sample=100, cluster.sample.prop=prop, \n",
" delta.methyl=effect, delta.expr=effect, delta.protein=effect, \n",
" p.DMP=0.25, p.DEG=NULL, p.DEP=NULL,\n",
" do.plot=FALSE, sample.cluster=TRUE, feature.cluster=TRUE)\n",
" \n",
" \n",
" thisdir <- paste(out.folder,size,\"/\",num.clusters, sep=\"\")\n",
" print(thisdir)\n",
" #dir.create(paste(out.folder,size,num.clusters, sep=\"/\"), showWarnings = FALSE)\n",
" #dir.create(paste(out.folder,size,num.clusters, sep=\"/\"), showWarnings = FALSE)\n",
" #dir.create(paste(out.folder,size,num.clusters, sep=\"/\"), showWarnings = FALSE)\n",
"\n",
" \n",
"\n",
" # Export simulations as tables\n",
" write.table(sim.D$clustering.assignment, paste(thisdir, \"clusters.txt\", sep=\"/\"), sep=\"\\t\")\n",
" write_table_with_index_header(t(sim.D$dat.methyl), paste(thisdir, \"omics1.txt\", sep=\"/\"))\n",
" write_table_with_index_header(t(sim.D$dat.expr), paste(thisdir, \"omics2.txt\", sep=\"/\"))\n",
" write_table_with_index_header(t(sim.D$dat.protein), paste(thisdir, \"omics3.txt\", sep=\"/\"))\n",
"\n",
" return(\"data saved in folder\")\n",
"}\n",
"\n",
"## Support function\n",
"write_table_with_index_header <- function(data, file, sep=\"\\t\") {\n",
" write.table(cbind(probe=row.names(data),data), file, sep = sep, \n",
" append = FALSE, quote = FALSE, row.names = FALSE, col.names = TRUE)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1] \"##########\"\n",
"[1] \"-> Distribution: heterogeneous, Nb clusters: 5\"\n",
"[1] \"-> Simulating data...\"\n",
"[1] 0.35 0.13 0.19 0.08 0.25\n",
"[1] 1\n",
"[1] TRUE\n",
"[1] \"../data/simulations_20210329212141/heterogeneous/5\"\n",
"[1] \"-> Done.\"\n",
"[1] \"##########\"\n",
"[1] \"-> Distribution: equal, Nb clusters: 5\"\n",
"[1] \"-> Simulating data...\"\n",
"[1] 0.25 0.20 0.20 0.20 0.15\n",
"[1] 1\n",
"[1] TRUE\n",
"[1] \"../data/simulations_20210329212141/equal/5\"\n",
"[1] \"-> Done.\"\n",
"[1] \"##########\"\n",
"[1] \"-> Distribution: heterogeneous, Nb clusters: 10\"\n",
"[1] \"-> Simulating data...\"\n",
" [1] 0.20 0.10 0.07 0.10 0.15 0.13 0.10 0.08 0.05 0.02\n",
"[1] 1\n",
"[1] TRUE\n",
"[1] \"../data/simulations_20210329212141/heterogeneous/10\"\n",
"[1] \"-> Done.\"\n",
"[1] \"##########\"\n",
"[1] \"-> Distribution: equal, Nb clusters: 10\"\n",
"[1] \"-> Simulating data...\"\n",
" [1] 0.15 0.10 0.10 0.10 0.10 0.10 0.05 0.10 0.10 0.10\n",
"[1] 1\n",
"[1] TRUE\n",
"[1] \"../data/simulations_20210329212141/equal/10\"\n",
"[1] \"-> Done.\"\n",
"[1] \"##########\"\n",
"[1] \"-> Distribution: heterogeneous, Nb clusters: 15\"\n",
"[1] \"-> Simulating data...\"\n",
" [1] 0.10 0.08 0.04 0.03 0.12 0.03 0.10 0.03 0.05 0.02 0.10 0.20 0.03 0.02 0.05\n",
"[1] 1\n",
"[1] TRUE\n",
"[1] \"../data/simulations_20210329212141/heterogeneous/15\"\n",
"[1] \"-> Done.\"\n",
"[1] \"##########\"\n",
"[1] \"-> Distribution: equal, Nb clusters: 15\"\n",
"[1] \"-> Simulating data...\"\n",
" [1] 0.07 0.07 0.07 0.06 0.07 0.07 0.07 0.06 0.07 0.06 0.07 0.06 0.07 0.06 0.07\n",
"[1] 1\n",
"[1] TRUE\n",
"[1] \"../data/simulations_20210329212141/equal/15\"\n",
"[1] \"-> Done.\"\n"
]
}
],
"source": [
"## Simulate data, factorize and compare the results\n",
"\n",
"list_clusters <- seq(5,15,5)\n",
"list_distrib <- c(\"heterogeneous\",\"equal\")\n",
"\n",
"# For a given number of clusters\n",
"for(num.clusters in list_clusters) {\n",
" # Data distribution among clusters will either be heterogeneous or equal \n",
" for (size in list_distrib) {\n",
" \n",
" print(\"##########\")\n",
" print(paste0(\"-> Distribution: \", size, \", Nb clusters: \", num.clusters))\n",
" \n",
" # Make simulated data\n",
" print(\"-> Simulating data...\")\n",
" simulated_data_generation(simul_folder, num.clusters, size, predefined=TRUE)\n",
" \n",
" print(\"-> Done.\")\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}