[b33e61]: / R-scripts / scripts / .ipynb_checkpoints / 数据模拟-checkpoint.ipynb

Download this file

415 lines (414 with data), 14.5 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 27/28\n",
      "\n",
      "  To enable shared memory capabilities, try: install.extras('\n",
      "NMF\n",
      "')\n",
      "\n",
      "Loading required package: mclust\n",
      "\n",
      "Package 'mclust' version 5.4.6\n",
      "Type 'citation(\"mclust\")' for citing this R package in publications.\n",
      "\n",
      "Loading required package: ade4\n",
      "\n",
      "\n",
      "Attaching package: ‘ade4’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:BiocGenerics’:\n",
      "\n",
      "    score\n",
      "\n",
      "\n",
      "\n",
      "Attaching package: ‘GPArotation’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:NMF’:\n",
      "\n",
      "    entropy\n",
      "\n",
      "\n",
      "\n",
      "Attaching package: ‘MOFAtools’\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:NMF’:\n",
      "\n",
      "    featureNames, featureNames<-, predict, sampleNames, sampleNames<-\n",
      "\n",
      "\n",
      "The following objects are masked from ‘package:Biobase’:\n",
      "\n",
      "    featureNames, featureNames<-, sampleNames, sampleNames<-\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:stats’:\n",
      "\n",
      "    predict\n",
      "\n",
      "\n",
      "Loading required package: JADE\n",
      "\n",
      "Loading required package: lattice\n",
      "\n",
      "Loading required package: caTools\n",
      "\n",
      "Loading required package: gdata\n",
      "\n",
      "gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.\n",
      "\n",
      "\n",
      "\n",
      "gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.\n",
      "\n",
      "\n",
      "Attaching package: ‘gdata’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:Biobase’:\n",
      "\n",
      "    combine\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:BiocGenerics’:\n",
      "\n",
      "    combine\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:stats’:\n",
      "\n",
      "    nobs\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:utils’:\n",
      "\n",
      "    object.size\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:base’:\n",
      "\n",
      "    startsWith\n",
      "\n",
      "\n",
      "Loading required package: gtools\n",
      "\n",
      "\n",
      "Attaching package: ‘gtools’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:InterSIM’:\n",
      "\n",
      "    logit\n",
      "\n",
      "\n",
      "Loading required package: gplots\n",
      "\n",
      "\n",
      "Attaching package: ‘gplots’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:stats’:\n",
      "\n",
      "    lowess\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "library(\"InterSIM\", quietly = TRUE)\n",
    "source(\"runfactorization.R\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] \"../data/\"\n",
      "[1] \"20210329212141\"\n",
      "[1] \"../data/simulations_20210329212141/\"\n",
      "[1] \"../results20210329212141/\"\n"
     ]
    }
   ],
   "source": [
    "# Base folder for data\n",
    "data_folder <- \"../data/\"\n",
    "# Label to identify current run\n",
    "tag <- format(Sys.time(), \"%Y%m%d%H%M%S\")\n",
    "# Folder containing simulated data\n",
    "simul_folder <- paste0(data_folder, \"simulations_\", tag, \"/\") \n",
    "# Folder for comparison results\n",
    "results_folder <- paste0(\"../results\", tag, \"/\")\n",
    "print(data_folder)\n",
    "print(tag)\n",
    "print(simul_folder)\n",
    "print(results_folder)\n",
    "dir.create(data_folder, showWarnings = FALSE)\n",
    "dir.create(simul_folder, showWarnings = FALSE)\n",
    "dir.create(results_folder, showWarnings = FALSE)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_clusters <- seq(5,15,5)\n",
    "list_distrib <-  c(\"heterogeneous\",\"equal\")\n",
    "\n",
    "# For a given number of clusters\n",
    "for(size in list_distrib) {\n",
    "    thisdir1 <- paste(simul_folder,size, sep=\"\")\n",
    "    dir.create(thisdir1, showWarnings = FALSE)\n",
    "    # Data distribution among clusters will either be heterogeneous or equal \n",
    "    for (num.clusters in list_clusters) {\n",
    "        thisdir2 <- paste(thisdir1,\"/\",num.clusters, sep=\"\")\n",
    "        dir.create(thisdir2, showWarnings = FALSE)\n",
    "        }\n",
    "    }\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Simulate data\n",
    "## INPUTS:\n",
    "# folder = location  where the simulated data should be saved\n",
    "# num.clusters = number of clusters to be imposed on the data\n",
    "# size = heterogeneous for heterogeneous clusters, equal for equally-sized clusters\n",
    "## OUPUTS: matrices of simulated data are saved to file in folder\n",
    "##模拟数据\n",
    "##输入:\n",
    "#folder=应保存模拟数据的位置\n",
    "#num.clusters =要施加在数据上的集群数\n",
    "#size =对于异构集群,heterogeneous,对于相等大小的集群,equal\n",
    "## OUPUTS:模拟数据矩阵保存到文件夹中的文件中\n",
    "simulated_data_generation <- function(out.folder, num.clusters, size=\"heterogeneous\", predefined=TRUE) {\n",
    "    \n",
    "    # Number of clusters\n",
    "    num.clusters <- as.numeric(num.clusters)\n",
    "    # Size of the effect\n",
    "    effect <- 2.5\n",
    "    # Sample proportions per clusters defined here are those used for the paper\n",
    "    #此处定义的每类样本比例是本文使用的比例\n",
    "    prop_predefined <- list(\n",
    "        \"heterogeneous\" = list(\n",
    "            \"5\" = c(0.35, 0.13, 0.19, 0.08, 0.25),\n",
    "            \"10\" = c(0.20, 0.10, 0.07, 0.10, 0.15, 0.13, 0.10, 0.08, 0.05, 0.02),\n",
    "            \"15\" = c(0.10,0.08,0.04,0.03,0.12,0.03,0.10,0.03,0.05,0.02,0.1,0.2,0.03,0.02,0.05)\n",
    "        ),\n",
    "        \"equal\" = list(\n",
    "            \"5\" = c(0.25,0.2,0.2,0.2,0.15),\n",
    "            \"10\" = c(0.15,0.1,0.1,0.1,0.1,0.1,0.05,0.1,0.1,0.1),\n",
    "            \"15\" = c(0.07,0.07,0.07,0.06,0.07,0.07,0.07,0.06,0.07,0.06,0.07,0.06,0.07,0.06,0.07)\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Check provided parameter (size) against allowed values\n",
    "    if(! size %in% names(prop_predefined)) {\n",
    "        print(paste0(\"ERROR: size can only assume value : \", \n",
    "                     paste0(names(prop_predefined), collapse=\",\"),\n",
    "                     \" found : \", size))\n",
    "    }\n",
    "\n",
    "    # If article proportions are to be used\n",
    "    if(predefined) {\n",
    "        # Check provided parameter (number of clusters) against allowed values\n",
    "        if(! as.character(num.clusters) %in% names(prop_predefined[[size]])) {\n",
    "            print(paste0(\"ERROR: num.clusters can only assume value : \", \n",
    "                         paste0(names(prop_predefined[[size]]), collapse=\",\"),\n",
    "                         \" found : \",\n",
    "                         num.clusters))\n",
    "        }\n",
    "        prop <- prop_predefined[[size]][[as.character(num.clusters)]]\n",
    "        prop[1] <- 1-sum(prop[-1])\n",
    "    }\n",
    "    # Otherwise\n",
    "    else {\n",
    "        if(size == \"equal\") {\n",
    "            # Could be simplified! Only necessary because InterSIM is \"easily offended\" :\n",
    "            # ensure same object type as in the heterogeneous case, and that not all \n",
    "            # values are exactly the same (should not impact the number of samples per group)\n",
    "            # - same type\n",
    "            equals <- rep(1, num.clusters)\n",
    "            prop <- equals/sum(equals)\n",
    "            # - slightly imbalanced\n",
    "            delta <- 0.05*prop[1]\n",
    "            prop[1] <- prop[1]+delta\n",
    "            prop[num.clusters] <- prop[num.clusters]-delta\n",
    "            # - sum is 1\n",
    "            prop <- round(prop, digits = 10)\n",
    "            prop[1] <- 1-sum(prop[-1])\n",
    "        }\n",
    "        else {\n",
    "            random <- runif(n = num.clusters, min = 0, max = 1)\n",
    "            prop <- random/sum(random)\n",
    "        }\n",
    "    }\n",
    "\n",
    "    # Simulate data based on provided parameters\n",
    "    print(prop)\n",
    "    print(sum(prop))\n",
    "    print(sum(prop)==1)\n",
    "    sim.D <- InterSIM(n.sample=100, cluster.sample.prop=prop, \n",
    "                      delta.methyl=effect, delta.expr=effect, delta.protein=effect, \n",
    "                      p.DMP=0.25, p.DEG=NULL, p.DEP=NULL,\n",
    "                      do.plot=FALSE, sample.cluster=TRUE, feature.cluster=TRUE)\n",
    "                      \n",
    "    \n",
    "    thisdir <- paste(out.folder,size,\"/\",num.clusters, sep=\"\")\n",
    "    print(thisdir)\n",
    "    #dir.create(paste(out.folder,size,num.clusters, sep=\"/\"), showWarnings = FALSE)\n",
    "    #dir.create(paste(out.folder,size,num.clusters, sep=\"/\"), showWarnings = FALSE)\n",
    "    #dir.create(paste(out.folder,size,num.clusters, sep=\"/\"), showWarnings = FALSE)\n",
    "\n",
    "                \n",
    "\n",
    "    # Export simulations as tables\n",
    "    write.table(sim.D$clustering.assignment, paste(thisdir, \"clusters.txt\", sep=\"/\"), sep=\"\\t\")\n",
    "    write_table_with_index_header(t(sim.D$dat.methyl), paste(thisdir, \"omics1.txt\", sep=\"/\"))\n",
    "    write_table_with_index_header(t(sim.D$dat.expr), paste(thisdir, \"omics2.txt\", sep=\"/\"))\n",
    "    write_table_with_index_header(t(sim.D$dat.protein), paste(thisdir, \"omics3.txt\", sep=\"/\"))\n",
    "\n",
    "    return(\"data saved in folder\")\n",
    "}\n",
    "\n",
    "## Support function\n",
    "write_table_with_index_header <- function(data, file, sep=\"\\t\") {\n",
    "    write.table(cbind(probe=row.names(data),data), file, sep = sep, \n",
    "                append = FALSE, quote = FALSE, row.names = FALSE, col.names = TRUE)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] \"##########\"\n",
      "[1] \"-> Distribution: heterogeneous, Nb clusters: 5\"\n",
      "[1] \"-> Simulating data...\"\n",
      "[1] 0.35 0.13 0.19 0.08 0.25\n",
      "[1] 1\n",
      "[1] TRUE\n",
      "[1] \"../data/simulations_20210329212141/heterogeneous/5\"\n",
      "[1] \"-> Done.\"\n",
      "[1] \"##########\"\n",
      "[1] \"-> Distribution: equal, Nb clusters: 5\"\n",
      "[1] \"-> Simulating data...\"\n",
      "[1] 0.25 0.20 0.20 0.20 0.15\n",
      "[1] 1\n",
      "[1] TRUE\n",
      "[1] \"../data/simulations_20210329212141/equal/5\"\n",
      "[1] \"-> Done.\"\n",
      "[1] \"##########\"\n",
      "[1] \"-> Distribution: heterogeneous, Nb clusters: 10\"\n",
      "[1] \"-> Simulating data...\"\n",
      " [1] 0.20 0.10 0.07 0.10 0.15 0.13 0.10 0.08 0.05 0.02\n",
      "[1] 1\n",
      "[1] TRUE\n",
      "[1] \"../data/simulations_20210329212141/heterogeneous/10\"\n",
      "[1] \"-> Done.\"\n",
      "[1] \"##########\"\n",
      "[1] \"-> Distribution: equal, Nb clusters: 10\"\n",
      "[1] \"-> Simulating data...\"\n",
      " [1] 0.15 0.10 0.10 0.10 0.10 0.10 0.05 0.10 0.10 0.10\n",
      "[1] 1\n",
      "[1] TRUE\n",
      "[1] \"../data/simulations_20210329212141/equal/10\"\n",
      "[1] \"-> Done.\"\n",
      "[1] \"##########\"\n",
      "[1] \"-> Distribution: heterogeneous, Nb clusters: 15\"\n",
      "[1] \"-> Simulating data...\"\n",
      " [1] 0.10 0.08 0.04 0.03 0.12 0.03 0.10 0.03 0.05 0.02 0.10 0.20 0.03 0.02 0.05\n",
      "[1] 1\n",
      "[1] TRUE\n",
      "[1] \"../data/simulations_20210329212141/heterogeneous/15\"\n",
      "[1] \"-> Done.\"\n",
      "[1] \"##########\"\n",
      "[1] \"-> Distribution: equal, Nb clusters: 15\"\n",
      "[1] \"-> Simulating data...\"\n",
      " [1] 0.07 0.07 0.07 0.06 0.07 0.07 0.07 0.06 0.07 0.06 0.07 0.06 0.07 0.06 0.07\n",
      "[1] 1\n",
      "[1] TRUE\n",
      "[1] \"../data/simulations_20210329212141/equal/15\"\n",
      "[1] \"-> Done.\"\n"
     ]
    }
   ],
   "source": [
    "## Simulate data, factorize and compare the results\n",
    "\n",
    "list_clusters <- seq(5,15,5)\n",
    "list_distrib <-  c(\"heterogeneous\",\"equal\")\n",
    "\n",
    "# For a given number of clusters\n",
    "for(num.clusters in list_clusters) {\n",
    "    # Data distribution among clusters will either be heterogeneous or equal \n",
    "    for (size in list_distrib) {\n",
    "        \n",
    "        print(\"##########\")\n",
    "        print(paste0(\"-> Distribution: \", size, \", Nb clusters: \", num.clusters))\n",
    "        \n",
    "        # Make simulated data\n",
    "        print(\"-> Simulating data...\")\n",
    "        simulated_data_generation(simul_folder, num.clusters, size, predefined=TRUE)\n",
    "        \n",
    "        print(\"-> Done.\")\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}