[214c6e]: / 00_preparation / 01_cage_downloads.ipynb

Download this file

258 lines (257 with data), 17.4 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Downloads for CAGE binding prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If necessary, download prerequisites first."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!conda install --yes r-base\n",
    "#!pip install rpy2\n",
    "#!pip install tzlocal\n",
    "#!conda install --yes -c bioconda bedtools samtools \n",
    "#!conda install --yes r-stringi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from pybedtools import BedTool\n",
    "%load_ext rpy2.ipython"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "output = '../data'\n",
    "os.makedirs(output, exist_ok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Downloading the data for CAGE prediction\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-11-07 22:32:03--  https://www.encodeproject.org/files/ENCFF591XCX/@@download/ENCFF591XCX.bam\n",
      "Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144\n",
      "Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.\n",
      "HTTP request sent, awaiting response... 307 Temporary Redirect\n",
      "Location: https://encode-public.s3.amazonaws.com/2017/08/24/ac0d9343-0435-490b-aa5d-2f14e8275a9e/ENCFF591XCX.bam?response-content-disposition=attachment%3B%20filename%3DENCFF591XCX.bam&Signature=jmijhfrSNqpgHeqcJfMGh2gdvo0%3D&Expires=1573291923&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D [following]\n",
      "--2019-11-07 22:32:03--  https://encode-public.s3.amazonaws.com/2017/08/24/ac0d9343-0435-490b-aa5d-2f14e8275a9e/ENCFF591XCX.bam?response-content-disposition=attachment%3B%20filename%3DENCFF591XCX.bam&Signature=jmijhfrSNqpgHeqcJfMGh2gdvo0%3D&Expires=1573291923&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D\n",
      "Resolving encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)... 52.218.237.147\n",
      "Connecting to encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)|52.218.237.147|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 3537306756 (3,3G) [binary/octet-stream]\n",
      "Saving to: ‘../data/dnase.hepg2.bam’\n",
      "\n",
      "../data/dnase.hepg2 100%[===================>]   3,29G  2,42MB/s    in 25m 0s  \n",
      "\n",
      "2019-11-07 22:57:04 (2,25 MB/s) - ‘../data/dnase.hepg2.bam’ saved [3537306756/3537306756]\n",
      "\n",
      "--2019-11-07 22:57:44--  https://www.encodeproject.org/files/ENCFF736LHE/@@download/ENCFF736LHE.bigWig\n",
      "Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144\n",
      "Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.\n",
      "HTTP request sent, awaiting response... 307 Temporary Redirect\n",
      "Location: https://encode-public.s3.amazonaws.com/2017/02/08/59f52ec5-17b9-4de6-b4e3-46aadca158b2/ENCFF736LHE.bigWig?response-content-disposition=attachment%3B%20filename%3DENCFF736LHE.bigWig&Expires=1573293465&Signature=QCRBaPCQA39xHVAd7duEEhGWZks%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D [following]\n",
      "--2019-11-07 22:57:45--  https://encode-public.s3.amazonaws.com/2017/02/08/59f52ec5-17b9-4de6-b4e3-46aadca158b2/ENCFF736LHE.bigWig?response-content-disposition=attachment%3B%20filename%3DENCFF736LHE.bigWig&Expires=1573293465&Signature=QCRBaPCQA39xHVAd7duEEhGWZks%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D\n",
      "Resolving encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)... 52.218.208.50\n",
      "Connecting to encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)|52.218.208.50|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1206836202 (1,1G) [binary/octet-stream]\n",
      "Saving to: ‘../data/h3k4me3.hepg2.bigWig’\n",
      "\n",
      "../data/h3k4me3.hep 100%[===================>]   1,12G  2,00MB/s    in 8m 19s  \n",
      "\n",
      "2019-11-07 23:06:05 (2,31 MB/s) - ‘../data/h3k4me3.hepg2.bigWig’ saved [1206836202/1206836202]\n",
      "\n",
      "--2019-11-07 23:06:05--  ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz\n",
      "           => ‘../data/gencode.v29.annotation.gtf.gz’\n",
      "Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.197.74\n",
      "Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.197.74|:21... connected.\n",
      "Logging in as anonymous ... Logged in!\n",
      "==> SYST ... done.    ==> PWD ... done.\n",
      "==> TYPE I ... done.  ==> CWD (1) /pub/databases/gencode/Gencode_human/release_29 ... done.\n",
      "==> SIZE gencode.v29.annotation.gtf.gz ... 39387922\n",
      "==> PASV ... done.    ==> RETR gencode.v29.annotation.gtf.gz ... done.\n",
      "Length: 39387922 (38M) (unauthoritative)\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gencode.v29.annotat 100%[===================>]  37,56M  2,50MB/s    in 14s     \n",
      "\n",
      "2019-11-07 23:06:20 (2,66 MB/s) - ‘../data/gencode.v29.annotation.gtf.gz’ saved [39387922]\n",
      "\n",
      "--2019-11-07 23:06:33--  https://www.encodeproject.org/files/ENCFF177HHM/@@download/ENCFF177HHM.bam\n",
      "Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144\n",
      "Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.\n",
      "HTTP request sent, awaiting response... 307 Temporary Redirect\n",
      "Location: https://encode-public.s3.amazonaws.com/2016/08/04/4d16576e-cc0f-4dc2-942e-d0fe8b473276/ENCFF177HHM.bam?response-content-disposition=attachment%3B%20filename%3DENCFF177HHM.bam&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&Expires=1573293994&Signature=eOZqgUWGDpJiJOf12nkrP1ysTUM%3D [following]\n",
      "--2019-11-07 23:06:34--  https://encode-public.s3.amazonaws.com/2016/08/04/4d16576e-cc0f-4dc2-942e-d0fe8b473276/ENCFF177HHM.bam?response-content-disposition=attachment%3B%20filename%3DENCFF177HHM.bam&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&Expires=1573293994&Signature=eOZqgUWGDpJiJOf12nkrP1ysTUM%3D\n",
      "Resolving encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)... 52.218.204.122\n",
      "Connecting to encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)|52.218.204.122|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1286351250 (1,2G) [binary/octet-stream]\n",
      "Saving to: ‘../data/cage.hepg2.rep1.bam’\n",
      "\n",
      "../data/cage.hepg2. 100%[===================>]   1,20G  2,45MB/s    in 9m 4s   \n",
      "\n",
      "2019-11-07 23:15:39 (2,26 MB/s) - ‘../data/cage.hepg2.rep1.bam’ saved [1286351250/1286351250]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# HepG2, Dnase, paired-end\n",
    "!wget https://www.encodeproject.org/files/ENCFF591XCX/@@download/ENCFF591XCX.bam -O {output}/dnase.hepg2.bam;\n",
    "!samtools index {output}/dnase.hepg2.bam\n",
    "\n",
    "# HepG2, H3K4me3\n",
    "!wget https://www.encodeproject.org/files/ENCFF736LHE/@@download/ENCFF736LHE.bigWig -O {output}/h3k4me3.hepg2.bigWig;\n",
    "\n",
    "!wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz -O {output}/gencode.v29.annotation.gtf.gz\n",
    "!gunzip -f {output}/gencode.v29.annotation.gtf.gz\n",
    "\n",
    "#CAGE, HepG2, rep1\n",
    "!wget https://www.encodeproject.org/files/ENCFF177HHM/@@download/ENCFF177HHM.bam -O {output}/cage.hepg2.rep1.bam\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "!samtools index {output}/cage.hepg2.rep1.bam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputannotation = os.path.join(output, \"gencode.v29.annotation.gtf\")\n",
    "outputannotation = os.path.join(output, \"gencode.v29.tss.gtf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['stringi', 'tools', 'stats', 'graphics', 'grDevices', 'utils',\n",
       "       'datasets', 'methods', 'base'], dtype='<U9')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%R library(stringi)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We only need TSSs from protein coding genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i inputannotation\n",
    "\n",
    "df = read.table(inputannotation, header=F, sep=\"\\t\", stringsAsFactor=FALSE)\n",
    "\n",
    "df = subset(df, V3==\"gene\")\n",
    "\n",
    "df = df[stri_detect_fixed( df$V9,  \"protein_coding\"), ]\n",
    "\n",
    "df[df$V7 == \"+\",]$V5 = as.integer(df[df$V7 == \"+\",]$V4)\n",
    "df[df$V7 == \"+\",]$V4 = as.integer(df[df$V7 == \"+\",]$V4 - 200)\n",
    "\n",
    "df[df$V7 == \"-\",]$V4 = as.integer(df[df$V7 == \"-\",]$V5)\n",
    "df[df$V7 == \"-\",]$V5 = as.integer(df[df$V7 == \"-\",]$V5 + 200)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i outputannotation\n",
    "write.table(df, outputannotation, quote=F, sep=\"\\t\", col.names=F, row.names=F)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}