258 lines (257 with data), 17.4 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Downloads for CAGE binding prediction"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If necessary, download prerequisites first."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#!conda install --yes r-base\n",
"#!pip install rpy2\n",
"#!pip install tzlocal\n",
"#!conda install --yes -c bioconda bedtools samtools \n",
"#!conda install --yes r-stringi"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from pybedtools import BedTool\n",
"%load_ext rpy2.ipython"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"output = '../data'\n",
"os.makedirs(output, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Downloading the data for CAGE prediction\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2019-11-07 22:32:03-- https://www.encodeproject.org/files/ENCFF591XCX/@@download/ENCFF591XCX.bam\n",
"Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144\n",
"Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.\n",
"HTTP request sent, awaiting response... 307 Temporary Redirect\n",
"Location: https://encode-public.s3.amazonaws.com/2017/08/24/ac0d9343-0435-490b-aa5d-2f14e8275a9e/ENCFF591XCX.bam?response-content-disposition=attachment%3B%20filename%3DENCFF591XCX.bam&Signature=jmijhfrSNqpgHeqcJfMGh2gdvo0%3D&Expires=1573291923&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D [following]\n",
"--2019-11-07 22:32:03-- https://encode-public.s3.amazonaws.com/2017/08/24/ac0d9343-0435-490b-aa5d-2f14e8275a9e/ENCFF591XCX.bam?response-content-disposition=attachment%3B%20filename%3DENCFF591XCX.bam&Signature=jmijhfrSNqpgHeqcJfMGh2gdvo0%3D&Expires=1573291923&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D\n",
"Resolving encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)... 52.218.237.147\n",
"Connecting to encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)|52.218.237.147|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 3537306756 (3,3G) [binary/octet-stream]\n",
"Saving to: ‘../data/dnase.hepg2.bam’\n",
"\n",
"../data/dnase.hepg2 100%[===================>] 3,29G 2,42MB/s in 25m 0s \n",
"\n",
"2019-11-07 22:57:04 (2,25 MB/s) - ‘../data/dnase.hepg2.bam’ saved [3537306756/3537306756]\n",
"\n",
"--2019-11-07 22:57:44-- https://www.encodeproject.org/files/ENCFF736LHE/@@download/ENCFF736LHE.bigWig\n",
"Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144\n",
"Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.\n",
"HTTP request sent, awaiting response... 307 Temporary Redirect\n",
"Location: https://encode-public.s3.amazonaws.com/2017/02/08/59f52ec5-17b9-4de6-b4e3-46aadca158b2/ENCFF736LHE.bigWig?response-content-disposition=attachment%3B%20filename%3DENCFF736LHE.bigWig&Expires=1573293465&Signature=QCRBaPCQA39xHVAd7duEEhGWZks%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D [following]\n",
"--2019-11-07 22:57:45-- https://encode-public.s3.amazonaws.com/2017/02/08/59f52ec5-17b9-4de6-b4e3-46aadca158b2/ENCFF736LHE.bigWig?response-content-disposition=attachment%3B%20filename%3DENCFF736LHE.bigWig&Expires=1573293465&Signature=QCRBaPCQA39xHVAd7duEEhGWZks%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D\n",
"Resolving encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)... 52.218.208.50\n",
"Connecting to encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)|52.218.208.50|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1206836202 (1,1G) [binary/octet-stream]\n",
"Saving to: ‘../data/h3k4me3.hepg2.bigWig’\n",
"\n",
"../data/h3k4me3.hep 100%[===================>] 1,12G 2,00MB/s in 8m 19s \n",
"\n",
"2019-11-07 23:06:05 (2,31 MB/s) - ‘../data/h3k4me3.hepg2.bigWig’ saved [1206836202/1206836202]\n",
"\n",
"--2019-11-07 23:06:05-- ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz\n",
" => ‘../data/gencode.v29.annotation.gtf.gz’\n",
"Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.197.74\n",
"Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.197.74|:21... connected.\n",
"Logging in as anonymous ... Logged in!\n",
"==> SYST ... done. ==> PWD ... done.\n",
"==> TYPE I ... done. ==> CWD (1) /pub/databases/gencode/Gencode_human/release_29 ... done.\n",
"==> SIZE gencode.v29.annotation.gtf.gz ... 39387922\n",
"==> PASV ... done. ==> RETR gencode.v29.annotation.gtf.gz ... done.\n",
"Length: 39387922 (38M) (unauthoritative)\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"gencode.v29.annotat 100%[===================>] 37,56M 2,50MB/s in 14s \n",
"\n",
"2019-11-07 23:06:20 (2,66 MB/s) - ‘../data/gencode.v29.annotation.gtf.gz’ saved [39387922]\n",
"\n",
"--2019-11-07 23:06:33-- https://www.encodeproject.org/files/ENCFF177HHM/@@download/ENCFF177HHM.bam\n",
"Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144\n",
"Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.\n",
"HTTP request sent, awaiting response... 307 Temporary Redirect\n",
"Location: https://encode-public.s3.amazonaws.com/2016/08/04/4d16576e-cc0f-4dc2-942e-d0fe8b473276/ENCFF177HHM.bam?response-content-disposition=attachment%3B%20filename%3DENCFF177HHM.bam&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&Expires=1573293994&Signature=eOZqgUWGDpJiJOf12nkrP1ysTUM%3D [following]\n",
"--2019-11-07 23:06:34-- https://encode-public.s3.amazonaws.com/2016/08/04/4d16576e-cc0f-4dc2-942e-d0fe8b473276/ENCFF177HHM.bam?response-content-disposition=attachment%3B%20filename%3DENCFF177HHM.bam&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbdvpfUY%2BmfUPXIcpC1Nd2E2Ue3SacNbnikcijMdo0fTORxqd7kWX7Zq0Srz4wp5NJwQt%2FRzLZnYTNNpct3sVHKa7ulB6rfiq64eYJms23GzYNeHoaLY5ZgsWRQj8hzffa4A%2Bs8fcX0kXmWW%2FCpJkUONtDFn2KPy3wP36wo2%2FXQQ%2BhUA%2FnIwSI5ZsBA7%2F%2FRDrcl88GjyPgjZ4cDbbWp1%2BkUY1Ehh2Ct5xIsmYbf5au6N4fOdxq7ZB8DmLTkBAzC0VFz17MPuFku4FOs4CuiFA4TAuUPER2ymeG%2BQbdAV0ksdj8FtrovkKC%2BLpwrLLSh%2BUvDVlJfl6a1d3xb3JUIeUrMTpxdnroiIeeUr1psluKJEYkpdi%2F%2BOP6d1t6OmA%2FGMWCwKAJdgMhhgQzYFU2x0OVHrpb7XFKG6QH13atSiZuoP3oYjV5INYPyXrixcnTTYvptJbYZbvni4Lrpq1txRDLPxTs6BPrCbkb86n9L1%2FlpZMkHs8lRbb8%2FJcBe8I8vnNVvTQkXwU5WWCXRvtbGyG6mN9vITeTCTVfVePtyVrb3uhcjVwGk1Dj%2ButivowQ0EJ4EhHmhFYKqm%2BiyvErjtxmsK5%2F2zwcY6XXkl%2FVmhcYZLwHPLW5vPJFJ0GjyTULAldXumir87Ul1AZud%2FO5U0DFF%2BQjL9tvhkmJhqDi1pt4JJ96mdOJdxzk4%2BNA5vRQ9ll6DGG937C9SSF%2Fg%3D%3D&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&Expires=1573293994&Signature=eOZqgUWGDpJiJOf12nkrP1ysTUM%3D\n",
"Resolving encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)... 52.218.204.122\n",
"Connecting to encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)|52.218.204.122|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1286351250 (1,2G) [binary/octet-stream]\n",
"Saving to: ‘../data/cage.hepg2.rep1.bam’\n",
"\n",
"../data/cage.hepg2. 100%[===================>] 1,20G 2,45MB/s in 9m 4s \n",
"\n",
"2019-11-07 23:15:39 (2,26 MB/s) - ‘../data/cage.hepg2.rep1.bam’ saved [1286351250/1286351250]\n",
"\n"
]
}
],
"source": [
"# HepG2, Dnase, paired-end\n",
"!wget https://www.encodeproject.org/files/ENCFF591XCX/@@download/ENCFF591XCX.bam -O {output}/dnase.hepg2.bam;\n",
"!samtools index {output}/dnase.hepg2.bam\n",
"\n",
"# HepG2, H3K4me3\n",
"!wget https://www.encodeproject.org/files/ENCFF736LHE/@@download/ENCFF736LHE.bigWig -O {output}/h3k4me3.hepg2.bigWig;\n",
"\n",
"!wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz -O {output}/gencode.v29.annotation.gtf.gz\n",
"!gunzip -f {output}/gencode.v29.annotation.gtf.gz\n",
"\n",
"#CAGE, HepG2, rep1\n",
"!wget https://www.encodeproject.org/files/ENCFF177HHM/@@download/ENCFF177HHM.bam -O {output}/cage.hepg2.rep1.bam\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"!samtools index {output}/cage.hepg2.rep1.bam"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"inputannotation = os.path.join(output, \"gencode.v29.annotation.gtf\")\n",
"outputannotation = os.path.join(output, \"gencode.v29.tss.gtf\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['stringi', 'tools', 'stats', 'graphics', 'grDevices', 'utils',\n",
" 'datasets', 'methods', 'base'], dtype='<U9')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%R library(stringi)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We only need TSSs from protein coding genes"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"%%R -i inputannotation\n",
"\n",
"df = read.table(inputannotation, header=F, sep=\"\\t\", stringsAsFactor=FALSE)\n",
"\n",
"df = subset(df, V3==\"gene\")\n",
"\n",
"df = df[stri_detect_fixed( df$V9, \"protein_coding\"), ]\n",
"\n",
"df[df$V7 == \"+\",]$V5 = as.integer(df[df$V7 == \"+\",]$V4)\n",
"df[df$V7 == \"+\",]$V4 = as.integer(df[df$V7 == \"+\",]$V4 - 200)\n",
"\n",
"df[df$V7 == \"-\",]$V4 = as.integer(df[df$V7 == \"-\",]$V5)\n",
"df[df$V7 == \"-\",]$V5 = as.integer(df[df$V7 == \"-\",]$V5 + 200)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"%%R -i outputannotation\n",
"write.table(df, outputannotation, quote=F, sep=\"\\t\", col.names=F, row.names=F)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}