[548210]: / tests / test_sequences.py

Download this file

104 lines (76 with data), 3.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
import dask.dataframe as dd
from openomics.database.sequence import GENCODE, MirBase
from .test_multiomics import *
@pytest.fixture
def generate_GENCODE():
gencode = GENCODE(path="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/",
file_resources={"long_noncoding_RNAs.gtf": "gencode.v32.long_noncoding_RNAs.gtf.gz",
"lncRNA_transcripts.fa": "gencode.v32.lncRNA_transcripts.fa.gz"}, )
gencode.data = gencode.data.sample(frac=0.01)
return gencode
@pytest.fixture
def generate_GENCODE_dask():
gencode = GENCODE(path="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/",
file_resources={"long_noncoding_RNAs.gtf": "gencode.v32.long_noncoding_RNAs.gtf.gz",
"lncRNA_transcripts.fa": "gencode.v32.lncRNA_transcripts.fa.gz"},
blocksize='10MB')
gencode.data = gencode.data.sample(frac=0.01)
return gencode
@pytest.fixture
def generate_MirBase_ftp():
mirbase = MirBase(path="ftp://mirbase.org/pub/mirbase/CURRENT/")
mirbase.data = mirbase.data.sample(frac=0.01)
return mirbase
def test_import_mirbase_db(generate_MirBase_ftp):
"""
Args:
generate_MirBase_ftp:
"""
assert generate_MirBase_ftp.data_path == "ftp://mirbase.org/pub/mirbase/CURRENT/"
def test_import_GENCODE(generate_GENCODE):
"""
Args:
generate_GENCODE:
"""
assert generate_GENCODE.data_path == 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/'
def test_import_dask_GENCODE(generate_GENCODE_dask):
assert isinstance(generate_GENCODE_dask.data, dd.DataFrame)
def test_annotate_GENCODE(generate_TCGA_LUAD, generate_GENCODE):
"""
Args:
generate_TCGA_LUAD:
generate_GENCODE:
"""
generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE, on="gene_id", columns=['gene_name'])
# Test join on off-index
generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE,
on="gene_name",
columns=['transcript_id'],
agg="concat")
assert {'gene_name', 'transcript_id'}.issubset(
generate_TCGA_LUAD.LncRNA.annotations.columns)
def test_annotate_dask_GENCODE(generate_TCGA_LUAD, generate_GENCODE_dask):
"""
Args:
generate_TCGA_LUAD:
generate_GENCODE_dask:
"""
generate_GENCODE_dask.data = generate_GENCODE_dask.data[generate_GENCODE_dask.data["gene_id"].notnull()]
# Test join on index column
generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE_dask,
on="gene_id",
columns=['gene_name'],
agg="concat")
# Test join on off-index
generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE_dask,
on="gene_name",
columns=['transcript_id'],
agg="concat")
assert {'gene_name', 'transcript_id'}.issubset(
generate_TCGA_LUAD.LncRNA.annotations.columns)
def test_annotate_sequence_GENCODE(generate_TCGA_LUAD, generate_GENCODE):
generate_TCGA_LUAD.LncRNA.annotate_sequences(generate_GENCODE,
on="gene_id",
omic="LncRNA",
agg="longest")
assert not generate_TCGA_LUAD.LncRNA.annotations["sequence"].empty