Diff of /tests/test_sequences.py [000000] .. [548210]

Switch to unified view

a b/tests/test_sequences.py
1
import dask.dataframe as dd
2
3
from openomics.database.sequence import GENCODE, MirBase
4
from .test_multiomics import *
5
6
7
@pytest.fixture
8
def generate_GENCODE():
9
    gencode = GENCODE(path="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/",
10
                      file_resources={"long_noncoding_RNAs.gtf": "gencode.v32.long_noncoding_RNAs.gtf.gz",
11
                                      "lncRNA_transcripts.fa": "gencode.v32.lncRNA_transcripts.fa.gz"}, )
12
13
    gencode.data = gencode.data.sample(frac=0.01)
14
    return gencode
15
16
17
@pytest.fixture
18
def generate_GENCODE_dask():
19
    gencode = GENCODE(path="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/",
20
                      file_resources={"long_noncoding_RNAs.gtf": "gencode.v32.long_noncoding_RNAs.gtf.gz",
21
                                      "lncRNA_transcripts.fa": "gencode.v32.lncRNA_transcripts.fa.gz"},
22
                      blocksize='10MB')
23
24
    gencode.data = gencode.data.sample(frac=0.01)
25
    return gencode
26
27
28
@pytest.fixture
29
def generate_MirBase_ftp():
30
    mirbase = MirBase(path="ftp://mirbase.org/pub/mirbase/CURRENT/")
31
    mirbase.data = mirbase.data.sample(frac=0.01)
32
    return mirbase
33
34
35
def test_import_mirbase_db(generate_MirBase_ftp):
36
    """
37
    Args:
38
        generate_MirBase_ftp:
39
    """
40
    assert generate_MirBase_ftp.data_path == "ftp://mirbase.org/pub/mirbase/CURRENT/"
41
42
43
def test_import_GENCODE(generate_GENCODE):
44
    """
45
    Args:
46
        generate_GENCODE:
47
    """
48
    assert generate_GENCODE.data_path == 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/'
49
50
51
def test_import_dask_GENCODE(generate_GENCODE_dask):
52
    assert isinstance(generate_GENCODE_dask.data, dd.DataFrame)
53
54
55
def test_annotate_GENCODE(generate_TCGA_LUAD, generate_GENCODE):
56
    """
57
    Args:
58
        generate_TCGA_LUAD:
59
        generate_GENCODE:
60
    """
61
    generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE, on="gene_id", columns=['gene_name'])
62
63
    # Test join on off-index
64
    generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE,
65
                                                  on="gene_name",
66
                                                  columns=['transcript_id'],
67
                                                  agg="concat")
68
69
    assert {'gene_name', 'transcript_id'}.issubset(
70
        generate_TCGA_LUAD.LncRNA.annotations.columns)
71
72
73
def test_annotate_dask_GENCODE(generate_TCGA_LUAD, generate_GENCODE_dask):
74
    """
75
    Args:
76
        generate_TCGA_LUAD:
77
        generate_GENCODE_dask:
78
    """
79
    generate_GENCODE_dask.data = generate_GENCODE_dask.data[generate_GENCODE_dask.data["gene_id"].notnull()]
80
81
    # Test join on index column
82
    generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE_dask,
83
                                                  on="gene_id",
84
                                                  columns=['gene_name'],
85
                                                  agg="concat")
86
87
    # Test join on off-index
88
    generate_TCGA_LUAD.LncRNA.annotate_attributes(generate_GENCODE_dask,
89
                                                  on="gene_name",
90
                                                  columns=['transcript_id'],
91
                                                  agg="concat")
92
93
    assert {'gene_name', 'transcript_id'}.issubset(
94
        generate_TCGA_LUAD.LncRNA.annotations.columns)
95
96
97
def test_annotate_sequence_GENCODE(generate_TCGA_LUAD, generate_GENCODE):
98
    generate_TCGA_LUAD.LncRNA.annotate_sequences(generate_GENCODE,
99
                                                 on="gene_id",
100
                                                 omic="LncRNA",
101
                                                 agg="longest")
102
103
    assert not generate_TCGA_LUAD.LncRNA.annotations["sequence"].empty