a b/faigen/data/alignment.py
1
from fastai import *
2
from fastai.text import *
3
from Bio import Seq
4
from Bio.Seq import Seq
5
from Bio import SeqIO
6
from Bio.SeqRecord import SeqRecord
7
from Bio.SeqFeature import FeatureLocation, CompoundLocation
8
9
10
""" 
11
Sample 
12
data = (AlignmentsItemList.from_folder(bam_sam_folder)
13
        #Where are the sequences? -> in ```bam_sam_folder``` and its subfolders
14
        .generate_missing_index(),
15
        #Alignment files need indexes. Run samtools to generate index if missing
16
        .take_by_name(list_of_named_alignments),
17
        #select metachondria, chromosome, etc.
18
        .toFasta(),
19
        #generate fasta sequence from alignment
20
        .do_not_label(),
21
        #create empty labels for unsupervised learning tasks
22
        .databunch(bs=16, collate_fn=bb_pad_collate))
23
        #Finally we convert to a DataBunch, use a batch size of 16,
24
        # and we use bb_pad_collate to collate the data into a mini-batch
25
"""
26
27
28
##=====================================
29
## ItemBase classes
30
##=====================================
31
32
class AlignmentIndexBase(ItemBase):
33
    pass
34
35
class AlignmentItemBase(ItemBase):
36
    """an alignment item contains an alignment record tracked in bai index file as n_ref"""
37
    def __init__(self,n_ref:str, seq:str, qual:str,  cigar:object, meta:dict):
38
        self.n_ref, self.seq, self.qual, self.cigar, self.meta = n_ref, seq, cigar, meta
39
40
    def loadAlighment(self, file):
41
        pass
42
43
    def loadAlignmentIndex(self, aignmentFileName):
44
        with open(aignmentFileName) as
45
        if self.generate_index
46
47
    def _getNamedPart(self, part:str):
48
    ```select a named parts e.g chromosome, metachondria etc.```
49
50
    def  toFasta(self, item:Collection[str], position:slice) -> str:
51
        pass
52
53
    def toVariants(self, item:Collection[str], position:slice) -> Collection[str]:
54
        pass
55
56
57
class AlignmentFileProcessor(PreProcessor):
58
    "`PreProcessor` that opens the filenames and read alignment files."
59
    def process_one(self,item):
60
        return bam_reader(item) if isinstance(item, Path) else item
61
62
63
class AlignmentItemList(ItemList):
64
    "Special `ItemList` for BAM/SAM alignment files"
65
    _bunch = AlignmentDataBunch
66
    _sequencer,_indexer, _cygar_processor =  SequenceProcessor, IndexProcessor, CygarProcessor
67
    _is_compressed = True
68
    _in_memory = False
69
70
    def from_file(self, file):
71
72
73
    def do_not_label(self, **kwargs):
74
        "A special labelling method for unsupervised learning"
75
        self.__class__ = UnlabeledAlignemtList
76
        kwargs['label_cls'] = UnlabeledAlignemtList
77
        return self.label_empty(**kwargs)
78
79
    def named_parts(self):
80
        """Retreave named parts from alignment index"""
81
        pass
82
83
84
85
86
87
##=====================================
88
## DataBunch
89
##=====================================
90
91
92
class AlignmentDataBunch(DataBunch):
93
    "DataBunch suitable for generic sequence processing."
94
95
96
    @classmethod
97
    def from_folder(cls, path:PathOrStr, train:str='train', valid:str='valid', test:Optional[str]=None,
98
                    classes:Collection[Any]=None, tokenizer:Tokenizer=None, vocab:Vocab=None, chunksize:int=10000, max_vocab:int=60000,
99
                    min_freq:int=2, mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False, **kwargs):
100
        "Create a `AlignmentDataBunch` from text files in folders."
101
        path = Path(path).absolute()
102
        processor = [AlignmentFileProcessor()]  +
103
                    _get_processor(tokenizer=tokenizer, vocab=vocab, chunksize=chunksize, max_vocab=max_vocab,
104
                    min_freq=min_freq, mark_fields=mark_fields, include_bos=include_bos, include_eos=include_eos)
105
        src = (AlignmentItemList.from_folder(path, processor=processor)
106
                       .split_by_folder(train=train, valid=valid))
107
        src = src.label_for_clustering() if cls==UnlabeledAlignementDataBunch else src.label_from_folder(classes=classes)
108
        if test is not None: src.add_test_folder(path/test)
109
        return src.databunch(**kwargs)
110
111
112
##=====================================
113
## Unlabeled Alignment Data Bunch
114
##=====================================
115
116
class UnlabeledAlignemtList(AlignementItemList):
117
    "Special `ItemList` for a language model."
118
    _bunch = AlignmentDataBunch
119
120
121
class UnlabeledAlignementDataBunch(AlignmentDataBunch):
122
    "DataBunch suitable for unsupervised learning over alignment data"
123
124
    def label_for_clustering(self, **kwargs):
125
            "A special labelling method for unsupervised learning"
126
            self.__class__ = UnlabeledAlignemtList
127
            kwargs['label_cls'] = UnlabeledAlignemtList
128
            return self.label_empty(**kwargs)