|
a |
|
b/faigen/data/alignment.py |
|
|
1 |
from fastai import * |
|
|
2 |
from fastai.text import * |
|
|
3 |
from Bio import Seq |
|
|
4 |
from Bio.Seq import Seq |
|
|
5 |
from Bio import SeqIO |
|
|
6 |
from Bio.SeqRecord import SeqRecord |
|
|
7 |
from Bio.SeqFeature import FeatureLocation, CompoundLocation |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
""" |
|
|
11 |
Sample |
|
|
12 |
data = (AlignmentsItemList.from_folder(bam_sam_folder) |
|
|
13 |
#Where are the sequences? -> in ```bam_sam_folder``` and its subfolders |
|
|
14 |
.generate_missing_index(), |
|
|
15 |
#Alignment files need indexes. Run samtools to generate index if missing |
|
|
16 |
.take_by_name(list_of_named_alignments), |
|
|
17 |
#select metachondria, chromosome, etc. |
|
|
18 |
.toFasta(), |
|
|
19 |
#generate fasta sequence from alignment |
|
|
20 |
.do_not_label(), |
|
|
21 |
#create empty labels for unsupervised learning tasks |
|
|
22 |
.databunch(bs=16, collate_fn=bb_pad_collate)) |
|
|
23 |
#Finally we convert to a DataBunch, use a batch size of 16, |
|
|
24 |
# and we use bb_pad_collate to collate the data into a mini-batch |
|
|
25 |
""" |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
##===================================== |
|
|
29 |
## ItemBase classes |
|
|
30 |
##===================================== |
|
|
31 |
|
|
|
32 |
class AlignmentIndexBase(ItemBase): |
|
|
33 |
pass |
|
|
34 |
|
|
|
35 |
class AlignmentItemBase(ItemBase): |
|
|
36 |
"""an alignment item contains an alignment record tracked in bai index file as n_ref""" |
|
|
37 |
def __init__(self,n_ref:str, seq:str, qual:str, cigar:object, meta:dict): |
|
|
38 |
self.n_ref, self.seq, self.qual, self.cigar, self.meta = n_ref, seq, cigar, meta |
|
|
39 |
|
|
|
40 |
def loadAlighment(self, file): |
|
|
41 |
pass |
|
|
42 |
|
|
|
43 |
def loadAlignmentIndex(self, aignmentFileName): |
|
|
44 |
with open(aignmentFileName) as |
|
|
45 |
if self.generate_index |
|
|
46 |
|
|
|
47 |
def _getNamedPart(self, part:str): |
|
|
48 |
```select a named parts e.g chromosome, metachondria etc.``` |
|
|
49 |
|
|
|
50 |
def toFasta(self, item:Collection[str], position:slice) -> str: |
|
|
51 |
pass |
|
|
52 |
|
|
|
53 |
def toVariants(self, item:Collection[str], position:slice) -> Collection[str]: |
|
|
54 |
pass |
|
|
55 |
|
|
|
56 |
|
|
|
57 |
class AlignmentFileProcessor(PreProcessor): |
|
|
58 |
"`PreProcessor` that opens the filenames and read alignment files." |
|
|
59 |
def process_one(self,item): |
|
|
60 |
return bam_reader(item) if isinstance(item, Path) else item |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
class AlignmentItemList(ItemList): |
|
|
64 |
"Special `ItemList` for BAM/SAM alignment files" |
|
|
65 |
_bunch = AlignmentDataBunch |
|
|
66 |
_sequencer,_indexer, _cygar_processor = SequenceProcessor, IndexProcessor, CygarProcessor |
|
|
67 |
_is_compressed = True |
|
|
68 |
_in_memory = False |
|
|
69 |
|
|
|
70 |
def from_file(self, file): |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
def do_not_label(self, **kwargs): |
|
|
74 |
"A special labelling method for unsupervised learning" |
|
|
75 |
self.__class__ = UnlabeledAlignemtList |
|
|
76 |
kwargs['label_cls'] = UnlabeledAlignemtList |
|
|
77 |
return self.label_empty(**kwargs) |
|
|
78 |
|
|
|
79 |
def named_parts(self): |
|
|
80 |
"""Retreave named parts from alignment index""" |
|
|
81 |
pass |
|
|
82 |
|
|
|
83 |
|
|
|
84 |
|
|
|
85 |
|
|
|
86 |
|
|
|
87 |
##===================================== |
|
|
88 |
## DataBunch |
|
|
89 |
##===================================== |
|
|
90 |
|
|
|
91 |
|
|
|
92 |
class AlignmentDataBunch(DataBunch): |
|
|
93 |
"DataBunch suitable for generic sequence processing." |
|
|
94 |
|
|
|
95 |
|
|
|
96 |
@classmethod |
|
|
97 |
def from_folder(cls, path:PathOrStr, train:str='train', valid:str='valid', test:Optional[str]=None, |
|
|
98 |
classes:Collection[Any]=None, tokenizer:Tokenizer=None, vocab:Vocab=None, chunksize:int=10000, max_vocab:int=60000, |
|
|
99 |
min_freq:int=2, mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False, **kwargs): |
|
|
100 |
"Create a `AlignmentDataBunch` from text files in folders." |
|
|
101 |
path = Path(path).absolute() |
|
|
102 |
processor = [AlignmentFileProcessor()] + |
|
|
103 |
_get_processor(tokenizer=tokenizer, vocab=vocab, chunksize=chunksize, max_vocab=max_vocab, |
|
|
104 |
min_freq=min_freq, mark_fields=mark_fields, include_bos=include_bos, include_eos=include_eos) |
|
|
105 |
src = (AlignmentItemList.from_folder(path, processor=processor) |
|
|
106 |
.split_by_folder(train=train, valid=valid)) |
|
|
107 |
src = src.label_for_clustering() if cls==UnlabeledAlignementDataBunch else src.label_from_folder(classes=classes) |
|
|
108 |
if test is not None: src.add_test_folder(path/test) |
|
|
109 |
return src.databunch(**kwargs) |
|
|
110 |
|
|
|
111 |
|
|
|
112 |
##===================================== |
|
|
113 |
## Unlabeled Alignment Data Bunch |
|
|
114 |
##===================================== |
|
|
115 |
|
|
|
116 |
class UnlabeledAlignemtList(AlignementItemList): |
|
|
117 |
"Special `ItemList` for a language model." |
|
|
118 |
_bunch = AlignmentDataBunch |
|
|
119 |
|
|
|
120 |
|
|
|
121 |
class UnlabeledAlignementDataBunch(AlignmentDataBunch): |
|
|
122 |
"DataBunch suitable for unsupervised learning over alignment data" |
|
|
123 |
|
|
|
124 |
def label_for_clustering(self, **kwargs): |
|
|
125 |
"A special labelling method for unsupervised learning" |
|
|
126 |
self.__class__ = UnlabeledAlignemtList |
|
|
127 |
kwargs['label_cls'] = UnlabeledAlignemtList |
|
|
128 |
return self.label_empty(**kwargs) |