[98867e]: / faigen / data / alignment.py

Download this file

129 lines (94 with data), 4.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from fastai import *
from fastai.text import *
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
"""
Sample
data = (AlignmentsItemList.from_folder(bam_sam_folder)
#Where are the sequences? -> in ```bam_sam_folder``` and its subfolders
.generate_missing_index(),
#Alignment files need indexes. Run samtools to generate index if missing
.take_by_name(list_of_named_alignments),
#select metachondria, chromosome, etc.
.toFasta(),
#generate fasta sequence from alignment
.do_not_label(),
#create empty labels for unsupervised learning tasks
.databunch(bs=16, collate_fn=bb_pad_collate))
#Finally we convert to a DataBunch, use a batch size of 16,
# and we use bb_pad_collate to collate the data into a mini-batch
"""
##=====================================
## ItemBase classes
##=====================================
class AlignmentIndexBase(ItemBase):
pass
class AlignmentItemBase(ItemBase):
"""an alignment item contains an alignment record tracked in bai index file as n_ref"""
def __init__(self,n_ref:str, seq:str, qual:str, cigar:object, meta:dict):
self.n_ref, self.seq, self.qual, self.cigar, self.meta = n_ref, seq, cigar, meta
def loadAlighment(self, file):
pass
def loadAlignmentIndex(self, aignmentFileName):
with open(aignmentFileName) as
if self.generate_index
def _getNamedPart(self, part:str):
```select a named parts e.g chromosome, metachondria etc.```
def toFasta(self, item:Collection[str], position:slice) -> str:
pass
def toVariants(self, item:Collection[str], position:slice) -> Collection[str]:
pass
class AlignmentFileProcessor(PreProcessor):
"`PreProcessor` that opens the filenames and read alignment files."
def process_one(self,item):
return bam_reader(item) if isinstance(item, Path) else item
class AlignmentItemList(ItemList):
"Special `ItemList` for BAM/SAM alignment files"
_bunch = AlignmentDataBunch
_sequencer,_indexer, _cygar_processor = SequenceProcessor, IndexProcessor, CygarProcessor
_is_compressed = True
_in_memory = False
def from_file(self, file):
def do_not_label(self, **kwargs):
"A special labelling method for unsupervised learning"
self.__class__ = UnlabeledAlignemtList
kwargs['label_cls'] = UnlabeledAlignemtList
return self.label_empty(**kwargs)
def named_parts(self):
"""Retreave named parts from alignment index"""
pass
##=====================================
## DataBunch
##=====================================
class AlignmentDataBunch(DataBunch):
"DataBunch suitable for generic sequence processing."
@classmethod
def from_folder(cls, path:PathOrStr, train:str='train', valid:str='valid', test:Optional[str]=None,
classes:Collection[Any]=None, tokenizer:Tokenizer=None, vocab:Vocab=None, chunksize:int=10000, max_vocab:int=60000,
min_freq:int=2, mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False, **kwargs):
"Create a `AlignmentDataBunch` from text files in folders."
path = Path(path).absolute()
processor = [AlignmentFileProcessor()] +
_get_processor(tokenizer=tokenizer, vocab=vocab, chunksize=chunksize, max_vocab=max_vocab,
min_freq=min_freq, mark_fields=mark_fields, include_bos=include_bos, include_eos=include_eos)
src = (AlignmentItemList.from_folder(path, processor=processor)
.split_by_folder(train=train, valid=valid))
src = src.label_for_clustering() if cls==UnlabeledAlignementDataBunch else src.label_from_folder(classes=classes)
if test is not None: src.add_test_folder(path/test)
return src.databunch(**kwargs)
##=====================================
## Unlabeled Alignment Data Bunch
##=====================================
class UnlabeledAlignemtList(AlignementItemList):
"Special `ItemList` for a language model."
_bunch = AlignmentDataBunch
class UnlabeledAlignementDataBunch(AlignmentDataBunch):
"DataBunch suitable for unsupervised learning over alignment data"
def label_for_clustering(self, **kwargs):
"A special labelling method for unsupervised learning"
self.__class__ = UnlabeledAlignemtList
kwargs['label_cls'] = UnlabeledAlignemtList
return self.label_empty(**kwargs)