|
a |
|
b/neusomatic/include/SeqanUtils.h |
|
|
1 |
#ifndef SEQAN_UTILS_HPP |
|
|
2 |
#define SEQAN_UTILS_HPP |
|
|
3 |
|
|
|
4 |
#include <vector> |
|
|
5 |
#include<seqan/seq_io.h> |
|
|
6 |
#include<seqan/misc/interval_tree.h> |
|
|
7 |
#include<seqan/find.h> |
|
|
8 |
|
|
|
9 |
namespace neusomatic{ namespace bio{ |
|
|
10 |
|
|
|
11 |
template<typename Base> |
|
|
12 |
std::string Base2String(const Base& base, const std::string& gap_symbol){ |
|
|
13 |
|
|
|
14 |
//std::cout<<static_cast<unsigned char>(base)<<" :static cast: "; |
|
|
15 |
//std::cout<<(unsigned)seqan::ordValue(base)<<std::endl; |
|
|
16 |
|
|
|
17 |
switch((unsigned)seqan::ordValue(base)) |
|
|
18 |
{ |
|
|
19 |
case 0: return "A"; |
|
|
20 |
case 1: return "C"; |
|
|
21 |
case 2: return "G"; |
|
|
22 |
case 3: return "T"; |
|
|
23 |
default: return gap_symbol; |
|
|
24 |
} |
|
|
25 |
//should never reach here |
|
|
26 |
return ""; |
|
|
27 |
|
|
|
28 |
} |
|
|
29 |
|
|
|
30 |
template<> |
|
|
31 |
std::string Base2String<char>(const char& base, const std::string& gap_symbol) { |
|
|
32 |
switch(base) { |
|
|
33 |
case 'a': |
|
|
34 |
case 'A': |
|
|
35 |
case 0: |
|
|
36 |
return "A"; |
|
|
37 |
|
|
|
38 |
case 'c': |
|
|
39 |
case 'C': |
|
|
40 |
case 1: |
|
|
41 |
return "C"; |
|
|
42 |
|
|
|
43 |
case 'g': |
|
|
44 |
case 'G': |
|
|
45 |
case 2: |
|
|
46 |
return "G"; |
|
|
47 |
case 't': |
|
|
48 |
case 'T': |
|
|
49 |
case 3: |
|
|
50 |
return "T"; |
|
|
51 |
default: |
|
|
52 |
return gap_symbol; |
|
|
53 |
} |
|
|
54 |
} |
|
|
55 |
|
|
|
56 |
template<typename Itr> |
|
|
57 |
std::string to_string(Itr b, Itr e, const std::string& gap_symbol = "N"){ |
|
|
58 |
std::string result; |
|
|
59 |
auto it = b; |
|
|
60 |
for(; it < e; ++it){ |
|
|
61 |
result += Base2String(*it, gap_symbol); |
|
|
62 |
} |
|
|
63 |
return result; |
|
|
64 |
} |
|
|
65 |
|
|
|
66 |
inline std::string to_string(const seqan::Dna5String& seq, const std::string& gap_symbol = "N"){ |
|
|
67 |
std::string result; |
|
|
68 |
for (unsigned int i = 0; i < seqan::length(seq); ++i) { |
|
|
69 |
result += Base2String(seq[i], gap_symbol); |
|
|
70 |
} |
|
|
71 |
return result; |
|
|
72 |
} |
|
|
73 |
|
|
|
74 |
template<typename TInterval> |
|
|
75 |
inline decltype(auto) ToSeqanIntervals(const std::vector<TInterval>& invs){ |
|
|
76 |
|
|
|
77 |
typedef seqan::IntervalAndCargo<int, TInterval> TIntervalAndCargo; |
|
|
78 |
std::vector<TIntervalAndCargo> seqan_intervals; |
|
|
79 |
for (auto it = invs.cbegin(); it != invs.cend(); ++it) { |
|
|
80 |
TIntervalAndCargo i(it->left(), it->right(), *it); |
|
|
81 |
seqan_intervals.push_back(i); |
|
|
82 |
} |
|
|
83 |
return seqan_intervals; |
|
|
84 |
} |
|
|
85 |
|
|
|
86 |
inline std::vector<seqan::CharString> SeqanSplit(seqan::CharString target, std::string splitter) { |
|
|
87 |
/* |
|
|
88 |
Return a vec of size 1 containing the intact target if splitter is not found in the target. |
|
|
89 |
*/ |
|
|
90 |
if (seqan::length(target) == 0 || seqan::length(splitter) == 0) { |
|
|
91 |
throw std::runtime_error("Invalid string and pattern to split."); |
|
|
92 |
} |
|
|
93 |
using TPos = typename seqan::Position<seqan::CharString>::Type; |
|
|
94 |
|
|
|
95 |
std::vector<seqan::CharString> result; |
|
|
96 |
seqan::Finder<seqan::CharString> finder(target); |
|
|
97 |
seqan::Pattern<seqan::CharString, seqan::Horspool> pattern(splitter); |
|
|
98 |
std::vector<std::pair<TPos, TPos>> searches; |
|
|
99 |
while (seqan::find(finder, pattern)) { |
|
|
100 |
searches.emplace_back(seqan::beginPosition(finder), seqan::endPosition(finder)); |
|
|
101 |
} |
|
|
102 |
|
|
|
103 |
TPos lastp = 0; |
|
|
104 |
for (auto const& p: searches) { |
|
|
105 |
seqan::CharString unit = seqan::infix(target, lastp, p.first); |
|
|
106 |
if (!seqan::empty(unit)) result.push_back(unit); |
|
|
107 |
lastp = p.second; |
|
|
108 |
} |
|
|
109 |
if (lastp < seqan::length(target) - 1) { |
|
|
110 |
seqan::CharString unit = seqan::infix(target, lastp, seqan::length(target)); |
|
|
111 |
result.push_back(unit); |
|
|
112 |
} |
|
|
113 |
return result; |
|
|
114 |
} |
|
|
115 |
|
|
|
116 |
}// namespace neusomatic |
|
|
117 |
}// namespace bio |
|
|
118 |
|
|
|
119 |
#endif /* SEQAN_UTILS_HPP */ |