comparison snp_caller/src/SnipDriver.h @ 0:0fd352f62446 draft default tip

planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
author chrisd
date Sun, 21 Feb 2016 06:05:24 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0fd352f62446
1 #ifndef SNIP_DRIVER_H
2 #define SNIP_DRIVER_H
3
4 #include "Alignment.h"
5 #include "FastaRecord.h"
6 #include "args.h"
7
8 #include <iostream>
9 #include <string>
10 #include <fstream>
11 #include <algorithm>
12 #include <set>
13
14 typedef decltype(nullptr) nullptr_t;
15 typedef std::vector<std::pair<int,char>> cigar_str;
16
17 std::set<std::string> read_pair_snips;
18
19 struct CompareSnips {
20 bool operator()(const std::string &first, const std::string &second) {
21 return first.length() < second.length();
22 }
23 };
24
25
26 Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) {
27 for(auto &alignment : alignments) {
28 bool mate_pair = alignment.second_in_pair();
29 std::string mate_qname = alignment.qname();
30 int mate_pos = alignment.pos();
31
32 if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) {
33 return &alignment;
34 }
35 }
36 return nullptr;
37 }
38
39 void parse_cigar(FastaRecord &record, Alignment &alignment) {
40 cigar_str cigar = alignment.get_cigar();
41
42 int occurrence;
43 char operation;
44
45 int gene_pos = alignment.pos()-1;
46 int read_pos = 0;
47
48 char base_in_ref, base_in_read;
49
50 std::string read = alignment.seq();
51 std::string qname = alignment.qname();
52
53 std::string gene = record.gene();
54 std::string gene_id = record.gene_id();
55
56 std::string snip_pattern;
57
58 for(int i = 0; i < cigar.size(); i++) {
59 occurrence = cigar[i].first;
60 operation = cigar[i].second;
61
62 if(operation == 'M' || operation == '=') {
63 for(int k = 0; k < occurrence; k++) {
64 if(gene_pos >= gene.length() || read_pos >= read.length()) {
65 break;
66 }
67
68 base_in_ref = gene[gene_pos];
69 base_in_read = read[read_pos];
70
71 if(base_in_ref != base_in_read) {
72 snip_pattern = std::to_string(gene_pos) +
73 std::string(1,base_in_ref) + "->" +
74 std::string(1, base_in_read) + ":";
75 read_pair_snips.insert(snip_pattern);
76 }
77
78 ++gene_pos;
79 ++read_pos;
80 }
81 }
82 else if(operation == 'I') {
83 read_pos += occurrence;
84 gene_pos += occurrence;
85 }
86 else if(operation == 'S') {
87 read_pos += occurrence;
88 }
89 else if(operation == 'N') {
90 gene_pos += occurrence;
91 }
92 else if(operation == 'P') {
93 read_pos += occurrence;
94 gene_pos += occurrence;
95 }
96 else if(operation == 'D') {
97 gene_pos += occurrence;
98 }
99 else if(operation == 'X') {
100 read_pos += occurrence;
101 gene_pos += occurrence;
102 }
103 else {
104 std::cout << "Operation " << operation << " not handled." << std::endl;
105 }
106 }
107 }
108
109 std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) {
110 std::vector<std::string> snips;
111 snips.assign(read_pair_snips.begin(), read_pair_snips.end());
112
113 CompareSnips comp;
114 sort(snips.begin(), snips.end(), comp);
115
116 std::string pattern;
117 for(const auto &snp : snips) {
118 pattern+=snp;
119 }
120 return pattern;
121 }
122
123 bool find_se_snips(std::vector<FastaRecord> &records,
124 Alignment &read_pair) {
125
126 int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
127 if(fasta_record_idx == records.size()) {
128 return false;
129 }
130
131 parse_cigar(records[fasta_record_idx], read_pair);
132 std::string snp = convert_snips_to_pattern(read_pair_snips);
133
134 if(!snp.empty()) {
135 records[fasta_record_idx].snip_database[snp]++;
136 }
137
138 read_pair_snips.clear();
139
140 return true;
141 }
142
143 bool find_pe_snips(std::vector<FastaRecord> &records,
144 std::vector<Alignment> &alignments,
145 Alignment &read_pair) {
146
147 if(read_pair.first_in_pair()) {
148 std::string qname = read_pair.qname();
149 int mate_start_pos = read_pair.pnext();
150
151 Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos);
152 if(mate_pair == nullptr) {
153 return false;
154 }
155 if(mate_pair->rname () != read_pair.rname()) {
156 if(mate_pair->alternate_hits.size() == 0) {
157 return false;
158 }
159
160 bool found_alt_hit = false;
161 for(const auto &alternative: mate_pair->alternate_hits) {
162 if(alternative.rname == read_pair.rname()) {
163 found_alt_hit = true;
164 mate_pair->set_rname(alternative.rname);
165 mate_pair->set_cigar(alternative.cigar);
166 mate_pair->set_pos(alternative.pos);
167 break;
168 }
169 }
170 if(!found_alt_hit) {
171 return false;
172 }
173 }
174
175
176 int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
177 if(fasta_record_idx == records.size()) {
178 return false;
179 }
180
181 parse_cigar(records[fasta_record_idx], read_pair);
182 parse_cigar(records[fasta_record_idx], *mate_pair);
183
184 std::string snp = convert_snips_to_pattern(read_pair_snips);
185
186 if(!snp.empty()) {
187 records[fasta_record_idx].snip_database[snp]++;
188 }
189
190
191 read_pair_snips.clear();
192 }
193
194 return true;
195 }
196
197 void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) {
198 if(arg.samse) {
199 for(int i = 0; i < alignments.size(); i++) {
200 find_se_snips(records, alignments[i]);
201 }
202 }
203 else {
204 for(int i = 0; i < alignments.size(); i++) {
205 find_pe_snips(records, alignments, alignments[i]);
206 }
207 }
208 }
209
210 void write_header(const std::string &out_fp) {
211 std::ofstream out(out_fp.c_str());
212 out << "Chr,Haplotype,Frequency" << '\n';
213 }
214
215 void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) {
216 write_header(out_fp);
217 std::ofstream out(out_fp.c_str(), std::ofstream::app);
218 for(const auto &record : records) {
219 for(const auto &snip : record.snip_database) {
220 out << record.gene_id() << "," << snip.first << "," << snip.second << '\n';
221 }
222 }
223 }
224
225
226 #endif // SNIP_DRIVER_H