Mercurial > repos > chrisd > testshed
comparison snp_caller/src/SnipDriver.h @ 0:0fd352f62446 draft default tip
planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
| author | chrisd |
|---|---|
| date | Sun, 21 Feb 2016 06:05:24 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0fd352f62446 |
|---|---|
| 1 #ifndef SNIP_DRIVER_H | |
| 2 #define SNIP_DRIVER_H | |
| 3 | |
| 4 #include "Alignment.h" | |
| 5 #include "FastaRecord.h" | |
| 6 #include "args.h" | |
| 7 | |
| 8 #include <iostream> | |
| 9 #include <string> | |
| 10 #include <fstream> | |
| 11 #include <algorithm> | |
| 12 #include <set> | |
| 13 | |
| 14 typedef decltype(nullptr) nullptr_t; | |
| 15 typedef std::vector<std::pair<int,char>> cigar_str; | |
| 16 | |
| 17 std::set<std::string> read_pair_snips; | |
| 18 | |
| 19 struct CompareSnips { | |
| 20 bool operator()(const std::string &first, const std::string &second) { | |
| 21 return first.length() < second.length(); | |
| 22 } | |
| 23 }; | |
| 24 | |
| 25 | |
| 26 Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) { | |
| 27 for(auto &alignment : alignments) { | |
| 28 bool mate_pair = alignment.second_in_pair(); | |
| 29 std::string mate_qname = alignment.qname(); | |
| 30 int mate_pos = alignment.pos(); | |
| 31 | |
| 32 if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) { | |
| 33 return &alignment; | |
| 34 } | |
| 35 } | |
| 36 return nullptr; | |
| 37 } | |
| 38 | |
| 39 void parse_cigar(FastaRecord &record, Alignment &alignment) { | |
| 40 cigar_str cigar = alignment.get_cigar(); | |
| 41 | |
| 42 int occurrence; | |
| 43 char operation; | |
| 44 | |
| 45 int gene_pos = alignment.pos()-1; | |
| 46 int read_pos = 0; | |
| 47 | |
| 48 char base_in_ref, base_in_read; | |
| 49 | |
| 50 std::string read = alignment.seq(); | |
| 51 std::string qname = alignment.qname(); | |
| 52 | |
| 53 std::string gene = record.gene(); | |
| 54 std::string gene_id = record.gene_id(); | |
| 55 | |
| 56 std::string snip_pattern; | |
| 57 | |
| 58 for(int i = 0; i < cigar.size(); i++) { | |
| 59 occurrence = cigar[i].first; | |
| 60 operation = cigar[i].second; | |
| 61 | |
| 62 if(operation == 'M' || operation == '=') { | |
| 63 for(int k = 0; k < occurrence; k++) { | |
| 64 if(gene_pos >= gene.length() || read_pos >= read.length()) { | |
| 65 break; | |
| 66 } | |
| 67 | |
| 68 base_in_ref = gene[gene_pos]; | |
| 69 base_in_read = read[read_pos]; | |
| 70 | |
| 71 if(base_in_ref != base_in_read) { | |
| 72 snip_pattern = std::to_string(gene_pos) + | |
| 73 std::string(1,base_in_ref) + "->" + | |
| 74 std::string(1, base_in_read) + ":"; | |
| 75 read_pair_snips.insert(snip_pattern); | |
| 76 } | |
| 77 | |
| 78 ++gene_pos; | |
| 79 ++read_pos; | |
| 80 } | |
| 81 } | |
| 82 else if(operation == 'I') { | |
| 83 read_pos += occurrence; | |
| 84 gene_pos += occurrence; | |
| 85 } | |
| 86 else if(operation == 'S') { | |
| 87 read_pos += occurrence; | |
| 88 } | |
| 89 else if(operation == 'N') { | |
| 90 gene_pos += occurrence; | |
| 91 } | |
| 92 else if(operation == 'P') { | |
| 93 read_pos += occurrence; | |
| 94 gene_pos += occurrence; | |
| 95 } | |
| 96 else if(operation == 'D') { | |
| 97 gene_pos += occurrence; | |
| 98 } | |
| 99 else if(operation == 'X') { | |
| 100 read_pos += occurrence; | |
| 101 gene_pos += occurrence; | |
| 102 } | |
| 103 else { | |
| 104 std::cout << "Operation " << operation << " not handled." << std::endl; | |
| 105 } | |
| 106 } | |
| 107 } | |
| 108 | |
| 109 std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) { | |
| 110 std::vector<std::string> snips; | |
| 111 snips.assign(read_pair_snips.begin(), read_pair_snips.end()); | |
| 112 | |
| 113 CompareSnips comp; | |
| 114 sort(snips.begin(), snips.end(), comp); | |
| 115 | |
| 116 std::string pattern; | |
| 117 for(const auto &snp : snips) { | |
| 118 pattern+=snp; | |
| 119 } | |
| 120 return pattern; | |
| 121 } | |
| 122 | |
| 123 bool find_se_snips(std::vector<FastaRecord> &records, | |
| 124 Alignment &read_pair) { | |
| 125 | |
| 126 int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); | |
| 127 if(fasta_record_idx == records.size()) { | |
| 128 return false; | |
| 129 } | |
| 130 | |
| 131 parse_cigar(records[fasta_record_idx], read_pair); | |
| 132 std::string snp = convert_snips_to_pattern(read_pair_snips); | |
| 133 | |
| 134 if(!snp.empty()) { | |
| 135 records[fasta_record_idx].snip_database[snp]++; | |
| 136 } | |
| 137 | |
| 138 read_pair_snips.clear(); | |
| 139 | |
| 140 return true; | |
| 141 } | |
| 142 | |
| 143 bool find_pe_snips(std::vector<FastaRecord> &records, | |
| 144 std::vector<Alignment> &alignments, | |
| 145 Alignment &read_pair) { | |
| 146 | |
| 147 if(read_pair.first_in_pair()) { | |
| 148 std::string qname = read_pair.qname(); | |
| 149 int mate_start_pos = read_pair.pnext(); | |
| 150 | |
| 151 Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos); | |
| 152 if(mate_pair == nullptr) { | |
| 153 return false; | |
| 154 } | |
| 155 if(mate_pair->rname () != read_pair.rname()) { | |
| 156 if(mate_pair->alternate_hits.size() == 0) { | |
| 157 return false; | |
| 158 } | |
| 159 | |
| 160 bool found_alt_hit = false; | |
| 161 for(const auto &alternative: mate_pair->alternate_hits) { | |
| 162 if(alternative.rname == read_pair.rname()) { | |
| 163 found_alt_hit = true; | |
| 164 mate_pair->set_rname(alternative.rname); | |
| 165 mate_pair->set_cigar(alternative.cigar); | |
| 166 mate_pair->set_pos(alternative.pos); | |
| 167 break; | |
| 168 } | |
| 169 } | |
| 170 if(!found_alt_hit) { | |
| 171 return false; | |
| 172 } | |
| 173 } | |
| 174 | |
| 175 | |
| 176 int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); | |
| 177 if(fasta_record_idx == records.size()) { | |
| 178 return false; | |
| 179 } | |
| 180 | |
| 181 parse_cigar(records[fasta_record_idx], read_pair); | |
| 182 parse_cigar(records[fasta_record_idx], *mate_pair); | |
| 183 | |
| 184 std::string snp = convert_snips_to_pattern(read_pair_snips); | |
| 185 | |
| 186 if(!snp.empty()) { | |
| 187 records[fasta_record_idx].snip_database[snp]++; | |
| 188 } | |
| 189 | |
| 190 | |
| 191 read_pair_snips.clear(); | |
| 192 } | |
| 193 | |
| 194 return true; | |
| 195 } | |
| 196 | |
| 197 void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) { | |
| 198 if(arg.samse) { | |
| 199 for(int i = 0; i < alignments.size(); i++) { | |
| 200 find_se_snips(records, alignments[i]); | |
| 201 } | |
| 202 } | |
| 203 else { | |
| 204 for(int i = 0; i < alignments.size(); i++) { | |
| 205 find_pe_snips(records, alignments, alignments[i]); | |
| 206 } | |
| 207 } | |
| 208 } | |
| 209 | |
| 210 void write_header(const std::string &out_fp) { | |
| 211 std::ofstream out(out_fp.c_str()); | |
| 212 out << "Chr,Haplotype,Frequency" << '\n'; | |
| 213 } | |
| 214 | |
| 215 void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) { | |
| 216 write_header(out_fp); | |
| 217 std::ofstream out(out_fp.c_str(), std::ofstream::app); | |
| 218 for(const auto &record : records) { | |
| 219 for(const auto &snip : record.snip_database) { | |
| 220 out << record.gene_id() << "," << snip.first << "," << snip.second << '\n'; | |
| 221 } | |
| 222 } | |
| 223 } | |
| 224 | |
| 225 | |
| 226 #endif // SNIP_DRIVER_H |
