Mercurial > repos > chrisd > testshed
diff snp_caller/src/SnipDriver.h @ 0:0fd352f62446 draft default tip
planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
author | chrisd |
---|---|
date | Sun, 21 Feb 2016 06:05:24 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/SnipDriver.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,226 @@ +#ifndef SNIP_DRIVER_H +#define SNIP_DRIVER_H + +#include "Alignment.h" +#include "FastaRecord.h" +#include "args.h" + +#include <iostream> +#include <string> +#include <fstream> +#include <algorithm> +#include <set> + +typedef decltype(nullptr) nullptr_t; +typedef std::vector<std::pair<int,char>> cigar_str; + +std::set<std::string> read_pair_snips; + +struct CompareSnips { + bool operator()(const std::string &first, const std::string &second) { + return first.length() < second.length(); + } +}; + + +Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) { + for(auto &alignment : alignments) { + bool mate_pair = alignment.second_in_pair(); + std::string mate_qname = alignment.qname(); + int mate_pos = alignment.pos(); + + if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) { + return &alignment; + } + } + return nullptr; +} + +void parse_cigar(FastaRecord &record, Alignment &alignment) { + cigar_str cigar = alignment.get_cigar(); + + int occurrence; + char operation; + + int gene_pos = alignment.pos()-1; + int read_pos = 0; + + char base_in_ref, base_in_read; + + std::string read = alignment.seq(); + std::string qname = alignment.qname(); + + std::string gene = record.gene(); + std::string gene_id = record.gene_id(); + + std::string snip_pattern; + + for(int i = 0; i < cigar.size(); i++) { + occurrence = cigar[i].first; + operation = cigar[i].second; + + if(operation == 'M' || operation == '=') { + for(int k = 0; k < occurrence; k++) { + if(gene_pos >= gene.length() || read_pos >= read.length()) { + break; + } + + base_in_ref = gene[gene_pos]; + base_in_read = read[read_pos]; + + if(base_in_ref != base_in_read) { + snip_pattern = std::to_string(gene_pos) + + std::string(1,base_in_ref) + "->" + + std::string(1, base_in_read) + ":"; + read_pair_snips.insert(snip_pattern); + } + + ++gene_pos; + ++read_pos; + } + } + else if(operation == 'I') { + read_pos += occurrence; + gene_pos += occurrence; + } + else if(operation == 'S') { + read_pos += occurrence; + } + else if(operation == 'N') { + gene_pos += occurrence; + } + else if(operation == 'P') { + read_pos += occurrence; + gene_pos += occurrence; + } + else if(operation == 'D') { + gene_pos += occurrence; + } + else if(operation == 'X') { + read_pos += occurrence; + gene_pos += occurrence; + } + else { + std::cout << "Operation " << operation << " not handled." << std::endl; + } + } +} + +std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) { + std::vector<std::string> snips; + snips.assign(read_pair_snips.begin(), read_pair_snips.end()); + + CompareSnips comp; + sort(snips.begin(), snips.end(), comp); + + std::string pattern; + for(const auto &snp : snips) { + pattern+=snp; + } + return pattern; +} + +bool find_se_snips(std::vector<FastaRecord> &records, + Alignment &read_pair) { + + int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); + if(fasta_record_idx == records.size()) { + return false; + } + + parse_cigar(records[fasta_record_idx], read_pair); + std::string snp = convert_snips_to_pattern(read_pair_snips); + + if(!snp.empty()) { + records[fasta_record_idx].snip_database[snp]++; + } + + read_pair_snips.clear(); + + return true; +} + +bool find_pe_snips(std::vector<FastaRecord> &records, + std::vector<Alignment> &alignments, + Alignment &read_pair) { + + if(read_pair.first_in_pair()) { + std::string qname = read_pair.qname(); + int mate_start_pos = read_pair.pnext(); + + Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos); + if(mate_pair == nullptr) { + return false; + } + if(mate_pair->rname () != read_pair.rname()) { + if(mate_pair->alternate_hits.size() == 0) { + return false; + } + + bool found_alt_hit = false; + for(const auto &alternative: mate_pair->alternate_hits) { + if(alternative.rname == read_pair.rname()) { + found_alt_hit = true; + mate_pair->set_rname(alternative.rname); + mate_pair->set_cigar(alternative.cigar); + mate_pair->set_pos(alternative.pos); + break; + } + } + if(!found_alt_hit) { + return false; + } + } + + + int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); + if(fasta_record_idx == records.size()) { + return false; + } + + parse_cigar(records[fasta_record_idx], read_pair); + parse_cigar(records[fasta_record_idx], *mate_pair); + + std::string snp = convert_snips_to_pattern(read_pair_snips); + + if(!snp.empty()) { + records[fasta_record_idx].snip_database[snp]++; + } + + + read_pair_snips.clear(); + } + + return true; +} + +void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) { + if(arg.samse) { + for(int i = 0; i < alignments.size(); i++) { + find_se_snips(records, alignments[i]); + } + } + else { + for(int i = 0; i < alignments.size(); i++) { + find_pe_snips(records, alignments, alignments[i]); + } + } +} + +void write_header(const std::string &out_fp) { + std::ofstream out(out_fp.c_str()); + out << "Chr,Haplotype,Frequency" << '\n'; +} + +void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) { + write_header(out_fp); + std::ofstream out(out_fp.c_str(), std::ofstream::app); + for(const auto &record : records) { + for(const auto &snip : record.snip_database) { + out << record.gene_id() << "," << snip.first << "," << snip.second << '\n'; + } + } +} + + +#endif // SNIP_DRIVER_H