Mercurial > repos > chrisd > testshed
view snp_caller/src/SnipDriver.h @ 0:0fd352f62446 draft default tip
planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
author | chrisd |
---|---|
date | Sun, 21 Feb 2016 06:05:24 -0500 |
parents | |
children |
line wrap: on
line source
#ifndef SNIP_DRIVER_H #define SNIP_DRIVER_H #include "Alignment.h" #include "FastaRecord.h" #include "args.h" #include <iostream> #include <string> #include <fstream> #include <algorithm> #include <set> typedef decltype(nullptr) nullptr_t; typedef std::vector<std::pair<int,char>> cigar_str; std::set<std::string> read_pair_snips; struct CompareSnips { bool operator()(const std::string &first, const std::string &second) { return first.length() < second.length(); } }; Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) { for(auto &alignment : alignments) { bool mate_pair = alignment.second_in_pair(); std::string mate_qname = alignment.qname(); int mate_pos = alignment.pos(); if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) { return &alignment; } } return nullptr; } void parse_cigar(FastaRecord &record, Alignment &alignment) { cigar_str cigar = alignment.get_cigar(); int occurrence; char operation; int gene_pos = alignment.pos()-1; int read_pos = 0; char base_in_ref, base_in_read; std::string read = alignment.seq(); std::string qname = alignment.qname(); std::string gene = record.gene(); std::string gene_id = record.gene_id(); std::string snip_pattern; for(int i = 0; i < cigar.size(); i++) { occurrence = cigar[i].first; operation = cigar[i].second; if(operation == 'M' || operation == '=') { for(int k = 0; k < occurrence; k++) { if(gene_pos >= gene.length() || read_pos >= read.length()) { break; } base_in_ref = gene[gene_pos]; base_in_read = read[read_pos]; if(base_in_ref != base_in_read) { snip_pattern = std::to_string(gene_pos) + std::string(1,base_in_ref) + "->" + std::string(1, base_in_read) + ":"; read_pair_snips.insert(snip_pattern); } ++gene_pos; ++read_pos; } } else if(operation == 'I') { read_pos += occurrence; gene_pos += occurrence; } else if(operation == 'S') { read_pos += occurrence; } else if(operation == 'N') { gene_pos += occurrence; } else if(operation == 'P') { read_pos += occurrence; gene_pos += occurrence; } else if(operation == 'D') { gene_pos += occurrence; } else if(operation == 'X') { read_pos += occurrence; gene_pos += occurrence; } else { std::cout << "Operation " << operation << " not handled." << std::endl; } } } std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) { std::vector<std::string> snips; snips.assign(read_pair_snips.begin(), read_pair_snips.end()); CompareSnips comp; sort(snips.begin(), snips.end(), comp); std::string pattern; for(const auto &snp : snips) { pattern+=snp; } return pattern; } bool find_se_snips(std::vector<FastaRecord> &records, Alignment &read_pair) { int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); if(fasta_record_idx == records.size()) { return false; } parse_cigar(records[fasta_record_idx], read_pair); std::string snp = convert_snips_to_pattern(read_pair_snips); if(!snp.empty()) { records[fasta_record_idx].snip_database[snp]++; } read_pair_snips.clear(); return true; } bool find_pe_snips(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, Alignment &read_pair) { if(read_pair.first_in_pair()) { std::string qname = read_pair.qname(); int mate_start_pos = read_pair.pnext(); Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos); if(mate_pair == nullptr) { return false; } if(mate_pair->rname () != read_pair.rname()) { if(mate_pair->alternate_hits.size() == 0) { return false; } bool found_alt_hit = false; for(const auto &alternative: mate_pair->alternate_hits) { if(alternative.rname == read_pair.rname()) { found_alt_hit = true; mate_pair->set_rname(alternative.rname); mate_pair->set_cigar(alternative.cigar); mate_pair->set_pos(alternative.pos); break; } } if(!found_alt_hit) { return false; } } int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); if(fasta_record_idx == records.size()) { return false; } parse_cigar(records[fasta_record_idx], read_pair); parse_cigar(records[fasta_record_idx], *mate_pair); std::string snp = convert_snips_to_pattern(read_pair_snips); if(!snp.empty()) { records[fasta_record_idx].snip_database[snp]++; } read_pair_snips.clear(); } return true; } void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) { if(arg.samse) { for(int i = 0; i < alignments.size(); i++) { find_se_snips(records, alignments[i]); } } else { for(int i = 0; i < alignments.size(); i++) { find_pe_snips(records, alignments, alignments[i]); } } } void write_header(const std::string &out_fp) { std::ofstream out(out_fp.c_str()); out << "Chr,Haplotype,Frequency" << '\n'; } void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) { write_header(out_fp); std::ofstream out(out_fp.c_str(), std::ofstream::app); for(const auto &record : records) { for(const auto &snip : record.snip_database) { out << record.gene_id() << "," << snip.first << "," << snip.second << '\n'; } } } #endif // SNIP_DRIVER_H