diff snp_caller/src/SnipDriver.h @ 0:0fd352f62446 draft default tip

planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
author chrisd
date Sun, 21 Feb 2016 06:05:24 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/SnipDriver.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,226 @@
+#ifndef SNIP_DRIVER_H
+#define SNIP_DRIVER_H
+
+#include "Alignment.h"
+#include "FastaRecord.h"
+#include "args.h"
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <algorithm>
+#include <set>
+
+typedef decltype(nullptr) nullptr_t;
+typedef std::vector<std::pair<int,char>> cigar_str;
+
+std::set<std::string> read_pair_snips;
+
+struct CompareSnips {
+	bool operator()(const std::string &first, const std::string &second) {
+		return first.length() < second.length();
+	}
+};
+
+
+Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) {
+	for(auto &alignment : alignments) {
+		bool mate_pair = alignment.second_in_pair();
+		std::string mate_qname = alignment.qname();
+		int mate_pos = alignment.pos();
+
+		if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) {
+			return &alignment;	
+		}
+	}
+	return nullptr;	
+}
+
+void parse_cigar(FastaRecord &record, Alignment &alignment) {
+	cigar_str cigar = alignment.get_cigar();
+
+	int occurrence;
+	char operation;
+
+	int gene_pos = alignment.pos()-1;
+	int read_pos = 0;
+
+	char base_in_ref, base_in_read;
+
+	std::string read = alignment.seq();
+	std::string qname = alignment.qname();
+
+	std::string gene = record.gene();
+	std::string gene_id = record.gene_id();
+
+	std::string snip_pattern;
+
+	for(int i = 0; i < cigar.size(); i++) {
+		occurrence = cigar[i].first;
+		operation = cigar[i].second;
+
+		if(operation == 'M' || operation == '=') {
+			for(int k = 0; k < occurrence; k++) {
+				if(gene_pos >= gene.length() || read_pos >= read.length()) {
+					break;
+				}
+
+				base_in_ref = gene[gene_pos];
+				base_in_read = read[read_pos];
+
+				if(base_in_ref != base_in_read) {
+					snip_pattern = std::to_string(gene_pos) +
+						       std::string(1,base_in_ref) + "->" +
+						       std::string(1, base_in_read) + ":"; 
+					read_pair_snips.insert(snip_pattern);			
+				}
+
+				++gene_pos;
+				++read_pos;
+			}
+		}
+		else if(operation == 'I') {
+			read_pos += occurrence;
+			gene_pos += occurrence;
+		}
+		else if(operation == 'S') {
+			read_pos += occurrence;
+		}
+		else if(operation == 'N') {
+			gene_pos += occurrence;
+		}
+		else if(operation == 'P') {
+			read_pos += occurrence;
+			gene_pos += occurrence;
+		}
+		else if(operation == 'D') {
+			gene_pos += occurrence;
+		}
+		else if(operation == 'X') {
+			read_pos += occurrence;
+			gene_pos += occurrence;
+		}
+		else {
+			std::cout << "Operation " << operation << " not handled." << std::endl;			
+		}
+	}
+}
+
+std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) {
+	std::vector<std::string> snips;
+	snips.assign(read_pair_snips.begin(), read_pair_snips.end());
+
+	CompareSnips comp;
+	sort(snips.begin(), snips.end(), comp);
+
+	std::string pattern;
+	for(const auto &snp : snips) {
+		pattern+=snp;
+	}
+	return pattern;
+}
+
+bool find_se_snips(std::vector<FastaRecord> &records,
+		   Alignment &read_pair) {
+	
+	int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
+	if(fasta_record_idx == records.size()) {
+		return false;
+	}
+
+	parse_cigar(records[fasta_record_idx], read_pair);
+	std::string snp = convert_snips_to_pattern(read_pair_snips);
+
+	if(!snp.empty()) {
+		records[fasta_record_idx].snip_database[snp]++;
+	}
+
+	read_pair_snips.clear();
+
+	return true;
+}
+
+bool find_pe_snips(std::vector<FastaRecord> &records, 
+		   std::vector<Alignment> &alignments,
+		   Alignment &read_pair) {
+
+	if(read_pair.first_in_pair()) {
+		std::string qname = read_pair.qname();
+		int mate_start_pos = read_pair.pnext();
+
+		Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos);
+		if(mate_pair == nullptr) {
+			return false;
+		}
+		if(mate_pair->rname () != read_pair.rname()) {
+			if(mate_pair->alternate_hits.size() == 0) {
+				return false;
+			}
+
+			bool found_alt_hit = false;
+			for(const auto &alternative: mate_pair->alternate_hits) {
+				if(alternative.rname == read_pair.rname()) {
+					found_alt_hit = true;
+					mate_pair->set_rname(alternative.rname);
+					mate_pair->set_cigar(alternative.cigar);
+					mate_pair->set_pos(alternative.pos);
+					break;
+				}
+			}
+			if(!found_alt_hit) {
+				return false;
+			}
+		}
+
+
+		int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
+		if(fasta_record_idx == records.size()) {
+			return false;
+		}
+
+		parse_cigar(records[fasta_record_idx], read_pair);
+		parse_cigar(records[fasta_record_idx], *mate_pair);
+
+		std::string snp = convert_snips_to_pattern(read_pair_snips);
+
+		if(!snp.empty()) {
+			records[fasta_record_idx].snip_database[snp]++;
+		}
+
+	
+		read_pair_snips.clear();
+	}
+
+	return true;
+}
+
+void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) {
+	if(arg.samse) {
+		for(int i = 0; i < alignments.size(); i++) {
+			find_se_snips(records, alignments[i]);
+		}
+	}
+	else {
+		for(int i = 0; i < alignments.size(); i++) {
+			find_pe_snips(records, alignments, alignments[i]);
+		}	
+	}
+}
+
+void write_header(const std::string &out_fp) {
+	std::ofstream out(out_fp.c_str());
+	out << "Chr,Haplotype,Frequency" << '\n';
+}
+
+void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) {
+	write_header(out_fp);
+	std::ofstream out(out_fp.c_str(), std::ofstream::app);
+	for(const auto &record : records) {
+		for(const auto &snip : record.snip_database) {
+			out << record.gene_id() << "," << snip.first << "," << snip.second << '\n';
+		}
+	}
+}
+
+
+#endif // SNIP_DRIVER_H