view snp_caller/src/SnipDriver.h @ 0:0fd352f62446 draft default tip

planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
author chrisd
date Sun, 21 Feb 2016 06:05:24 -0500
parents
children
line wrap: on
line source

#ifndef SNIP_DRIVER_H
#define SNIP_DRIVER_H

#include "Alignment.h"
#include "FastaRecord.h"
#include "args.h"

#include <iostream>
#include <string>
#include <fstream>
#include <algorithm>
#include <set>

typedef decltype(nullptr) nullptr_t;
typedef std::vector<std::pair<int,char>> cigar_str;

std::set<std::string> read_pair_snips;

struct CompareSnips {
	bool operator()(const std::string &first, const std::string &second) {
		return first.length() < second.length();
	}
};


Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) {
	for(auto &alignment : alignments) {
		bool mate_pair = alignment.second_in_pair();
		std::string mate_qname = alignment.qname();
		int mate_pos = alignment.pos();

		if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) {
			return &alignment;	
		}
	}
	return nullptr;	
}

void parse_cigar(FastaRecord &record, Alignment &alignment) {
	cigar_str cigar = alignment.get_cigar();

	int occurrence;
	char operation;

	int gene_pos = alignment.pos()-1;
	int read_pos = 0;

	char base_in_ref, base_in_read;

	std::string read = alignment.seq();
	std::string qname = alignment.qname();

	std::string gene = record.gene();
	std::string gene_id = record.gene_id();

	std::string snip_pattern;

	for(int i = 0; i < cigar.size(); i++) {
		occurrence = cigar[i].first;
		operation = cigar[i].second;

		if(operation == 'M' || operation == '=') {
			for(int k = 0; k < occurrence; k++) {
				if(gene_pos >= gene.length() || read_pos >= read.length()) {
					break;
				}

				base_in_ref = gene[gene_pos];
				base_in_read = read[read_pos];

				if(base_in_ref != base_in_read) {
					snip_pattern = std::to_string(gene_pos) +
						       std::string(1,base_in_ref) + "->" +
						       std::string(1, base_in_read) + ":"; 
					read_pair_snips.insert(snip_pattern);			
				}

				++gene_pos;
				++read_pos;
			}
		}
		else if(operation == 'I') {
			read_pos += occurrence;
			gene_pos += occurrence;
		}
		else if(operation == 'S') {
			read_pos += occurrence;
		}
		else if(operation == 'N') {
			gene_pos += occurrence;
		}
		else if(operation == 'P') {
			read_pos += occurrence;
			gene_pos += occurrence;
		}
		else if(operation == 'D') {
			gene_pos += occurrence;
		}
		else if(operation == 'X') {
			read_pos += occurrence;
			gene_pos += occurrence;
		}
		else {
			std::cout << "Operation " << operation << " not handled." << std::endl;			
		}
	}
}

std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) {
	std::vector<std::string> snips;
	snips.assign(read_pair_snips.begin(), read_pair_snips.end());

	CompareSnips comp;
	sort(snips.begin(), snips.end(), comp);

	std::string pattern;
	for(const auto &snp : snips) {
		pattern+=snp;
	}
	return pattern;
}

bool find_se_snips(std::vector<FastaRecord> &records,
		   Alignment &read_pair) {
	
	int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
	if(fasta_record_idx == records.size()) {
		return false;
	}

	parse_cigar(records[fasta_record_idx], read_pair);
	std::string snp = convert_snips_to_pattern(read_pair_snips);

	if(!snp.empty()) {
		records[fasta_record_idx].snip_database[snp]++;
	}

	read_pair_snips.clear();

	return true;
}

bool find_pe_snips(std::vector<FastaRecord> &records, 
		   std::vector<Alignment> &alignments,
		   Alignment &read_pair) {

	if(read_pair.first_in_pair()) {
		std::string qname = read_pair.qname();
		int mate_start_pos = read_pair.pnext();

		Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos);
		if(mate_pair == nullptr) {
			return false;
		}
		if(mate_pair->rname () != read_pair.rname()) {
			if(mate_pair->alternate_hits.size() == 0) {
				return false;
			}

			bool found_alt_hit = false;
			for(const auto &alternative: mate_pair->alternate_hits) {
				if(alternative.rname == read_pair.rname()) {
					found_alt_hit = true;
					mate_pair->set_rname(alternative.rname);
					mate_pair->set_cigar(alternative.cigar);
					mate_pair->set_pos(alternative.pos);
					break;
				}
			}
			if(!found_alt_hit) {
				return false;
			}
		}


		int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
		if(fasta_record_idx == records.size()) {
			return false;
		}

		parse_cigar(records[fasta_record_idx], read_pair);
		parse_cigar(records[fasta_record_idx], *mate_pair);

		std::string snp = convert_snips_to_pattern(read_pair_snips);

		if(!snp.empty()) {
			records[fasta_record_idx].snip_database[snp]++;
		}

	
		read_pair_snips.clear();
	}

	return true;
}

void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) {
	if(arg.samse) {
		for(int i = 0; i < alignments.size(); i++) {
			find_se_snips(records, alignments[i]);
		}
	}
	else {
		for(int i = 0; i < alignments.size(); i++) {
			find_pe_snips(records, alignments, alignments[i]);
		}	
	}
}

void write_header(const std::string &out_fp) {
	std::ofstream out(out_fp.c_str());
	out << "Chr,Haplotype,Frequency" << '\n';
}

void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) {
	write_header(out_fp);
	std::ofstream out(out_fp.c_str(), std::ofstream::app);
	for(const auto &record : records) {
		for(const auto &snip : record.snip_database) {
			out << record.gene_id() << "," << snip.first << "," << snip.second << '\n';
		}
	}
}


#endif // SNIP_DRIVER_H