changeset 0:0fd352f62446 draft default tip

planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
author chrisd
date Sun, 21 Feb 2016 06:05:24 -0500
parents
children
files csa gene_fraction.xml gene_fraction/src/Alignments.cpp gene_fraction/src/Alignments.h gene_fraction/src/Alignments.o gene_fraction/src/Fasta.cpp gene_fraction/src/Fasta.h gene_fraction/src/Fasta.o gene_fraction/src/FastaRecord.cpp gene_fraction/src/FastaRecord.h gene_fraction/src/FastaRecord.o gene_fraction/src/Makefile gene_fraction/src/Sam.cpp gene_fraction/src/Sam.h gene_fraction/src/Sam.o gene_fraction/src/SamRatio.h gene_fraction/src/alignment_util.h gene_fraction/src/args.h gene_fraction/src/dir_util.h gene_fraction/src/int_util.h gene_fraction/src/main.cpp gene_fraction/src/main.o res.bam sam snp snp_caller.xml snp_caller/src/Alignment.cpp snp_caller/src/Alignment.h snp_caller/src/Alignment.o snp_caller/src/Fasta.cpp snp_caller/src/Fasta.h snp_caller/src/Fasta.o snp_caller/src/FastaRecord.cpp snp_caller/src/FastaRecord.h snp_caller/src/FastaRecord.o snp_caller/src/Makefile snp_caller/src/Sam.cpp snp_caller/src/Sam.h snp_caller/src/Sam.o snp_caller/src/SnipDriver.h snp_caller/src/alignment_util.h snp_caller/src/args.h snp_caller/src/int_util.h snp_caller/src/main.cpp snp_caller/src/main.o
diffstat 45 files changed, 1853 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file csa has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction.xml	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,20 @@
+<tool id="gene_fraction" name="Gene Fraction" version="0.1.0">
+  <description>for single-end or paired-end data</description>
+  <command>$__tool_directory__/csa -amr_fp $amr_fp -sam_fp $sam_fp -t $threshold -min $min -max $max -skip $skip -samples $samples -out_fp $out_fp</command>
+  <inputs>
+    <param format="fasta" name="amr_fp" type="data" label="Input fasta file"/>
+    <param format="sam" name ="sam_fp" type="data" label="Input SAM file"/>
+    <param name="threshold" size="3" type="integer" min="0" max="100" value="1" label="Threshold"/>
+    <param name="min" size="3" type="integer" min="1" max="100" value="1" label="Minimum level"/>
+    <param name="max" size="3" type="integer" min="1" max="100" value="1" label="Maximum level"/>
+    <param name="skip" size="3" type="integer" min="1" max="100" value="1" label="Skip pattern"/>
+    <param name="samples" size="3" type="integer" min="1" max="100" value="1" label="Samples per level"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_fp" />
+  </outputs>
+  <help>
+This tool calculates the amount of genes that are covered by atleast one read.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Alignments.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,35 @@
+#include "Alignments.h"
+#include "int_util.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include <iostream>
+#include <sstream>
+
+Alignments::Alignments(std::string alignment) : _alignment(alignment) {
+	fill_alignment_fields(alignment);
+}
+
+void Alignments::fill_alignment_fields(const std::string &alignment) {
+	std::istringstream ss(alignment);
+	ss >> field.QNAME >> field.FLAG >> field.RNAME >> field.POS >>
+	      field.MAPQ >> field.CIGAR >> field.RNEXT >> field.PNEXT >>
+	      field.TLEN >> field.SEQ >> field.QUAL;
+}
+
+std::vector<std::pair<int,char>> Alignments::cigar() {
+	return get_cigar_operations(field.CIGAR);
+}
+
+std::vector<std::pair<int,char>> Alignments::get_cigar_operations(const std::string &cigar) {
+	std::vector<std::pair<int,char>> p;
+    	int count;
+    	char operation;
+
+    	std::istringstream ss(cigar);
+    	while(ss >> count >> operation) {
+        	p.push_back(std::make_pair(count, operation));
+    	}
+
+    	return p;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Alignments.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,68 @@
+#ifndef ALIGNMENTS_H
+#define ALIGNMENTS_H
+
+#include <string>
+#include <vector>
+
+/**
+ * Stores information about an alignment
+ */
+struct alignment_fields {
+    std::string QNAME;
+    int FLAG;
+    std::string RNAME;
+    int POS;
+    int MAPQ;
+    std::string CIGAR;
+    std::string RNEXT;
+    int PNEXT;
+    int TLEN;
+    std::string SEQ;
+    std::string QUAL;
+};
+
+/**
+ * Class for dealing with alignments
+ */
+class Alignments {
+public:
+	/**
+ 	 * Ctor that initializes alignment
+ 	 */ 
+	Alignments(std::string alignment);
+
+	/**
+ 	 * Stores information about each of the eleven
+ 	 * required alignment fields
+ 	 */ 
+    	void fill_alignment_fields(const std::string &alignment);
+
+    	std::vector<std::pair<int,char>> cigar();
+
+	inline std::string alignment() const	    { return _alignment;  };
+
+    	inline std::string qname() const            { return field.QNAME; };
+    	inline std::string rname() const            { return field.RNAME; };
+    	inline std::string cigar() const            { return field.CIGAR; };
+    	inline std::string rnext() const            { return field.RNEXT; };
+    	inline std::string seq() const              { return field.SEQ;   };
+    	inline std::string qual() const             { return field.QUAL;  };
+
+    	inline int flag() const                     { return field.FLAG;  };
+    	inline int pos() const                      { return field.POS;   };
+    	inline int mapq() const                     { return field.MAPQ;  };
+    	inline int pnext() const                    { return field.PNEXT; };
+    	inline int tlen() const			    { return field.TLEN;  };
+
+private:
+	/**
+         * Returns a pair of cigar operations as (occurrence, operation)
+         * Ex: 10M5I -> (10, M), (5, I)
+         */
+    	std::vector<std::pair<int,char>> get_cigar_operations(const std::string &cigar);
+
+    	std::string _alignment;
+    	alignment_fields field;
+};
+
+#endif /* ALIGNMENTS_H */
Binary file gene_fraction/src/Alignments.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Fasta.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,38 @@
+#include "Fasta.h"
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+
+Fasta::Fasta(std::string amr_fp) : _amr_fp(amr_fp) {}
+
+void Fasta::read_fasta(const std::string &amr_fp) {
+	std::ifstream in(amr_fp.c_str());
+	if(!in) {
+		std::cerr << "Could not open fasta file " << amr_fp << std::endl;
+		exit(EXIT_FAILURE);
+	}
+
+	std::string gene_id, gene, line;
+	while(std::getline(in, line)) {
+		std::size_t gene_idx = line.find(" ");
+		
+		if(gene_idx != std::string::npos)
+			gene_id = line.substr(1, gene_idx-1);
+		else
+			gene_id = line.substr(1, line.length());
+
+		std::getline(in, gene);
+		records.push_back(FastaRecord(gene_id, gene));
+	}
+	in.close();
+	
+	FastaRecord::sort_by_gene_id(records);
+}
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Fasta.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,28 @@
+#ifndef FASTA_H
+#define FASTA_H
+
+#include <string>
+#include <vector>
+#include "FastaRecord.h"
+
+/**
+ * Class for dealing with fasta files
+ */
+class Fasta {
+public:
+	/** 
+ 	 * Constructor that sets amr file path 
+ 	 */
+	Fasta(std::string amr_fp);
+
+	/**
+ 	 * Reads fasta file from file path 
+ 	 */
+	void read_fasta(const std::string &amr_fp);
+	
+	std::vector<FastaRecord> records;
+private:
+	std::string _amr_fp;
+};
+
+#endif /* FASTA_H */
Binary file gene_fraction/src/Fasta.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/FastaRecord.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,62 @@
+#include "FastaRecord.h"
+
+#include <algorithm>
+
+FastaRecord::FastaRecord(std::string gene_id, std::string gene) :
+	_gene_id(gene_id), _gene(gene), _base_hits(_gene.length(), 0),
+	_gene_hits(0) {}
+
+std::string FastaRecord::gene_id() const { return _gene_id; }
+
+std::string FastaRecord::gene() const { return _gene; }
+
+void FastaRecord::update_gene_hits() {
+	_gene_hits++;
+}
+
+int FastaRecord::gene_hits() const {
+	return _gene_hits;
+}
+
+int FastaRecord::get_base_hits() const {
+	return static_cast<int>(count(_base_hits.begin(), _base_hits.end(), 1));
+}
+
+int FastaRecord::find_gene(const std::vector<FastaRecord> &records,
+			   const std::string &gene_id, std::string seq) {
+	int gene_index;
+
+	std::vector<FastaRecord>::const_iterator low;
+	// binary search for fasta record index
+	low = std::lower_bound(records.begin(), records.end(), FastaRecord(gene_id, seq),
+                                   [](const FastaRecord &a, const FastaRecord &b)
+                                   { return a.gene_id() < b.gene_id(); });
+	gene_index = (low - records.begin());
+
+	return gene_index;
+}
+
+void FastaRecord::sort_by_gene_id(std::vector<FastaRecord> &records) {
+	// sort records by gene id
+	sort(records.begin(), records.end(), [](const FastaRecord &a, const FastaRecord &b) { return a.gene_id() < b.gene_id(); });
+}
+
+void FastaRecord::reset_base_hits(std::vector<FastaRecord> &records) {
+	for_each(records.begin(), records.end(), [](FastaRecord &a) { std::fill(a.base_hits().begin(), a.base_hits().end(), 0); });
+}
+
+void FastaRecord::reset_gene_hits(std::vector<FastaRecord> &records) {
+	for_each(records.begin(), records.end(), [](FastaRecord &a) { a._gene_hits = 0; });
+}
+
+std::vector<int> &FastaRecord::base_hits() {
+        return _base_hits;
+}
+
+
+
+
+
+
+	
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/FastaRecord.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,86 @@
+#ifndef FASTA_RECORD_H
+#define FASTA_RECORD_H
+
+#include <string>
+#include <vector>
+
+/**
+ * Class for dealing with fasta records
+ */
+class FastaRecord {
+public:
+	/**
+ 	 * Ctor that initializes gene id and gene
+ 	 */ 
+	FastaRecord(std::string gene_id, std::string gene);
+
+	/**
+ 	 * Returns a string gene id
+ 	 */
+	std::string gene_id() const;
+
+	/**
+ 	 * Returns the gene associated with gene id
+ 	 */
+	std::string gene() const;
+
+	/**
+ 	 * Returns the total base hits for a gene
+ 	 */ 
+	int get_base_hits() const;
+
+	/**
+ 	 * Returns the amount of genes that were hit
+ 	 * during the gene fraction calculation
+ 	 */ 
+	int gene_hits() const;
+	
+	/**
+ 	 *
+ 	 */ 
+	void update_base_hits(const int &index);
+
+	/**
+ 	 *
+ 	 */ 
+	void update_gene_hits();
+
+	/**
+ 	 * Searches for a fasta record corresponding
+ 	 * to gene id
+ 	 */ 
+	static int find_gene(const std::vector<FastaRecord> &records, 
+			     const std::string &gene_id, 
+                             std::string seq = "");
+
+	/**
+ 	 * Sorts fasta records by gene id
+ 	 */ 
+	static void sort_by_gene_id(std::vector<FastaRecord> &records);
+
+	/**
+ 	 * Resets base hits vector to 0's. 
+ 	 * This occurs after each sample is processed
+ 	 */ 
+	static void reset_base_hits(std::vector<FastaRecord> &records);
+
+	/**
+ 	 * Resets gene hits primitive to 0.
+ 	 * This happens after each sample is processed 
+ 	 */
+	static void reset_gene_hits(std::vector<FastaRecord> &records);
+
+	std::vector<int> &base_hits();
+
+	std::string _gene_id;
+	std::string _gene;
+	std::vector<int> _base_hits;
+
+private:
+	int _gene_hits;
+};
+
+
+
+
+#endif /* FASTA_RECORD_H */
Binary file gene_fraction/src/FastaRecord.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Makefile	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,14 @@
+output: main.o Sam.o Alignments.o Fasta.o FastaRecord.o
+	g++ -std=c++11 main.o Sam.o Alignments.o Fasta.o FastaRecord.o -o csa
+main.o: main.cpp
+	g++ -c -std=c++11 main.cpp
+Sam.o: Sam.cpp
+	g++ -c -std=c++11 Sam.cpp
+Alignments.o: Alignments.cpp
+	g++ -c -std=c++11 Alignments.cpp
+Fasta.o: Fasta.cpp
+	g++ -c -std=c++11 Fasta.cpp
+FastaRecord.o: FastaRecord.cpp
+	g++ -c -std=c++11 FastaRecord.cpp
+clean:
+	rm *.o csa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Sam.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,40 @@
+#include "Sam.h"
+#include "dir_util.h"
+#include "alignment_util.h"
+
+#include <iostream>
+#include <fstream>
+
+Sam::Sam(std::string sam_fp) : _sam_fp(sam_fp) {}
+
+void Sam::read_sam(cmd_args args) {
+	if(args.bam_stream) read_from_stdin();
+	else read_from_file(args.sam_fp);
+}
+
+void Sam::read_from_stdin() {
+	std::string line;
+	while(std::getline(std::cin, line)) {
+		if(line[0] == '@') continue;
+		alignment.push_back(line);
+	}
+}
+
+void Sam::read_from_file(const std::string &sam_fp) {
+	std::ifstream in(sam_fp.c_str());
+	if(!in) {
+		std::cerr << "Could not open sam file " << sam_fp << std::endl;
+		exit(EXIT_FAILURE);
+	}
+
+	std::string line;
+	while(getline(in, line)) {
+		if(line[0] == '@') continue;
+		if(is_good_alignment(line))
+			alignment.push_back(line);
+	}
+
+	in.close();
+}
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Sam.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,48 @@
+#ifndef SAM_H
+#define SAM_H
+
+#include <string>
+#include <vector>
+
+#include "args.h"
+#include "Alignments.h"
+
+/**
+ * Class for dealing with sam files
+ */
+
+class Sam {
+public:
+	/**
+ 	 * Ctor initializes sam file path
+ 	 */ 
+	Sam(std::string sam_fp);
+	void read_sam(cmd_args args);
+
+	/**
+ 	 * Reads sam file from stdin 
+ 	 */ 
+	void read_from_stdin();
+
+	/**
+         * Reads sam file from directory or file path
+         */	 
+	void read_from_file(const std::string &sam_fp);
+
+	/**
+ 	 *
+ 	 */	 
+	void read_from_dir(const std::string &sam_dir_fp);
+
+	std::vector<Alignments> alignment;
+
+private:
+	std::string _sam_fp;
+};
+
+
+
+
+
+#endif /* SAM_H */
+
Binary file gene_fraction/src/Sam.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/SamRatio.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,248 @@
+#ifndef SAM_RATIO_H
+#define SAM_RATIO_H
+
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+
+#include "FastaRecord.h"
+#include "Alignments.h"
+#include "args.h"
+
+typedef std::vector<std::pair<int,char>> cigar_str;
+
+/**
+ *
+ */
+struct header {
+	std::string level =             "Level";
+        std::string iteration =         "Iteration";
+        std::string gene_id =           "Gene id";
+        std::string gene_fraction =     "Gene Fraction";
+        std::string hits =              "Hits";
+};
+
+/**
+ * Reports the total number of bases that were touched for each
+ * gene. This is largely calculated using the positional and seq
+ * information found in fields four and ten of each alignment
+ */
+void analyze_coverage(FastaRecord &record, Alignments &alignment) {
+	record.update_gene_hits();
+
+	cigar_str cigar = alignment.cigar();
+
+	int len;
+	char op;
+
+	int occurrence;
+	int pos_in_gene = alignment.pos();
+
+	int start, stop;
+	int base_hits = record._base_hits.size(); // func this
+	int read_length = alignment.seq().length(); //func this
+
+	if(pos_in_gene == 1) {
+		occurrence = 0;
+		for(int i = 0; i < cigar.size(); i++) {
+			len = cigar[i].first;
+			op = cigar[i].second;
+
+			switch(op) {
+				case 'M':
+					occurrence += len;
+					break;
+				case 'I':
+					occurrence += len;
+					break;
+				default:
+					break;
+			}
+		}
+
+		start = read_length - occurrence;
+		stop = start + read_length;
+
+		for(int i = start; i < base_hits; i++) {
+			if(i == stop) break;
+			record._base_hits[i] = 1;
+		}
+	}
+	else {
+		start = pos_in_gene - 1;
+		stop = start + read_length;
+
+		for(int i = start; i < base_hits; i++) {
+			if(i == stop) break;
+			record._base_hits[i] = 1;
+		}
+	}
+}
+
+/**
+ * Returns gene fraction of fasta record
+ * Returns -1 if gene fraction is not greater than threshold
+ */
+double coverage(const FastaRecord &record, const int &threshold) {
+        double gene_coverage;
+
+        int base_hits, gene_size;
+
+        base_hits = record.get_base_hits();
+        gene_size = record.gene().length();
+
+        gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100;
+
+        if(gene_coverage > threshold)
+                return gene_coverage;
+        return -1;
+}
+
+/**
+ * Writes header to output file when
+ * reading from stdin
+ */
+void bam_stream_header() {
+	header h;
+	char sep = ',';
+
+	std::cout << h.level << sep << h.iteration << sep 
+                  << h.gene_id << sep << h.gene_fraction << sep 
+                  << h.hits << '\n';
+}
+
+/**
+ * Writes header to output file when
+ * reading from sam file
+ */
+void file_header(const std::string &out_fp, const std::string &sam_fp) {
+	header h;
+	std::ofstream out(out_fp.c_str(), std::ofstream::app );
+	char sep = ',';
+
+	out << "@" << sam_fp << '\n';
+	out << h.level << sep << h.iteration << sep 
+            << h.gene_id << sep << h.gene_fraction << sep 
+            << h.hits << '\n';
+	out.close();
+}
+
+/**
+ *
+ */
+void create_header(cmd_args &args) {
+	if(args.bam_stream) {
+		bam_stream_header();
+	}
+	else {
+		file_header(args.out_fp, args.sam_fp);
+	}
+}
+
+/**
+ * Writes results to output file when reading from
+ * stdin
+ */
+void bam_stream_results(std::vector<FastaRecord> &records,
+                        const int &level, const int &iteration,
+                        cmd_args &args) {
+
+	double gene_fraction;
+	int hits_seen;
+	std::string id;
+	char sep = ',';
+
+	for(auto &rec : records) {
+		gene_fraction = coverage(rec, args.threshold);
+		hits_seen = rec.gene_hits();
+		id = rec.gene_id();
+
+		if(gene_fraction > 0) {
+			std::cout << level << sep << iteration << sep
+			          << id << sep << gene_fraction << sep
+			          << hits_seen << '\n';			
+		}
+	}
+}
+
+/**
+ * Write results when reading sam file from
+ * path
+ */
+void file_results(std::vector<FastaRecord> &records,
+                  const int level, const int &iteration,
+                  cmd_args &args) {
+
+	std::ofstream out(args.out_fp.c_str(), std::ofstream::app);
+	
+	double gene_fraction;
+	int hits_seen;
+	std::string id;
+	char sep = ',';
+
+	for(auto &rec : records) {
+		gene_fraction = coverage(rec, args.threshold);
+		hits_seen = rec.gene_hits();
+		id = rec.gene_id();
+
+		if(gene_fraction > 0) {
+			out << level << sep << iteration << sep
+			    << id << sep << gene_fraction << sep
+                            << hits_seen << '\n';
+		}
+	}
+	out.close();
+}
+
+/**
+ *
+ */
+void report_results(std::vector<FastaRecord> &records,
+		    const int level, const int &iteration,
+		    cmd_args &args) {
+
+	if(args.bam_stream) {
+		bam_stream_results(records,level,iteration,args);
+	}
+	else {
+		file_results(records,level,iteration,args);
+	}
+}
+
+/**
+ * Generates a sequence of samples from sam file specified
+ * by the starting level, max level, and skip pattern
+ */
+void generate_samples(std::vector<FastaRecord> &records,
+                      std::vector<Alignments> &alignments,
+		      cmd_args &args) {
+
+	int read_count = alignments.size();
+	int sample_size;
+
+	srand(unsigned(time(0)));
+
+	std::vector<int> sequence(read_count);
+	iota(sequence.begin(), sequence.end(), 0);
+
+	create_header(args);	
+
+	for(int level = args.min; level <= args.max; level += args.skip) {
+		for(int sample = 0; sample < args.s_per_lev; sample++) {
+			random_shuffle(sequence.begin(), sequence.end(), randomize);
+			sample_size = round(((static_cast<double>(level)/100)*read_count));
+			std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size);
+
+			for(int a_idx = 0; a_idx < chosen.size(); a_idx++) {
+				std::string rname = alignments[chosen[a_idx]].rname();
+				int gene_idx = FastaRecord::find_gene(records, rname);	
+				analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]);
+			}
+			report_results(records,level,sample+1,args);
+			FastaRecord::reset_base_hits(records);
+			FastaRecord::reset_gene_hits(records);
+		}
+	}
+}
+
+#endif /* SAM_RATIO_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/alignment_util.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,96 @@
+#ifndef ALIGNMENT_UTIL_H
+#define ALIGNMENT_UTIL_H
+
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+
+#include "int_util.h"
+
+// macro to check if read mapped
+#define READ_UNMAPPED 4
+
+/**
+ * Splits alignment into separate parts
+ */
+std::vector<std::string> split_alignment(std::string &alignment) {
+    	std::vector<std::string> parts;
+
+    	boost::trim_if(alignment, boost::is_any_of("\t "));
+	// split on tab delimeter
+    	boost::split(parts, alignment, boost::is_any_of("\t "), boost::token_compress_on);
+	
+    	return parts;
+}
+
+/**
+ * Validates bit flag
+ */
+bool is_good_flag(const int &bit_flag) {
+    	if( (bit_flag & READ_UNMAPPED) > 0) return false;
+    	return true;
+}
+
+/**
+ * Validates rname
+ */
+bool is_good_rname(const std::string &rname) {
+    	return rname.compare("*") != 0;
+}
+
+/**
+ * Validates pos
+ */
+bool is_good_pos(const int &pos) {
+    	return pos > 0;
+}
+
+/**
+ * Validates cigar
+ */
+bool is_good_cigar(const std::string &cigar) {
+    	return cigar.compare("*") != 0;
+}
+
+/**
+ * Validates seq
+ */
+bool is_good_seq(const std::string &seq) {
+    	return seq.compare("*") != 0;
+}
+
+/**
+ * Validates alignment fields
+ */
+bool fields_are_good(std::vector<std::string> &parts) {
+    	int bit_flag = s_to_i(parts[1]);
+    	int pos = s_to_i(parts[3]);
+
+    	std::string rname = parts[2];
+    	std::string cigar = parts[5];
+    	std::string seq = parts[9];
+
+    	if(!(is_good_flag(bit_flag))) return false;
+    	if(!(is_good_pos(pos))) return false;
+    	if(!(is_good_rname(rname))) return false;
+    	if(!(is_good_cigar(cigar))) return false;
+    	if(!(is_good_seq(seq))) return false;
+
+    	return true;
+}
+
+/**
+ * Stores alignments that pass validity checks
+ */
+bool is_good_alignment(std::string &alignment) {
+    	std::vector<std::string> alignment_parts;
+
+    	alignment_parts = split_alignment(alignment);
+
+    	if(!(fields_are_good(alignment_parts)))
+        	return false;
+    	return true;
+}
+
+#endif /* ALIGNMENT_UTIL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/args.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,68 @@
+#ifndef ARGS_H
+#define ARGS_H
+
+#include "int_util.h"
+
+#include <string>
+#include <vector>
+#include <list>
+
+/**
+ * Encapsulates information input
+ * from the command line.
+ */
+struct cmd_args {
+	std::string amr_fp;
+	std::string sam_fp;
+	std::list<std::string> sam_dir_fp;
+	std::string out_fp;
+	
+	int threshold;
+	int min;
+	int max;
+	int skip;
+	int s_per_lev;
+
+	bool sam_dir = false;		/* This will be set to true when parsing a
+                                           directory of sam files. */
+	bool bam_stream = false;	/* This will be set to true when executing
+                                           samtools view -h example.bam | csa > output
+                                           from the command line. */
+};
+
+/**
+ * Returns a struct of command line arguments.
+ */
+static inline cmd_args
+parse_command_line(int argc, char *argv[]) {
+	std::vector<std::string> args(argv, argv + argc);
+
+	cmd_args arg;
+
+	for(int i = 1; i < argc; i++) {
+		if(args[i].compare("-amr_fp") == 0)
+			arg.amr_fp = args[++i];
+		else if(args[i].compare("-sam_fp") == 0) 
+			arg.sam_fp = args[++i];
+		else if(args[i].compare("-out_fp") == 0)
+			arg.out_fp = args[++i];
+		else if(args[i].compare("-t") == 0)
+			arg.threshold = s_to_i(args[++i]);
+		else if(args[i].compare("-min") == 0)
+			arg.min = s_to_i(args[++i]);
+		else if(args[i].compare("-max") == 0)
+			arg.max = s_to_i(args[++i]);
+		else if(args[i].compare("-skip") == 0)
+			arg.skip = s_to_i(args[++i]);
+		else if(args[i].compare("-samples") == 0)
+			arg.s_per_lev = s_to_i(args[++i]);
+		else if(args[i].compare("-d") == 0)
+			arg.sam_dir = true;
+		else if(args[i].compare("-bam") == 0)
+			arg.bam_stream = true;
+	}
+
+	return arg;
+}
+
+#endif /* ARGS_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/dir_util.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,43 @@
+#ifndef DIR_UTIL_H
+#define DIR_UTIL_H
+
+#include <list>
+#include <iostream>
+#include <string>
+#include <dirent.h>
+
+/*
+ * Parses a directory of sam files
+ */
+static inline std::list<std::string>
+parse_sam_dir(const std::string &directory) {
+        DIR *dir;
+        struct dirent *ent;
+        std::string dir_path = directory;
+
+        dir = opendir(dir_path.c_str());
+	// is dir open/valid?
+        if(dir == NULL) {
+                std::cout << "Not a valid directory" << std::endl;
+                exit(EXIT_FAILURE);
+        }
+
+        std::list<std::string> sam_files;
+        std::string fn;
+        std::string ext;
+        std::string file_type = ".sam";
+
+	// get all files with a .sam file extension
+        while((ent = readdir(dir)) != NULL) {
+                fn = dir_path + std::string(ent->d_name);
+                ext = fn.substr(fn.length()-4, fn.length());
+                if(ext.compare(file_type) == 0) {
+                        sam_files.push_back(fn);
+                }
+        }
+        closedir(dir);
+
+        return sam_files;
+}
+
+#endif /* DIR_UTIL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/int_util.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,29 @@
+#ifndef INT_UTIL_H
+#define INT_UTIL_H
+
+#include <string>
+#include <sstream>
+
+/**
+ * Given a string, return its integer.
+ */
+static inline int
+s_to_i(const std::string &s) {
+	std::istringstream ss(s);
+	int i;
+	ss >> i;
+	return i;
+}
+
+/**
+ * Given an integer, return a random number
+ * between 0 and i.
+ */
+static inline int
+randomize(const int &i) {
+	return rand() % i;
+}
+
+#endif /*INT_UTIL_H */
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/main.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,37 @@
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include "int_util.h"
+#include "dir_util.h"
+#include "args.h"
+#include "Fasta.h"
+#include "Sam.h"
+#include "SamRatio.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	cmd_args args;
+	args = parse_command_line(argc, argv);
+
+	Fasta f(args.amr_fp);
+	f.read_fasta(args.amr_fp);	
+
+	if(args.sam_dir) {
+		list<string> sam_files = parse_sam_dir(args.sam_fp);
+		for(auto &fn : sam_files) {
+			args.sam_fp = fn;
+			Sam s(args.sam_fp);
+			s.read_sam(args);
+			generate_samples(f.records, s.alignment, args);
+		}
+	}
+	else {
+		Sam s(args.sam_fp);
+		s.read_sam(args);
+		generate_samples(f.records, s.alignment, args);
+	}
+
+	return 0;
+}
Binary file gene_fraction/src/main.o has changed
Binary file res.bam has changed
Binary file sam has changed
Binary file snp has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller.xml	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,23 @@
+<tool id="snp_caller" name="Identify Snips" version="0.1.0">
+  <description>for single-end or paired-end data</description>
+  <command>$__tool_directory__/snp -amr_fp $amr_fp -samse $samse -out_fp $out_fp</command>
+  <inputs>
+    <param format="fasta" name="amr_fp" type="data" label="Input fasta file"/>
+    <param format="sam" name ="samse" type="data" label="Input SAM file"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_fp" />
+  </outputs>
+
+<tests>
+  <test>
+    <param name="amr_fp" value="amr.fa" />
+    <param name="samse" value="samse.sam" />
+    <param name="out_fp" file="res" />
+  </test>
+</tests>
+  <help>
+This tool identifies single nucleotide polymorphisms from both single-end and paired-end data.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/Alignment.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,96 @@
+#include "Alignment.h"
+#include "int_util.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#define READ_PAIRED                         1
+#define READ_MAPPED_IN_PROPER_PAIR          2
+#define READ_UNMAPPED                       4
+#define MATE_UNMAPPED                       8
+#define READ_REVERSE_STRAND                 16
+#define MATE_REVERSE_STRAND                 32
+#define FIRST_IN_PAIR                       64
+#define SECOND_IN_PAIR                      128
+#define NOT_PRIMARY_ALIGNMENT               256
+#define READ_FAILS_VENDOR_QUALITY_CHECKS    512
+#define READ_IS_PCR_OR_OPTICAL_DUPLICATE    1024
+#define SUPPLEMENTARY_ALIGNMENT             2048
+
+Alignment::Alignment(std::string alignment) : _alignment(alignment) {
+	fill_alignment_fields(alignment);
+	fill_bit_flag(field.FLAG);
+	fill_xa_field(alignment);
+}
+
+void Alignment::fill_alignment_fields(const std::string &alignment) {
+	std::istringstream ss(alignment);
+	ss >> field.QNAME >> field.FLAG >> field.RNAME >> field.POS >>
+              field.MAPQ >> field.CIGAR >> field.RNEXT >> field.PNEXT >>
+              field.TLEN >> field.SEQ >> field.QUAL;
+}
+
+void Alignment::fill_bit_flag(const int &flag) {
+    if( (flag & READ_PAIRED) == 1)                     b_flag.read_paired = true;
+    if( (flag & READ_MAPPED_IN_PROPER_PAIR) > 1)       b_flag.read_mapped_in_proper_pair = true;
+    if( (flag & READ_UNMAPPED) > 1)                    b_flag.read_unmapped = true;
+    if( (flag & MATE_UNMAPPED) > 1)                    b_flag.mate_unmapped = true;
+    if( (flag & READ_REVERSE_STRAND) > 1)              b_flag.read_reverse_strand = true;
+    if( (flag & MATE_REVERSE_STRAND) > 1)              b_flag.mate_reverse_strand = true;
+    if( (flag & FIRST_IN_PAIR) > 1)                    b_flag.first_in_pair = true;
+    if( (flag & SECOND_IN_PAIR) > 1)                   b_flag.second_in_pair = true;
+}
+
+// XA:Z:mef(A)_10_AF376746,+81,92M,2;
+bool Alignment::fill_xa_field(const std::string &alignment) {
+	xa_fields hits;
+	std::string alt_fields;
+	int offset = 5;
+	std::size_t alt_index = alignment.find("XA:Z:");
+	
+	if(alt_index != std::string::npos) {
+		alt_fields = alignment.substr(alt_index+offset, alignment.length());		
+	}
+	else {
+		return false;
+	}
+
+	std::string field;
+	std::istringstream ss(alt_fields);
+
+	while(std::getline(ss, field, ',')) {
+		hits.rname = field;
+		//std::cout << hits.rname << std::endl;
+		getline(ss, field, ',');
+		hits.pos = strand_to_i(field);
+		//std::cout << hits.pos << std::endl;
+		getline(ss, field, ',');
+		hits.cigar = field;
+		//std::cout << hits.cigar << std::endl;
+		getline(ss, field, ';');
+		hits.edit = 0;
+		alternate_hits.push_back(hits);
+	}
+		
+	return true;	
+}
+
+std::vector<std::pair<int,char> > Alignment::get_cigar() {
+	return get_cigar_operations(field.CIGAR);
+}
+
+std::vector<std::pair<int,char> > Alignment::get_cigar_operations(const std::string &cigar) {
+	std::vector<std::pair<int,char>> p;
+	int count;
+	char operation;
+	
+	std::istringstream ss(cigar);
+	while(ss >> count >> operation) {
+		p.push_back(std::make_pair(count,operation));
+	}
+
+	return p;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/Alignment.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,91 @@
+#ifndef ALIGNMENT_H
+#define ALIGNMENT_H
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+struct bit_flag {
+    /* SAM flags in English */
+    bool read_paired =                              false;
+    bool read_mapped_in_proper_pair =               false;
+    bool read_unmapped =                            false;
+    bool mate_unmapped =                            false;
+    bool read_reverse_strand =                      false;
+    bool mate_reverse_strand =                      false;
+    bool first_in_pair =                            false;
+    bool second_in_pair =                           false;
+};
+
+struct alignment_fields {
+	std::string QNAME;
+	int FLAG;
+	std::string RNAME;
+	int POS;
+	std::string MAPQ;
+	std::string CIGAR;
+	std::string RNEXT;
+	int PNEXT;
+	int TLEN;
+	std::string SEQ;
+	std::string QUAL;
+};
+
+class Alignment {
+public:
+	Alignment(std::string alignment);
+	void fill_alignment_fields(const std::string &alignment);
+	void fill_bit_flag(const int &flag);
+	bool fill_xa_field(const std::string &alignment);
+	std::vector<std::pair<int,char> > get_cigar();
+
+	inline std::string alignment() const	{ return _alignment; };
+	inline std::string qname() const	{ return field.QNAME; };
+	inline std::string rname() const	{ return field.RNAME; };
+	inline std::string mapq() const		{ return field.MAPQ; };
+	inline std::string cigar() const	{ return field.CIGAR; };
+	inline std::string seq() const		{ return field.SEQ; };
+	inline std::string rnext() const	{ return field.RNEXT; };
+	inline int flag() const			{ return field.FLAG; };
+	inline int pos() const			{ return field.POS; };
+	inline int pnext() const		{ return field.PNEXT; };
+	inline int tlen() const			{ return field.TLEN; };
+
+    	inline bool read_paired() const                         { return b_flag.read_paired; };
+    	inline bool read_mapped_in_proper_pair() const          { return b_flag.read_mapped_in_proper_pair; };
+    	inline bool read_unmapped() const                       { return b_flag.read_unmapped; };
+    	inline bool mate_unmapped() const                       { return b_flag.mate_unmapped; };
+    	inline bool read_reverse_strand() const                 { return b_flag.read_reverse_strand; }
+    	inline bool mate_reverse_strand() const                 { return b_flag.mate_reverse_strand; };
+    	inline bool first_in_pair() const                       { return b_flag.first_in_pair; };
+    	inline bool second_in_pair() const                      { return b_flag.second_in_pair; };
+
+	inline void set_rname(std::string rname) {
+		field.RNAME = rname;
+	}
+	inline void set_cigar(std::string cigar) {
+		field.CIGAR = cigar;
+	}
+	inline void set_pos(int pos) {
+		field.POS = pos;
+	}
+
+	struct xa_fields {
+        	std::string rname;
+        	std::string cigar;
+        	int pos;
+        	int edit;
+	};
+
+	std::vector<xa_fields> alternate_hits;
+private:
+	std::vector<std::pair<int,char> > get_cigar_operations(const std::string &cigar);
+
+	std::string _alignment;
+
+	alignment_fields field;
+
+	bit_flag b_flag;
+};
+
+#endif // ALIGNMENT_H
Binary file snp_caller/src/Alignment.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/Fasta.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,33 @@
+#include "Fasta.h"
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include <iostream>
+
+
+Fasta::Fasta(std::string amr_fp) : _amr_fp(amr_fp) {}
+
+void Fasta::read_fasta(const std::string &amr_fp) {
+        std::ifstream in(amr_fp.c_str());
+        if(!in) {
+                std::cerr << "Could not open fasta file " << amr_fp << std::endl;
+                exit(EXIT_FAILURE);
+        }
+
+        std::string gene_id, gene, line;
+        while(std::getline(in, line)) {
+                std::size_t gene_idx = line.find(" ");
+
+                if(gene_idx != std::string::npos)
+                        gene_id = line.substr(1, gene_idx-1);
+                else
+                        gene_id = line.substr(1, line.length());
+
+                std::getline(in, gene);
+                records.push_back(FastaRecord(gene_id, gene));
+        }
+        in.close();
+
+        FastaRecord::sort_by_gene_id(records);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/Fasta.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,13 @@
+#include "FastaRecord.h"
+#include <vector>
+#include <string>
+
+class Fasta {
+public:
+	Fasta(std::string amr_fp);
+	void read_fasta(const std::string &amr_fp);
+
+	std::vector<FastaRecord> records;	
+private:
+	std::string _amr_fp;
+};
Binary file snp_caller/src/Fasta.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/FastaRecord.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,27 @@
+#include <algorithm>
+
+#include "FastaRecord.h"
+
+FastaRecord::FastaRecord(std::string gene_id, std::string gene) :
+	_gene_id(gene_id), _gene(gene) {}
+
+int FastaRecord::find_gene(const std::vector<FastaRecord> & records,
+			   const std::string &gene_id,
+			   const std::string seq) {
+	int gene_index;
+	
+	std::vector<FastaRecord>::const_iterator low;
+	low = std::lower_bound(records.begin(), records.end(),
+			       FastaRecord(gene_id, seq),
+			       [](const FastaRecord &a, const FastaRecord &b)
+			       { return a.gene_id() < b.gene_id(); });
+	gene_index = (low - records.begin());
+
+	return gene_index;
+}
+
+void FastaRecord::sort_by_gene_id(std::vector<FastaRecord> &records) {
+	sort(records.begin(), records.end(),
+             [](const FastaRecord &a, const FastaRecord &b)
+	     { return a.gene_id() < b.gene_id(); });
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/FastaRecord.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,27 @@
+#ifndef FASTA_RECORD_H
+#define FASTA_RECORD_H
+
+#include <string>
+#include <vector>
+#include <map>
+
+class FastaRecord {
+public:
+	FastaRecord(std::string gene_id, std::string gene);
+	
+	inline std::string gene_id() const	{ return _gene_id; };
+	inline std::string gene() const		{ return _gene;	};
+
+	static int find_gene(const std::vector<FastaRecord> &records,
+			     const std::string &gene_id,
+			     std::string seq = "");
+	static void sort_by_gene_id(std::vector<FastaRecord> &records);
+
+	std::map<std::string, int> snip_database;
+private:
+	std::string _gene_id;
+	std::string _gene;
+};
+
+
+#endif //FASTA_RECORD_H
Binary file snp_caller/src/FastaRecord.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/Makefile	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,14 @@
+output: main.o Fasta.o FastaRecord.o Alignment.o Sam.o
+	g++ -std=c++11 main.o Fasta.o FastaRecord.o Alignment.o Sam.o -o snp
+main.o: main.cpp
+	g++ -c -std=c++11 main.cpp
+Fasta.o: Fasta.cpp
+	g++ -c -std=c++11 Fasta.cpp
+FastaRecord.o: FastaRecord.cpp
+	g++ -c -std=c++11 FastaRecord.cpp
+Alignment.o: Alignment.cpp
+	g++ -c -std=c++11 Alignment.cpp
+Sam.o: Sam.cpp
+	g++ -c -std=c++11 Sam.cpp
+clean:
+	rm *.o snp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/Sam.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,57 @@
+#include "Sam.h"
+#include "alignment_util.h"
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iostream>
+
+Sam::Sam(std::string sam_fp) : _sam_fp(sam_fp) {}
+
+void Sam::read_se_sam(const std::string &sam_fp, bool best) {
+	std::ifstream in(sam_fp.c_str());
+	if(!in) {
+		std::cerr << "Could not open sam file " << sam_fp << std::endl;
+		exit(EXIT_FAILURE);
+	}
+
+	std::string alignment;
+	while(getline(in, alignment)) {
+		if(alignment[0] == '@')
+			continue;
+		//std::vector<std::string> parts = split_alignment(alignment);
+		if(se_fields_are_good(alignment, best)) {
+			alignments.push_back(alignment);
+		}
+	}
+	sort_by_qname(alignments);
+}
+
+void Sam::read_pe_sam(const std::string &sam_fp, bool best) {
+	std::ifstream in(sam_fp.c_str());
+	if(!in)	{
+		std::cerr << "Could not open sam file " << sam_fp << std::endl;
+		exit(EXIT_FAILURE);
+	}
+
+	std::string alignment;
+	while(getline(in, alignment)) {
+		if(alignment[0] == '@')
+			continue;
+		//std::vector<std::string> parts = split_alignment(alignment);
+		if(pe_fields_are_good(alignment, best)) {
+			alignments.push_back(alignment);
+		}
+	}
+	sort_by_qname(alignments);
+}
+
+void Sam::sort_by_qname(std::vector<Alignment> &v) {
+	sort(v.begin(), v.end(),
+	     [](const Alignment &a, const Alignment &b)
+	     { return a.qname() < b.qname(); });
+}
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/Sam.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,23 @@
+#ifndef SAM_H
+#define SAM_H
+
+#include "Alignment.h"
+
+class Sam {
+public:
+	Sam(std::string sam_fp);
+	void read_se_sam(const std::string &sam_fp, bool best);
+	void read_pe_sam(const std::string &sam_fp, bool best);
+	static void sort_by_qname(std::vector<Alignment> &v);
+
+	std::vector<Alignment> alignments;
+private:
+	std::string _sam_fp;
+
+};
+
+
+
+#endif //SAM_H
+
+
Binary file snp_caller/src/Sam.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/SnipDriver.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,226 @@
+#ifndef SNIP_DRIVER_H
+#define SNIP_DRIVER_H
+
+#include "Alignment.h"
+#include "FastaRecord.h"
+#include "args.h"
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <algorithm>
+#include <set>
+
+typedef decltype(nullptr) nullptr_t;
+typedef std::vector<std::pair<int,char>> cigar_str;
+
+std::set<std::string> read_pair_snips;
+
+struct CompareSnips {
+	bool operator()(const std::string &first, const std::string &second) {
+		return first.length() < second.length();
+	}
+};
+
+
+Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) {
+	for(auto &alignment : alignments) {
+		bool mate_pair = alignment.second_in_pair();
+		std::string mate_qname = alignment.qname();
+		int mate_pos = alignment.pos();
+
+		if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) {
+			return &alignment;	
+		}
+	}
+	return nullptr;	
+}
+
+void parse_cigar(FastaRecord &record, Alignment &alignment) {
+	cigar_str cigar = alignment.get_cigar();
+
+	int occurrence;
+	char operation;
+
+	int gene_pos = alignment.pos()-1;
+	int read_pos = 0;
+
+	char base_in_ref, base_in_read;
+
+	std::string read = alignment.seq();
+	std::string qname = alignment.qname();
+
+	std::string gene = record.gene();
+	std::string gene_id = record.gene_id();
+
+	std::string snip_pattern;
+
+	for(int i = 0; i < cigar.size(); i++) {
+		occurrence = cigar[i].first;
+		operation = cigar[i].second;
+
+		if(operation == 'M' || operation == '=') {
+			for(int k = 0; k < occurrence; k++) {
+				if(gene_pos >= gene.length() || read_pos >= read.length()) {
+					break;
+				}
+
+				base_in_ref = gene[gene_pos];
+				base_in_read = read[read_pos];
+
+				if(base_in_ref != base_in_read) {
+					snip_pattern = std::to_string(gene_pos) +
+						       std::string(1,base_in_ref) + "->" +
+						       std::string(1, base_in_read) + ":"; 
+					read_pair_snips.insert(snip_pattern);			
+				}
+
+				++gene_pos;
+				++read_pos;
+			}
+		}
+		else if(operation == 'I') {
+			read_pos += occurrence;
+			gene_pos += occurrence;
+		}
+		else if(operation == 'S') {
+			read_pos += occurrence;
+		}
+		else if(operation == 'N') {
+			gene_pos += occurrence;
+		}
+		else if(operation == 'P') {
+			read_pos += occurrence;
+			gene_pos += occurrence;
+		}
+		else if(operation == 'D') {
+			gene_pos += occurrence;
+		}
+		else if(operation == 'X') {
+			read_pos += occurrence;
+			gene_pos += occurrence;
+		}
+		else {
+			std::cout << "Operation " << operation << " not handled." << std::endl;			
+		}
+	}
+}
+
+std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) {
+	std::vector<std::string> snips;
+	snips.assign(read_pair_snips.begin(), read_pair_snips.end());
+
+	CompareSnips comp;
+	sort(snips.begin(), snips.end(), comp);
+
+	std::string pattern;
+	for(const auto &snp : snips) {
+		pattern+=snp;
+	}
+	return pattern;
+}
+
+bool find_se_snips(std::vector<FastaRecord> &records,
+		   Alignment &read_pair) {
+	
+	int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
+	if(fasta_record_idx == records.size()) {
+		return false;
+	}
+
+	parse_cigar(records[fasta_record_idx], read_pair);
+	std::string snp = convert_snips_to_pattern(read_pair_snips);
+
+	if(!snp.empty()) {
+		records[fasta_record_idx].snip_database[snp]++;
+	}
+
+	read_pair_snips.clear();
+
+	return true;
+}
+
+bool find_pe_snips(std::vector<FastaRecord> &records, 
+		   std::vector<Alignment> &alignments,
+		   Alignment &read_pair) {
+
+	if(read_pair.first_in_pair()) {
+		std::string qname = read_pair.qname();
+		int mate_start_pos = read_pair.pnext();
+
+		Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos);
+		if(mate_pair == nullptr) {
+			return false;
+		}
+		if(mate_pair->rname () != read_pair.rname()) {
+			if(mate_pair->alternate_hits.size() == 0) {
+				return false;
+			}
+
+			bool found_alt_hit = false;
+			for(const auto &alternative: mate_pair->alternate_hits) {
+				if(alternative.rname == read_pair.rname()) {
+					found_alt_hit = true;
+					mate_pair->set_rname(alternative.rname);
+					mate_pair->set_cigar(alternative.cigar);
+					mate_pair->set_pos(alternative.pos);
+					break;
+				}
+			}
+			if(!found_alt_hit) {
+				return false;
+			}
+		}
+
+
+		int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname());
+		if(fasta_record_idx == records.size()) {
+			return false;
+		}
+
+		parse_cigar(records[fasta_record_idx], read_pair);
+		parse_cigar(records[fasta_record_idx], *mate_pair);
+
+		std::string snp = convert_snips_to_pattern(read_pair_snips);
+
+		if(!snp.empty()) {
+			records[fasta_record_idx].snip_database[snp]++;
+		}
+
+	
+		read_pair_snips.clear();
+	}
+
+	return true;
+}
+
+void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) {
+	if(arg.samse) {
+		for(int i = 0; i < alignments.size(); i++) {
+			find_se_snips(records, alignments[i]);
+		}
+	}
+	else {
+		for(int i = 0; i < alignments.size(); i++) {
+			find_pe_snips(records, alignments, alignments[i]);
+		}	
+	}
+}
+
+void write_header(const std::string &out_fp) {
+	std::ofstream out(out_fp.c_str());
+	out << "Chr,Haplotype,Frequency" << '\n';
+}
+
+void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) {
+	write_header(out_fp);
+	std::ofstream out(out_fp.c_str(), std::ofstream::app);
+	for(const auto &record : records) {
+		for(const auto &snip : record.snip_database) {
+			out << record.gene_id() << "," << snip.first << "," << snip.second << '\n';
+		}
+	}
+}
+
+
+#endif // SNIP_DRIVER_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/alignment_util.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,130 @@
+#ifndef ALIGNMENT_UTIL_H
+#define ALIGNMENT_UTIL_H
+
+#include "int_util.h"
+
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+
+#define READ_PAIRED   1
+#define PROPER_PAIR   2
+#define READ_UNMAPPED 4
+#define MATE_UNMAPPED 8
+
+std::vector<std::string> split_alignment(std::string &alignment) {
+	std::vector<std::string> parts;
+	boost::trim_if(alignment, boost::is_any_of("\t "));
+	boost::split(parts, alignment, boost::is_any_of("\t "), boost::token_compress_on);
+	return parts;
+}
+
+bool is_good_se_flag(const int &flag) {
+	if( (flag & READ_UNMAPPED) > 0) {
+		return false;
+	}
+	else {
+		return true;
+	}
+}
+
+bool is_good_pe_flag(const int &flag) {
+	if( (flag & READ_UNMAPPED) > 0 || (flag & MATE_UNMAPPED) > 0 ||
+            (flag & READ_PAIRED) == 0 || (flag & PROPER_PAIR == 0) ) {
+		return false;
+	}
+	else {
+		return true;
+	}
+}
+
+bool is_good_rname(const std::string &rname) {
+	return rname.compare("*") != 0;
+}
+
+bool is_good_pos(const int &pos) {
+	return pos > 0;
+}
+
+bool is_good_pnext(const int &pnext) {
+	return pnext > 0;
+}
+
+bool is_good_cigar(const std::string &cigar) {
+	return cigar.compare("*") != 0;
+}
+
+bool is_good_seq(const std::string &seq) {
+	return seq.compare("*") != 0;
+}
+
+bool is_alignment_unique(const std::string &alignment) {
+	if(alignment.find("XT:A:U") != std::string::npos) {
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
+/*bool is_good_alignment(std::string &alignment) {
+	std::vector<std::string> alignment_parts;
+
+	alignment_parts = split_alignment(alignment);
+
+	if(!(fields_are_good(alignment_parts)))
+		return false;
+	else
+		return true;
+}*/
+
+bool se_fields_are_good(std::string &alignment, bool best) {
+	std::vector<std::string> parts = split_alignment(alignment);
+	int flag = s_to_i(parts[1]);
+	int pos = s_to_i(parts[3]);
+	
+	std::string rname = parts[2];
+	std::string cigar = parts[5];
+	std::string seq = parts[9];
+
+	if(!(is_good_se_flag(flag))) return false;
+	if(!(is_good_pos(pos))) return false;
+	if(!(is_good_rname(rname))) return false;
+	if(!(is_good_cigar(cigar))) return false;
+	if(!(is_good_seq(seq))) return false;
+	if(best) {
+		if(!(is_alignment_unique(alignment))) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool pe_fields_are_good(std::string &alignment, bool best) {
+	std::vector<std::string> parts = split_alignment(alignment);
+	int flag = s_to_i(parts[1]);
+        int pos = s_to_i(parts[3]);
+	int pnext = s_to_i(parts[7]);
+
+        std::string rname = parts[2];
+        std::string cigar = parts[5];
+        std::string seq = parts[9];
+
+        if(!(is_good_pe_flag(flag))) return false;
+        if(!(is_good_pos(pos))) return false;
+        if(!(is_good_rname(rname))) return false;
+        if(!(is_good_cigar(cigar))) return false;
+	if(!(is_good_pnext(pnext))) return false;
+        if(!(is_good_seq(seq))) return false;
+	if(best) {
+		if(!(is_alignment_unique(alignment))) {
+			return false;
+		}
+	}
+
+        return true;		
+}
+
+#endif // ALIGNMENT_UTIL_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/args.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,62 @@
+#ifndef ARGS_H
+#define ARGS_H
+
+#include <string>
+#include <vector>
+
+void static usage() {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "Program: SNP Finder \n");
+        fprintf(stderr, "Contact: Chris Dean <cdean11@rams.colostate.edu>\n\n");
+        fprintf(stderr, "Usage: snp [options] <fasta_file> <sam_file> <best> <output_file>\n\n");
+        fprintf(stderr, "Options:\n\n");
+        fprintf(stderr, "   -amr_fp STRING             amr database\n");
+        fprintf(stderr, "   -samse STRING           single-end alignments\n");
+        fprintf(stderr, "   -sampe STRING           paired-end alignments\n");
+        fprintf(stderr, "   -b BOOLEAN              filter on unique alignments\n");
+        fprintf(stderr, "   -out_fp STRING               output path or(file)\n\n");
+}
+
+struct cmd_args {
+	std::string amr_fp;
+	std::string sam_fp;
+	std::string out_fp;
+
+	bool best = false;
+	bool samse = false;
+	bool sampe = false;
+};
+
+static inline cmd_args
+parse_command_line(int argc, const char *argv[]) {
+	std::vector<std::string> args(argv, argv+argc);
+	
+	cmd_args arg;
+
+	for(int i = 1; i < argc; i++) {
+		if(args[i] == "-amr_fp") {
+			arg.amr_fp = args[++i];
+		}
+		else if(args[i] == "-samse") {
+			arg.sam_fp = argv[++i];		
+			arg.samse = true;
+		}
+		else if(args[i] == "-sampe") {
+			arg.sam_fp = argv[++i];
+			arg.sampe = true;
+		}
+		else if(args[i] == "-out_fp") {
+			arg.out_fp = argv[++i];
+		}
+		else if(args[i] == "-b") {
+			arg.best = true;		
+		}
+		else {
+			usage();
+			exit(EXIT_FAILURE);						
+		}
+	}
+	return arg;
+}
+
+#endif //ARGS_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/int_util.h	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,25 @@
+#ifndef INT_UTIL_H
+#define INT_UTIL_H
+
+#include <string>
+#include <sstream>
+
+inline
+int s_to_i(const std::string &pos) {
+	std::istringstream ss(pos);
+	int i;
+	ss >> i;
+	return i;
+}
+
+inline
+int strand_to_i(const std::string &pos) {
+	std::string s = pos.substr(1, pos.length());
+	std::istringstream ss(s);
+	int i;
+	ss >> i;
+	return i;
+}
+
+
+#endif // INT_UTIL_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_caller/src/main.cpp	Sun Feb 21 06:05:24 2016 -0500
@@ -0,0 +1,46 @@
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "args.h"
+#include "Fasta.h"
+#include "FastaRecord.h"
+#include "Sam.h"
+#include "SnipDriver.h"
+
+using namespace std;
+
+void print_fasta(const Fasta &f) {
+	for(const auto &record: f.records) {
+		cout << record.gene_id() << endl;
+		cout << record.gene() << endl;
+	}
+}
+
+void print_sam(const Sam &s) {
+	for(const auto &alignment: s.alignments) {
+		cout << alignment.alignment() << endl;
+	}
+}
+
+int main(int argc, const char *argv[]) {
+	cmd_args arg;
+	arg = parse_command_line(argc, argv);		
+
+	Fasta f(arg.amr_fp);
+	f.read_fasta(arg.amr_fp);	
+
+	Sam s(arg.sam_fp);
+	if(arg.samse) {
+		s.read_se_sam(arg.sam_fp, arg.best);
+	}
+	else {
+		s.read_pe_sam(arg.sam_fp, arg.best);
+	}
+
+	run(f.records, s.alignments, arg);
+	write_snips(f.records, arg.out_fp);
+
+
+	return 0;
+}
Binary file snp_caller/src/main.o has changed