Mercurial > repos > chrisd > testshed
changeset 0:0fd352f62446 draft default tip
planemo upload for repository https://github.com/ChrisD11/Duplicon commit 3ee0594c692faac542ffa58f4339d79b9b8aefbd-dirty
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction.xml Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,20 @@ +<tool id="gene_fraction" name="Gene Fraction" version="0.1.0"> + <description>for single-end or paired-end data</description> + <command>$__tool_directory__/csa -amr_fp $amr_fp -sam_fp $sam_fp -t $threshold -min $min -max $max -skip $skip -samples $samples -out_fp $out_fp</command> + <inputs> + <param format="fasta" name="amr_fp" type="data" label="Input fasta file"/> + <param format="sam" name ="sam_fp" type="data" label="Input SAM file"/> + <param name="threshold" size="3" type="integer" min="0" max="100" value="1" label="Threshold"/> + <param name="min" size="3" type="integer" min="1" max="100" value="1" label="Minimum level"/> + <param name="max" size="3" type="integer" min="1" max="100" value="1" label="Maximum level"/> + <param name="skip" size="3" type="integer" min="1" max="100" value="1" label="Skip pattern"/> + <param name="samples" size="3" type="integer" min="1" max="100" value="1" label="Samples per level"/> + </inputs> + <outputs> + <data format="tabular" name="out_fp" /> + </outputs> + <help> +This tool calculates the amount of genes that are covered by atleast one read. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Alignments.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,35 @@ +#include "Alignments.h" +#include "int_util.h" + +#include <boost/algorithm/string.hpp> + +#include <iostream> +#include <sstream> + +Alignments::Alignments(std::string alignment) : _alignment(alignment) { + fill_alignment_fields(alignment); +} + +void Alignments::fill_alignment_fields(const std::string &alignment) { + std::istringstream ss(alignment); + ss >> field.QNAME >> field.FLAG >> field.RNAME >> field.POS >> + field.MAPQ >> field.CIGAR >> field.RNEXT >> field.PNEXT >> + field.TLEN >> field.SEQ >> field.QUAL; +} + +std::vector<std::pair<int,char>> Alignments::cigar() { + return get_cigar_operations(field.CIGAR); +} + +std::vector<std::pair<int,char>> Alignments::get_cigar_operations(const std::string &cigar) { + std::vector<std::pair<int,char>> p; + int count; + char operation; + + std::istringstream ss(cigar); + while(ss >> count >> operation) { + p.push_back(std::make_pair(count, operation)); + } + + return p; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Alignments.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,68 @@ +#ifndef ALIGNMENTS_H +#define ALIGNMENTS_H + +#include <string> +#include <vector> + +/** + * Stores information about an alignment + */ +struct alignment_fields { + std::string QNAME; + int FLAG; + std::string RNAME; + int POS; + int MAPQ; + std::string CIGAR; + std::string RNEXT; + int PNEXT; + int TLEN; + std::string SEQ; + std::string QUAL; +}; + +/** + * Class for dealing with alignments + */ +class Alignments { +public: + /** + * Ctor that initializes alignment + */ + Alignments(std::string alignment); + + /** + * Stores information about each of the eleven + * required alignment fields + */ + void fill_alignment_fields(const std::string &alignment); + + std::vector<std::pair<int,char>> cigar(); + + inline std::string alignment() const { return _alignment; }; + + inline std::string qname() const { return field.QNAME; }; + inline std::string rname() const { return field.RNAME; }; + inline std::string cigar() const { return field.CIGAR; }; + inline std::string rnext() const { return field.RNEXT; }; + inline std::string seq() const { return field.SEQ; }; + inline std::string qual() const { return field.QUAL; }; + + inline int flag() const { return field.FLAG; }; + inline int pos() const { return field.POS; }; + inline int mapq() const { return field.MAPQ; }; + inline int pnext() const { return field.PNEXT; }; + inline int tlen() const { return field.TLEN; }; + +private: + /** + * Returns a pair of cigar operations as (occurrence, operation) + * Ex: 10M5I -> (10, M), (5, I) + */ + std::vector<std::pair<int,char>> get_cigar_operations(const std::string &cigar); + + std::string _alignment; + alignment_fields field; +}; + +#endif /* ALIGNMENTS_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Fasta.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,38 @@ +#include "Fasta.h" + +#include <iostream> +#include <fstream> +#include <vector> +#include <string> + +Fasta::Fasta(std::string amr_fp) : _amr_fp(amr_fp) {} + +void Fasta::read_fasta(const std::string &amr_fp) { + std::ifstream in(amr_fp.c_str()); + if(!in) { + std::cerr << "Could not open fasta file " << amr_fp << std::endl; + exit(EXIT_FAILURE); + } + + std::string gene_id, gene, line; + while(std::getline(in, line)) { + std::size_t gene_idx = line.find(" "); + + if(gene_idx != std::string::npos) + gene_id = line.substr(1, gene_idx-1); + else + gene_id = line.substr(1, line.length()); + + std::getline(in, gene); + records.push_back(FastaRecord(gene_id, gene)); + } + in.close(); + + FastaRecord::sort_by_gene_id(records); +} + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Fasta.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,28 @@ +#ifndef FASTA_H +#define FASTA_H + +#include <string> +#include <vector> +#include "FastaRecord.h" + +/** + * Class for dealing with fasta files + */ +class Fasta { +public: + /** + * Constructor that sets amr file path + */ + Fasta(std::string amr_fp); + + /** + * Reads fasta file from file path + */ + void read_fasta(const std::string &amr_fp); + + std::vector<FastaRecord> records; +private: + std::string _amr_fp; +}; + +#endif /* FASTA_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/FastaRecord.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,62 @@ +#include "FastaRecord.h" + +#include <algorithm> + +FastaRecord::FastaRecord(std::string gene_id, std::string gene) : + _gene_id(gene_id), _gene(gene), _base_hits(_gene.length(), 0), + _gene_hits(0) {} + +std::string FastaRecord::gene_id() const { return _gene_id; } + +std::string FastaRecord::gene() const { return _gene; } + +void FastaRecord::update_gene_hits() { + _gene_hits++; +} + +int FastaRecord::gene_hits() const { + return _gene_hits; +} + +int FastaRecord::get_base_hits() const { + return static_cast<int>(count(_base_hits.begin(), _base_hits.end(), 1)); +} + +int FastaRecord::find_gene(const std::vector<FastaRecord> &records, + const std::string &gene_id, std::string seq) { + int gene_index; + + std::vector<FastaRecord>::const_iterator low; + // binary search for fasta record index + low = std::lower_bound(records.begin(), records.end(), FastaRecord(gene_id, seq), + [](const FastaRecord &a, const FastaRecord &b) + { return a.gene_id() < b.gene_id(); }); + gene_index = (low - records.begin()); + + return gene_index; +} + +void FastaRecord::sort_by_gene_id(std::vector<FastaRecord> &records) { + // sort records by gene id + sort(records.begin(), records.end(), [](const FastaRecord &a, const FastaRecord &b) { return a.gene_id() < b.gene_id(); }); +} + +void FastaRecord::reset_base_hits(std::vector<FastaRecord> &records) { + for_each(records.begin(), records.end(), [](FastaRecord &a) { std::fill(a.base_hits().begin(), a.base_hits().end(), 0); }); +} + +void FastaRecord::reset_gene_hits(std::vector<FastaRecord> &records) { + for_each(records.begin(), records.end(), [](FastaRecord &a) { a._gene_hits = 0; }); +} + +std::vector<int> &FastaRecord::base_hits() { + return _base_hits; +} + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/FastaRecord.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,86 @@ +#ifndef FASTA_RECORD_H +#define FASTA_RECORD_H + +#include <string> +#include <vector> + +/** + * Class for dealing with fasta records + */ +class FastaRecord { +public: + /** + * Ctor that initializes gene id and gene + */ + FastaRecord(std::string gene_id, std::string gene); + + /** + * Returns a string gene id + */ + std::string gene_id() const; + + /** + * Returns the gene associated with gene id + */ + std::string gene() const; + + /** + * Returns the total base hits for a gene + */ + int get_base_hits() const; + + /** + * Returns the amount of genes that were hit + * during the gene fraction calculation + */ + int gene_hits() const; + + /** + * + */ + void update_base_hits(const int &index); + + /** + * + */ + void update_gene_hits(); + + /** + * Searches for a fasta record corresponding + * to gene id + */ + static int find_gene(const std::vector<FastaRecord> &records, + const std::string &gene_id, + std::string seq = ""); + + /** + * Sorts fasta records by gene id + */ + static void sort_by_gene_id(std::vector<FastaRecord> &records); + + /** + * Resets base hits vector to 0's. + * This occurs after each sample is processed + */ + static void reset_base_hits(std::vector<FastaRecord> &records); + + /** + * Resets gene hits primitive to 0. + * This happens after each sample is processed + */ + static void reset_gene_hits(std::vector<FastaRecord> &records); + + std::vector<int> &base_hits(); + + std::string _gene_id; + std::string _gene; + std::vector<int> _base_hits; + +private: + int _gene_hits; +}; + + + + +#endif /* FASTA_RECORD_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Makefile Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,14 @@ +output: main.o Sam.o Alignments.o Fasta.o FastaRecord.o + g++ -std=c++11 main.o Sam.o Alignments.o Fasta.o FastaRecord.o -o csa +main.o: main.cpp + g++ -c -std=c++11 main.cpp +Sam.o: Sam.cpp + g++ -c -std=c++11 Sam.cpp +Alignments.o: Alignments.cpp + g++ -c -std=c++11 Alignments.cpp +Fasta.o: Fasta.cpp + g++ -c -std=c++11 Fasta.cpp +FastaRecord.o: FastaRecord.cpp + g++ -c -std=c++11 FastaRecord.cpp +clean: + rm *.o csa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Sam.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,40 @@ +#include "Sam.h" +#include "dir_util.h" +#include "alignment_util.h" + +#include <iostream> +#include <fstream> + +Sam::Sam(std::string sam_fp) : _sam_fp(sam_fp) {} + +void Sam::read_sam(cmd_args args) { + if(args.bam_stream) read_from_stdin(); + else read_from_file(args.sam_fp); +} + +void Sam::read_from_stdin() { + std::string line; + while(std::getline(std::cin, line)) { + if(line[0] == '@') continue; + alignment.push_back(line); + } +} + +void Sam::read_from_file(const std::string &sam_fp) { + std::ifstream in(sam_fp.c_str()); + if(!in) { + std::cerr << "Could not open sam file " << sam_fp << std::endl; + exit(EXIT_FAILURE); + } + + std::string line; + while(getline(in, line)) { + if(line[0] == '@') continue; + if(is_good_alignment(line)) + alignment.push_back(line); + } + + in.close(); +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Sam.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,48 @@ +#ifndef SAM_H +#define SAM_H + +#include <string> +#include <vector> + +#include "args.h" +#include "Alignments.h" + +/** + * Class for dealing with sam files + */ + +class Sam { +public: + /** + * Ctor initializes sam file path + */ + Sam(std::string sam_fp); + void read_sam(cmd_args args); + + /** + * Reads sam file from stdin + */ + void read_from_stdin(); + + /** + * Reads sam file from directory or file path + */ + void read_from_file(const std::string &sam_fp); + + /** + * + */ + void read_from_dir(const std::string &sam_dir_fp); + + std::vector<Alignments> alignment; + +private: + std::string _sam_fp; +}; + + + + + +#endif /* SAM_H */ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/SamRatio.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,248 @@ +#ifndef SAM_RATIO_H +#define SAM_RATIO_H + +#include <iostream> +#include <fstream> +#include <algorithm> + +#include "FastaRecord.h" +#include "Alignments.h" +#include "args.h" + +typedef std::vector<std::pair<int,char>> cigar_str; + +/** + * + */ +struct header { + std::string level = "Level"; + std::string iteration = "Iteration"; + std::string gene_id = "Gene id"; + std::string gene_fraction = "Gene Fraction"; + std::string hits = "Hits"; +}; + +/** + * Reports the total number of bases that were touched for each + * gene. This is largely calculated using the positional and seq + * information found in fields four and ten of each alignment + */ +void analyze_coverage(FastaRecord &record, Alignments &alignment) { + record.update_gene_hits(); + + cigar_str cigar = alignment.cigar(); + + int len; + char op; + + int occurrence; + int pos_in_gene = alignment.pos(); + + int start, stop; + int base_hits = record._base_hits.size(); // func this + int read_length = alignment.seq().length(); //func this + + if(pos_in_gene == 1) { + occurrence = 0; + for(int i = 0; i < cigar.size(); i++) { + len = cigar[i].first; + op = cigar[i].second; + + switch(op) { + case 'M': + occurrence += len; + break; + case 'I': + occurrence += len; + break; + default: + break; + } + } + + start = read_length - occurrence; + stop = start + read_length; + + for(int i = start; i < base_hits; i++) { + if(i == stop) break; + record._base_hits[i] = 1; + } + } + else { + start = pos_in_gene - 1; + stop = start + read_length; + + for(int i = start; i < base_hits; i++) { + if(i == stop) break; + record._base_hits[i] = 1; + } + } +} + +/** + * Returns gene fraction of fasta record + * Returns -1 if gene fraction is not greater than threshold + */ +double coverage(const FastaRecord &record, const int &threshold) { + double gene_coverage; + + int base_hits, gene_size; + + base_hits = record.get_base_hits(); + gene_size = record.gene().length(); + + gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100; + + if(gene_coverage > threshold) + return gene_coverage; + return -1; +} + +/** + * Writes header to output file when + * reading from stdin + */ +void bam_stream_header() { + header h; + char sep = ','; + + std::cout << h.level << sep << h.iteration << sep + << h.gene_id << sep << h.gene_fraction << sep + << h.hits << '\n'; +} + +/** + * Writes header to output file when + * reading from sam file + */ +void file_header(const std::string &out_fp, const std::string &sam_fp) { + header h; + std::ofstream out(out_fp.c_str(), std::ofstream::app ); + char sep = ','; + + out << "@" << sam_fp << '\n'; + out << h.level << sep << h.iteration << sep + << h.gene_id << sep << h.gene_fraction << sep + << h.hits << '\n'; + out.close(); +} + +/** + * + */ +void create_header(cmd_args &args) { + if(args.bam_stream) { + bam_stream_header(); + } + else { + file_header(args.out_fp, args.sam_fp); + } +} + +/** + * Writes results to output file when reading from + * stdin + */ +void bam_stream_results(std::vector<FastaRecord> &records, + const int &level, const int &iteration, + cmd_args &args) { + + double gene_fraction; + int hits_seen; + std::string id; + char sep = ','; + + for(auto &rec : records) { + gene_fraction = coverage(rec, args.threshold); + hits_seen = rec.gene_hits(); + id = rec.gene_id(); + + if(gene_fraction > 0) { + std::cout << level << sep << iteration << sep + << id << sep << gene_fraction << sep + << hits_seen << '\n'; + } + } +} + +/** + * Write results when reading sam file from + * path + */ +void file_results(std::vector<FastaRecord> &records, + const int level, const int &iteration, + cmd_args &args) { + + std::ofstream out(args.out_fp.c_str(), std::ofstream::app); + + double gene_fraction; + int hits_seen; + std::string id; + char sep = ','; + + for(auto &rec : records) { + gene_fraction = coverage(rec, args.threshold); + hits_seen = rec.gene_hits(); + id = rec.gene_id(); + + if(gene_fraction > 0) { + out << level << sep << iteration << sep + << id << sep << gene_fraction << sep + << hits_seen << '\n'; + } + } + out.close(); +} + +/** + * + */ +void report_results(std::vector<FastaRecord> &records, + const int level, const int &iteration, + cmd_args &args) { + + if(args.bam_stream) { + bam_stream_results(records,level,iteration,args); + } + else { + file_results(records,level,iteration,args); + } +} + +/** + * Generates a sequence of samples from sam file specified + * by the starting level, max level, and skip pattern + */ +void generate_samples(std::vector<FastaRecord> &records, + std::vector<Alignments> &alignments, + cmd_args &args) { + + int read_count = alignments.size(); + int sample_size; + + srand(unsigned(time(0))); + + std::vector<int> sequence(read_count); + iota(sequence.begin(), sequence.end(), 0); + + create_header(args); + + for(int level = args.min; level <= args.max; level += args.skip) { + for(int sample = 0; sample < args.s_per_lev; sample++) { + random_shuffle(sequence.begin(), sequence.end(), randomize); + sample_size = round(((static_cast<double>(level)/100)*read_count)); + std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size); + + for(int a_idx = 0; a_idx < chosen.size(); a_idx++) { + std::string rname = alignments[chosen[a_idx]].rname(); + int gene_idx = FastaRecord::find_gene(records, rname); + analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]); + } + report_results(records,level,sample+1,args); + FastaRecord::reset_base_hits(records); + FastaRecord::reset_gene_hits(records); + } + } +} + +#endif /* SAM_RATIO_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/alignment_util.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,96 @@ +#ifndef ALIGNMENT_UTIL_H +#define ALIGNMENT_UTIL_H + +#include <string> +#include <vector> + +#include <boost/algorithm/string.hpp> + +#include "int_util.h" + +// macro to check if read mapped +#define READ_UNMAPPED 4 + +/** + * Splits alignment into separate parts + */ +std::vector<std::string> split_alignment(std::string &alignment) { + std::vector<std::string> parts; + + boost::trim_if(alignment, boost::is_any_of("\t ")); + // split on tab delimeter + boost::split(parts, alignment, boost::is_any_of("\t "), boost::token_compress_on); + + return parts; +} + +/** + * Validates bit flag + */ +bool is_good_flag(const int &bit_flag) { + if( (bit_flag & READ_UNMAPPED) > 0) return false; + return true; +} + +/** + * Validates rname + */ +bool is_good_rname(const std::string &rname) { + return rname.compare("*") != 0; +} + +/** + * Validates pos + */ +bool is_good_pos(const int &pos) { + return pos > 0; +} + +/** + * Validates cigar + */ +bool is_good_cigar(const std::string &cigar) { + return cigar.compare("*") != 0; +} + +/** + * Validates seq + */ +bool is_good_seq(const std::string &seq) { + return seq.compare("*") != 0; +} + +/** + * Validates alignment fields + */ +bool fields_are_good(std::vector<std::string> &parts) { + int bit_flag = s_to_i(parts[1]); + int pos = s_to_i(parts[3]); + + std::string rname = parts[2]; + std::string cigar = parts[5]; + std::string seq = parts[9]; + + if(!(is_good_flag(bit_flag))) return false; + if(!(is_good_pos(pos))) return false; + if(!(is_good_rname(rname))) return false; + if(!(is_good_cigar(cigar))) return false; + if(!(is_good_seq(seq))) return false; + + return true; +} + +/** + * Stores alignments that pass validity checks + */ +bool is_good_alignment(std::string &alignment) { + std::vector<std::string> alignment_parts; + + alignment_parts = split_alignment(alignment); + + if(!(fields_are_good(alignment_parts))) + return false; + return true; +} + +#endif /* ALIGNMENT_UTIL_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/args.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,68 @@ +#ifndef ARGS_H +#define ARGS_H + +#include "int_util.h" + +#include <string> +#include <vector> +#include <list> + +/** + * Encapsulates information input + * from the command line. + */ +struct cmd_args { + std::string amr_fp; + std::string sam_fp; + std::list<std::string> sam_dir_fp; + std::string out_fp; + + int threshold; + int min; + int max; + int skip; + int s_per_lev; + + bool sam_dir = false; /* This will be set to true when parsing a + directory of sam files. */ + bool bam_stream = false; /* This will be set to true when executing + samtools view -h example.bam | csa > output + from the command line. */ +}; + +/** + * Returns a struct of command line arguments. + */ +static inline cmd_args +parse_command_line(int argc, char *argv[]) { + std::vector<std::string> args(argv, argv + argc); + + cmd_args arg; + + for(int i = 1; i < argc; i++) { + if(args[i].compare("-amr_fp") == 0) + arg.amr_fp = args[++i]; + else if(args[i].compare("-sam_fp") == 0) + arg.sam_fp = args[++i]; + else if(args[i].compare("-out_fp") == 0) + arg.out_fp = args[++i]; + else if(args[i].compare("-t") == 0) + arg.threshold = s_to_i(args[++i]); + else if(args[i].compare("-min") == 0) + arg.min = s_to_i(args[++i]); + else if(args[i].compare("-max") == 0) + arg.max = s_to_i(args[++i]); + else if(args[i].compare("-skip") == 0) + arg.skip = s_to_i(args[++i]); + else if(args[i].compare("-samples") == 0) + arg.s_per_lev = s_to_i(args[++i]); + else if(args[i].compare("-d") == 0) + arg.sam_dir = true; + else if(args[i].compare("-bam") == 0) + arg.bam_stream = true; + } + + return arg; +} + +#endif /* ARGS_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/dir_util.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,43 @@ +#ifndef DIR_UTIL_H +#define DIR_UTIL_H + +#include <list> +#include <iostream> +#include <string> +#include <dirent.h> + +/* + * Parses a directory of sam files + */ +static inline std::list<std::string> +parse_sam_dir(const std::string &directory) { + DIR *dir; + struct dirent *ent; + std::string dir_path = directory; + + dir = opendir(dir_path.c_str()); + // is dir open/valid? + if(dir == NULL) { + std::cout << "Not a valid directory" << std::endl; + exit(EXIT_FAILURE); + } + + std::list<std::string> sam_files; + std::string fn; + std::string ext; + std::string file_type = ".sam"; + + // get all files with a .sam file extension + while((ent = readdir(dir)) != NULL) { + fn = dir_path + std::string(ent->d_name); + ext = fn.substr(fn.length()-4, fn.length()); + if(ext.compare(file_type) == 0) { + sam_files.push_back(fn); + } + } + closedir(dir); + + return sam_files; +} + +#endif /* DIR_UTIL_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/int_util.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,29 @@ +#ifndef INT_UTIL_H +#define INT_UTIL_H + +#include <string> +#include <sstream> + +/** + * Given a string, return its integer. + */ +static inline int +s_to_i(const std::string &s) { + std::istringstream ss(s); + int i; + ss >> i; + return i; +} + +/** + * Given an integer, return a random number + * between 0 and i. + */ +static inline int +randomize(const int &i) { + return rand() % i; +} + +#endif /*INT_UTIL_H */ + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/main.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,37 @@ +#include <string> +#include <iostream> +#include <vector> + +#include "int_util.h" +#include "dir_util.h" +#include "args.h" +#include "Fasta.h" +#include "Sam.h" +#include "SamRatio.h" + +using namespace std; + +int main(int argc, char *argv[]) { + cmd_args args; + args = parse_command_line(argc, argv); + + Fasta f(args.amr_fp); + f.read_fasta(args.amr_fp); + + if(args.sam_dir) { + list<string> sam_files = parse_sam_dir(args.sam_fp); + for(auto &fn : sam_files) { + args.sam_fp = fn; + Sam s(args.sam_fp); + s.read_sam(args); + generate_samples(f.records, s.alignment, args); + } + } + else { + Sam s(args.sam_fp); + s.read_sam(args); + generate_samples(f.records, s.alignment, args); + } + + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller.xml Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,23 @@ +<tool id="snp_caller" name="Identify Snips" version="0.1.0"> + <description>for single-end or paired-end data</description> + <command>$__tool_directory__/snp -amr_fp $amr_fp -samse $samse -out_fp $out_fp</command> + <inputs> + <param format="fasta" name="amr_fp" type="data" label="Input fasta file"/> + <param format="sam" name ="samse" type="data" label="Input SAM file"/> + </inputs> + <outputs> + <data format="tabular" name="out_fp" /> + </outputs> + +<tests> + <test> + <param name="amr_fp" value="amr.fa" /> + <param name="samse" value="samse.sam" /> + <param name="out_fp" file="res" /> + </test> +</tests> + <help> +This tool identifies single nucleotide polymorphisms from both single-end and paired-end data. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/Alignment.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,96 @@ +#include "Alignment.h" +#include "int_util.h" + +#include <boost/algorithm/string.hpp> + +#include <iostream> +#include <sstream> +#include <algorithm> + +#define READ_PAIRED 1 +#define READ_MAPPED_IN_PROPER_PAIR 2 +#define READ_UNMAPPED 4 +#define MATE_UNMAPPED 8 +#define READ_REVERSE_STRAND 16 +#define MATE_REVERSE_STRAND 32 +#define FIRST_IN_PAIR 64 +#define SECOND_IN_PAIR 128 +#define NOT_PRIMARY_ALIGNMENT 256 +#define READ_FAILS_VENDOR_QUALITY_CHECKS 512 +#define READ_IS_PCR_OR_OPTICAL_DUPLICATE 1024 +#define SUPPLEMENTARY_ALIGNMENT 2048 + +Alignment::Alignment(std::string alignment) : _alignment(alignment) { + fill_alignment_fields(alignment); + fill_bit_flag(field.FLAG); + fill_xa_field(alignment); +} + +void Alignment::fill_alignment_fields(const std::string &alignment) { + std::istringstream ss(alignment); + ss >> field.QNAME >> field.FLAG >> field.RNAME >> field.POS >> + field.MAPQ >> field.CIGAR >> field.RNEXT >> field.PNEXT >> + field.TLEN >> field.SEQ >> field.QUAL; +} + +void Alignment::fill_bit_flag(const int &flag) { + if( (flag & READ_PAIRED) == 1) b_flag.read_paired = true; + if( (flag & READ_MAPPED_IN_PROPER_PAIR) > 1) b_flag.read_mapped_in_proper_pair = true; + if( (flag & READ_UNMAPPED) > 1) b_flag.read_unmapped = true; + if( (flag & MATE_UNMAPPED) > 1) b_flag.mate_unmapped = true; + if( (flag & READ_REVERSE_STRAND) > 1) b_flag.read_reverse_strand = true; + if( (flag & MATE_REVERSE_STRAND) > 1) b_flag.mate_reverse_strand = true; + if( (flag & FIRST_IN_PAIR) > 1) b_flag.first_in_pair = true; + if( (flag & SECOND_IN_PAIR) > 1) b_flag.second_in_pair = true; +} + +// XA:Z:mef(A)_10_AF376746,+81,92M,2; +bool Alignment::fill_xa_field(const std::string &alignment) { + xa_fields hits; + std::string alt_fields; + int offset = 5; + std::size_t alt_index = alignment.find("XA:Z:"); + + if(alt_index != std::string::npos) { + alt_fields = alignment.substr(alt_index+offset, alignment.length()); + } + else { + return false; + } + + std::string field; + std::istringstream ss(alt_fields); + + while(std::getline(ss, field, ',')) { + hits.rname = field; + //std::cout << hits.rname << std::endl; + getline(ss, field, ','); + hits.pos = strand_to_i(field); + //std::cout << hits.pos << std::endl; + getline(ss, field, ','); + hits.cigar = field; + //std::cout << hits.cigar << std::endl; + getline(ss, field, ';'); + hits.edit = 0; + alternate_hits.push_back(hits); + } + + return true; +} + +std::vector<std::pair<int,char> > Alignment::get_cigar() { + return get_cigar_operations(field.CIGAR); +} + +std::vector<std::pair<int,char> > Alignment::get_cigar_operations(const std::string &cigar) { + std::vector<std::pair<int,char>> p; + int count; + char operation; + + std::istringstream ss(cigar); + while(ss >> count >> operation) { + p.push_back(std::make_pair(count,operation)); + } + + return p; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/Alignment.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,91 @@ +#ifndef ALIGNMENT_H +#define ALIGNMENT_H + +#include <string> +#include <vector> +#include <algorithm> + +struct bit_flag { + /* SAM flags in English */ + bool read_paired = false; + bool read_mapped_in_proper_pair = false; + bool read_unmapped = false; + bool mate_unmapped = false; + bool read_reverse_strand = false; + bool mate_reverse_strand = false; + bool first_in_pair = false; + bool second_in_pair = false; +}; + +struct alignment_fields { + std::string QNAME; + int FLAG; + std::string RNAME; + int POS; + std::string MAPQ; + std::string CIGAR; + std::string RNEXT; + int PNEXT; + int TLEN; + std::string SEQ; + std::string QUAL; +}; + +class Alignment { +public: + Alignment(std::string alignment); + void fill_alignment_fields(const std::string &alignment); + void fill_bit_flag(const int &flag); + bool fill_xa_field(const std::string &alignment); + std::vector<std::pair<int,char> > get_cigar(); + + inline std::string alignment() const { return _alignment; }; + inline std::string qname() const { return field.QNAME; }; + inline std::string rname() const { return field.RNAME; }; + inline std::string mapq() const { return field.MAPQ; }; + inline std::string cigar() const { return field.CIGAR; }; + inline std::string seq() const { return field.SEQ; }; + inline std::string rnext() const { return field.RNEXT; }; + inline int flag() const { return field.FLAG; }; + inline int pos() const { return field.POS; }; + inline int pnext() const { return field.PNEXT; }; + inline int tlen() const { return field.TLEN; }; + + inline bool read_paired() const { return b_flag.read_paired; }; + inline bool read_mapped_in_proper_pair() const { return b_flag.read_mapped_in_proper_pair; }; + inline bool read_unmapped() const { return b_flag.read_unmapped; }; + inline bool mate_unmapped() const { return b_flag.mate_unmapped; }; + inline bool read_reverse_strand() const { return b_flag.read_reverse_strand; } + inline bool mate_reverse_strand() const { return b_flag.mate_reverse_strand; }; + inline bool first_in_pair() const { return b_flag.first_in_pair; }; + inline bool second_in_pair() const { return b_flag.second_in_pair; }; + + inline void set_rname(std::string rname) { + field.RNAME = rname; + } + inline void set_cigar(std::string cigar) { + field.CIGAR = cigar; + } + inline void set_pos(int pos) { + field.POS = pos; + } + + struct xa_fields { + std::string rname; + std::string cigar; + int pos; + int edit; + }; + + std::vector<xa_fields> alternate_hits; +private: + std::vector<std::pair<int,char> > get_cigar_operations(const std::string &cigar); + + std::string _alignment; + + alignment_fields field; + + bit_flag b_flag; +}; + +#endif // ALIGNMENT_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/Fasta.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,33 @@ +#include "Fasta.h" + +#include <string> +#include <fstream> +#include <vector> +#include <iostream> + + +Fasta::Fasta(std::string amr_fp) : _amr_fp(amr_fp) {} + +void Fasta::read_fasta(const std::string &amr_fp) { + std::ifstream in(amr_fp.c_str()); + if(!in) { + std::cerr << "Could not open fasta file " << amr_fp << std::endl; + exit(EXIT_FAILURE); + } + + std::string gene_id, gene, line; + while(std::getline(in, line)) { + std::size_t gene_idx = line.find(" "); + + if(gene_idx != std::string::npos) + gene_id = line.substr(1, gene_idx-1); + else + gene_id = line.substr(1, line.length()); + + std::getline(in, gene); + records.push_back(FastaRecord(gene_id, gene)); + } + in.close(); + + FastaRecord::sort_by_gene_id(records); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/Fasta.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,13 @@ +#include "FastaRecord.h" +#include <vector> +#include <string> + +class Fasta { +public: + Fasta(std::string amr_fp); + void read_fasta(const std::string &amr_fp); + + std::vector<FastaRecord> records; +private: + std::string _amr_fp; +};
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/FastaRecord.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,27 @@ +#include <algorithm> + +#include "FastaRecord.h" + +FastaRecord::FastaRecord(std::string gene_id, std::string gene) : + _gene_id(gene_id), _gene(gene) {} + +int FastaRecord::find_gene(const std::vector<FastaRecord> & records, + const std::string &gene_id, + const std::string seq) { + int gene_index; + + std::vector<FastaRecord>::const_iterator low; + low = std::lower_bound(records.begin(), records.end(), + FastaRecord(gene_id, seq), + [](const FastaRecord &a, const FastaRecord &b) + { return a.gene_id() < b.gene_id(); }); + gene_index = (low - records.begin()); + + return gene_index; +} + +void FastaRecord::sort_by_gene_id(std::vector<FastaRecord> &records) { + sort(records.begin(), records.end(), + [](const FastaRecord &a, const FastaRecord &b) + { return a.gene_id() < b.gene_id(); }); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/FastaRecord.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,27 @@ +#ifndef FASTA_RECORD_H +#define FASTA_RECORD_H + +#include <string> +#include <vector> +#include <map> + +class FastaRecord { +public: + FastaRecord(std::string gene_id, std::string gene); + + inline std::string gene_id() const { return _gene_id; }; + inline std::string gene() const { return _gene; }; + + static int find_gene(const std::vector<FastaRecord> &records, + const std::string &gene_id, + std::string seq = ""); + static void sort_by_gene_id(std::vector<FastaRecord> &records); + + std::map<std::string, int> snip_database; +private: + std::string _gene_id; + std::string _gene; +}; + + +#endif //FASTA_RECORD_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/Makefile Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,14 @@ +output: main.o Fasta.o FastaRecord.o Alignment.o Sam.o + g++ -std=c++11 main.o Fasta.o FastaRecord.o Alignment.o Sam.o -o snp +main.o: main.cpp + g++ -c -std=c++11 main.cpp +Fasta.o: Fasta.cpp + g++ -c -std=c++11 Fasta.cpp +FastaRecord.o: FastaRecord.cpp + g++ -c -std=c++11 FastaRecord.cpp +Alignment.o: Alignment.cpp + g++ -c -std=c++11 Alignment.cpp +Sam.o: Sam.cpp + g++ -c -std=c++11 Sam.cpp +clean: + rm *.o snp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/Sam.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,57 @@ +#include "Sam.h" +#include "alignment_util.h" + +#include <string> +#include <vector> +#include <fstream> +#include <iostream> + +Sam::Sam(std::string sam_fp) : _sam_fp(sam_fp) {} + +void Sam::read_se_sam(const std::string &sam_fp, bool best) { + std::ifstream in(sam_fp.c_str()); + if(!in) { + std::cerr << "Could not open sam file " << sam_fp << std::endl; + exit(EXIT_FAILURE); + } + + std::string alignment; + while(getline(in, alignment)) { + if(alignment[0] == '@') + continue; + //std::vector<std::string> parts = split_alignment(alignment); + if(se_fields_are_good(alignment, best)) { + alignments.push_back(alignment); + } + } + sort_by_qname(alignments); +} + +void Sam::read_pe_sam(const std::string &sam_fp, bool best) { + std::ifstream in(sam_fp.c_str()); + if(!in) { + std::cerr << "Could not open sam file " << sam_fp << std::endl; + exit(EXIT_FAILURE); + } + + std::string alignment; + while(getline(in, alignment)) { + if(alignment[0] == '@') + continue; + //std::vector<std::string> parts = split_alignment(alignment); + if(pe_fields_are_good(alignment, best)) { + alignments.push_back(alignment); + } + } + sort_by_qname(alignments); +} + +void Sam::sort_by_qname(std::vector<Alignment> &v) { + sort(v.begin(), v.end(), + [](const Alignment &a, const Alignment &b) + { return a.qname() < b.qname(); }); +} + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/Sam.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,23 @@ +#ifndef SAM_H +#define SAM_H + +#include "Alignment.h" + +class Sam { +public: + Sam(std::string sam_fp); + void read_se_sam(const std::string &sam_fp, bool best); + void read_pe_sam(const std::string &sam_fp, bool best); + static void sort_by_qname(std::vector<Alignment> &v); + + std::vector<Alignment> alignments; +private: + std::string _sam_fp; + +}; + + + +#endif //SAM_H + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/SnipDriver.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,226 @@ +#ifndef SNIP_DRIVER_H +#define SNIP_DRIVER_H + +#include "Alignment.h" +#include "FastaRecord.h" +#include "args.h" + +#include <iostream> +#include <string> +#include <fstream> +#include <algorithm> +#include <set> + +typedef decltype(nullptr) nullptr_t; +typedef std::vector<std::pair<int,char>> cigar_str; + +std::set<std::string> read_pair_snips; + +struct CompareSnips { + bool operator()(const std::string &first, const std::string &second) { + return first.length() < second.length(); + } +}; + + +Alignment *get_mate_pair(std::vector<Alignment> &alignments, const std::string &qname, const int pnext) { + for(auto &alignment : alignments) { + bool mate_pair = alignment.second_in_pair(); + std::string mate_qname = alignment.qname(); + int mate_pos = alignment.pos(); + + if( (mate_pair == 1) && (mate_qname == qname) && (mate_pos == pnext)) { + return &alignment; + } + } + return nullptr; +} + +void parse_cigar(FastaRecord &record, Alignment &alignment) { + cigar_str cigar = alignment.get_cigar(); + + int occurrence; + char operation; + + int gene_pos = alignment.pos()-1; + int read_pos = 0; + + char base_in_ref, base_in_read; + + std::string read = alignment.seq(); + std::string qname = alignment.qname(); + + std::string gene = record.gene(); + std::string gene_id = record.gene_id(); + + std::string snip_pattern; + + for(int i = 0; i < cigar.size(); i++) { + occurrence = cigar[i].first; + operation = cigar[i].second; + + if(operation == 'M' || operation == '=') { + for(int k = 0; k < occurrence; k++) { + if(gene_pos >= gene.length() || read_pos >= read.length()) { + break; + } + + base_in_ref = gene[gene_pos]; + base_in_read = read[read_pos]; + + if(base_in_ref != base_in_read) { + snip_pattern = std::to_string(gene_pos) + + std::string(1,base_in_ref) + "->" + + std::string(1, base_in_read) + ":"; + read_pair_snips.insert(snip_pattern); + } + + ++gene_pos; + ++read_pos; + } + } + else if(operation == 'I') { + read_pos += occurrence; + gene_pos += occurrence; + } + else if(operation == 'S') { + read_pos += occurrence; + } + else if(operation == 'N') { + gene_pos += occurrence; + } + else if(operation == 'P') { + read_pos += occurrence; + gene_pos += occurrence; + } + else if(operation == 'D') { + gene_pos += occurrence; + } + else if(operation == 'X') { + read_pos += occurrence; + gene_pos += occurrence; + } + else { + std::cout << "Operation " << operation << " not handled." << std::endl; + } + } +} + +std::string convert_snips_to_pattern(const std::set<std::string> &read_pair_snips) { + std::vector<std::string> snips; + snips.assign(read_pair_snips.begin(), read_pair_snips.end()); + + CompareSnips comp; + sort(snips.begin(), snips.end(), comp); + + std::string pattern; + for(const auto &snp : snips) { + pattern+=snp; + } + return pattern; +} + +bool find_se_snips(std::vector<FastaRecord> &records, + Alignment &read_pair) { + + int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); + if(fasta_record_idx == records.size()) { + return false; + } + + parse_cigar(records[fasta_record_idx], read_pair); + std::string snp = convert_snips_to_pattern(read_pair_snips); + + if(!snp.empty()) { + records[fasta_record_idx].snip_database[snp]++; + } + + read_pair_snips.clear(); + + return true; +} + +bool find_pe_snips(std::vector<FastaRecord> &records, + std::vector<Alignment> &alignments, + Alignment &read_pair) { + + if(read_pair.first_in_pair()) { + std::string qname = read_pair.qname(); + int mate_start_pos = read_pair.pnext(); + + Alignment *mate_pair = get_mate_pair(alignments, qname, mate_start_pos); + if(mate_pair == nullptr) { + return false; + } + if(mate_pair->rname () != read_pair.rname()) { + if(mate_pair->alternate_hits.size() == 0) { + return false; + } + + bool found_alt_hit = false; + for(const auto &alternative: mate_pair->alternate_hits) { + if(alternative.rname == read_pair.rname()) { + found_alt_hit = true; + mate_pair->set_rname(alternative.rname); + mate_pair->set_cigar(alternative.cigar); + mate_pair->set_pos(alternative.pos); + break; + } + } + if(!found_alt_hit) { + return false; + } + } + + + int fasta_record_idx = FastaRecord::find_gene(records, read_pair.rname()); + if(fasta_record_idx == records.size()) { + return false; + } + + parse_cigar(records[fasta_record_idx], read_pair); + parse_cigar(records[fasta_record_idx], *mate_pair); + + std::string snp = convert_snips_to_pattern(read_pair_snips); + + if(!snp.empty()) { + records[fasta_record_idx].snip_database[snp]++; + } + + + read_pair_snips.clear(); + } + + return true; +} + +void run(std::vector<FastaRecord> &records, std::vector<Alignment> &alignments, cmd_args arg) { + if(arg.samse) { + for(int i = 0; i < alignments.size(); i++) { + find_se_snips(records, alignments[i]); + } + } + else { + for(int i = 0; i < alignments.size(); i++) { + find_pe_snips(records, alignments, alignments[i]); + } + } +} + +void write_header(const std::string &out_fp) { + std::ofstream out(out_fp.c_str()); + out << "Chr,Haplotype,Frequency" << '\n'; +} + +void write_snips(std::vector<FastaRecord> &records, const std::string &out_fp) { + write_header(out_fp); + std::ofstream out(out_fp.c_str(), std::ofstream::app); + for(const auto &record : records) { + for(const auto &snip : record.snip_database) { + out << record.gene_id() << "," << snip.first << "," << snip.second << '\n'; + } + } +} + + +#endif // SNIP_DRIVER_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/alignment_util.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,130 @@ +#ifndef ALIGNMENT_UTIL_H +#define ALIGNMENT_UTIL_H + +#include "int_util.h" + +#include <string> +#include <vector> + +#include <boost/algorithm/string.hpp> + +#define READ_PAIRED 1 +#define PROPER_PAIR 2 +#define READ_UNMAPPED 4 +#define MATE_UNMAPPED 8 + +std::vector<std::string> split_alignment(std::string &alignment) { + std::vector<std::string> parts; + boost::trim_if(alignment, boost::is_any_of("\t ")); + boost::split(parts, alignment, boost::is_any_of("\t "), boost::token_compress_on); + return parts; +} + +bool is_good_se_flag(const int &flag) { + if( (flag & READ_UNMAPPED) > 0) { + return false; + } + else { + return true; + } +} + +bool is_good_pe_flag(const int &flag) { + if( (flag & READ_UNMAPPED) > 0 || (flag & MATE_UNMAPPED) > 0 || + (flag & READ_PAIRED) == 0 || (flag & PROPER_PAIR == 0) ) { + return false; + } + else { + return true; + } +} + +bool is_good_rname(const std::string &rname) { + return rname.compare("*") != 0; +} + +bool is_good_pos(const int &pos) { + return pos > 0; +} + +bool is_good_pnext(const int &pnext) { + return pnext > 0; +} + +bool is_good_cigar(const std::string &cigar) { + return cigar.compare("*") != 0; +} + +bool is_good_seq(const std::string &seq) { + return seq.compare("*") != 0; +} + +bool is_alignment_unique(const std::string &alignment) { + if(alignment.find("XT:A:U") != std::string::npos) { + return true; + } + else { + return false; + } +} + +/*bool is_good_alignment(std::string &alignment) { + std::vector<std::string> alignment_parts; + + alignment_parts = split_alignment(alignment); + + if(!(fields_are_good(alignment_parts))) + return false; + else + return true; +}*/ + +bool se_fields_are_good(std::string &alignment, bool best) { + std::vector<std::string> parts = split_alignment(alignment); + int flag = s_to_i(parts[1]); + int pos = s_to_i(parts[3]); + + std::string rname = parts[2]; + std::string cigar = parts[5]; + std::string seq = parts[9]; + + if(!(is_good_se_flag(flag))) return false; + if(!(is_good_pos(pos))) return false; + if(!(is_good_rname(rname))) return false; + if(!(is_good_cigar(cigar))) return false; + if(!(is_good_seq(seq))) return false; + if(best) { + if(!(is_alignment_unique(alignment))) { + return false; + } + } + + return true; +} + +bool pe_fields_are_good(std::string &alignment, bool best) { + std::vector<std::string> parts = split_alignment(alignment); + int flag = s_to_i(parts[1]); + int pos = s_to_i(parts[3]); + int pnext = s_to_i(parts[7]); + + std::string rname = parts[2]; + std::string cigar = parts[5]; + std::string seq = parts[9]; + + if(!(is_good_pe_flag(flag))) return false; + if(!(is_good_pos(pos))) return false; + if(!(is_good_rname(rname))) return false; + if(!(is_good_cigar(cigar))) return false; + if(!(is_good_pnext(pnext))) return false; + if(!(is_good_seq(seq))) return false; + if(best) { + if(!(is_alignment_unique(alignment))) { + return false; + } + } + + return true; +} + +#endif // ALIGNMENT_UTIL_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/args.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,62 @@ +#ifndef ARGS_H +#define ARGS_H + +#include <string> +#include <vector> + +void static usage() { + fprintf(stderr, "\n"); + fprintf(stderr, "Program: SNP Finder \n"); + fprintf(stderr, "Contact: Chris Dean <cdean11@rams.colostate.edu>\n\n"); + fprintf(stderr, "Usage: snp [options] <fasta_file> <sam_file> <best> <output_file>\n\n"); + fprintf(stderr, "Options:\n\n"); + fprintf(stderr, " -amr_fp STRING amr database\n"); + fprintf(stderr, " -samse STRING single-end alignments\n"); + fprintf(stderr, " -sampe STRING paired-end alignments\n"); + fprintf(stderr, " -b BOOLEAN filter on unique alignments\n"); + fprintf(stderr, " -out_fp STRING output path or(file)\n\n"); +} + +struct cmd_args { + std::string amr_fp; + std::string sam_fp; + std::string out_fp; + + bool best = false; + bool samse = false; + bool sampe = false; +}; + +static inline cmd_args +parse_command_line(int argc, const char *argv[]) { + std::vector<std::string> args(argv, argv+argc); + + cmd_args arg; + + for(int i = 1; i < argc; i++) { + if(args[i] == "-amr_fp") { + arg.amr_fp = args[++i]; + } + else if(args[i] == "-samse") { + arg.sam_fp = argv[++i]; + arg.samse = true; + } + else if(args[i] == "-sampe") { + arg.sam_fp = argv[++i]; + arg.sampe = true; + } + else if(args[i] == "-out_fp") { + arg.out_fp = argv[++i]; + } + else if(args[i] == "-b") { + arg.best = true; + } + else { + usage(); + exit(EXIT_FAILURE); + } + } + return arg; +} + +#endif //ARGS_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/int_util.h Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,25 @@ +#ifndef INT_UTIL_H +#define INT_UTIL_H + +#include <string> +#include <sstream> + +inline +int s_to_i(const std::string &pos) { + std::istringstream ss(pos); + int i; + ss >> i; + return i; +} + +inline +int strand_to_i(const std::string &pos) { + std::string s = pos.substr(1, pos.length()); + std::istringstream ss(s); + int i; + ss >> i; + return i; +} + + +#endif // INT_UTIL_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_caller/src/main.cpp Sun Feb 21 06:05:24 2016 -0500 @@ -0,0 +1,46 @@ +#include <iostream> +#include <string> +#include <vector> + +#include "args.h" +#include "Fasta.h" +#include "FastaRecord.h" +#include "Sam.h" +#include "SnipDriver.h" + +using namespace std; + +void print_fasta(const Fasta &f) { + for(const auto &record: f.records) { + cout << record.gene_id() << endl; + cout << record.gene() << endl; + } +} + +void print_sam(const Sam &s) { + for(const auto &alignment: s.alignments) { + cout << alignment.alignment() << endl; + } +} + +int main(int argc, const char *argv[]) { + cmd_args arg; + arg = parse_command_line(argc, argv); + + Fasta f(arg.amr_fp); + f.read_fasta(arg.amr_fp); + + Sam s(arg.sam_fp); + if(arg.samse) { + s.read_se_sam(arg.sam_fp, arg.best); + } + else { + s.read_pe_sam(arg.sam_fp, arg.best); + } + + run(f.records, s.alignments, arg); + write_snips(f.records, arg.out_fp); + + + return 0; +}