Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/utils/Fasta/Fasta.cpp @ 1:bec36315bd12 default tip
Deleted selected files
author | aaronquinlan |
---|---|
date | Sat, 19 Nov 2011 14:17:03 -0500 |
parents | dfcd8b6c1bda |
children |
line wrap: on
line diff
--- a/BEDTools-Version-2.14.3/src/utils/Fasta/Fasta.cpp Thu Nov 03 10:25:04 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,325 +0,0 @@ -// *************************************************************************** -// FastaIndex.cpp (c) 2010 Erik Garrison <erik.garrison@bc.edu> -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 9 February 2010 (EG) -// --------------------------------------------------------------------------- - -#include "Fasta.h" - -FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len) - : name(name) - , length(length) - , offset(offset) - , line_blen(line_blen) - , line_len(line_len) -{} - -FastaIndexEntry::FastaIndexEntry(void) // empty constructor -{ clear(); } - -FastaIndexEntry::~FastaIndexEntry(void) -{} - -void FastaIndexEntry::clear(void) -{ - name = ""; - length = NULL; - offset = -1; // no real offset will ever be below 0, so this allows us to - // check if we have already recorded a real offset - line_blen = NULL; - line_len = NULL; -} - -ostream& operator<<(ostream& output, const FastaIndexEntry& e) { - // just write the first component of the name, for compliance with other tools - output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" << - e.line_blen << "\t" << e.line_len; - return output; // for multiple << operators. -} - -FastaIndex::FastaIndex(void) -{} - -void FastaIndex::readIndexFile(string fname) { - string line; - long long linenum = 0; - indexFile.open(fname.c_str(), ifstream::in); - if (indexFile.is_open()) { - while (getline (indexFile, line)) { - ++linenum; - // the fai format defined in samtools is tab-delimited, every line being: - // fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len - vector<string> fields = split(line, '\t'); - if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file - // note that fields[0] is the sequence name - char* end; - string name = split(fields[0], " \t").at(0); // key by first token of name - sequenceNames.push_back(name); - this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()), - strtoll(fields[2].c_str(), &end, 10), - atoi(fields[3].c_str()), - atoi(fields[4].c_str())))); - } else { - cerr << "Warning: malformed fasta index file " << fname << - "does not have enough fields @ line " << linenum << endl; - cerr << line << endl; - exit(1); - } - } - } else { - cerr << "could not open index file " << fname << endl; - exit(1); - } -} - -// for consistency this should be a class method -bool fastaIndexEntryCompare ( FastaIndexEntry a, FastaIndexEntry b) { return (a.offset<b.offset); } - -ostream& operator<<(ostream& output, FastaIndex& fastaIndex) { - vector<FastaIndexEntry> sortedIndex; - for(vector<string>::const_iterator it = fastaIndex.sequenceNames.begin(); it != fastaIndex.sequenceNames.end(); ++it) - { - sortedIndex.push_back(fastaIndex[*it]); - } - sort(sortedIndex.begin(), sortedIndex.end(), fastaIndexEntryCompare); - for( vector<FastaIndexEntry>::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) { - output << *fit << endl; - } - return output; -} - -void FastaIndex::indexReference(string refname) { - // overview: - // for line in the reference fasta file - // track byte offset from the start of the file - // if line is a fasta header, take the name and dump the last sequnece to the index - // if line is a sequence, add it to the current sequence - //cerr << "indexing fasta reference " << refname << endl; - string line; - FastaIndexEntry entry; // an entry buffer used in processing - entry.clear(); - int line_length = 0; - long long offset = 0; // byte offset from start of file - long long line_number = 0; // current line number - bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file - // this will be used to raise an error - // if we have a line length change at - // any line other than the last line in - // the sequence - bool emptyLine = false; // flag to catch empty lines, which we allow for - // index generation only on the last line of the sequence - ifstream refFile; - refFile.open(refname.c_str()); - if (refFile.is_open()) { - while (getline(refFile, line)) { - ++line_number; - line_length = line.length(); - if (line[0] == ';') { - // fasta comment, skip - } else if (line[0] == '+') { - // fastq quality header - getline(refFile, line); - line_length = line.length(); - offset += line_length + 1; - // get and don't handle the quality line - getline(refFile, line); - line_length = line.length(); - } else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header - // if we aren't on the first entry, push the last sequence into the index - if (entry.name != "") { - mismatchedLineLengths = false; // reset line length error tracker for every new sequence - emptyLine = false; - flushEntryToIndex(entry); - entry.clear(); - } - entry.name = line.substr(1, line_length - 1); - } else { // we assume we have found a sequence line - if (entry.offset == -1) // NB initially the offset is -1 - entry.offset = offset; - entry.length += line_length; - if (entry.line_len) { - //entry.line_len = entry.line_len ? entry.line_len : line_length + 1; - if (mismatchedLineLengths || emptyLine) { - if (line_length == 0) { - emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence - } else { - if (emptyLine) { - cerr << "ERROR: embedded newline"; - } else { - cerr << "ERROR: mismatched line lengths"; - } - cerr << " at line " << line_number << " within sequence " << entry.name << - endl << "File not suitable for fasta index generation." << endl; - exit(1); - } - } - // this flag is set here and checked on the next line - // because we may have reached the end of the sequence, in - // which case a mismatched line length is OK - if (entry.line_len != line_length + 1) { - mismatchedLineLengths = true; - if (line_length == 0) { - emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence - } - } - } else { - entry.line_len = line_length + 1; // first line - } - entry.line_blen = entry.line_len - 1; - } - offset += line_length + 1; - } - // we've hit the end of the fasta file! - // flush the last entry - flushEntryToIndex(entry); - } else { - cerr << "could not open reference file " << refname << " for indexing!" << endl; - exit(1); - } -} - -void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) { - string name = split(entry.name, " \t").at(0); // key by first token of name - sequenceNames.push_back(name); - this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length, - entry.offset, entry.line_blen, - entry.line_len))); - -} - -void FastaIndex::writeIndexFile(string fname) { - //cerr << "writing fasta index file " << fname << endl; - ofstream file; - file.open(fname.c_str()); - if (file.is_open()) { - file << *this; - } else { - cerr << "could not open index file " << fname << " for writing!" << endl; - exit(1); - } -} - -FastaIndex::~FastaIndex(void) { - indexFile.close(); -} - -FastaIndexEntry FastaIndex::entry(string name) { - FastaIndex::iterator e = this->find(name); - if (e == this->end()) { - cerr << "unable to find FASTA index entry for '" << name << "'" << endl; - exit(1); - } else { - return e->second; - } -} - -string FastaIndex::indexFileExtension() { return ".fai"; } - -void FastaReference::open(string reffilename, bool usemmap) { - filename = reffilename; - if (!(file = fopen(filename.c_str(), "r"))) { - cerr << "could not open " << filename << endl; - exit(1); - } - index = new FastaIndex(); - struct stat stFileInfo; - string indexFileName = filename + index->indexFileExtension(); - // if we can find an index file, use it - if(stat(indexFileName.c_str(), &stFileInfo) == 0) { - index->readIndexFile(indexFileName); - } else { // otherwise, read the reference and generate the index file in the cwd - cerr << "index file " << indexFileName << " not found, generating..." << endl; - index->indexReference(filename); - index->writeIndexFile(indexFileName); - } - if (usemmap) { - usingmmap = true; - int fd = fileno(file); - struct stat sb; - if (fstat(fd, &sb) == -1) - cerr << "could not stat file" << filename << endl; - filesize = sb.st_size; - // map the whole file - filemm = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); - } -} - -FastaReference::~FastaReference(void) { - fclose(file); - if (usingmmap) { - munmap(filemm, filesize); - } - delete index; -} - -string FastaReference::getSequence(string seqname) { - FastaIndexEntry entry = index->entry(seqname); - int newlines_in_sequence = entry.length / entry.line_blen; - int seqlen = newlines_in_sequence + entry.length; - char* seq = (char*) calloc (seqlen + 1, sizeof(char)); - if (usingmmap) { - memcpy(seq, (char*) filemm + entry.offset, seqlen); - } else { - fseek64(file, entry.offset, SEEK_SET); - fread(seq, sizeof(char), seqlen, file); - } - seq[seqlen] = '\0'; - char* pbegin = seq; - char* pend = seq + (seqlen/sizeof(char)); - pend = remove(pbegin, pend, '\n'); - pend = remove(pbegin, pend, '\0'); - string s = seq; - free(seq); - s.resize((pend - pbegin)/sizeof(char)); - return s; -} - -// TODO cleanup; odd function. use a map -string FastaReference::sequenceNameStartingWith(string seqnameStart) { - try { - return (*index)[seqnameStart].name; - } catch (exception& e) { - cerr << e.what() << ": unable to find index entry for " << seqnameStart << endl; - exit(1); - } -} - -string FastaReference::getSubSequence(string seqname, int start, int length) { - FastaIndexEntry entry = index->entry(seqname); - if (start < 0 || length < 1) { - cerr << "Error: cannot construct subsequence with negative offset or length < 1" << endl; - exit(1); - } - // we have to handle newlines - // approach: count newlines before start - // count newlines by end of read - // subtracting newlines before start find count of embedded newlines - int newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0; - int newlines_by_end = (start + length - 1) / entry.line_blen; - int newlines_inside = newlines_by_end - newlines_before; - int seqlen = length + newlines_inside; - char* seq = (char*) calloc (seqlen + 1, sizeof(char)); - if (usingmmap) { - memcpy(seq, (char*) filemm + entry.offset + newlines_before + start, seqlen); - } else { - fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET); - fread(seq, sizeof(char), (off_t) seqlen, file); - } - seq[seqlen] = '\0'; - char* pbegin = seq; - char* pend = seq + (seqlen/sizeof(char)); - pend = remove(pbegin, pend, '\n'); - pend = remove(pbegin, pend, '\0'); - string s = seq; - free(seq); - s.resize((pend - pbegin)/sizeof(char)); - return s; -} - -long unsigned int FastaReference::sequenceLength(string seqname) { - FastaIndexEntry entry = index->entry(seqname); - return entry.length; -} -