Mercurial > repos > zzhou > spp_phantompeak
diff spp/src/BamAlignment.cpp @ 6:ce08b0efa3fd draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:11:40 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamAlignment.cpp Tue Nov 27 16:11:40 2012 -0500 @@ -0,0 +1,696 @@ +// *************************************************************************** +// BamAlignment.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 13 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#include <BamAlignment.h> +using namespace BamTools; + +#include <cctype> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <exception> +#include <map> +#include <utility> +using namespace std; + +// default ctor +BamAlignment::BamAlignment(void) + : RefID(-1) + , Position(-1) + , MateRefID(-1) + , MatePosition(-1) + , InsertSize(0) +{ } + +// copy ctor +BamAlignment::BamAlignment(const BamAlignment& other) + : Name(other.Name) + , Length(other.Length) + , QueryBases(other.QueryBases) + , AlignedBases(other.AlignedBases) + , Qualities(other.Qualities) + , TagData(other.TagData) + , RefID(other.RefID) + , Position(other.Position) + , Bin(other.Bin) + , MapQuality(other.MapQuality) + , AlignmentFlag(other.AlignmentFlag) + , CigarData(other.CigarData) + , MateRefID(other.MateRefID) + , MatePosition(other.MatePosition) + , InsertSize(other.InsertSize) + , SupportData(other.SupportData) +{ } + +// dtor +BamAlignment::~BamAlignment(void) { } + +// Queries against alignment flags +bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } +bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } +bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } +bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } +bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } +bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } +bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } +bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } +bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } +bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } +bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + +// Manipulate alignment flags +void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } +void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } +void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } +void BamAlignment::SetIsMapped(bool ok) { SetIsUnmapped(!ok); } +void BamAlignment::SetIsMateMapped(bool ok) { SetIsMateUnmapped(!ok); } +void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } +void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } +void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } +void BamAlignment::SetIsPrimaryAlignment(bool ok) { SetIsSecondaryAlignment(!ok); } +void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } +void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } +void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } +void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } +void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } + +// calculates alignment end position, based on starting position and CIGAR operations +int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { + + // initialize alignment end to starting position + int alignEnd = Position; + + // iterate over cigar operations + vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const char cigarType = (*cigarIter).Type; + if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) + alignEnd += (*cigarIter).Length; + else if ( usePadded && cigarType == 'I' ) + alignEnd += (*cigarIter).Length; + } + + // adjust for zeroBased, if necessary + if (zeroBased) + return alignEnd - 1; + else + return alignEnd; +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, copy tag data to temp buffer + string newTag = tag + type + value; + const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) { + return AddTag(tag, type, (const uint32_t&)value); +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + value.size()]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + const unsigned int dataLength = strlen(value.c_str()); + memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) { + return EditTag(tag, type, (const uint32_t&)value); +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +// get "NM" tag data - originally contributed by Aaron Quinlan +// stores data in 'editDistance', returns success/fail +bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { + return GetTag("NM", (uint32_t&)editDistance); +} + +// get "RG" tag data +// stores data in 'readGroup', returns success/fail +bool BamAlignment::GetReadGroup(string& readGroup) const { + return GetTag("RG", readGroup); +} + +bool BamAlignment::GetTag(const string& tag, string& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + const unsigned int dataLength = strlen(pTagData); + destination.clear(); + destination.resize(dataLength); + memcpy( (char*)destination.data(), pTagData, dataLength ); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch (type) { + + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type for integer destination (float or var-length strings) + case 'f': + case 'Z': + case 'H': + fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + + // store in destination + destination = 0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTag(const string& tag, int32_t& destination) const { + return GetTag(tag, (uint32_t&)destination); +} + +bool BamAlignment::GetTag(const string& tag, float& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch(type) { + + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'f': + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type (var-length strings) + case 'Z': + case 'H': + fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + + // store in destination + destination = 0.0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTagType(const string& tag, char& type) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // lookup tag + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // retrieve tag type code + type = *(pTagData - 1); + + // validate that type is a proper BAM tag type + switch(type) { + case 'A': + case 'c': + case 'C': + case 's': + case 'S': + case 'f': + case 'i': + case 'I': + case 'Z': + case 'H': + return true; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::RemoveTag(const string& tag) { + + // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed + // also, return false if no data present to remove + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + char newTagData[originalTagDataLength]; + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); + + // save new tag data + TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); + return true; + } + + // tag not found, no removal - return failure + return false; +} + +bool BamAlignment::FindTag(const string& tag, + char* &pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) +{ + + while ( numBytesParsed < tagDataLength ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag, return true on match + if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) + return true; + + // get the storage class and find the next tag + if ( *pTagStorageType == '\0' ) return false; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; + if ( *pTagData == '\0' ) return false; + } + + // checked all tags, none match + return false; +} + +bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { + + switch(storageType) { + + case 'A': + case 'c': + case 'C': + ++numBytesParsed; + ++pTagData; + break; + + case 's': + case 'S': + numBytesParsed += 2; + pTagData += 2; + break; + + case 'f': + case 'i': + case 'I': + numBytesParsed += 4; + pTagData += 4; + break; + + case 'Z': + case 'H': + while(*pTagData) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + default: + // error case + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType); + return false; + } + + // return success + return true; +}