diff spp/src/BamWriter_p.cpp @ 15:e689b83b0257 draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:15:21 -0500
parents ce08b0efa3fd
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/spp/src/BamWriter_p.cpp	Tue Nov 27 16:15:21 2012 -0500
@@ -0,0 +1,379 @@
+// ***************************************************************************
+// BamWriter_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include <BamAlignment.h>
+#include <BamWriter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+BamWriterPrivate::BamWriterPrivate(void) {
+    IsBigEndian = SystemIsBigEndian();
+}
+
+BamWriterPrivate::~BamWriterPrivate(void) {
+    mBGZF.Close();
+}
+
+// closes the alignment archive
+void BamWriterPrivate::Close(void) {
+    mBGZF.Close();
+}
+
+// calculates minimum bin for a BAM alignment interval
+const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
+    --end;
+    if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
+    if( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);
+    if( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);
+    if( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);
+    if( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);
+    return 0;
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+    // initialize
+    const unsigned int numCigarOperations = cigarOperations.size();
+    packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
+
+    // pack the cigar data into the string
+    unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+    unsigned int cigarOp;
+    vector<CigarOp>::const_iterator coIter;
+    for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
+
+	switch(coIter->Type) {
+	    case 'M':
+		  cigarOp = BAM_CMATCH;
+		  break;
+	    case 'I':
+		  cigarOp = BAM_CINS;
+		  break;
+	    case 'D':
+		  cigarOp = BAM_CDEL;
+		  break;
+	    case 'N':
+		  cigarOp = BAM_CREF_SKIP;
+		  break;
+	    case 'S':
+		  cigarOp = BAM_CSOFT_CLIP;
+		  break;
+	    case 'H':
+		  cigarOp = BAM_CHARD_CLIP;
+		  break;
+	    case 'P':
+		  cigarOp = BAM_CPAD;
+		  break;
+	    default:
+		  fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+		  exit(1);
+	}
+
+	*pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+	pPackedCigar++;
+    }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+    // prepare the encoded query string
+    const unsigned int queryLen = query.size();
+    const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
+    encodedQuery.resize(encodedQueryLen);
+    char* pEncodedQuery = (char*)encodedQuery.data();
+    const char* pQuery = (const char*)query.data();
+
+    unsigned char nucleotideCode;
+    bool useHighWord = true;
+
+    while(*pQuery) {
+
+	switch(*pQuery) {
+
+	    case '=':
+		nucleotideCode = 0;
+		break;
+
+	    case 'A':
+		nucleotideCode = 1;
+		break;
+
+	    case 'C':
+		nucleotideCode = 2;
+		break;
+
+	    case 'G':
+		nucleotideCode = 4;
+		break;
+
+	    case 'T':
+		nucleotideCode = 8;
+		break;
+
+	    case 'N':
+		nucleotideCode = 15;
+		break;
+
+	    default:
+		fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+		exit(1);
+	}
+
+	// pack the nucleotide code
+	if(useHighWord) {
+	    *pEncodedQuery = nucleotideCode << 4;
+	    useHighWord = false;
+	} else {
+	    *pEncodedQuery |= nucleotideCode;
+	    pEncodedQuery++;
+	    useHighWord = true;
+	}
+
+	// increment the query position
+	pQuery++;
+    }
+}
+
+// opens the alignment archive
+bool BamWriterPrivate::Open(const string& filename,
+			    const string& samHeader,
+			    const RefVector& referenceSequences,
+			    bool isWriteUncompressed)
+{
+    // open the BGZF file for writing, return failure if error
+    if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
+	return false;
+
+    // ================
+    // write the header
+    // ================
+
+    // write the BAM signature
+    const unsigned char SIGNATURE_LENGTH = 4;
+    const char* BAM_SIGNATURE = "BAM\1";
+    mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
+
+    // write the SAM header text length
+    uint32_t samHeaderLen = samHeader.size();
+    if (IsBigEndian) SwapEndian_32(samHeaderLen);
+    mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
+
+    // write the SAM header text
+    if(samHeaderLen > 0)
+	mBGZF.Write(samHeader.data(), samHeaderLen);
+
+    // write the number of reference sequences
+    uint32_t numReferenceSequences = referenceSequences.size();
+    if (IsBigEndian) SwapEndian_32(numReferenceSequences);
+    mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
+
+    // =============================
+    // write the sequence dictionary
+    // =============================
+
+    RefVector::const_iterator rsIter;
+    for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+
+	// write the reference sequence name length
+	uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
+	if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
+	mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
+
+	// write the reference sequence name
+	mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+	// write the reference sequence length
+	int32_t referenceLength = rsIter->RefLength;
+	if (IsBigEndian) SwapEndian_32(referenceLength);
+	mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
+    }
+
+    // return success
+    return true;
+}
+
+// saves the alignment to the alignment archive
+void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
+
+    // if BamAlignment contains only the core data and a raw char data buffer
+    // (as a result of BamReader::GetNextAlignmentCore())
+    if ( al.SupportData.HasCoreOnly ) {
+
+	// write the block size
+	unsigned int blockSize = al.SupportData.BlockLength;
+	if (IsBigEndian) SwapEndian_32(blockSize);
+	mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+	// assign the BAM core data
+	uint32_t buffer[8];
+	buffer[0] = al.RefID;
+	buffer[1] = al.Position;
+	buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+	buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+	buffer[4] = al.SupportData.QuerySequenceLength;
+	buffer[5] = al.MateRefID;
+	buffer[6] = al.MatePosition;
+	buffer[7] = al.InsertSize;
+
+	// swap BAM core endian-ness, if necessary
+	if ( IsBigEndian ) {
+	    for ( int i = 0; i < 8; ++i )
+		SwapEndian_32(buffer[i]);
+	}
+
+	// write the BAM core
+	mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+	// write the raw char data
+	mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
+    }
+
+    // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
+    // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
+    else {
+
+	// calculate char lengths
+	const unsigned int nameLength         = al.Name.size() + 1;
+	const unsigned int numCigarOperations = al.CigarData.size();
+	const unsigned int queryLength        = al.QueryBases.size();
+	const unsigned int tagDataLength      = al.TagData.size();
+
+	// no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
+	// force calculation of Bin before storing
+	const int endPosition = al.GetEndPosition();
+	const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+
+	// create our packed cigar string
+	string packedCigar;
+	CreatePackedCigar(al.CigarData, packedCigar);
+	const unsigned int packedCigarLength = packedCigar.size();
+
+	// encode the query
+	string encodedQuery;
+	EncodeQuerySequence(al.QueryBases, encodedQuery);
+	const unsigned int encodedQueryLength = encodedQuery.size();
+
+	// write the block size
+	const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
+	unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+	if (IsBigEndian) SwapEndian_32(blockSize);
+	mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+	// assign the BAM core data
+	uint32_t buffer[8];
+	buffer[0] = al.RefID;
+	buffer[1] = al.Position;
+	buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+	buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+	buffer[4] = queryLength;
+	buffer[5] = al.MateRefID;
+	buffer[6] = al.MatePosition;
+	buffer[7] = al.InsertSize;
+
+	// swap BAM core endian-ness, if necessary
+	if ( IsBigEndian ) {
+	    for ( int i = 0; i < 8; ++i )
+		SwapEndian_32(buffer[i]);
+	}
+
+	// write the BAM core
+	mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+	// write the query name
+	mBGZF.Write(al.Name.c_str(), nameLength);
+
+	// write the packed cigar
+	if ( IsBigEndian ) {
+
+	    char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
+	    memcpy(cigarData, packedCigar.data(), packedCigarLength);
+
+	    for (unsigned int i = 0; i < packedCigarLength; ++i) {
+		if ( IsBigEndian )
+		  SwapEndian_32p(&cigarData[i]);
+	    }
+
+	    mBGZF.Write(cigarData, packedCigarLength);
+	    free(cigarData);
+	}
+	else
+	    mBGZF.Write(packedCigar.data(), packedCigarLength);
+
+	// write the encoded query sequence
+	mBGZF.Write(encodedQuery.data(), encodedQueryLength);
+
+	// write the base qualities
+	string baseQualities(al.Qualities);
+	char* pBaseQualities = (char*)al.Qualities.data();
+	for(unsigned int i = 0; i < queryLength; i++) {
+	    pBaseQualities[i] -= 33;
+	}
+	mBGZF.Write(pBaseQualities, queryLength);
+
+	// write the read group tag
+	if ( IsBigEndian ) {
+
+	    char* tagData = (char*)calloc(sizeof(char), tagDataLength);
+	    memcpy(tagData, al.TagData.data(), tagDataLength);
+
+	    int i = 0;
+	    while ( (unsigned int)i < tagDataLength ) {
+
+		i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)
+		uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning
+		++i;                                    // skip value type
+
+		switch (type) {
+
+		    case('A') :
+		    case('C') :
+			++i;
+			break;
+
+		    case('S') :
+			SwapEndian_16p(&tagData[i]);
+			i+=2; // sizeof(uint16_t)
+			break;
+
+		    case('F') :
+		    case('I') :
+			SwapEndian_32p(&tagData[i]);
+			i+=4; // sizeof(uint32_t)
+			break;
+
+		    case('D') :
+			SwapEndian_64p(&tagData[i]);
+			i+=8; // sizeof(uint64_t)
+			break;
+
+		    case('H') :
+		    case('Z') :
+			while (tagData[i]) { ++i; }
+			++i; // increment one more for null terminator
+			break;
+
+		    default :
+			fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+			free(tagData);
+			exit(1);
+		}
+	    }
+
+	    mBGZF.Write(tagData, tagDataLength);
+	    free(tagData);
+	}
+	else
+	    mBGZF.Write(al.TagData.data(), tagDataLength);
+    }
+}