Mercurial > repos > zzhou > spp_phantompeak
diff spp/src/BamStandardIndex_p.cpp @ 15:e689b83b0257 draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:15:21 -0500 |
parents | ce08b0efa3fd |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamStandardIndex_p.cpp Tue Nov 27 16:15:21 2012 -0500 @@ -0,0 +1,910 @@ +// *************************************************************************** +// BamStandardIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#include <BamAlignment.h> +#include <BamReader.h> +#include <BGZF.h> +#include <BamStandardIndex_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <algorithm> +#include <iostream> +#include <map> +using namespace std; + +BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) + : BamIndex(bgzf, reader) + , m_dataBeginOffset(0) + , m_hasFullDataCache(false) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +BamStandardIndex::~BamStandardIndex(void) { + ClearAllData(); +} + +// calculate bins that overlap region +int BamStandardIndex::BinsFromRegion(const BamRegion& region, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]) +{ + // get region boundaries + uint32_t begin = (unsigned int)region.LeftPosition; + uint32_t end; + + // if right bound specified AND left&right bounds are on same reference + // OK to use right bound position + if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) + end = (unsigned int)region.RightPosition; + + // otherwise, use end of left bound reference as cutoff + else + end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + bins[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } + + // return number of bins stored + return i; +} + +// creates index data (in-memory) from current reader data +bool BamStandardIndex::Build(void) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // move file pointer to beginning of alignments + m_reader->Rewind(); + + // get reference count, reserve index space + const int numReferences = (int)m_references.size(); + m_indexData.clear(); + m_hasFullDataCache = false; + SetReferenceCount(numReferences); + + // sets default constant for bin, ID, offset, coordinate variables + const uint32_t defaultValue = 0xffffffffu; + + // bin data + uint32_t saveBin(defaultValue); + uint32_t lastBin(defaultValue); + + // reference ID data + int32_t saveRefID(defaultValue); + int32_t lastRefID(defaultValue); + + // offset data + uint64_t saveOffset = m_BGZF->Tell(); + uint64_t lastOffset = saveOffset; + + // coordinate data + int32_t lastCoordinate = defaultValue; + + BamAlignment bAlignment; + while ( m_reader->GetNextAlignmentCore(bAlignment) ) { + + // change of chromosome, save ID, reset bin + if ( lastRefID != bAlignment.RefID ) { + lastRefID = bAlignment.RefID; + lastBin = defaultValue; + } + + // if lastCoordinate greater than BAM position - file not sorted properly + else if ( lastCoordinate > bAlignment.Position ) { + fprintf(stderr, "BAM file not properly sorted:\n"); + fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), + lastCoordinate, bAlignment.Position, bAlignment.RefID); + exit(1); + } + + // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) + if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { + + // save linear offset entry (matched to BAM entry refID) + BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + LinearOffsetVector& offsets = refIndex.Offsets; + SaveLinearOffset(offsets, bAlignment, lastOffset); + } + + // if current BamAlignment bin != lastBin, "then possibly write the binning index" + if ( bAlignment.Bin != lastBin ) { + + // if not first time through + if ( saveBin != defaultValue ) { + + // save Bam bin entry + BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& binMap = refIndex.Bins; + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // update saveOffset + saveOffset = lastOffset; + + // update bin values + saveBin = bAlignment.Bin; + lastBin = bAlignment.Bin; + + // update saveRefID + saveRefID = bAlignment.RefID; + + // if invalid RefID, break out + if ( saveRefID < 0 ) break; + } + + // make sure that current file pointer is beyond lastOffset + if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { + fprintf(stderr, "Error in BGZF offsets.\n"); + exit(1); + } + + // update lastOffset + lastOffset = m_BGZF->Tell(); + + // update lastCoordinate + lastCoordinate = bAlignment.Position; + } + + // save any leftover BAM data (as long as refID is valid) + if ( saveRefID >= 0 ) { + // save Bam bin entry + BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& binMap = refIndex.Bins; + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // simplify index by merging chunks + MergeChunks(); + + // iterate through references in index + // sort offsets in linear offset vector + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); + for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { + + // get reference index data + ReferenceIndex& refIndex = (*indexIter).second; + LinearOffsetVector& offsets = refIndex.Offsets; + + // sort linear offsets + sort(offsets.begin(), offsets.end()); + } + + // rewind file pointer to beginning of alignments, return success/fail + return m_reader->Rewind(); +} + +// check index file magic number, return true if OK +bool BamStandardIndex::CheckMagicNumber(void) { + + // read in magic number + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); + + // compare to expected value + if ( strncmp(magic, "BAI\1", 4) != 0 ) { + fprintf(stderr, "Problem with index file - invalid format.\n"); + fclose(m_indexStream); + return false; + } + + // return success/failure of load + return (elementsRead == 4); +} + +// clear all current index offset data in memory +void BamStandardIndex::ClearAllData(void) { + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + const int& refId = (*indexIter).first; + ClearReferenceOffsets(refId); + } +} + +// clear all index offset data for desired reference +void BamStandardIndex::ClearReferenceOffsets(const int& refId) { + + // look up refId, skip if not found + BamStandardIndexData::iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return ; + + // clear reference data + ReferenceIndex& refEntry = (*indexIter).second; + refEntry.Bins.clear(); + refEntry.Offsets.clear(); + + // set flag + m_hasFullDataCache = false; +} + +// return file position after header metadata +const off_t BamStandardIndex::DataBeginOffset(void) const { + return m_dataBeginOffset; +} + +// calculates offset(s) for a given region +bool BamStandardIndex::GetOffsets(const BamRegion& region, + const bool isRightBoundSpecified, + vector<int64_t>& offsets, + bool* hasAlignmentsInRegion) +{ + // return false if leftBound refID is not found in index data + if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) + return false; + + // load index data for region if not already cached + if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; + loadedOk &= SkipToReference(region.LeftRefID); + loadedOk &= LoadReference(region.LeftRefID); + if ( !loadedOk ) return false; + } + + // calculate which bins overlap this region + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); + + // get bins for this reference + BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); + if ( indexIter == m_indexData.end() ) return false; // error + const ReferenceIndex& refIndex = (*indexIter).second; + const BamBinMap& binMap = refIndex.Bins; + + // get minimum offset to consider + const LinearOffsetVector& linearOffsets = refIndex.Offsets; + const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) + ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); + + // store all alignment 'chunk' starts (file offsets) for bins in this region + for ( int i = 0; i < numBins; ++i ) { + + const uint16_t binKey = bins[i]; + map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey); + if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { + + // iterate over chunks + const ChunkVector& chunks = (*binIter).second; + std::vector<Chunk>::const_iterator chunksIter = chunks.begin(); + std::vector<Chunk>::const_iterator chunksEnd = chunks.end(); + for ( ; chunksIter != chunksEnd; ++chunksIter) { + + // if valid chunk found, store its file offset + const Chunk& chunk = (*chunksIter); + if ( chunk.Stop > minOffset ) + offsets.push_back( chunk.Start ); + } + } + } + + // clean up memory + free(bins); + + // sort the offsets before returning + sort(offsets.begin(), offsets.end()); + + // set flag & return success + *hasAlignmentsInRegion = (offsets.size() != 0 ); + + // if cache mode set to none, dump the data we just loaded + if (m_cacheMode == BamIndex::NoIndexCaching ) + ClearReferenceOffsets(region.LeftRefID); + + // return succes + return true; +} + +// returns whether reference has alignments or no +bool BamStandardIndex::HasAlignments(const int& refId) const { + BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return false; // error + const ReferenceIndex& refEntry = (*indexIter).second; + return refEntry.HasAlignments; +} + +// return true if all index data is cached +bool BamStandardIndex::HasFullDataCache(void) const { + return m_hasFullDataCache; +} + +// returns true if index cache has data for desired reference +bool BamStandardIndex::IsDataLoaded(const int& refId) const { + + // look up refId, return false if not found + BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return false; + + // see if reference has alignments + // if not, it's not a problem to have no offset data + const ReferenceIndex& refEntry = (*indexIter).second; + if ( !refEntry.HasAlignments ) return true; + + // return whether bin map contains data + return ( !refEntry.Bins.empty() ); +} + +// attempts to use index to jump to region; returns success/fail +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // make sure left-bound position is valid + if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + return false; + + // calculate offsets for this region + // if failed, print message, set flag, and return failure + vector<int64_t> offsets; + if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { + fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n"); + *hasAlignmentsInRegion = false; + return false; + } + + // iterate through offsets + BamAlignment bAlignment; + bool result = true; + for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { + + // attempt seek & load first available alignment + // set flag to true if data exists + result &= m_BGZF->Seek(*o); + *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment); + + // if this alignment corresponds to desired position + // return success of seeking back to the offset before the 'current offset' (to cover overlaps) + if ( ((bAlignment.RefID == region.LeftRefID) && + ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || + (bAlignment.RefID > region.LeftRefID) ) + { + if ( o != offsets.begin() ) --o; + return m_BGZF->Seek(*o); + } + } + + // if error in jumping, print message & set flag + if ( !result ) { + fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n"); + *hasAlignmentsInRegion = false; + } + + // return success/failure + return result; +} + +// clears index data from all references except the first +void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + KeepOnlyReferenceOffsets((*indexBegin).first); +} + +// clears index data from all references except the one specified +void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) { + BamStandardIndexData::iterator mapIter = m_indexData.begin(); + BamStandardIndexData::iterator mapEnd = m_indexData.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + const int entryRefId = (*mapIter).first; + if ( entryRefId != refId ) + ClearReferenceOffsets(entryRefId); + } +} + +bool BamStandardIndex::LoadAllReferences(bool saveData) { + + // skip if data already loaded + if ( m_hasFullDataCache ) return true; + + // get number of reference sequences + uint32_t numReferences; + if ( !LoadReferenceCount((int&)numReferences) ) + return false; + + // iterate over reference entries + bool loadedOk = true; + for ( int i = 0; i < (int)numReferences; ++i ) + loadedOk &= LoadReference(i, saveData); + + // set flag + if ( loadedOk && saveData ) + m_hasFullDataCache = true; + + // return success/failure of loading references + return loadedOk; +} + +// load header data from index file, return true if loaded OK +bool BamStandardIndex::LoadHeader(void) { + + bool loadedOk = CheckMagicNumber(); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of load + return loadedOk; +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get bin ID + uint32_t binId; + elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(binId); + + // load alignment chunks for this bin + ChunkVector chunks; + bool chunksOk = LoadChunks(chunks, saveData); + + // store bin entry + if ( chunksOk && saveData ) + refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks)); + + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} + +bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get number of bins + int32_t numBins; + elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBins); + + // set flag + refEntry.HasAlignments = ( numBins != 0 ); + + // iterate over bins + bool binsOk = true; + for ( int i = 0; i < numBins; ++i ) + binsOk &= LoadBin(refEntry, saveData); + + // return success/failure of load + return ( (elementsRead == 1) && binsOk ); +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) { + + size_t elementsRead = 0; + + // read in chunk data + uint64_t start; + uint64_t stop; + elementsRead += fread(&start, sizeof(start), 1, m_indexStream); + elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // save data if requested + if ( saveData ) chunks.push_back( Chunk(start, stop) ); + + // return success/failure of load + return ( elementsRead == 2 ); +} + +bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) { + + size_t elementsRead = 0; + + // read in number of chunks + uint32_t numChunks; + elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numChunks); + + // initialize space for chunks if we're storing this data + if ( saveData ) chunks.reserve(numChunks); + + // iterate over chunks + bool chunksOk = true; + for ( int i = 0; i < (int)numChunks; ++i ) + chunksOk &= LoadChunk(chunks, saveData); + + // sort chunk vector + sort( chunks.begin(), chunks.end(), ChunkLessThan ); + + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} + +// load a single index linear offset entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // read in number of linear offsets + int32_t numLinearOffsets; + elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + + // set up destination vector (if we're saving the data) + LinearOffsetVector linearOffsets; + if ( saveData ) linearOffsets.reserve(numLinearOffsets); + + // iterate over linear offsets + uint64_t linearOffset; + for ( int i = 0; i < numLinearOffsets; ++i ) { + elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + if ( saveData ) linearOffsets.push_back(linearOffset); + } + + // sort linear offsets + sort ( linearOffsets.begin(), linearOffsets.end() ); + + // save in reference index entry if desired + if ( saveData ) refEntry.Offsets = linearOffsets; + + // return success/failure of load + return ( elementsRead == (size_t)(numLinearOffsets + 1) ); +} + +bool BamStandardIndex::LoadFirstReference(bool saveData) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return LoadReference((*indexBegin).first, saveData); +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadReference(const int& refId, bool saveData) { + + // look up refId + BamStandardIndexData::iterator indexIter = m_indexData.find(refId); + + // if reference not previously loaded, create new entry + if ( indexIter == m_indexData.end() ) { + ReferenceIndex newEntry; + newEntry.HasAlignments = false; + m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) ); + } + + // load reference data + indexIter = m_indexData.find(refId); + ReferenceIndex& entry = (*indexIter).second; + bool loadedOk = true; + loadedOk &= LoadBins(entry, saveData); + loadedOk &= LoadLinearOffsets(entry, saveData); + return loadedOk; +} + +// loads number of references, return true if loaded OK +bool BamStandardIndex::LoadReferenceCount(int& numReferences) { + + size_t elementsRead = 0; + + // read reference count + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // return success/failure of load + return ( elementsRead == 1 ); +} + +// merges 'alignment chunks' in BAM bin (used for index building) +void BamStandardIndex::MergeChunks(void) { + + // iterate over reference enties + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + + // get BAM bin map for this reference + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& bamBinMap = refIndex.Bins; + + // iterate over BAM bins + BamBinMap::iterator binIter = bamBinMap.begin(); + BamBinMap::iterator binEnd = bamBinMap.end(); + for ( ; binIter != binEnd; ++binIter ) { + + // get chunk vector for this bin + ChunkVector& binChunks = (*binIter).second; + if ( binChunks.size() == 0 ) continue; + + ChunkVector mergedChunks; + mergedChunks.push_back( binChunks[0] ); + + // iterate over chunks + int i = 0; + ChunkVector::iterator chunkIter = binChunks.begin(); + ChunkVector::iterator chunkEnd = binChunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentChunk' based on numeric index + Chunk& currentChunk = mergedChunks[i]; + + // get iteratorChunk based on vector iterator + Chunk& iteratorChunk = (*chunkIter); + + // if chunk ends where (iterator) chunk starts, then merge + if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) + currentChunk.Stop = iteratorChunk.Stop; + + // otherwise + else { + // set currentChunk + 1 to iteratorChunk + mergedChunks.push_back(iteratorChunk); + ++i; + } + } + + // saved merged chunk vector + (*binIter).second = mergedChunks; + } + } +} + +// saves BAM bin entry for index +void BamStandardIndex::SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset) +{ + // look up saveBin + BamBinMap::iterator binIter = binMap.find(saveBin); + + // create new chunk + Chunk newChunk(saveOffset, lastOffset); + + // if entry doesn't exist + if ( binIter == binMap.end() ) { + ChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks)); + } + + // otherwise + else { + ChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} + +// saves linear offset entry for index +void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset) +{ + // get converted offsets + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) + offsets.resize(newSize, 0); + + // store offset + for( int i = beginOffset + 1; i <= endOffset; ++i ) { + if ( offsets[i] == 0 ) + offsets[i] = lastOffset; + } +} + +// initializes index data structure to hold @count references +void BamStandardIndex::SetReferenceCount(const int& count) { + for ( int i = 0; i < count; ++i ) + m_indexData[i].HasAlignments = false; +} + +bool BamStandardIndex::SkipToFirstReference(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return SkipToReference( (*indexBegin).first ); +} + +// position file pointer to desired reference begin, return true if skipped OK +bool BamStandardIndex::SkipToReference(const int& refId) { + + // attempt rewind + if ( !Rewind() ) return false; + + // read in number of references + uint32_t numReferences; + size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // iterate over reference entries + bool skippedOk = true; + int currentRefId = 0; + while (currentRefId != refId) { + skippedOk &= LoadReference(currentRefId, false); + ++currentRefId; + } + + // return success + return skippedOk; +} + +// write header to new index file +bool BamStandardIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write magic number + elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of write + return (elementsWritten == 4); +} + +// write index data for all references to new index file +bool BamStandardIndex::WriteAllReferences(void) { + + size_t elementsWritten = 0; + + // write number of reference sequences + int32_t numReferenceSeqs = m_indexData.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); + elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream); + + // iterate over reference sequences + bool refsOk = true; + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++ indexIter ) + refsOk &= WriteReference( (*indexIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && refsOk ); +} + +// write index data for bin to new index file +bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write BAM bin ID + uint32_t binKey = binId; + if ( m_isBigEndian ) SwapEndian_32(binKey); + elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); + + // write chunks + bool chunksOk = WriteChunks(chunks); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for bins to new index file +bool BamStandardIndex::WriteBins(const BamBinMap& bins) { + + size_t elementsWritten = 0; + + // write number of bins + int32_t binCount = bins.size(); + if ( m_isBigEndian ) SwapEndian_32(binCount); + elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); + + // iterate over bins + bool binsOk = true; + BamBinMap::const_iterator binIter = bins.begin(); + BamBinMap::const_iterator binEnd = bins.end(); + for ( ; binIter != binEnd; ++binIter ) + binsOk &= WriteBin( (*binIter).first, (*binIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && binsOk ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::WriteChunk(const Chunk& chunk) { + + size_t elementsWritten = 0; + + // localize alignment chunk offsets + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // write to index file + elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); + elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); + + // return success/failure of write + return ( elementsWritten == 2 ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write chunks + int32_t chunkCount = chunks.size(); + if ( m_isBigEndian ) SwapEndian_32(chunkCount); + elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); + + // iterate over chunks + bool chunksOk = true; + ChunkVector::const_iterator chunkIter = chunks.begin(); + ChunkVector::const_iterator chunkEnd = chunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) + chunksOk &= WriteChunk( (*chunkIter) ); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for linear offsets entry to new index file +bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { + + size_t elementsWritten = 0; + + // write number of linear offsets + int32_t offsetCount = offsets.size(); + if ( m_isBigEndian ) SwapEndian_32(offsetCount); + elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); + + // iterate over linear offsets + LinearOffsetVector::const_iterator offsetIter = offsets.begin(); + LinearOffsetVector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset + uint64_t linearOffset = (*offsetIter); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + } + + // return success/failure of write + return ( elementsWritten == (size_t)(offsetCount + 1) ); +} + +// write index data for a single reference to new index file +bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) { + bool refOk = true; + refOk &= WriteBins(refEntry.Bins); + refOk &= WriteLinearOffsets(refEntry.Offsets); + return refOk; +}