view spp/src/BamToolsIndex_p.cpp @ 15:e689b83b0257 draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:15:21 -0500
parents ce08b0efa3fd
children
line wrap: on
line source

// ***************************************************************************
// BamToolsIndex.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
// Last modified: 22 November 2010 (DB)
// ---------------------------------------------------------------------------
// Provides index operations for the BamTools index format (".bti")
// ***************************************************************************

#include <BamAlignment.h>
#include <BamReader.h>
#include <BGZF.h>
#include <BamToolsIndex_p.h>
using namespace BamTools;
using namespace BamTools::Internal;

#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <map>
using namespace std;

BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
    : BamIndex(bgzf, reader)
    , m_blockSize(1000)
    , m_dataBeginOffset(0)
    , m_hasFullDataCache(false)
    , m_inputVersion(0)
    , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
{
    m_isBigEndian = BamTools::SystemIsBigEndian();
}

// dtor
BamToolsIndex::~BamToolsIndex(void) {
    ClearAllData();
}

// creates index data (in-memory) from current reader data
bool BamToolsIndex::Build(void) {

    // be sure reader & BGZF file are valid & open for reading
    if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
	return false;

    // move file pointer to beginning of alignments
    if ( !m_reader->Rewind() ) return false;

    // initialize index data structure with space for all references
    const int numReferences = (int)m_references.size();
    m_indexData.clear();
    m_hasFullDataCache = false;
    SetReferenceCount(numReferences);

    // set up counters and markers
    int32_t currentBlockCount      = 0;
    int64_t currentAlignmentOffset = m_BGZF->Tell();
    int32_t blockRefId             = 0;
    int32_t blockMaxEndPosition    = 0;
    int64_t blockStartOffset       = currentAlignmentOffset;
    int32_t blockStartPosition     = -1;

    // plow through alignments, storing index entries
    BamAlignment al;
    while ( m_reader->GetNextAlignmentCore(al) ) {

	// if block contains data (not the first time through) AND alignment is on a new reference
	if ( currentBlockCount > 0 && al.RefID != blockRefId ) {

	    // store previous data
	    BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
	    SaveOffsetEntry(blockRefId, entry);

	    // intialize new block for current alignment's reference
	    currentBlockCount   = 0;
	    blockMaxEndPosition = al.GetEndPosition();
	    blockStartOffset    = currentAlignmentOffset;
	}

	// if beginning of block, save first alignment's refID & position
	if ( currentBlockCount == 0 ) {
	    blockRefId         = al.RefID;
	    blockStartPosition = al.Position;
	}

	// increment block counter
	++currentBlockCount;

	// check end position
	int32_t alignmentEndPosition = al.GetEndPosition();
	if ( alignmentEndPosition > blockMaxEndPosition )
	    blockMaxEndPosition = alignmentEndPosition;

	// if block is full, get offset for next block, reset currentBlockCount
	if ( currentBlockCount == m_blockSize ) {
	    BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
	    SaveOffsetEntry(blockRefId, entry);
	    blockStartOffset  = m_BGZF->Tell();
	    currentBlockCount = 0;
	}

	// not the best name, but for the next iteration, this value will be the offset of the *current* alignment
	// necessary because we won't know if this next alignment is on a new reference until we actually read it
	currentAlignmentOffset = m_BGZF->Tell();
    }

    // store final block with data
    BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
    SaveOffsetEntry(blockRefId, entry);

    // set flag
    m_hasFullDataCache = true;

    // return success/failure of rewind
    return m_reader->Rewind();
}

// check index file magic number, return true if OK
bool BamToolsIndex::CheckMagicNumber(void) {

    // see if index is valid BAM index
    char magic[4];
    size_t elementsRead = fread(magic, 1, 4, m_indexStream);
    if ( elementsRead != 4 ) return false;
    if ( strncmp(magic, "BTI\1", 4) != 0 ) {
	fprintf(stderr, "Problem with index file - invalid format.\n");
	return false;
    }

    // otherwise ok
    return true;
}

// check index file version, return true if OK
bool BamToolsIndex::CheckVersion(void) {

    // read version from file
    size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
    if ( elementsRead != 1 ) return false;
    if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);

    // if version is negative, or zero
    if ( m_inputVersion <= 0 ) {
	fprintf(stderr, "Problem with index file - invalid version.\n");
	return false;
    }

    // if version is newer than can be supported by this version of bamtools
    else if ( m_inputVersion > m_outputVersion ) {
	fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
	fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
	return false;
    }

    // ------------------------------------------------------------------
    // check for deprecated, unsupported versions
    // (typically whose format did not accomodate a particular bug fix)

    else if ( (Version)m_inputVersion == BTI_1_0 ) {
	fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
	fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
	return false;
    }

    else if ( (Version)m_inputVersion == BTI_1_1 ) {
	fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
	fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
	return false;
    }

    // otherwise ok
    else return true;
}

// clear all current index offset data in memory
void BamToolsIndex::ClearAllData(void) {
    BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
    BamToolsIndexData::const_iterator indexEnd  = m_indexData.end();
    for ( ; indexIter != indexEnd; ++indexIter ) {
	const int& refId = (*indexIter).first;
	ClearReferenceOffsets(refId);
    }
}

// clear all index offset data for desired reference
void BamToolsIndex::ClearReferenceOffsets(const int& refId) {
    if ( m_indexData.find(refId) == m_indexData.end() ) return;
    vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
    offsets.clear();
    m_hasFullDataCache = false;
}

// return file position after header metadata
const off_t BamToolsIndex::DataBeginOffset(void) const {
    return m_dataBeginOffset;
}

// calculate BAM file offset for desired region
// return true if no error (*NOT* equivalent to "has alignments or valid offset")
//   check @hasAlignmentsInRegion to determine this status
// @region - target region
// @offset - resulting seek target
// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
// N.B. - ignores isRightBoundSpecified
bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {

    // return false if leftBound refID is not found in index data
    BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
    if ( indexIter == m_indexData.end()) return false;

    // load index data for region if not already cached
    if ( !IsDataLoaded(region.LeftRefID) ) {
	bool loadedOk = true;
	loadedOk &= SkipToReference(region.LeftRefID);
	loadedOk &= LoadReference(region.LeftRefID);
	if ( !loadedOk ) return false;
    }

    // localize index data for this reference (& sanity check that data actually exists)
    indexIter = m_indexData.find(region.LeftRefID);
    if ( indexIter == m_indexData.end()) return false;
    const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
    if ( referenceOffsets.empty() ) return false;

    // -------------------------------------------------------
    // calculate nearest index to jump to

    // save first offset
    offset = (*referenceOffsets.begin()).StartOffset;

    // iterate over offsets entries on this reference
    vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
    vector<BamToolsIndexEntry>::const_iterator offsetEnd  = referenceOffsets.end();
    for ( ; offsetIter != offsetEnd; ++offsetIter ) {
	const BamToolsIndexEntry& entry = (*offsetIter);
	// break if alignment 'entry' overlaps region
	if ( entry.MaxEndPosition >= region.LeftPosition ) break;
	offset = (*offsetIter).StartOffset;
    }

    // set flag based on whether an index entry was found for this region
    *hasAlignmentsInRegion = ( offsetIter != offsetEnd );

    // if cache mode set to none, dump the data we just loaded
    if (m_cacheMode == BamIndex::NoIndexCaching )
	ClearReferenceOffsets(region.LeftRefID);

    // return success
    return true;
}

// returns whether reference has alignments or no
bool BamToolsIndex::HasAlignments(const int& refId) const {

    BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
    if ( indexIter == m_indexData.end()) return false;
    const BamToolsReferenceEntry& refEntry = (*indexIter).second;
    return refEntry.HasAlignments;
}

// return true if all index data is cached
bool BamToolsIndex::HasFullDataCache(void) const {
    return m_hasFullDataCache;
}

// returns true if index cache has data for desired reference
bool BamToolsIndex::IsDataLoaded(const int& refId) const {

    BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
    if ( indexIter == m_indexData.end()) return false;
    const BamToolsReferenceEntry& refEntry = (*indexIter).second;

    if ( !refEntry.HasAlignments ) return true; // no data period

    // return whether offsets list contains data
    return !refEntry.Offsets.empty();
}

// attempts to use index to jump to region; returns success/fail
bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {

    // clear flag
    *hasAlignmentsInRegion = false;

    // check valid BamReader state
    if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) {
	fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
	return false;
    }

    // make sure left-bound position is valid
    if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength )
	return false;

    // calculate nearest offset to jump to
    int64_t offset;
    if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
	fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
	return false;
    }

    // return success/failure of seek
    return m_BGZF->Seek(offset);
}

// clears index data from all references except the first
void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) {
    BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
    KeepOnlyReferenceOffsets( (*indexBegin).first );
}

// clears index data from all references except the one specified
void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) {
    BamToolsIndexData::iterator mapIter = m_indexData.begin();
    BamToolsIndexData::iterator mapEnd  = m_indexData.end();
    for ( ; mapIter != mapEnd; ++mapIter ) {
	const int entryRefId = (*mapIter).first;
	if ( entryRefId != refId )
	    ClearReferenceOffsets(entryRefId);
    }
}

// load index data for all references, return true if loaded OK
bool BamToolsIndex::LoadAllReferences(bool saveData) {

    // skip if data already loaded
    if ( m_hasFullDataCache ) return true;

    // read in number of references
    int32_t numReferences;
    if ( !LoadReferenceCount(numReferences) ) return false;
    //SetReferenceCount(numReferences);

    // iterate over reference entries
    bool loadedOk = true;
    for ( int i = 0; i < numReferences; ++i )
	loadedOk &= LoadReference(i, saveData);

    // set flag
    if ( loadedOk && saveData )
	m_hasFullDataCache = true;

    // return success/failure of load
    return loadedOk;
}

// load header data from index file, return true if loaded OK
bool BamToolsIndex::LoadHeader(void) {

    // check magic number
    if ( !CheckMagicNumber() ) return false;

    // check BTI version
    if ( !CheckVersion() ) return false;

    // read in block size
    size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
    if ( elementsRead != 1 ) return false;
    if ( m_isBigEndian ) SwapEndian_32(m_blockSize);

    // store offset of beginning of data
    m_dataBeginOffset = ftell64(m_indexStream);

    // return success/failure of load
    return (elementsRead == 1);
}

// load a single index entry from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) {

    // read in index entry data members
    size_t elementsRead = 0;
    BamToolsIndexEntry entry;
    elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream);
    elementsRead += fread(&entry.StartOffset,    sizeof(entry.StartOffset),    1, m_indexStream);
    elementsRead += fread(&entry.StartPosition,  sizeof(entry.StartPosition),  1, m_indexStream);
    if ( elementsRead != 3 ) {
	cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
	return false;
    }

    // swap endian-ness if necessary
    if ( m_isBigEndian ) {
	SwapEndian_32(entry.MaxEndPosition);
	SwapEndian_64(entry.StartOffset);
	SwapEndian_32(entry.StartPosition);
    }

    // save data
    if ( saveData )
	SaveOffsetEntry(refId, entry);

    // return success/failure of load
    return true;
}

// load a single reference from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool BamToolsIndex::LoadFirstReference(bool saveData) {
    BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
    return LoadReference( (*indexBegin).first, saveData );
}

// load a single reference from file, return true if loaded OK
// @saveData - save data in memory if true, just read & discard if false
bool BamToolsIndex::LoadReference(const int& refId, bool saveData) {

    // read in number of offsets for this reference
    uint32_t numOffsets;
    size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
    if ( elementsRead != 1 ) return false;
    if ( m_isBigEndian ) SwapEndian_32(numOffsets);

    // initialize offsets container for this reference
    SetOffsetCount(refId, (int)numOffsets);

    // iterate over offset entries
    for ( unsigned int j = 0; j < numOffsets; ++j )
	LoadIndexEntry(refId, saveData);

    // return success/failure of load
    return true;
}

// loads number of references, return true if loaded OK
bool BamToolsIndex::LoadReferenceCount(int& numReferences) {

    size_t elementsRead = 0;

    // read reference count
    elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
    if ( m_isBigEndian ) SwapEndian_32(numReferences);

    // return success/failure of load
    return ( elementsRead == 1 );
}

// saves an index offset entry in memory
void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
    BamToolsReferenceEntry& refEntry = m_indexData[refId];
    refEntry.HasAlignments = true;
    refEntry.Offsets.push_back(entry);
}

// pre-allocates size for offset vector
void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) {
    BamToolsReferenceEntry& refEntry = m_indexData[refId];
    refEntry.Offsets.reserve(offsetCount);
    refEntry.HasAlignments = ( offsetCount > 0);
}

// initializes index data structure to hold @count references
void BamToolsIndex::SetReferenceCount(const int& count) {
    for ( int i = 0; i < count; ++i )
	m_indexData[i].HasAlignments = false;
}

// position file pointer to first reference begin, return true if skipped OK
bool BamToolsIndex::SkipToFirstReference(void) {
    BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
    return SkipToReference( (*indexBegin).first );
}

// position file pointer to desired reference begin, return true if skipped OK
bool BamToolsIndex::SkipToReference(const int& refId) {

    // attempt rewind
    if ( !Rewind() ) return false;

    // read in number of references
    int32_t numReferences;
    size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
    if ( elementsRead != 1 ) return false;
    if ( m_isBigEndian ) SwapEndian_32(numReferences);

    // iterate over reference entries
    bool skippedOk = true;
    int currentRefId = 0;
    while (currentRefId != refId) {
	skippedOk &= LoadReference(currentRefId, false);
	++currentRefId;
    }

    // return success/failure of skip
    return skippedOk;
}

// write header to new index file
bool BamToolsIndex::WriteHeader(void) {

    size_t elementsWritten = 0;

    // write BTI index format 'magic number'
    elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream);

    // write BTI index format version
    int32_t currentVersion = (int32_t)m_outputVersion;
    if ( m_isBigEndian ) SwapEndian_32(currentVersion);
    elementsWritten += fwrite(&currentVersion, sizeof(currentVersion), 1, m_indexStream);

    // write block size
    int32_t blockSize = m_blockSize;
    if ( m_isBigEndian ) SwapEndian_32(blockSize);
    elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);

    // store offset of beginning of data
    m_dataBeginOffset = ftell64(m_indexStream);

    // return success/failure of write
    return ( elementsWritten == 6 );
}

// write index data for all references to new index file
bool BamToolsIndex::WriteAllReferences(void) {

    size_t elementsWritten = 0;

    // write number of references
    int32_t numReferences = (int32_t)m_indexData.size();
    if ( m_isBigEndian ) SwapEndian_32(numReferences);
    elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);

    // iterate through references in index
    bool refOk = true;
    BamToolsIndexData::const_iterator refIter = m_indexData.begin();
    BamToolsIndexData::const_iterator refEnd  = m_indexData.end();
    for ( ; refIter != refEnd; ++refIter )
	refOk &= WriteReferenceEntry( (*refIter).second );

    return ( (elementsWritten == 1) && refOk );
}

// write current reference index data to new index file
bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {

    size_t elementsWritten = 0;

    // write number of offsets listed for this reference
    uint32_t numOffsets = refEntry.Offsets.size();
    if ( m_isBigEndian ) SwapEndian_32(numOffsets);
    elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream);

    // iterate over offset entries
    bool entriesOk = true;
    vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
    vector<BamToolsIndexEntry>::const_iterator offsetEnd  = refEntry.Offsets.end();
    for ( ; offsetIter != offsetEnd; ++offsetIter )
	entriesOk &= WriteIndexEntry( (*offsetIter) );

    return ( (elementsWritten == 1) && entriesOk );
}

// write current index offset entry to new index file
bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) {

    // copy entry data
    int32_t maxEndPosition = entry.MaxEndPosition;
    int64_t startOffset    = entry.StartOffset;
    int32_t startPosition  = entry.StartPosition;

    // swap endian-ness if necessary
    if ( m_isBigEndian ) {
	SwapEndian_32(maxEndPosition);
	SwapEndian_64(startOffset);
	SwapEndian_32(startPosition);
    }

    // write the reference index entry
    size_t elementsWritten = 0;
    elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
    elementsWritten += fwrite(&startOffset,    sizeof(startOffset),    1, m_indexStream);
    elementsWritten += fwrite(&startPosition,  sizeof(startPosition),  1, m_indexStream);
    return ( elementsWritten == 3 );
}