Mercurial > repos > zzhou > spp_phantompeak
comparison spp/src/BamStandardIndex_p.cpp @ 6:ce08b0efa3fd draft
Uploaded
| author | zzhou |
|---|---|
| date | Tue, 27 Nov 2012 16:11:40 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:608a8e0eac56 | 6:ce08b0efa3fd |
|---|---|
| 1 // *************************************************************************** | |
| 2 // BamStandardIndex.cpp (c) 2010 Derek Barnett | |
| 3 // Marth Lab, Department of Biology, Boston College | |
| 4 // All rights reserved. | |
| 5 // --------------------------------------------------------------------------- | |
| 6 // Last modified: 22 November 2010 (DB) | |
| 7 // --------------------------------------------------------------------------- | |
| 8 // Provides index operations for the standardized BAM index format (".bai") | |
| 9 // *************************************************************************** | |
| 10 | |
| 11 #include <BamAlignment.h> | |
| 12 #include <BamReader.h> | |
| 13 #include <BGZF.h> | |
| 14 #include <BamStandardIndex_p.h> | |
| 15 using namespace BamTools; | |
| 16 using namespace BamTools::Internal; | |
| 17 | |
| 18 #include <cstdio> | |
| 19 #include <cstdlib> | |
| 20 #include <algorithm> | |
| 21 #include <iostream> | |
| 22 #include <map> | |
| 23 using namespace std; | |
| 24 | |
| 25 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) | |
| 26 : BamIndex(bgzf, reader) | |
| 27 , m_dataBeginOffset(0) | |
| 28 , m_hasFullDataCache(false) | |
| 29 { | |
| 30 m_isBigEndian = BamTools::SystemIsBigEndian(); | |
| 31 } | |
| 32 | |
| 33 BamStandardIndex::~BamStandardIndex(void) { | |
| 34 ClearAllData(); | |
| 35 } | |
| 36 | |
| 37 // calculate bins that overlap region | |
| 38 int BamStandardIndex::BinsFromRegion(const BamRegion& region, | |
| 39 const bool isRightBoundSpecified, | |
| 40 uint16_t bins[MAX_BIN]) | |
| 41 { | |
| 42 // get region boundaries | |
| 43 uint32_t begin = (unsigned int)region.LeftPosition; | |
| 44 uint32_t end; | |
| 45 | |
| 46 // if right bound specified AND left&right bounds are on same reference | |
| 47 // OK to use right bound position | |
| 48 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) | |
| 49 end = (unsigned int)region.RightPosition; | |
| 50 | |
| 51 // otherwise, use end of left bound reference as cutoff | |
| 52 else | |
| 53 end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1; | |
| 54 | |
| 55 // initialize list, bin '0' always a valid bin | |
| 56 int i = 0; | |
| 57 bins[i++] = 0; | |
| 58 | |
| 59 // get rest of bins that contain this region | |
| 60 unsigned int k; | |
| 61 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } | |
| 62 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } | |
| 63 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } | |
| 64 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } | |
| 65 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } | |
| 66 | |
| 67 // return number of bins stored | |
| 68 return i; | |
| 69 } | |
| 70 | |
| 71 // creates index data (in-memory) from current reader data | |
| 72 bool BamStandardIndex::Build(void) { | |
| 73 | |
| 74 // be sure reader & BGZF file are valid & open for reading | |
| 75 if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) | |
| 76 return false; | |
| 77 | |
| 78 // move file pointer to beginning of alignments | |
| 79 m_reader->Rewind(); | |
| 80 | |
| 81 // get reference count, reserve index space | |
| 82 const int numReferences = (int)m_references.size(); | |
| 83 m_indexData.clear(); | |
| 84 m_hasFullDataCache = false; | |
| 85 SetReferenceCount(numReferences); | |
| 86 | |
| 87 // sets default constant for bin, ID, offset, coordinate variables | |
| 88 const uint32_t defaultValue = 0xffffffffu; | |
| 89 | |
| 90 // bin data | |
| 91 uint32_t saveBin(defaultValue); | |
| 92 uint32_t lastBin(defaultValue); | |
| 93 | |
| 94 // reference ID data | |
| 95 int32_t saveRefID(defaultValue); | |
| 96 int32_t lastRefID(defaultValue); | |
| 97 | |
| 98 // offset data | |
| 99 uint64_t saveOffset = m_BGZF->Tell(); | |
| 100 uint64_t lastOffset = saveOffset; | |
| 101 | |
| 102 // coordinate data | |
| 103 int32_t lastCoordinate = defaultValue; | |
| 104 | |
| 105 BamAlignment bAlignment; | |
| 106 while ( m_reader->GetNextAlignmentCore(bAlignment) ) { | |
| 107 | |
| 108 // change of chromosome, save ID, reset bin | |
| 109 if ( lastRefID != bAlignment.RefID ) { | |
| 110 lastRefID = bAlignment.RefID; | |
| 111 lastBin = defaultValue; | |
| 112 } | |
| 113 | |
| 114 // if lastCoordinate greater than BAM position - file not sorted properly | |
| 115 else if ( lastCoordinate > bAlignment.Position ) { | |
| 116 fprintf(stderr, "BAM file not properly sorted:\n"); | |
| 117 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), | |
| 118 lastCoordinate, bAlignment.Position, bAlignment.RefID); | |
| 119 exit(1); | |
| 120 } | |
| 121 | |
| 122 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) | |
| 123 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { | |
| 124 | |
| 125 // save linear offset entry (matched to BAM entry refID) | |
| 126 BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID); | |
| 127 if ( indexIter == m_indexData.end() ) return false; // error | |
| 128 ReferenceIndex& refIndex = (*indexIter).second; | |
| 129 LinearOffsetVector& offsets = refIndex.Offsets; | |
| 130 SaveLinearOffset(offsets, bAlignment, lastOffset); | |
| 131 } | |
| 132 | |
| 133 // if current BamAlignment bin != lastBin, "then possibly write the binning index" | |
| 134 if ( bAlignment.Bin != lastBin ) { | |
| 135 | |
| 136 // if not first time through | |
| 137 if ( saveBin != defaultValue ) { | |
| 138 | |
| 139 // save Bam bin entry | |
| 140 BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); | |
| 141 if ( indexIter == m_indexData.end() ) return false; // error | |
| 142 ReferenceIndex& refIndex = (*indexIter).second; | |
| 143 BamBinMap& binMap = refIndex.Bins; | |
| 144 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); | |
| 145 } | |
| 146 | |
| 147 // update saveOffset | |
| 148 saveOffset = lastOffset; | |
| 149 | |
| 150 // update bin values | |
| 151 saveBin = bAlignment.Bin; | |
| 152 lastBin = bAlignment.Bin; | |
| 153 | |
| 154 // update saveRefID | |
| 155 saveRefID = bAlignment.RefID; | |
| 156 | |
| 157 // if invalid RefID, break out | |
| 158 if ( saveRefID < 0 ) break; | |
| 159 } | |
| 160 | |
| 161 // make sure that current file pointer is beyond lastOffset | |
| 162 if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { | |
| 163 fprintf(stderr, "Error in BGZF offsets.\n"); | |
| 164 exit(1); | |
| 165 } | |
| 166 | |
| 167 // update lastOffset | |
| 168 lastOffset = m_BGZF->Tell(); | |
| 169 | |
| 170 // update lastCoordinate | |
| 171 lastCoordinate = bAlignment.Position; | |
| 172 } | |
| 173 | |
| 174 // save any leftover BAM data (as long as refID is valid) | |
| 175 if ( saveRefID >= 0 ) { | |
| 176 // save Bam bin entry | |
| 177 BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); | |
| 178 if ( indexIter == m_indexData.end() ) return false; // error | |
| 179 ReferenceIndex& refIndex = (*indexIter).second; | |
| 180 BamBinMap& binMap = refIndex.Bins; | |
| 181 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); | |
| 182 } | |
| 183 | |
| 184 // simplify index by merging chunks | |
| 185 MergeChunks(); | |
| 186 | |
| 187 // iterate through references in index | |
| 188 // sort offsets in linear offset vector | |
| 189 BamStandardIndexData::iterator indexIter = m_indexData.begin(); | |
| 190 BamStandardIndexData::iterator indexEnd = m_indexData.end(); | |
| 191 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { | |
| 192 | |
| 193 // get reference index data | |
| 194 ReferenceIndex& refIndex = (*indexIter).second; | |
| 195 LinearOffsetVector& offsets = refIndex.Offsets; | |
| 196 | |
| 197 // sort linear offsets | |
| 198 sort(offsets.begin(), offsets.end()); | |
| 199 } | |
| 200 | |
| 201 // rewind file pointer to beginning of alignments, return success/fail | |
| 202 return m_reader->Rewind(); | |
| 203 } | |
| 204 | |
| 205 // check index file magic number, return true if OK | |
| 206 bool BamStandardIndex::CheckMagicNumber(void) { | |
| 207 | |
| 208 // read in magic number | |
| 209 char magic[4]; | |
| 210 size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); | |
| 211 | |
| 212 // compare to expected value | |
| 213 if ( strncmp(magic, "BAI\1", 4) != 0 ) { | |
| 214 fprintf(stderr, "Problem with index file - invalid format.\n"); | |
| 215 fclose(m_indexStream); | |
| 216 return false; | |
| 217 } | |
| 218 | |
| 219 // return success/failure of load | |
| 220 return (elementsRead == 4); | |
| 221 } | |
| 222 | |
| 223 // clear all current index offset data in memory | |
| 224 void BamStandardIndex::ClearAllData(void) { | |
| 225 BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); | |
| 226 BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); | |
| 227 for ( ; indexIter != indexEnd; ++indexIter ) { | |
| 228 const int& refId = (*indexIter).first; | |
| 229 ClearReferenceOffsets(refId); | |
| 230 } | |
| 231 } | |
| 232 | |
| 233 // clear all index offset data for desired reference | |
| 234 void BamStandardIndex::ClearReferenceOffsets(const int& refId) { | |
| 235 | |
| 236 // look up refId, skip if not found | |
| 237 BamStandardIndexData::iterator indexIter = m_indexData.find(refId); | |
| 238 if ( indexIter == m_indexData.end() ) return ; | |
| 239 | |
| 240 // clear reference data | |
| 241 ReferenceIndex& refEntry = (*indexIter).second; | |
| 242 refEntry.Bins.clear(); | |
| 243 refEntry.Offsets.clear(); | |
| 244 | |
| 245 // set flag | |
| 246 m_hasFullDataCache = false; | |
| 247 } | |
| 248 | |
| 249 // return file position after header metadata | |
| 250 const off_t BamStandardIndex::DataBeginOffset(void) const { | |
| 251 return m_dataBeginOffset; | |
| 252 } | |
| 253 | |
| 254 // calculates offset(s) for a given region | |
| 255 bool BamStandardIndex::GetOffsets(const BamRegion& region, | |
| 256 const bool isRightBoundSpecified, | |
| 257 vector<int64_t>& offsets, | |
| 258 bool* hasAlignmentsInRegion) | |
| 259 { | |
| 260 // return false if leftBound refID is not found in index data | |
| 261 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) | |
| 262 return false; | |
| 263 | |
| 264 // load index data for region if not already cached | |
| 265 if ( !IsDataLoaded(region.LeftRefID) ) { | |
| 266 bool loadedOk = true; | |
| 267 loadedOk &= SkipToReference(region.LeftRefID); | |
| 268 loadedOk &= LoadReference(region.LeftRefID); | |
| 269 if ( !loadedOk ) return false; | |
| 270 } | |
| 271 | |
| 272 // calculate which bins overlap this region | |
| 273 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); | |
| 274 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); | |
| 275 | |
| 276 // get bins for this reference | |
| 277 BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); | |
| 278 if ( indexIter == m_indexData.end() ) return false; // error | |
| 279 const ReferenceIndex& refIndex = (*indexIter).second; | |
| 280 const BamBinMap& binMap = refIndex.Bins; | |
| 281 | |
| 282 // get minimum offset to consider | |
| 283 const LinearOffsetVector& linearOffsets = refIndex.Offsets; | |
| 284 const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) | |
| 285 ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); | |
| 286 | |
| 287 // store all alignment 'chunk' starts (file offsets) for bins in this region | |
| 288 for ( int i = 0; i < numBins; ++i ) { | |
| 289 | |
| 290 const uint16_t binKey = bins[i]; | |
| 291 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey); | |
| 292 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { | |
| 293 | |
| 294 // iterate over chunks | |
| 295 const ChunkVector& chunks = (*binIter).second; | |
| 296 std::vector<Chunk>::const_iterator chunksIter = chunks.begin(); | |
| 297 std::vector<Chunk>::const_iterator chunksEnd = chunks.end(); | |
| 298 for ( ; chunksIter != chunksEnd; ++chunksIter) { | |
| 299 | |
| 300 // if valid chunk found, store its file offset | |
| 301 const Chunk& chunk = (*chunksIter); | |
| 302 if ( chunk.Stop > minOffset ) | |
| 303 offsets.push_back( chunk.Start ); | |
| 304 } | |
| 305 } | |
| 306 } | |
| 307 | |
| 308 // clean up memory | |
| 309 free(bins); | |
| 310 | |
| 311 // sort the offsets before returning | |
| 312 sort(offsets.begin(), offsets.end()); | |
| 313 | |
| 314 // set flag & return success | |
| 315 *hasAlignmentsInRegion = (offsets.size() != 0 ); | |
| 316 | |
| 317 // if cache mode set to none, dump the data we just loaded | |
| 318 if (m_cacheMode == BamIndex::NoIndexCaching ) | |
| 319 ClearReferenceOffsets(region.LeftRefID); | |
| 320 | |
| 321 // return succes | |
| 322 return true; | |
| 323 } | |
| 324 | |
| 325 // returns whether reference has alignments or no | |
| 326 bool BamStandardIndex::HasAlignments(const int& refId) const { | |
| 327 BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); | |
| 328 if ( indexIter == m_indexData.end() ) return false; // error | |
| 329 const ReferenceIndex& refEntry = (*indexIter).second; | |
| 330 return refEntry.HasAlignments; | |
| 331 } | |
| 332 | |
| 333 // return true if all index data is cached | |
| 334 bool BamStandardIndex::HasFullDataCache(void) const { | |
| 335 return m_hasFullDataCache; | |
| 336 } | |
| 337 | |
| 338 // returns true if index cache has data for desired reference | |
| 339 bool BamStandardIndex::IsDataLoaded(const int& refId) const { | |
| 340 | |
| 341 // look up refId, return false if not found | |
| 342 BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); | |
| 343 if ( indexIter == m_indexData.end() ) return false; | |
| 344 | |
| 345 // see if reference has alignments | |
| 346 // if not, it's not a problem to have no offset data | |
| 347 const ReferenceIndex& refEntry = (*indexIter).second; | |
| 348 if ( !refEntry.HasAlignments ) return true; | |
| 349 | |
| 350 // return whether bin map contains data | |
| 351 return ( !refEntry.Bins.empty() ); | |
| 352 } | |
| 353 | |
| 354 // attempts to use index to jump to region; returns success/fail | |
| 355 bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { | |
| 356 | |
| 357 // be sure reader & BGZF file are valid & open for reading | |
| 358 if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) | |
| 359 return false; | |
| 360 | |
| 361 // make sure left-bound position is valid | |
| 362 if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) | |
| 363 return false; | |
| 364 | |
| 365 // calculate offsets for this region | |
| 366 // if failed, print message, set flag, and return failure | |
| 367 vector<int64_t> offsets; | |
| 368 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { | |
| 369 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n"); | |
| 370 *hasAlignmentsInRegion = false; | |
| 371 return false; | |
| 372 } | |
| 373 | |
| 374 // iterate through offsets | |
| 375 BamAlignment bAlignment; | |
| 376 bool result = true; | |
| 377 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { | |
| 378 | |
| 379 // attempt seek & load first available alignment | |
| 380 // set flag to true if data exists | |
| 381 result &= m_BGZF->Seek(*o); | |
| 382 *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment); | |
| 383 | |
| 384 // if this alignment corresponds to desired position | |
| 385 // return success of seeking back to the offset before the 'current offset' (to cover overlaps) | |
| 386 if ( ((bAlignment.RefID == region.LeftRefID) && | |
| 387 ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || | |
| 388 (bAlignment.RefID > region.LeftRefID) ) | |
| 389 { | |
| 390 if ( o != offsets.begin() ) --o; | |
| 391 return m_BGZF->Seek(*o); | |
| 392 } | |
| 393 } | |
| 394 | |
| 395 // if error in jumping, print message & set flag | |
| 396 if ( !result ) { | |
| 397 fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n"); | |
| 398 *hasAlignmentsInRegion = false; | |
| 399 } | |
| 400 | |
| 401 // return success/failure | |
| 402 return result; | |
| 403 } | |
| 404 | |
| 405 // clears index data from all references except the first | |
| 406 void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { | |
| 407 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); | |
| 408 KeepOnlyReferenceOffsets((*indexBegin).first); | |
| 409 } | |
| 410 | |
| 411 // clears index data from all references except the one specified | |
| 412 void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) { | |
| 413 BamStandardIndexData::iterator mapIter = m_indexData.begin(); | |
| 414 BamStandardIndexData::iterator mapEnd = m_indexData.end(); | |
| 415 for ( ; mapIter != mapEnd; ++mapIter ) { | |
| 416 const int entryRefId = (*mapIter).first; | |
| 417 if ( entryRefId != refId ) | |
| 418 ClearReferenceOffsets(entryRefId); | |
| 419 } | |
| 420 } | |
| 421 | |
| 422 bool BamStandardIndex::LoadAllReferences(bool saveData) { | |
| 423 | |
| 424 // skip if data already loaded | |
| 425 if ( m_hasFullDataCache ) return true; | |
| 426 | |
| 427 // get number of reference sequences | |
| 428 uint32_t numReferences; | |
| 429 if ( !LoadReferenceCount((int&)numReferences) ) | |
| 430 return false; | |
| 431 | |
| 432 // iterate over reference entries | |
| 433 bool loadedOk = true; | |
| 434 for ( int i = 0; i < (int)numReferences; ++i ) | |
| 435 loadedOk &= LoadReference(i, saveData); | |
| 436 | |
| 437 // set flag | |
| 438 if ( loadedOk && saveData ) | |
| 439 m_hasFullDataCache = true; | |
| 440 | |
| 441 // return success/failure of loading references | |
| 442 return loadedOk; | |
| 443 } | |
| 444 | |
| 445 // load header data from index file, return true if loaded OK | |
| 446 bool BamStandardIndex::LoadHeader(void) { | |
| 447 | |
| 448 bool loadedOk = CheckMagicNumber(); | |
| 449 | |
| 450 // store offset of beginning of data | |
| 451 m_dataBeginOffset = ftell64(m_indexStream); | |
| 452 | |
| 453 // return success/failure of load | |
| 454 return loadedOk; | |
| 455 } | |
| 456 | |
| 457 // load a single index bin entry from file, return true if loaded OK | |
| 458 // @saveData - save data in memory if true, just read & discard if false | |
| 459 bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) { | |
| 460 | |
| 461 size_t elementsRead = 0; | |
| 462 | |
| 463 // get bin ID | |
| 464 uint32_t binId; | |
| 465 elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); | |
| 466 if ( m_isBigEndian ) SwapEndian_32(binId); | |
| 467 | |
| 468 // load alignment chunks for this bin | |
| 469 ChunkVector chunks; | |
| 470 bool chunksOk = LoadChunks(chunks, saveData); | |
| 471 | |
| 472 // store bin entry | |
| 473 if ( chunksOk && saveData ) | |
| 474 refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks)); | |
| 475 | |
| 476 // return success/failure of load | |
| 477 return ( (elementsRead == 1) && chunksOk ); | |
| 478 } | |
| 479 | |
| 480 bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) { | |
| 481 | |
| 482 size_t elementsRead = 0; | |
| 483 | |
| 484 // get number of bins | |
| 485 int32_t numBins; | |
| 486 elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); | |
| 487 if ( m_isBigEndian ) SwapEndian_32(numBins); | |
| 488 | |
| 489 // set flag | |
| 490 refEntry.HasAlignments = ( numBins != 0 ); | |
| 491 | |
| 492 // iterate over bins | |
| 493 bool binsOk = true; | |
| 494 for ( int i = 0; i < numBins; ++i ) | |
| 495 binsOk &= LoadBin(refEntry, saveData); | |
| 496 | |
| 497 // return success/failure of load | |
| 498 return ( (elementsRead == 1) && binsOk ); | |
| 499 } | |
| 500 | |
| 501 // load a single index bin entry from file, return true if loaded OK | |
| 502 // @saveData - save data in memory if true, just read & discard if false | |
| 503 bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) { | |
| 504 | |
| 505 size_t elementsRead = 0; | |
| 506 | |
| 507 // read in chunk data | |
| 508 uint64_t start; | |
| 509 uint64_t stop; | |
| 510 elementsRead += fread(&start, sizeof(start), 1, m_indexStream); | |
| 511 elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream); | |
| 512 | |
| 513 // swap endian-ness if necessary | |
| 514 if ( m_isBigEndian ) { | |
| 515 SwapEndian_64(start); | |
| 516 SwapEndian_64(stop); | |
| 517 } | |
| 518 | |
| 519 // save data if requested | |
| 520 if ( saveData ) chunks.push_back( Chunk(start, stop) ); | |
| 521 | |
| 522 // return success/failure of load | |
| 523 return ( elementsRead == 2 ); | |
| 524 } | |
| 525 | |
| 526 bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) { | |
| 527 | |
| 528 size_t elementsRead = 0; | |
| 529 | |
| 530 // read in number of chunks | |
| 531 uint32_t numChunks; | |
| 532 elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream); | |
| 533 if ( m_isBigEndian ) SwapEndian_32(numChunks); | |
| 534 | |
| 535 // initialize space for chunks if we're storing this data | |
| 536 if ( saveData ) chunks.reserve(numChunks); | |
| 537 | |
| 538 // iterate over chunks | |
| 539 bool chunksOk = true; | |
| 540 for ( int i = 0; i < (int)numChunks; ++i ) | |
| 541 chunksOk &= LoadChunk(chunks, saveData); | |
| 542 | |
| 543 // sort chunk vector | |
| 544 sort( chunks.begin(), chunks.end(), ChunkLessThan ); | |
| 545 | |
| 546 // return success/failure of load | |
| 547 return ( (elementsRead == 1) && chunksOk ); | |
| 548 } | |
| 549 | |
| 550 // load a single index linear offset entry from file, return true if loaded OK | |
| 551 // @saveData - save data in memory if true, just read & discard if false | |
| 552 bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { | |
| 553 | |
| 554 size_t elementsRead = 0; | |
| 555 | |
| 556 // read in number of linear offsets | |
| 557 int32_t numLinearOffsets; | |
| 558 elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); | |
| 559 if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); | |
| 560 | |
| 561 // set up destination vector (if we're saving the data) | |
| 562 LinearOffsetVector linearOffsets; | |
| 563 if ( saveData ) linearOffsets.reserve(numLinearOffsets); | |
| 564 | |
| 565 // iterate over linear offsets | |
| 566 uint64_t linearOffset; | |
| 567 for ( int i = 0; i < numLinearOffsets; ++i ) { | |
| 568 elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); | |
| 569 if ( m_isBigEndian ) SwapEndian_64(linearOffset); | |
| 570 if ( saveData ) linearOffsets.push_back(linearOffset); | |
| 571 } | |
| 572 | |
| 573 // sort linear offsets | |
| 574 sort ( linearOffsets.begin(), linearOffsets.end() ); | |
| 575 | |
| 576 // save in reference index entry if desired | |
| 577 if ( saveData ) refEntry.Offsets = linearOffsets; | |
| 578 | |
| 579 // return success/failure of load | |
| 580 return ( elementsRead == (size_t)(numLinearOffsets + 1) ); | |
| 581 } | |
| 582 | |
| 583 bool BamStandardIndex::LoadFirstReference(bool saveData) { | |
| 584 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); | |
| 585 return LoadReference((*indexBegin).first, saveData); | |
| 586 } | |
| 587 | |
| 588 // load a single reference from file, return true if loaded OK | |
| 589 // @saveData - save data in memory if true, just read & discard if false | |
| 590 bool BamStandardIndex::LoadReference(const int& refId, bool saveData) { | |
| 591 | |
| 592 // look up refId | |
| 593 BamStandardIndexData::iterator indexIter = m_indexData.find(refId); | |
| 594 | |
| 595 // if reference not previously loaded, create new entry | |
| 596 if ( indexIter == m_indexData.end() ) { | |
| 597 ReferenceIndex newEntry; | |
| 598 newEntry.HasAlignments = false; | |
| 599 m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) ); | |
| 600 } | |
| 601 | |
| 602 // load reference data | |
| 603 indexIter = m_indexData.find(refId); | |
| 604 ReferenceIndex& entry = (*indexIter).second; | |
| 605 bool loadedOk = true; | |
| 606 loadedOk &= LoadBins(entry, saveData); | |
| 607 loadedOk &= LoadLinearOffsets(entry, saveData); | |
| 608 return loadedOk; | |
| 609 } | |
| 610 | |
| 611 // loads number of references, return true if loaded OK | |
| 612 bool BamStandardIndex::LoadReferenceCount(int& numReferences) { | |
| 613 | |
| 614 size_t elementsRead = 0; | |
| 615 | |
| 616 // read reference count | |
| 617 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); | |
| 618 if ( m_isBigEndian ) SwapEndian_32(numReferences); | |
| 619 | |
| 620 // return success/failure of load | |
| 621 return ( elementsRead == 1 ); | |
| 622 } | |
| 623 | |
| 624 // merges 'alignment chunks' in BAM bin (used for index building) | |
| 625 void BamStandardIndex::MergeChunks(void) { | |
| 626 | |
| 627 // iterate over reference enties | |
| 628 BamStandardIndexData::iterator indexIter = m_indexData.begin(); | |
| 629 BamStandardIndexData::iterator indexEnd = m_indexData.end(); | |
| 630 for ( ; indexIter != indexEnd; ++indexIter ) { | |
| 631 | |
| 632 // get BAM bin map for this reference | |
| 633 ReferenceIndex& refIndex = (*indexIter).second; | |
| 634 BamBinMap& bamBinMap = refIndex.Bins; | |
| 635 | |
| 636 // iterate over BAM bins | |
| 637 BamBinMap::iterator binIter = bamBinMap.begin(); | |
| 638 BamBinMap::iterator binEnd = bamBinMap.end(); | |
| 639 for ( ; binIter != binEnd; ++binIter ) { | |
| 640 | |
| 641 // get chunk vector for this bin | |
| 642 ChunkVector& binChunks = (*binIter).second; | |
| 643 if ( binChunks.size() == 0 ) continue; | |
| 644 | |
| 645 ChunkVector mergedChunks; | |
| 646 mergedChunks.push_back( binChunks[0] ); | |
| 647 | |
| 648 // iterate over chunks | |
| 649 int i = 0; | |
| 650 ChunkVector::iterator chunkIter = binChunks.begin(); | |
| 651 ChunkVector::iterator chunkEnd = binChunks.end(); | |
| 652 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { | |
| 653 | |
| 654 // get 'currentChunk' based on numeric index | |
| 655 Chunk& currentChunk = mergedChunks[i]; | |
| 656 | |
| 657 // get iteratorChunk based on vector iterator | |
| 658 Chunk& iteratorChunk = (*chunkIter); | |
| 659 | |
| 660 // if chunk ends where (iterator) chunk starts, then merge | |
| 661 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) | |
| 662 currentChunk.Stop = iteratorChunk.Stop; | |
| 663 | |
| 664 // otherwise | |
| 665 else { | |
| 666 // set currentChunk + 1 to iteratorChunk | |
| 667 mergedChunks.push_back(iteratorChunk); | |
| 668 ++i; | |
| 669 } | |
| 670 } | |
| 671 | |
| 672 // saved merged chunk vector | |
| 673 (*binIter).second = mergedChunks; | |
| 674 } | |
| 675 } | |
| 676 } | |
| 677 | |
| 678 // saves BAM bin entry for index | |
| 679 void BamStandardIndex::SaveBinEntry(BamBinMap& binMap, | |
| 680 const uint32_t& saveBin, | |
| 681 const uint64_t& saveOffset, | |
| 682 const uint64_t& lastOffset) | |
| 683 { | |
| 684 // look up saveBin | |
| 685 BamBinMap::iterator binIter = binMap.find(saveBin); | |
| 686 | |
| 687 // create new chunk | |
| 688 Chunk newChunk(saveOffset, lastOffset); | |
| 689 | |
| 690 // if entry doesn't exist | |
| 691 if ( binIter == binMap.end() ) { | |
| 692 ChunkVector newChunks; | |
| 693 newChunks.push_back(newChunk); | |
| 694 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks)); | |
| 695 } | |
| 696 | |
| 697 // otherwise | |
| 698 else { | |
| 699 ChunkVector& binChunks = (*binIter).second; | |
| 700 binChunks.push_back( newChunk ); | |
| 701 } | |
| 702 } | |
| 703 | |
| 704 // saves linear offset entry for index | |
| 705 void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, | |
| 706 const BamAlignment& bAlignment, | |
| 707 const uint64_t& lastOffset) | |
| 708 { | |
| 709 // get converted offsets | |
| 710 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; | |
| 711 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; | |
| 712 | |
| 713 // resize vector if necessary | |
| 714 int oldSize = offsets.size(); | |
| 715 int newSize = endOffset + 1; | |
| 716 if ( oldSize < newSize ) | |
| 717 offsets.resize(newSize, 0); | |
| 718 | |
| 719 // store offset | |
| 720 for( int i = beginOffset + 1; i <= endOffset; ++i ) { | |
| 721 if ( offsets[i] == 0 ) | |
| 722 offsets[i] = lastOffset; | |
| 723 } | |
| 724 } | |
| 725 | |
| 726 // initializes index data structure to hold @count references | |
| 727 void BamStandardIndex::SetReferenceCount(const int& count) { | |
| 728 for ( int i = 0; i < count; ++i ) | |
| 729 m_indexData[i].HasAlignments = false; | |
| 730 } | |
| 731 | |
| 732 bool BamStandardIndex::SkipToFirstReference(void) { | |
| 733 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); | |
| 734 return SkipToReference( (*indexBegin).first ); | |
| 735 } | |
| 736 | |
| 737 // position file pointer to desired reference begin, return true if skipped OK | |
| 738 bool BamStandardIndex::SkipToReference(const int& refId) { | |
| 739 | |
| 740 // attempt rewind | |
| 741 if ( !Rewind() ) return false; | |
| 742 | |
| 743 // read in number of references | |
| 744 uint32_t numReferences; | |
| 745 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); | |
| 746 if ( elementsRead != 1 ) return false; | |
| 747 if ( m_isBigEndian ) SwapEndian_32(numReferences); | |
| 748 | |
| 749 // iterate over reference entries | |
| 750 bool skippedOk = true; | |
| 751 int currentRefId = 0; | |
| 752 while (currentRefId != refId) { | |
| 753 skippedOk &= LoadReference(currentRefId, false); | |
| 754 ++currentRefId; | |
| 755 } | |
| 756 | |
| 757 // return success | |
| 758 return skippedOk; | |
| 759 } | |
| 760 | |
| 761 // write header to new index file | |
| 762 bool BamStandardIndex::WriteHeader(void) { | |
| 763 | |
| 764 size_t elementsWritten = 0; | |
| 765 | |
| 766 // write magic number | |
| 767 elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream); | |
| 768 | |
| 769 // store offset of beginning of data | |
| 770 m_dataBeginOffset = ftell64(m_indexStream); | |
| 771 | |
| 772 // return success/failure of write | |
| 773 return (elementsWritten == 4); | |
| 774 } | |
| 775 | |
| 776 // write index data for all references to new index file | |
| 777 bool BamStandardIndex::WriteAllReferences(void) { | |
| 778 | |
| 779 size_t elementsWritten = 0; | |
| 780 | |
| 781 // write number of reference sequences | |
| 782 int32_t numReferenceSeqs = m_indexData.size(); | |
| 783 if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); | |
| 784 elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream); | |
| 785 | |
| 786 // iterate over reference sequences | |
| 787 bool refsOk = true; | |
| 788 BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); | |
| 789 BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); | |
| 790 for ( ; indexIter != indexEnd; ++ indexIter ) | |
| 791 refsOk &= WriteReference( (*indexIter).second ); | |
| 792 | |
| 793 // return success/failure of write | |
| 794 return ( (elementsWritten == 1) && refsOk ); | |
| 795 } | |
| 796 | |
| 797 // write index data for bin to new index file | |
| 798 bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { | |
| 799 | |
| 800 size_t elementsWritten = 0; | |
| 801 | |
| 802 // write BAM bin ID | |
| 803 uint32_t binKey = binId; | |
| 804 if ( m_isBigEndian ) SwapEndian_32(binKey); | |
| 805 elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); | |
| 806 | |
| 807 // write chunks | |
| 808 bool chunksOk = WriteChunks(chunks); | |
| 809 | |
| 810 // return success/failure of write | |
| 811 return ( (elementsWritten == 1) && chunksOk ); | |
| 812 } | |
| 813 | |
| 814 // write index data for bins to new index file | |
| 815 bool BamStandardIndex::WriteBins(const BamBinMap& bins) { | |
| 816 | |
| 817 size_t elementsWritten = 0; | |
| 818 | |
| 819 // write number of bins | |
| 820 int32_t binCount = bins.size(); | |
| 821 if ( m_isBigEndian ) SwapEndian_32(binCount); | |
| 822 elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); | |
| 823 | |
| 824 // iterate over bins | |
| 825 bool binsOk = true; | |
| 826 BamBinMap::const_iterator binIter = bins.begin(); | |
| 827 BamBinMap::const_iterator binEnd = bins.end(); | |
| 828 for ( ; binIter != binEnd; ++binIter ) | |
| 829 binsOk &= WriteBin( (*binIter).first, (*binIter).second ); | |
| 830 | |
| 831 // return success/failure of write | |
| 832 return ( (elementsWritten == 1) && binsOk ); | |
| 833 } | |
| 834 | |
| 835 // write index data for chunk entry to new index file | |
| 836 bool BamStandardIndex::WriteChunk(const Chunk& chunk) { | |
| 837 | |
| 838 size_t elementsWritten = 0; | |
| 839 | |
| 840 // localize alignment chunk offsets | |
| 841 uint64_t start = chunk.Start; | |
| 842 uint64_t stop = chunk.Stop; | |
| 843 | |
| 844 // swap endian-ness if necessary | |
| 845 if ( m_isBigEndian ) { | |
| 846 SwapEndian_64(start); | |
| 847 SwapEndian_64(stop); | |
| 848 } | |
| 849 | |
| 850 // write to index file | |
| 851 elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); | |
| 852 elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); | |
| 853 | |
| 854 // return success/failure of write | |
| 855 return ( elementsWritten == 2 ); | |
| 856 } | |
| 857 | |
| 858 // write index data for chunk entry to new index file | |
| 859 bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { | |
| 860 | |
| 861 size_t elementsWritten = 0; | |
| 862 | |
| 863 // write chunks | |
| 864 int32_t chunkCount = chunks.size(); | |
| 865 if ( m_isBigEndian ) SwapEndian_32(chunkCount); | |
| 866 elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); | |
| 867 | |
| 868 // iterate over chunks | |
| 869 bool chunksOk = true; | |
| 870 ChunkVector::const_iterator chunkIter = chunks.begin(); | |
| 871 ChunkVector::const_iterator chunkEnd = chunks.end(); | |
| 872 for ( ; chunkIter != chunkEnd; ++chunkIter ) | |
| 873 chunksOk &= WriteChunk( (*chunkIter) ); | |
| 874 | |
| 875 // return success/failure of write | |
| 876 return ( (elementsWritten == 1) && chunksOk ); | |
| 877 } | |
| 878 | |
| 879 // write index data for linear offsets entry to new index file | |
| 880 bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { | |
| 881 | |
| 882 size_t elementsWritten = 0; | |
| 883 | |
| 884 // write number of linear offsets | |
| 885 int32_t offsetCount = offsets.size(); | |
| 886 if ( m_isBigEndian ) SwapEndian_32(offsetCount); | |
| 887 elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); | |
| 888 | |
| 889 // iterate over linear offsets | |
| 890 LinearOffsetVector::const_iterator offsetIter = offsets.begin(); | |
| 891 LinearOffsetVector::const_iterator offsetEnd = offsets.end(); | |
| 892 for ( ; offsetIter != offsetEnd; ++offsetIter ) { | |
| 893 | |
| 894 // write linear offset | |
| 895 uint64_t linearOffset = (*offsetIter); | |
| 896 if ( m_isBigEndian ) SwapEndian_64(linearOffset); | |
| 897 elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); | |
| 898 } | |
| 899 | |
| 900 // return success/failure of write | |
| 901 return ( elementsWritten == (size_t)(offsetCount + 1) ); | |
| 902 } | |
| 903 | |
| 904 // write index data for a single reference to new index file | |
| 905 bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) { | |
| 906 bool refOk = true; | |
| 907 refOk &= WriteBins(refEntry.Bins); | |
| 908 refOk &= WriteLinearOffsets(refEntry.Offsets); | |
| 909 return refOk; | |
| 910 } |
