Mercurial > repos > zzhou > spp_phantompeak
comparison spp/src/BamReader_p.cpp @ 6:ce08b0efa3fd draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:11:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:608a8e0eac56 | 6:ce08b0efa3fd |
---|---|
1 // *************************************************************************** | |
2 // BamReader_p.cpp (c) 2009 Derek Barnett | |
3 // Marth Lab, Department of Biology, Boston College | |
4 // All rights reserved. | |
5 // --------------------------------------------------------------------------- | |
6 // Last modified: 22 November 2010 (DB) | |
7 // --------------------------------------------------------------------------- | |
8 // Provides the basic functionality for reading BAM files | |
9 // *************************************************************************** | |
10 | |
11 #include <BamReader.h> | |
12 #include <BGZF.h> | |
13 #include <BamReader_p.h> | |
14 #include <BamStandardIndex_p.h> | |
15 #include <BamToolsIndex_p.h> | |
16 using namespace BamTools; | |
17 using namespace BamTools::Internal; | |
18 | |
19 #include <algorithm> | |
20 #include <iostream> | |
21 #include <iterator> | |
22 #include <vector> | |
23 using namespace std; | |
24 | |
25 // constructor | |
26 BamReaderPrivate::BamReaderPrivate(BamReader* parent) | |
27 : HeaderText("") | |
28 , Index(0) | |
29 , HasIndex(false) | |
30 , AlignmentsBeginOffset(0) | |
31 // , m_header(0) | |
32 , IndexCacheMode(BamIndex::LimitedIndexCaching) | |
33 , HasAlignmentsInRegion(true) | |
34 , Parent(parent) | |
35 , DNA_LOOKUP("=ACMGRSVTWYHKDBN") | |
36 , CIGAR_LOOKUP("MIDNSHP") | |
37 { | |
38 IsBigEndian = SystemIsBigEndian(); | |
39 } | |
40 | |
41 // destructor | |
42 BamReaderPrivate::~BamReaderPrivate(void) { | |
43 Close(); | |
44 } | |
45 | |
46 // adjusts requested region if necessary (depending on where data actually begins) | |
47 void BamReaderPrivate::AdjustRegion(BamRegion& region) { | |
48 | |
49 // check for valid index first | |
50 if ( Index == 0 ) return; | |
51 | |
52 // see if any references in region have alignments | |
53 HasAlignmentsInRegion = false; | |
54 int currentId = region.LeftRefID; | |
55 | |
56 const int rightBoundRefId = ( region.isRightBoundSpecified() ? region.RightRefID : References.size() - 1 ); | |
57 while ( currentId <= rightBoundRefId ) { | |
58 HasAlignmentsInRegion = Index->HasAlignments(currentId); | |
59 if ( HasAlignmentsInRegion ) break; | |
60 ++currentId; | |
61 } | |
62 | |
63 // if no data found on any reference in region | |
64 if ( !HasAlignmentsInRegion ) return; | |
65 | |
66 // if left bound of desired region had no data, use first reference that had data | |
67 // otherwise, leave requested region as-is | |
68 if ( currentId != region.LeftRefID ) { | |
69 region.LeftRefID = currentId; | |
70 region.LeftPosition = 0; | |
71 } | |
72 } | |
73 | |
74 // fills out character data for BamAlignment data | |
75 bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { | |
76 | |
77 // calculate character lengths/offsets | |
78 const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; | |
79 const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); | |
80 const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; | |
81 const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; | |
82 const unsigned int tagDataLength = dataLength - tagDataOffset; | |
83 | |
84 // check offsets to see what char data exists | |
85 const bool hasSeqData = ( seqDataOffset < dataLength ); | |
86 const bool hasQualData = ( qualDataOffset < dataLength ); | |
87 const bool hasTagData = ( tagDataOffset < dataLength ); | |
88 | |
89 // set up char buffers | |
90 const char* allCharData = bAlignment.SupportData.AllCharData.data(); | |
91 const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); | |
92 const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); | |
93 char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); | |
94 | |
95 // store alignment name (relies on null char in name as terminator) | |
96 bAlignment.Name.assign((const char*)(allCharData)); | |
97 | |
98 // save query sequence | |
99 bAlignment.QueryBases.clear(); | |
100 if ( hasSeqData ) { | |
101 bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); | |
102 for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { | |
103 char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; | |
104 bAlignment.QueryBases.append(1, singleBase); | |
105 } | |
106 } | |
107 | |
108 // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character | |
109 bAlignment.Qualities.clear(); | |
110 if ( hasQualData ) { | |
111 bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); | |
112 for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { | |
113 char singleQuality = (char)(qualData[i]+33); | |
114 bAlignment.Qualities.append(1, singleQuality); | |
115 } | |
116 } | |
117 | |
118 // if QueryBases is empty (and this is a allowed case) | |
119 if ( bAlignment.QueryBases.empty() ) | |
120 bAlignment.AlignedBases = bAlignment.QueryBases; | |
121 | |
122 // if QueryBases contains data, then build AlignedBases using CIGAR data | |
123 else { | |
124 | |
125 // resize AlignedBases | |
126 bAlignment.AlignedBases.clear(); | |
127 bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); | |
128 | |
129 // iterate over CigarOps | |
130 int k = 0; | |
131 vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin(); | |
132 vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end(); | |
133 for ( ; cigarIter != cigarEnd; ++cigarIter ) { | |
134 | |
135 const CigarOp& op = (*cigarIter); | |
136 switch(op.Type) { | |
137 | |
138 case ('M') : | |
139 case ('I') : | |
140 bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases | |
141 // fall through | |
142 | |
143 case ('S') : | |
144 k += op.Length; // for 'S' - soft clip, skip over query bases | |
145 break; | |
146 | |
147 case ('D') : | |
148 bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character | |
149 break; | |
150 | |
151 case ('P') : | |
152 bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character | |
153 break; | |
154 | |
155 case ('N') : | |
156 bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence | |
157 break; | |
158 | |
159 case ('H') : | |
160 break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op | |
161 | |
162 default: | |
163 fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here | |
164 exit(1); | |
165 } | |
166 } | |
167 } | |
168 | |
169 // save tag data | |
170 bAlignment.TagData.clear(); | |
171 if ( hasTagData ) { | |
172 if ( IsBigEndian ) { | |
173 int i = 0; | |
174 while ( (unsigned int)i < tagDataLength ) { | |
175 | |
176 i += 2; // skip tag type (e.g. "RG", "NM", etc) | |
177 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning | |
178 ++i; // skip value type | |
179 | |
180 switch (type) { | |
181 | |
182 case('A') : | |
183 case('C') : | |
184 ++i; | |
185 break; | |
186 | |
187 case('S') : | |
188 SwapEndian_16p(&tagData[i]); | |
189 i += sizeof(uint16_t); | |
190 break; | |
191 | |
192 case('F') : | |
193 case('I') : | |
194 SwapEndian_32p(&tagData[i]); | |
195 i += sizeof(uint32_t); | |
196 break; | |
197 | |
198 case('D') : | |
199 SwapEndian_64p(&tagData[i]); | |
200 i += sizeof(uint64_t); | |
201 break; | |
202 | |
203 case('H') : | |
204 case('Z') : | |
205 while (tagData[i]) { ++i; } | |
206 ++i; // increment one more for null terminator | |
207 break; | |
208 | |
209 default : | |
210 fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here | |
211 exit(1); | |
212 } | |
213 } | |
214 } | |
215 | |
216 // store tagData in alignment | |
217 bAlignment.TagData.resize(tagDataLength); | |
218 memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); | |
219 } | |
220 | |
221 // clear the core-only flag | |
222 bAlignment.SupportData.HasCoreOnly = false; | |
223 | |
224 // return success | |
225 return true; | |
226 } | |
227 | |
228 // clear index data structure | |
229 void BamReaderPrivate::ClearIndex(void) { | |
230 delete Index; | |
231 Index = 0; | |
232 HasIndex = false; | |
233 } | |
234 | |
235 // closes the BAM file | |
236 void BamReaderPrivate::Close(void) { | |
237 | |
238 // close BGZF file stream | |
239 mBGZF.Close(); | |
240 | |
241 // clear out index data | |
242 ClearIndex(); | |
243 | |
244 // clear out header data | |
245 HeaderText.clear(); | |
246 // if ( m_header ) { | |
247 // delete m_header; | |
248 // m_header = 0; | |
249 // } | |
250 | |
251 // clear out region flags | |
252 Region.clear(); | |
253 } | |
254 | |
255 // creates index for BAM file, saves to file | |
256 // default behavior is to create the BAM standard index (".bai") | |
257 // set flag to false to create the BamTools-specific index (".bti") | |
258 bool BamReaderPrivate::CreateIndex(bool useStandardIndex) { | |
259 | |
260 // clear out prior index data | |
261 ClearIndex(); | |
262 | |
263 // create index based on type requested | |
264 if ( useStandardIndex ) | |
265 Index = new BamStandardIndex(&mBGZF, Parent); | |
266 else | |
267 Index = new BamToolsIndex(&mBGZF, Parent); | |
268 | |
269 // set index cache mode to full for writing | |
270 Index->SetCacheMode(BamIndex::FullIndexCaching); | |
271 | |
272 // build new index | |
273 bool ok = true; | |
274 ok &= Index->Build(); | |
275 HasIndex = ok; | |
276 | |
277 // mark empty references | |
278 MarkReferences(); | |
279 | |
280 // attempt to save index data to file | |
281 ok &= Index->Write(Filename); | |
282 | |
283 // set client's desired index cache mode | |
284 Index->SetCacheMode(IndexCacheMode); | |
285 | |
286 // return success/fail of both building & writing index | |
287 return ok; | |
288 } | |
289 | |
290 const string BamReaderPrivate::GetHeaderText(void) const { | |
291 | |
292 return HeaderText; | |
293 | |
294 // if ( m_header ) | |
295 // return m_header->Text(); | |
296 // else | |
297 // return string(""); | |
298 } | |
299 | |
300 // get next alignment (from specified region, if given) | |
301 bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { | |
302 | |
303 // if valid alignment found, attempt to parse char data, and return success/failure | |
304 if ( GetNextAlignmentCore(bAlignment) ) | |
305 return BuildCharData(bAlignment); | |
306 | |
307 // no valid alignment found | |
308 else return false; | |
309 } | |
310 | |
311 // retrieves next available alignment core data (returns success/fail) | |
312 // ** DOES NOT parse any character data (read name, bases, qualities, tag data) | |
313 // these can be accessed, if necessary, from the supportData | |
314 // useful for operations requiring ONLY positional or other alignment-related information | |
315 bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { | |
316 | |
317 // if region is set but has no alignments | |
318 if ( !Region.isNull() && !HasAlignmentsInRegion ) | |
319 return false; | |
320 | |
321 // if valid alignment available | |
322 if ( LoadNextAlignment(bAlignment) ) { | |
323 | |
324 // set core-only flag | |
325 bAlignment.SupportData.HasCoreOnly = true; | |
326 | |
327 // if region not specified with at least a left boundary, return success | |
328 if ( !Region.isLeftBoundSpecified() ) return true; | |
329 | |
330 // determine region state (before, within, after) | |
331 BamReaderPrivate::RegionState state = IsOverlap(bAlignment); | |
332 | |
333 // if alignment lies after region, return false | |
334 if ( state == AFTER_REGION ) return false; | |
335 | |
336 while ( state != WITHIN_REGION ) { | |
337 // if no valid alignment available (likely EOF) return failure | |
338 if ( !LoadNextAlignment(bAlignment) ) return false; | |
339 // if alignment lies after region, return false (no available read within region) | |
340 state = IsOverlap(bAlignment); | |
341 if ( state == AFTER_REGION ) return false; | |
342 } | |
343 | |
344 // return success (alignment found that overlaps region) | |
345 return true; | |
346 } | |
347 | |
348 // no valid alignment | |
349 else return false; | |
350 } | |
351 | |
352 // returns RefID for given RefName (returns References.size() if not found) | |
353 int BamReaderPrivate::GetReferenceID(const string& refName) const { | |
354 | |
355 // retrieve names from reference data | |
356 vector<string> refNames; | |
357 RefVector::const_iterator refIter = References.begin(); | |
358 RefVector::const_iterator refEnd = References.end(); | |
359 for ( ; refIter != refEnd; ++refIter) | |
360 refNames.push_back( (*refIter).RefName ); | |
361 | |
362 // return 'index-of' refName ( if not found, returns refNames.size() ) | |
363 return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); | |
364 } | |
365 | |
366 // returns region state - whether alignment ends before, overlaps, or starts after currently specified region | |
367 // this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true | |
368 BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { | |
369 | |
370 // if alignment is on any reference sequence before left bound | |
371 if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION; | |
372 | |
373 // if alignment starts on left bound reference | |
374 else if ( bAlignment.RefID == Region.LeftRefID ) { | |
375 | |
376 // if alignment starts at or after left boundary | |
377 if ( bAlignment.Position >= Region.LeftPosition) { | |
378 | |
379 // if right boundary is specified AND | |
380 // left/right boundaries are on same reference AND | |
381 // alignment starts past right boundary | |
382 if ( Region.isRightBoundSpecified() && | |
383 Region.LeftRefID == Region.RightRefID && | |
384 bAlignment.Position > Region.RightPosition ) | |
385 return AFTER_REGION; | |
386 | |
387 // otherwise, alignment is within region | |
388 return WITHIN_REGION; | |
389 } | |
390 | |
391 // alignment starts before left boundary | |
392 else { | |
393 // check if alignment overlaps left boundary | |
394 if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION; | |
395 else return BEFORE_REGION; | |
396 } | |
397 } | |
398 | |
399 // alignment starts on a reference after the left bound | |
400 else { | |
401 | |
402 // if region has a right boundary | |
403 if ( Region.isRightBoundSpecified() ) { | |
404 | |
405 // alignment is on reference between boundaries | |
406 if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION; | |
407 | |
408 // alignment is on reference after right boundary | |
409 else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION; | |
410 | |
411 // alignment is on right bound reference | |
412 else { | |
413 // check if alignment starts before or at right boundary | |
414 if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION; | |
415 else return AFTER_REGION; | |
416 } | |
417 } | |
418 | |
419 // otherwise, alignment is after left bound reference, but there is no right boundary | |
420 else return WITHIN_REGION; | |
421 } | |
422 } | |
423 | |
424 // load BAM header data | |
425 void BamReaderPrivate::LoadHeaderData(void) { | |
426 | |
427 // m_header = new BamHeader(&mBGZF); | |
428 // bool headerLoadedOk = m_header->Load(); | |
429 // if ( !headerLoadedOk ) | |
430 // cerr << "BamReader could not load header" << endl; | |
431 | |
432 // check to see if proper BAM header | |
433 char buffer[4]; | |
434 if (mBGZF.Read(buffer, 4) != 4) { | |
435 fprintf(stderr, "Could not read header type\n"); | |
436 exit(1); | |
437 } | |
438 | |
439 if (strncmp(buffer, "BAM\001", 4)) { | |
440 fprintf(stderr, "wrong header type!\n"); | |
441 exit(1); | |
442 } | |
443 | |
444 // get BAM header text length | |
445 mBGZF.Read(buffer, 4); | |
446 unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); | |
447 if ( IsBigEndian ) SwapEndian_32(headerTextLength); | |
448 | |
449 // get BAM header text | |
450 char* headerText = (char*)calloc(headerTextLength + 1, 1); | |
451 mBGZF.Read(headerText, headerTextLength); | |
452 HeaderText = (string)((const char*)headerText); | |
453 | |
454 // clean up calloc-ed temp variable | |
455 free(headerText); | |
456 } | |
457 | |
458 // load existing index data from BAM index file (".bti" OR ".bai"), return success/fail | |
459 bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) { | |
460 | |
461 // clear out any existing index data | |
462 ClearIndex(); | |
463 | |
464 // if no index filename provided, so we need to look for available index files | |
465 if ( IndexFilename.empty() ) { | |
466 | |
467 // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag | |
468 const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); | |
469 Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type); | |
470 | |
471 // if null, return failure | |
472 if ( Index == 0 ) return false; | |
473 | |
474 // generate proper IndexFilename based on type of index created | |
475 IndexFilename = Filename + Index->Extension(); | |
476 } | |
477 | |
478 else { | |
479 | |
480 // attempt to load BamIndex based on IndexFilename provided by client | |
481 Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent); | |
482 | |
483 // if null, return failure | |
484 if ( Index == 0 ) return false; | |
485 } | |
486 | |
487 // set cache mode for BamIndex | |
488 Index->SetCacheMode(IndexCacheMode); | |
489 | |
490 // loading the index data from file | |
491 HasIndex = Index->Load(IndexFilename); | |
492 | |
493 // mark empty references | |
494 MarkReferences(); | |
495 | |
496 // return index status | |
497 return HasIndex; | |
498 } | |
499 | |
500 // populates BamAlignment with alignment data under file pointer, returns success/fail | |
501 bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { | |
502 | |
503 // read in the 'block length' value, make sure it's not zero | |
504 char buffer[4]; | |
505 mBGZF.Read(buffer, 4); | |
506 bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); | |
507 if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } | |
508 if ( bAlignment.SupportData.BlockLength == 0 ) return false; | |
509 | |
510 // read in core alignment data, make sure the right size of data was read | |
511 char x[BAM_CORE_SIZE]; | |
512 if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; | |
513 | |
514 if ( IsBigEndian ) { | |
515 for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) | |
516 SwapEndian_32p(&x[i]); | |
517 } | |
518 | |
519 // set BamAlignment 'core' and 'support' data | |
520 bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); | |
521 bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); | |
522 | |
523 unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); | |
524 bAlignment.Bin = tempValue >> 16; | |
525 bAlignment.MapQuality = tempValue >> 8 & 0xff; | |
526 bAlignment.SupportData.QueryNameLength = tempValue & 0xff; | |
527 | |
528 tempValue = BgzfData::UnpackUnsignedInt(&x[12]); | |
529 bAlignment.AlignmentFlag = tempValue >> 16; | |
530 bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; | |
531 | |
532 bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); | |
533 bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); | |
534 bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); | |
535 bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); | |
536 | |
537 // set BamAlignment length | |
538 bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; | |
539 | |
540 // read in character data - make sure proper data size was read | |
541 bool readCharDataOK = false; | |
542 const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; | |
543 char* allCharData = (char*)calloc(sizeof(char), dataLength); | |
544 | |
545 if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { | |
546 | |
547 // store 'allCharData' in supportData structure | |
548 bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); | |
549 | |
550 // set success flag | |
551 readCharDataOK = true; | |
552 | |
553 // save CIGAR ops | |
554 // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, | |
555 // even when GetNextAlignmentCore() is called | |
556 const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; | |
557 uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); | |
558 CigarOp op; | |
559 bAlignment.CigarData.clear(); | |
560 bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); | |
561 for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { | |
562 | |
563 // swap if necessary | |
564 if ( IsBigEndian ) SwapEndian_32(cigarData[i]); | |
565 | |
566 // build CigarOp structure | |
567 op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); | |
568 op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; | |
569 | |
570 // save CigarOp | |
571 bAlignment.CigarData.push_back(op); | |
572 } | |
573 } | |
574 | |
575 free(allCharData); | |
576 return readCharDataOK; | |
577 } | |
578 | |
579 // loads reference data from BAM file | |
580 void BamReaderPrivate::LoadReferenceData(void) { | |
581 | |
582 // get number of reference sequences | |
583 char buffer[4]; | |
584 mBGZF.Read(buffer, 4); | |
585 unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); | |
586 if ( IsBigEndian ) SwapEndian_32(numberRefSeqs); | |
587 if ( numberRefSeqs == 0 ) return; | |
588 References.reserve((int)numberRefSeqs); | |
589 | |
590 // iterate over all references in header | |
591 for (unsigned int i = 0; i != numberRefSeqs; ++i) { | |
592 | |
593 // get length of reference name | |
594 mBGZF.Read(buffer, 4); | |
595 unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); | |
596 if ( IsBigEndian ) SwapEndian_32(refNameLength); | |
597 char* refName = (char*)calloc(refNameLength, 1); | |
598 | |
599 // get reference name and reference sequence length | |
600 mBGZF.Read(refName, refNameLength); | |
601 mBGZF.Read(buffer, 4); | |
602 int refLength = BgzfData::UnpackSignedInt(buffer); | |
603 if ( IsBigEndian ) SwapEndian_32(refLength); | |
604 | |
605 // store data for reference | |
606 RefData aReference; | |
607 aReference.RefName = (string)((const char*)refName); | |
608 aReference.RefLength = refLength; | |
609 References.push_back(aReference); | |
610 | |
611 // clean up calloc-ed temp variable | |
612 free(refName); | |
613 } | |
614 } | |
615 | |
616 // mark references with no alignment data | |
617 void BamReaderPrivate::MarkReferences(void) { | |
618 | |
619 // ensure index is available | |
620 if ( !HasIndex ) return; | |
621 | |
622 // mark empty references | |
623 for ( int i = 0; i < (int)References.size(); ++i ) | |
624 References.at(i).RefHasAlignments = Index->HasAlignments(i); | |
625 } | |
626 | |
627 // opens BAM file (and index) | |
628 bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) { | |
629 | |
630 // store filenames | |
631 Filename = filename; | |
632 IndexFilename = indexFilename; | |
633 | |
634 // open the BGZF file for reading, return false on failure | |
635 if ( !mBGZF.Open(filename, "rb") ) return false; | |
636 | |
637 // retrieve header text & reference data | |
638 LoadHeaderData(); | |
639 LoadReferenceData(); | |
640 | |
641 // store file offset of first alignment | |
642 AlignmentsBeginOffset = mBGZF.Tell(); | |
643 | |
644 // if no index filename provided | |
645 if ( IndexFilename.empty() ) { | |
646 | |
647 // client did not specify that index SHOULD be found | |
648 // useful for cases where sequential access is all that is required | |
649 if ( !lookForIndex ) return true; | |
650 | |
651 // otherwise, look for index file, return success/fail | |
652 return LoadIndex(lookForIndex, preferStandardIndex) ; | |
653 } | |
654 | |
655 // client supplied an index filename | |
656 // attempt to load index data, return success/fail | |
657 return LoadIndex(lookForIndex, preferStandardIndex); | |
658 } | |
659 | |
660 // returns BAM file pointer to beginning of alignment data | |
661 bool BamReaderPrivate::Rewind(void) { | |
662 | |
663 // rewind to first alignment, return false if unable to seek | |
664 if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; | |
665 | |
666 // retrieve first alignment data, return false if unable to read | |
667 BamAlignment al; | |
668 if ( !LoadNextAlignment(al) ) return false; | |
669 | |
670 // reset default region info using first alignment in file | |
671 Region.clear(); | |
672 HasAlignmentsInRegion = true; | |
673 | |
674 // rewind back to beginning of first alignment | |
675 // return success/fail of seek | |
676 return mBGZF.Seek(AlignmentsBeginOffset); | |
677 } | |
678 | |
679 // change the index caching behavior | |
680 void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { | |
681 IndexCacheMode = mode; | |
682 if ( Index == 0 ) return; | |
683 Index->SetCacheMode(mode); | |
684 } | |
685 | |
686 // asks Index to attempt a Jump() to specified region | |
687 // returns success/failure | |
688 bool BamReaderPrivate::SetRegion(const BamRegion& region) { | |
689 | |
690 // clear out any prior BamReader region data | |
691 // | |
692 // N.B. - this is cleared so that BamIndex now has free reign to call | |
693 // GetNextAlignmentCore() and do overlap checking without worrying about BamReader | |
694 // performing any overlap checking of its own and moving on to the next read... Calls | |
695 // to GetNextAlignmentCore() with no Region set, simply return the next alignment. | |
696 // This ensures that the Index is able to do just that. (All without exposing | |
697 // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature) | |
698 Region.clear(); | |
699 | |
700 // check for existing index | |
701 if ( !HasIndex ) return false; | |
702 | |
703 // adjust region if necessary to reflect where data actually begins | |
704 BamRegion adjustedRegion(region); | |
705 AdjustRegion(adjustedRegion); | |
706 | |
707 // if no data present, return true | |
708 // not an error, but BamReader knows that no data is there for future alignment access | |
709 // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions | |
710 // that other BAMs have data) | |
711 if ( !HasAlignmentsInRegion ) { | |
712 Region = adjustedRegion; | |
713 return true; | |
714 } | |
715 | |
716 // attempt jump to user-specified region return false if jump could not be performed at all | |
717 // (invalid index, unknown reference, etc) | |
718 // | |
719 // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag | |
720 // * This covers case where a region is requested that lies beyond the last alignment on a reference | |
721 // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false | |
722 // BamMultiReader is then able to successfully pull alignments from a region from multiple files | |
723 // even if one or more have no data. | |
724 if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false; | |
725 | |
726 // save region and return success | |
727 Region = adjustedRegion; | |
728 return true; | |
729 } |