Mercurial > repos > zzhou > spp_phantompeak
comparison spp/src/BamWriter_p.cpp @ 6:ce08b0efa3fd draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:11:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:608a8e0eac56 | 6:ce08b0efa3fd |
---|---|
1 // *************************************************************************** | |
2 // BamWriter_p.cpp (c) 2010 Derek Barnett | |
3 // Marth Lab, Department of Biology, Boston College | |
4 // All rights reserved. | |
5 // --------------------------------------------------------------------------- | |
6 // Last modified: 22 November 2010 (DB) | |
7 // --------------------------------------------------------------------------- | |
8 // Provides the basic functionality for producing BAM files | |
9 // *************************************************************************** | |
10 | |
11 #include <BamAlignment.h> | |
12 #include <BamWriter_p.h> | |
13 using namespace BamTools; | |
14 using namespace BamTools::Internal; | |
15 using namespace std; | |
16 | |
17 BamWriterPrivate::BamWriterPrivate(void) { | |
18 IsBigEndian = SystemIsBigEndian(); | |
19 } | |
20 | |
21 BamWriterPrivate::~BamWriterPrivate(void) { | |
22 mBGZF.Close(); | |
23 } | |
24 | |
25 // closes the alignment archive | |
26 void BamWriterPrivate::Close(void) { | |
27 mBGZF.Close(); | |
28 } | |
29 | |
30 // calculates minimum bin for a BAM alignment interval | |
31 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { | |
32 --end; | |
33 if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); | |
34 if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); | |
35 if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); | |
36 if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); | |
37 if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); | |
38 return 0; | |
39 } | |
40 | |
41 // creates a cigar string from the supplied alignment | |
42 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) { | |
43 | |
44 // initialize | |
45 const unsigned int numCigarOperations = cigarOperations.size(); | |
46 packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); | |
47 | |
48 // pack the cigar data into the string | |
49 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); | |
50 | |
51 unsigned int cigarOp; | |
52 vector<CigarOp>::const_iterator coIter; | |
53 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { | |
54 | |
55 switch(coIter->Type) { | |
56 case 'M': | |
57 cigarOp = BAM_CMATCH; | |
58 break; | |
59 case 'I': | |
60 cigarOp = BAM_CINS; | |
61 break; | |
62 case 'D': | |
63 cigarOp = BAM_CDEL; | |
64 break; | |
65 case 'N': | |
66 cigarOp = BAM_CREF_SKIP; | |
67 break; | |
68 case 'S': | |
69 cigarOp = BAM_CSOFT_CLIP; | |
70 break; | |
71 case 'H': | |
72 cigarOp = BAM_CHARD_CLIP; | |
73 break; | |
74 case 'P': | |
75 cigarOp = BAM_CPAD; | |
76 break; | |
77 default: | |
78 fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); | |
79 exit(1); | |
80 } | |
81 | |
82 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; | |
83 pPackedCigar++; | |
84 } | |
85 } | |
86 | |
87 // encodes the supplied query sequence into 4-bit notation | |
88 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { | |
89 | |
90 // prepare the encoded query string | |
91 const unsigned int queryLen = query.size(); | |
92 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); | |
93 encodedQuery.resize(encodedQueryLen); | |
94 char* pEncodedQuery = (char*)encodedQuery.data(); | |
95 const char* pQuery = (const char*)query.data(); | |
96 | |
97 unsigned char nucleotideCode; | |
98 bool useHighWord = true; | |
99 | |
100 while(*pQuery) { | |
101 | |
102 switch(*pQuery) { | |
103 | |
104 case '=': | |
105 nucleotideCode = 0; | |
106 break; | |
107 | |
108 case 'A': | |
109 nucleotideCode = 1; | |
110 break; | |
111 | |
112 case 'C': | |
113 nucleotideCode = 2; | |
114 break; | |
115 | |
116 case 'G': | |
117 nucleotideCode = 4; | |
118 break; | |
119 | |
120 case 'T': | |
121 nucleotideCode = 8; | |
122 break; | |
123 | |
124 case 'N': | |
125 nucleotideCode = 15; | |
126 break; | |
127 | |
128 default: | |
129 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); | |
130 exit(1); | |
131 } | |
132 | |
133 // pack the nucleotide code | |
134 if(useHighWord) { | |
135 *pEncodedQuery = nucleotideCode << 4; | |
136 useHighWord = false; | |
137 } else { | |
138 *pEncodedQuery |= nucleotideCode; | |
139 pEncodedQuery++; | |
140 useHighWord = true; | |
141 } | |
142 | |
143 // increment the query position | |
144 pQuery++; | |
145 } | |
146 } | |
147 | |
148 // opens the alignment archive | |
149 bool BamWriterPrivate::Open(const string& filename, | |
150 const string& samHeader, | |
151 const RefVector& referenceSequences, | |
152 bool isWriteUncompressed) | |
153 { | |
154 // open the BGZF file for writing, return failure if error | |
155 if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) | |
156 return false; | |
157 | |
158 // ================ | |
159 // write the header | |
160 // ================ | |
161 | |
162 // write the BAM signature | |
163 const unsigned char SIGNATURE_LENGTH = 4; | |
164 const char* BAM_SIGNATURE = "BAM\1"; | |
165 mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); | |
166 | |
167 // write the SAM header text length | |
168 uint32_t samHeaderLen = samHeader.size(); | |
169 if (IsBigEndian) SwapEndian_32(samHeaderLen); | |
170 mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); | |
171 | |
172 // write the SAM header text | |
173 if(samHeaderLen > 0) | |
174 mBGZF.Write(samHeader.data(), samHeaderLen); | |
175 | |
176 // write the number of reference sequences | |
177 uint32_t numReferenceSequences = referenceSequences.size(); | |
178 if (IsBigEndian) SwapEndian_32(numReferenceSequences); | |
179 mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); | |
180 | |
181 // ============================= | |
182 // write the sequence dictionary | |
183 // ============================= | |
184 | |
185 RefVector::const_iterator rsIter; | |
186 for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { | |
187 | |
188 // write the reference sequence name length | |
189 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; | |
190 if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); | |
191 mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); | |
192 | |
193 // write the reference sequence name | |
194 mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); | |
195 | |
196 // write the reference sequence length | |
197 int32_t referenceLength = rsIter->RefLength; | |
198 if (IsBigEndian) SwapEndian_32(referenceLength); | |
199 mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); | |
200 } | |
201 | |
202 // return success | |
203 return true; | |
204 } | |
205 | |
206 // saves the alignment to the alignment archive | |
207 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { | |
208 | |
209 // if BamAlignment contains only the core data and a raw char data buffer | |
210 // (as a result of BamReader::GetNextAlignmentCore()) | |
211 if ( al.SupportData.HasCoreOnly ) { | |
212 | |
213 // write the block size | |
214 unsigned int blockSize = al.SupportData.BlockLength; | |
215 if (IsBigEndian) SwapEndian_32(blockSize); | |
216 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); | |
217 | |
218 // assign the BAM core data | |
219 uint32_t buffer[8]; | |
220 buffer[0] = al.RefID; | |
221 buffer[1] = al.Position; | |
222 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; | |
223 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; | |
224 buffer[4] = al.SupportData.QuerySequenceLength; | |
225 buffer[5] = al.MateRefID; | |
226 buffer[6] = al.MatePosition; | |
227 buffer[7] = al.InsertSize; | |
228 | |
229 // swap BAM core endian-ness, if necessary | |
230 if ( IsBigEndian ) { | |
231 for ( int i = 0; i < 8; ++i ) | |
232 SwapEndian_32(buffer[i]); | |
233 } | |
234 | |
235 // write the BAM core | |
236 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); | |
237 | |
238 // write the raw char data | |
239 mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); | |
240 } | |
241 | |
242 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc | |
243 // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) | |
244 else { | |
245 | |
246 // calculate char lengths | |
247 const unsigned int nameLength = al.Name.size() + 1; | |
248 const unsigned int numCigarOperations = al.CigarData.size(); | |
249 const unsigned int queryLength = al.QueryBases.size(); | |
250 const unsigned int tagDataLength = al.TagData.size(); | |
251 | |
252 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) | |
253 // force calculation of Bin before storing | |
254 const int endPosition = al.GetEndPosition(); | |
255 const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); | |
256 | |
257 // create our packed cigar string | |
258 string packedCigar; | |
259 CreatePackedCigar(al.CigarData, packedCigar); | |
260 const unsigned int packedCigarLength = packedCigar.size(); | |
261 | |
262 // encode the query | |
263 string encodedQuery; | |
264 EncodeQuerySequence(al.QueryBases, encodedQuery); | |
265 const unsigned int encodedQueryLength = encodedQuery.size(); | |
266 | |
267 // write the block size | |
268 const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; | |
269 unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; | |
270 if (IsBigEndian) SwapEndian_32(blockSize); | |
271 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); | |
272 | |
273 // assign the BAM core data | |
274 uint32_t buffer[8]; | |
275 buffer[0] = al.RefID; | |
276 buffer[1] = al.Position; | |
277 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; | |
278 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; | |
279 buffer[4] = queryLength; | |
280 buffer[5] = al.MateRefID; | |
281 buffer[6] = al.MatePosition; | |
282 buffer[7] = al.InsertSize; | |
283 | |
284 // swap BAM core endian-ness, if necessary | |
285 if ( IsBigEndian ) { | |
286 for ( int i = 0; i < 8; ++i ) | |
287 SwapEndian_32(buffer[i]); | |
288 } | |
289 | |
290 // write the BAM core | |
291 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); | |
292 | |
293 // write the query name | |
294 mBGZF.Write(al.Name.c_str(), nameLength); | |
295 | |
296 // write the packed cigar | |
297 if ( IsBigEndian ) { | |
298 | |
299 char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); | |
300 memcpy(cigarData, packedCigar.data(), packedCigarLength); | |
301 | |
302 for (unsigned int i = 0; i < packedCigarLength; ++i) { | |
303 if ( IsBigEndian ) | |
304 SwapEndian_32p(&cigarData[i]); | |
305 } | |
306 | |
307 mBGZF.Write(cigarData, packedCigarLength); | |
308 free(cigarData); | |
309 } | |
310 else | |
311 mBGZF.Write(packedCigar.data(), packedCigarLength); | |
312 | |
313 // write the encoded query sequence | |
314 mBGZF.Write(encodedQuery.data(), encodedQueryLength); | |
315 | |
316 // write the base qualities | |
317 string baseQualities(al.Qualities); | |
318 char* pBaseQualities = (char*)al.Qualities.data(); | |
319 for(unsigned int i = 0; i < queryLength; i++) { | |
320 pBaseQualities[i] -= 33; | |
321 } | |
322 mBGZF.Write(pBaseQualities, queryLength); | |
323 | |
324 // write the read group tag | |
325 if ( IsBigEndian ) { | |
326 | |
327 char* tagData = (char*)calloc(sizeof(char), tagDataLength); | |
328 memcpy(tagData, al.TagData.data(), tagDataLength); | |
329 | |
330 int i = 0; | |
331 while ( (unsigned int)i < tagDataLength ) { | |
332 | |
333 i += 2; // skip tag type (e.g. "RG", "NM", etc) | |
334 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning | |
335 ++i; // skip value type | |
336 | |
337 switch (type) { | |
338 | |
339 case('A') : | |
340 case('C') : | |
341 ++i; | |
342 break; | |
343 | |
344 case('S') : | |
345 SwapEndian_16p(&tagData[i]); | |
346 i+=2; // sizeof(uint16_t) | |
347 break; | |
348 | |
349 case('F') : | |
350 case('I') : | |
351 SwapEndian_32p(&tagData[i]); | |
352 i+=4; // sizeof(uint32_t) | |
353 break; | |
354 | |
355 case('D') : | |
356 SwapEndian_64p(&tagData[i]); | |
357 i+=8; // sizeof(uint64_t) | |
358 break; | |
359 | |
360 case('H') : | |
361 case('Z') : | |
362 while (tagData[i]) { ++i; } | |
363 ++i; // increment one more for null terminator | |
364 break; | |
365 | |
366 default : | |
367 fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here | |
368 free(tagData); | |
369 exit(1); | |
370 } | |
371 } | |
372 | |
373 mBGZF.Write(tagData, tagDataLength); | |
374 free(tagData); | |
375 } | |
376 else | |
377 mBGZF.Write(al.TagData.data(), tagDataLength); | |
378 } | |
379 } |