6
|
1 // ***************************************************************************
|
|
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
|
|
3 // Marth Lab, Department of Biology, Boston College
|
|
4 // All rights reserved.
|
|
5 // ---------------------------------------------------------------------------
|
|
6 // Last modified: 22 November 2010 (DB)
|
|
7 // ---------------------------------------------------------------------------
|
|
8 // Provides the basic functionality for producing BAM files
|
|
9 // ***************************************************************************
|
|
10
|
|
11 #include <BamAlignment.h>
|
|
12 #include <BamWriter_p.h>
|
|
13 using namespace BamTools;
|
|
14 using namespace BamTools::Internal;
|
|
15 using namespace std;
|
|
16
|
|
17 BamWriterPrivate::BamWriterPrivate(void) {
|
|
18 IsBigEndian = SystemIsBigEndian();
|
|
19 }
|
|
20
|
|
21 BamWriterPrivate::~BamWriterPrivate(void) {
|
|
22 mBGZF.Close();
|
|
23 }
|
|
24
|
|
25 // closes the alignment archive
|
|
26 void BamWriterPrivate::Close(void) {
|
|
27 mBGZF.Close();
|
|
28 }
|
|
29
|
|
30 // calculates minimum bin for a BAM alignment interval
|
|
31 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
|
|
32 --end;
|
|
33 if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
|
|
34 if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
|
|
35 if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
|
|
36 if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
|
|
37 if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
|
|
38 return 0;
|
|
39 }
|
|
40
|
|
41 // creates a cigar string from the supplied alignment
|
|
42 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
|
|
43
|
|
44 // initialize
|
|
45 const unsigned int numCigarOperations = cigarOperations.size();
|
|
46 packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
|
|
47
|
|
48 // pack the cigar data into the string
|
|
49 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
|
|
50
|
|
51 unsigned int cigarOp;
|
|
52 vector<CigarOp>::const_iterator coIter;
|
|
53 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
|
|
54
|
|
55 switch(coIter->Type) {
|
|
56 case 'M':
|
|
57 cigarOp = BAM_CMATCH;
|
|
58 break;
|
|
59 case 'I':
|
|
60 cigarOp = BAM_CINS;
|
|
61 break;
|
|
62 case 'D':
|
|
63 cigarOp = BAM_CDEL;
|
|
64 break;
|
|
65 case 'N':
|
|
66 cigarOp = BAM_CREF_SKIP;
|
|
67 break;
|
|
68 case 'S':
|
|
69 cigarOp = BAM_CSOFT_CLIP;
|
|
70 break;
|
|
71 case 'H':
|
|
72 cigarOp = BAM_CHARD_CLIP;
|
|
73 break;
|
|
74 case 'P':
|
|
75 cigarOp = BAM_CPAD;
|
|
76 break;
|
|
77 default:
|
|
78 fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
|
|
79 exit(1);
|
|
80 }
|
|
81
|
|
82 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
|
|
83 pPackedCigar++;
|
|
84 }
|
|
85 }
|
|
86
|
|
87 // encodes the supplied query sequence into 4-bit notation
|
|
88 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
|
|
89
|
|
90 // prepare the encoded query string
|
|
91 const unsigned int queryLen = query.size();
|
|
92 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
|
|
93 encodedQuery.resize(encodedQueryLen);
|
|
94 char* pEncodedQuery = (char*)encodedQuery.data();
|
|
95 const char* pQuery = (const char*)query.data();
|
|
96
|
|
97 unsigned char nucleotideCode;
|
|
98 bool useHighWord = true;
|
|
99
|
|
100 while(*pQuery) {
|
|
101
|
|
102 switch(*pQuery) {
|
|
103
|
|
104 case '=':
|
|
105 nucleotideCode = 0;
|
|
106 break;
|
|
107
|
|
108 case 'A':
|
|
109 nucleotideCode = 1;
|
|
110 break;
|
|
111
|
|
112 case 'C':
|
|
113 nucleotideCode = 2;
|
|
114 break;
|
|
115
|
|
116 case 'G':
|
|
117 nucleotideCode = 4;
|
|
118 break;
|
|
119
|
|
120 case 'T':
|
|
121 nucleotideCode = 8;
|
|
122 break;
|
|
123
|
|
124 case 'N':
|
|
125 nucleotideCode = 15;
|
|
126 break;
|
|
127
|
|
128 default:
|
|
129 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
|
|
130 exit(1);
|
|
131 }
|
|
132
|
|
133 // pack the nucleotide code
|
|
134 if(useHighWord) {
|
|
135 *pEncodedQuery = nucleotideCode << 4;
|
|
136 useHighWord = false;
|
|
137 } else {
|
|
138 *pEncodedQuery |= nucleotideCode;
|
|
139 pEncodedQuery++;
|
|
140 useHighWord = true;
|
|
141 }
|
|
142
|
|
143 // increment the query position
|
|
144 pQuery++;
|
|
145 }
|
|
146 }
|
|
147
|
|
148 // opens the alignment archive
|
|
149 bool BamWriterPrivate::Open(const string& filename,
|
|
150 const string& samHeader,
|
|
151 const RefVector& referenceSequences,
|
|
152 bool isWriteUncompressed)
|
|
153 {
|
|
154 // open the BGZF file for writing, return failure if error
|
|
155 if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
|
|
156 return false;
|
|
157
|
|
158 // ================
|
|
159 // write the header
|
|
160 // ================
|
|
161
|
|
162 // write the BAM signature
|
|
163 const unsigned char SIGNATURE_LENGTH = 4;
|
|
164 const char* BAM_SIGNATURE = "BAM\1";
|
|
165 mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
|
|
166
|
|
167 // write the SAM header text length
|
|
168 uint32_t samHeaderLen = samHeader.size();
|
|
169 if (IsBigEndian) SwapEndian_32(samHeaderLen);
|
|
170 mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
|
|
171
|
|
172 // write the SAM header text
|
|
173 if(samHeaderLen > 0)
|
|
174 mBGZF.Write(samHeader.data(), samHeaderLen);
|
|
175
|
|
176 // write the number of reference sequences
|
|
177 uint32_t numReferenceSequences = referenceSequences.size();
|
|
178 if (IsBigEndian) SwapEndian_32(numReferenceSequences);
|
|
179 mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
|
|
180
|
|
181 // =============================
|
|
182 // write the sequence dictionary
|
|
183 // =============================
|
|
184
|
|
185 RefVector::const_iterator rsIter;
|
|
186 for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
|
|
187
|
|
188 // write the reference sequence name length
|
|
189 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
|
|
190 if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
|
|
191 mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
|
|
192
|
|
193 // write the reference sequence name
|
|
194 mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
|
|
195
|
|
196 // write the reference sequence length
|
|
197 int32_t referenceLength = rsIter->RefLength;
|
|
198 if (IsBigEndian) SwapEndian_32(referenceLength);
|
|
199 mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
|
|
200 }
|
|
201
|
|
202 // return success
|
|
203 return true;
|
|
204 }
|
|
205
|
|
206 // saves the alignment to the alignment archive
|
|
207 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
|
|
208
|
|
209 // if BamAlignment contains only the core data and a raw char data buffer
|
|
210 // (as a result of BamReader::GetNextAlignmentCore())
|
|
211 if ( al.SupportData.HasCoreOnly ) {
|
|
212
|
|
213 // write the block size
|
|
214 unsigned int blockSize = al.SupportData.BlockLength;
|
|
215 if (IsBigEndian) SwapEndian_32(blockSize);
|
|
216 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
|
|
217
|
|
218 // assign the BAM core data
|
|
219 uint32_t buffer[8];
|
|
220 buffer[0] = al.RefID;
|
|
221 buffer[1] = al.Position;
|
|
222 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
|
|
223 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
|
|
224 buffer[4] = al.SupportData.QuerySequenceLength;
|
|
225 buffer[5] = al.MateRefID;
|
|
226 buffer[6] = al.MatePosition;
|
|
227 buffer[7] = al.InsertSize;
|
|
228
|
|
229 // swap BAM core endian-ness, if necessary
|
|
230 if ( IsBigEndian ) {
|
|
231 for ( int i = 0; i < 8; ++i )
|
|
232 SwapEndian_32(buffer[i]);
|
|
233 }
|
|
234
|
|
235 // write the BAM core
|
|
236 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
|
|
237
|
|
238 // write the raw char data
|
|
239 mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
|
|
240 }
|
|
241
|
|
242 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
|
|
243 // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
|
|
244 else {
|
|
245
|
|
246 // calculate char lengths
|
|
247 const unsigned int nameLength = al.Name.size() + 1;
|
|
248 const unsigned int numCigarOperations = al.CigarData.size();
|
|
249 const unsigned int queryLength = al.QueryBases.size();
|
|
250 const unsigned int tagDataLength = al.TagData.size();
|
|
251
|
|
252 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
|
|
253 // force calculation of Bin before storing
|
|
254 const int endPosition = al.GetEndPosition();
|
|
255 const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
|
|
256
|
|
257 // create our packed cigar string
|
|
258 string packedCigar;
|
|
259 CreatePackedCigar(al.CigarData, packedCigar);
|
|
260 const unsigned int packedCigarLength = packedCigar.size();
|
|
261
|
|
262 // encode the query
|
|
263 string encodedQuery;
|
|
264 EncodeQuerySequence(al.QueryBases, encodedQuery);
|
|
265 const unsigned int encodedQueryLength = encodedQuery.size();
|
|
266
|
|
267 // write the block size
|
|
268 const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
|
|
269 unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
|
|
270 if (IsBigEndian) SwapEndian_32(blockSize);
|
|
271 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
|
|
272
|
|
273 // assign the BAM core data
|
|
274 uint32_t buffer[8];
|
|
275 buffer[0] = al.RefID;
|
|
276 buffer[1] = al.Position;
|
|
277 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
|
|
278 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
|
|
279 buffer[4] = queryLength;
|
|
280 buffer[5] = al.MateRefID;
|
|
281 buffer[6] = al.MatePosition;
|
|
282 buffer[7] = al.InsertSize;
|
|
283
|
|
284 // swap BAM core endian-ness, if necessary
|
|
285 if ( IsBigEndian ) {
|
|
286 for ( int i = 0; i < 8; ++i )
|
|
287 SwapEndian_32(buffer[i]);
|
|
288 }
|
|
289
|
|
290 // write the BAM core
|
|
291 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
|
|
292
|
|
293 // write the query name
|
|
294 mBGZF.Write(al.Name.c_str(), nameLength);
|
|
295
|
|
296 // write the packed cigar
|
|
297 if ( IsBigEndian ) {
|
|
298
|
|
299 char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
|
|
300 memcpy(cigarData, packedCigar.data(), packedCigarLength);
|
|
301
|
|
302 for (unsigned int i = 0; i < packedCigarLength; ++i) {
|
|
303 if ( IsBigEndian )
|
|
304 SwapEndian_32p(&cigarData[i]);
|
|
305 }
|
|
306
|
|
307 mBGZF.Write(cigarData, packedCigarLength);
|
|
308 free(cigarData);
|
|
309 }
|
|
310 else
|
|
311 mBGZF.Write(packedCigar.data(), packedCigarLength);
|
|
312
|
|
313 // write the encoded query sequence
|
|
314 mBGZF.Write(encodedQuery.data(), encodedQueryLength);
|
|
315
|
|
316 // write the base qualities
|
|
317 string baseQualities(al.Qualities);
|
|
318 char* pBaseQualities = (char*)al.Qualities.data();
|
|
319 for(unsigned int i = 0; i < queryLength; i++) {
|
|
320 pBaseQualities[i] -= 33;
|
|
321 }
|
|
322 mBGZF.Write(pBaseQualities, queryLength);
|
|
323
|
|
324 // write the read group tag
|
|
325 if ( IsBigEndian ) {
|
|
326
|
|
327 char* tagData = (char*)calloc(sizeof(char), tagDataLength);
|
|
328 memcpy(tagData, al.TagData.data(), tagDataLength);
|
|
329
|
|
330 int i = 0;
|
|
331 while ( (unsigned int)i < tagDataLength ) {
|
|
332
|
|
333 i += 2; // skip tag type (e.g. "RG", "NM", etc)
|
|
334 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
|
|
335 ++i; // skip value type
|
|
336
|
|
337 switch (type) {
|
|
338
|
|
339 case('A') :
|
|
340 case('C') :
|
|
341 ++i;
|
|
342 break;
|
|
343
|
|
344 case('S') :
|
|
345 SwapEndian_16p(&tagData[i]);
|
|
346 i+=2; // sizeof(uint16_t)
|
|
347 break;
|
|
348
|
|
349 case('F') :
|
|
350 case('I') :
|
|
351 SwapEndian_32p(&tagData[i]);
|
|
352 i+=4; // sizeof(uint32_t)
|
|
353 break;
|
|
354
|
|
355 case('D') :
|
|
356 SwapEndian_64p(&tagData[i]);
|
|
357 i+=8; // sizeof(uint64_t)
|
|
358 break;
|
|
359
|
|
360 case('H') :
|
|
361 case('Z') :
|
|
362 while (tagData[i]) { ++i; }
|
|
363 ++i; // increment one more for null terminator
|
|
364 break;
|
|
365
|
|
366 default :
|
|
367 fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
|
|
368 free(tagData);
|
|
369 exit(1);
|
|
370 }
|
|
371 }
|
|
372
|
|
373 mBGZF.Write(tagData, tagDataLength);
|
|
374 free(tagData);
|
|
375 }
|
|
376 else
|
|
377 mBGZF.Write(al.TagData.data(), tagDataLength);
|
|
378 }
|
|
379 }
|