annotate spp/src/BamWriter_p.cpp @ 15:e689b83b0257 draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:15:21 -0500
parents ce08b0efa3fd
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1 // ***************************************************************************
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
3 // Marth Lab, Department of Biology, Boston College
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
4 // All rights reserved.
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
5 // ---------------------------------------------------------------------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
6 // Last modified: 22 November 2010 (DB)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
7 // ---------------------------------------------------------------------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
8 // Provides the basic functionality for producing BAM files
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
9 // ***************************************************************************
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
10
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
11 #include <BamAlignment.h>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
12 #include <BamWriter_p.h>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
13 using namespace BamTools;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
14 using namespace BamTools::Internal;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
15 using namespace std;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
16
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
17 BamWriterPrivate::BamWriterPrivate(void) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
18 IsBigEndian = SystemIsBigEndian();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
19 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
20
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
21 BamWriterPrivate::~BamWriterPrivate(void) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
22 mBGZF.Close();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
23 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
24
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
25 // closes the alignment archive
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
26 void BamWriterPrivate::Close(void) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
27 mBGZF.Close();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
28 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
29
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
30 // calculates minimum bin for a BAM alignment interval
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
31 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
32 --end;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
33 if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
34 if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
35 if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
36 if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
37 if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
38 return 0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
39 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
40
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
41 // creates a cigar string from the supplied alignment
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
42 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
43
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
44 // initialize
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
45 const unsigned int numCigarOperations = cigarOperations.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
46 packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
47
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
48 // pack the cigar data into the string
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
49 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
50
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
51 unsigned int cigarOp;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
52 vector<CigarOp>::const_iterator coIter;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
53 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
54
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
55 switch(coIter->Type) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
56 case 'M':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
57 cigarOp = BAM_CMATCH;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
58 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
59 case 'I':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
60 cigarOp = BAM_CINS;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
61 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
62 case 'D':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
63 cigarOp = BAM_CDEL;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
64 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
65 case 'N':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
66 cigarOp = BAM_CREF_SKIP;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
67 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
68 case 'S':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
69 cigarOp = BAM_CSOFT_CLIP;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
70 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
71 case 'H':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
72 cigarOp = BAM_CHARD_CLIP;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
73 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
74 case 'P':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
75 cigarOp = BAM_CPAD;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
76 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
77 default:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
78 fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
79 exit(1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
80 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
81
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
82 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
83 pPackedCigar++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
84 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
85 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
86
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
87 // encodes the supplied query sequence into 4-bit notation
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
88 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
89
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
90 // prepare the encoded query string
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
91 const unsigned int queryLen = query.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
92 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
93 encodedQuery.resize(encodedQueryLen);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
94 char* pEncodedQuery = (char*)encodedQuery.data();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
95 const char* pQuery = (const char*)query.data();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
96
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
97 unsigned char nucleotideCode;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
98 bool useHighWord = true;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
99
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
100 while(*pQuery) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
101
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
102 switch(*pQuery) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
103
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
104 case '=':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
105 nucleotideCode = 0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
106 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
107
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
108 case 'A':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
109 nucleotideCode = 1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
110 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
111
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
112 case 'C':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
113 nucleotideCode = 2;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
114 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
115
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
116 case 'G':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
117 nucleotideCode = 4;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
118 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
119
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
120 case 'T':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
121 nucleotideCode = 8;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
122 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
123
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
124 case 'N':
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
125 nucleotideCode = 15;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
126 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
127
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
128 default:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
129 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
130 exit(1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
131 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
132
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
133 // pack the nucleotide code
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
134 if(useHighWord) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
135 *pEncodedQuery = nucleotideCode << 4;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
136 useHighWord = false;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
137 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
138 *pEncodedQuery |= nucleotideCode;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
139 pEncodedQuery++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
140 useHighWord = true;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
141 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
142
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
143 // increment the query position
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
144 pQuery++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
145 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
146 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
147
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
148 // opens the alignment archive
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
149 bool BamWriterPrivate::Open(const string& filename,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
150 const string& samHeader,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
151 const RefVector& referenceSequences,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
152 bool isWriteUncompressed)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
153 {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
154 // open the BGZF file for writing, return failure if error
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
155 if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
156 return false;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
157
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
158 // ================
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
159 // write the header
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
160 // ================
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
161
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
162 // write the BAM signature
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
163 const unsigned char SIGNATURE_LENGTH = 4;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
164 const char* BAM_SIGNATURE = "BAM\1";
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
165 mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
166
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
167 // write the SAM header text length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
168 uint32_t samHeaderLen = samHeader.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
169 if (IsBigEndian) SwapEndian_32(samHeaderLen);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
170 mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
171
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
172 // write the SAM header text
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
173 if(samHeaderLen > 0)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
174 mBGZF.Write(samHeader.data(), samHeaderLen);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
175
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
176 // write the number of reference sequences
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
177 uint32_t numReferenceSequences = referenceSequences.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
178 if (IsBigEndian) SwapEndian_32(numReferenceSequences);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
179 mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
180
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
181 // =============================
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
182 // write the sequence dictionary
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
183 // =============================
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
184
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
185 RefVector::const_iterator rsIter;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
186 for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
187
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
188 // write the reference sequence name length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
189 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
190 if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
191 mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
192
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
193 // write the reference sequence name
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
194 mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
195
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
196 // write the reference sequence length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
197 int32_t referenceLength = rsIter->RefLength;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
198 if (IsBigEndian) SwapEndian_32(referenceLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
199 mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
200 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
201
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
202 // return success
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
203 return true;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
204 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
205
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
206 // saves the alignment to the alignment archive
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
207 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
208
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
209 // if BamAlignment contains only the core data and a raw char data buffer
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
210 // (as a result of BamReader::GetNextAlignmentCore())
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
211 if ( al.SupportData.HasCoreOnly ) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
212
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
213 // write the block size
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
214 unsigned int blockSize = al.SupportData.BlockLength;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
215 if (IsBigEndian) SwapEndian_32(blockSize);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
216 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
217
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
218 // assign the BAM core data
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
219 uint32_t buffer[8];
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
220 buffer[0] = al.RefID;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
221 buffer[1] = al.Position;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
222 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
223 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
224 buffer[4] = al.SupportData.QuerySequenceLength;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
225 buffer[5] = al.MateRefID;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
226 buffer[6] = al.MatePosition;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
227 buffer[7] = al.InsertSize;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
228
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
229 // swap BAM core endian-ness, if necessary
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
230 if ( IsBigEndian ) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
231 for ( int i = 0; i < 8; ++i )
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
232 SwapEndian_32(buffer[i]);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
233 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
234
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
235 // write the BAM core
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
236 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
237
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
238 // write the raw char data
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
239 mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
240 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
241
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
242 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
243 // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
244 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
245
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
246 // calculate char lengths
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
247 const unsigned int nameLength = al.Name.size() + 1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
248 const unsigned int numCigarOperations = al.CigarData.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
249 const unsigned int queryLength = al.QueryBases.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
250 const unsigned int tagDataLength = al.TagData.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
251
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
252 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
253 // force calculation of Bin before storing
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
254 const int endPosition = al.GetEndPosition();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
255 const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
256
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
257 // create our packed cigar string
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
258 string packedCigar;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
259 CreatePackedCigar(al.CigarData, packedCigar);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
260 const unsigned int packedCigarLength = packedCigar.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
261
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
262 // encode the query
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
263 string encodedQuery;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
264 EncodeQuerySequence(al.QueryBases, encodedQuery);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
265 const unsigned int encodedQueryLength = encodedQuery.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
266
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
267 // write the block size
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
268 const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
269 unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
270 if (IsBigEndian) SwapEndian_32(blockSize);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
271 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
272
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
273 // assign the BAM core data
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
274 uint32_t buffer[8];
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
275 buffer[0] = al.RefID;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
276 buffer[1] = al.Position;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
277 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
278 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
279 buffer[4] = queryLength;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
280 buffer[5] = al.MateRefID;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
281 buffer[6] = al.MatePosition;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
282 buffer[7] = al.InsertSize;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
283
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
284 // swap BAM core endian-ness, if necessary
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
285 if ( IsBigEndian ) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
286 for ( int i = 0; i < 8; ++i )
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
287 SwapEndian_32(buffer[i]);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
288 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
289
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
290 // write the BAM core
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
291 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
292
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
293 // write the query name
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
294 mBGZF.Write(al.Name.c_str(), nameLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
295
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
296 // write the packed cigar
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
297 if ( IsBigEndian ) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
298
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
299 char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
300 memcpy(cigarData, packedCigar.data(), packedCigarLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
301
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
302 for (unsigned int i = 0; i < packedCigarLength; ++i) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
303 if ( IsBigEndian )
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
304 SwapEndian_32p(&cigarData[i]);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
305 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
306
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
307 mBGZF.Write(cigarData, packedCigarLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
308 free(cigarData);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
309 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
310 else
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
311 mBGZF.Write(packedCigar.data(), packedCigarLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
312
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
313 // write the encoded query sequence
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
314 mBGZF.Write(encodedQuery.data(), encodedQueryLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
315
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
316 // write the base qualities
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
317 string baseQualities(al.Qualities);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
318 char* pBaseQualities = (char*)al.Qualities.data();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
319 for(unsigned int i = 0; i < queryLength; i++) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
320 pBaseQualities[i] -= 33;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
321 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
322 mBGZF.Write(pBaseQualities, queryLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
323
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
324 // write the read group tag
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
325 if ( IsBigEndian ) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
326
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
327 char* tagData = (char*)calloc(sizeof(char), tagDataLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
328 memcpy(tagData, al.TagData.data(), tagDataLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
329
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
330 int i = 0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
331 while ( (unsigned int)i < tagDataLength ) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
332
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
333 i += 2; // skip tag type (e.g. "RG", "NM", etc)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
334 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
335 ++i; // skip value type
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
336
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
337 switch (type) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
338
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
339 case('A') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
340 case('C') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
341 ++i;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
342 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
343
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
344 case('S') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
345 SwapEndian_16p(&tagData[i]);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
346 i+=2; // sizeof(uint16_t)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
347 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
348
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
349 case('F') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
350 case('I') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
351 SwapEndian_32p(&tagData[i]);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
352 i+=4; // sizeof(uint32_t)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
353 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
354
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
355 case('D') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
356 SwapEndian_64p(&tagData[i]);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
357 i+=8; // sizeof(uint64_t)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
358 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
359
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
360 case('H') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
361 case('Z') :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
362 while (tagData[i]) { ++i; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
363 ++i; // increment one more for null terminator
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
364 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
365
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
366 default :
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
367 fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
368 free(tagData);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
369 exit(1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
370 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
371 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
372
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
373 mBGZF.Write(tagData, tagDataLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
374 free(tagData);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
375 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
376 else
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
377 mBGZF.Write(al.TagData.data(), tagDataLength);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
378 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
379 }