6
|
1 // ***************************************************************************
|
|
2 // BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
|
|
3 // Marth Lab, Department of Biology, Boston College
|
|
4 // All rights reserved.
|
|
5 // ---------------------------------------------------------------------------
|
|
6 // Last modified: 19 November 2010 (DB)
|
|
7 // ---------------------------------------------------------------------------
|
|
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
|
|
9 // Institute.
|
|
10 // ---------------------------------------------------------------------------
|
|
11 // Provides the basic functionality for reading & writing BGZF files
|
|
12 // ***************************************************************************
|
|
13
|
|
14 #include <BGZF.h>
|
|
15 using namespace BamTools;
|
|
16
|
|
17 #include <algorithm>
|
|
18 using namespace std;
|
|
19
|
|
20 BgzfData::BgzfData(void)
|
|
21 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
|
|
22 , CompressedBlockSize(MAX_BLOCK_SIZE)
|
|
23 , BlockLength(0)
|
|
24 , BlockOffset(0)
|
|
25 , BlockAddress(0)
|
|
26 , IsOpen(false)
|
|
27 , IsWriteOnly(false)
|
|
28 , IsWriteUncompressed(false)
|
|
29 , Stream(NULL)
|
|
30 , UncompressedBlock(NULL)
|
|
31 , CompressedBlock(NULL)
|
|
32 {
|
|
33 try {
|
|
34 CompressedBlock = new char[CompressedBlockSize];
|
|
35 UncompressedBlock = new char[UncompressedBlockSize];
|
|
36 } catch( std::bad_alloc& ba ) {
|
|
37 fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n");
|
|
38 exit(1);
|
|
39 }
|
|
40 }
|
|
41
|
|
42 // destructor
|
|
43 BgzfData::~BgzfData(void) {
|
|
44 if( CompressedBlock ) delete[] CompressedBlock;
|
|
45 if( UncompressedBlock ) delete[] UncompressedBlock;
|
|
46 }
|
|
47
|
|
48 // closes BGZF file
|
|
49 void BgzfData::Close(void) {
|
|
50
|
|
51 // skip if file not open, otherwise set flag
|
|
52 if ( !IsOpen ) return;
|
|
53
|
|
54 // if writing to file, flush the current BGZF block,
|
|
55 // then write an empty block (as EOF marker)
|
|
56 if ( IsWriteOnly ) {
|
|
57 FlushBlock();
|
|
58 int blockLength = DeflateBlock();
|
|
59 fwrite(CompressedBlock, 1, blockLength, Stream);
|
|
60 }
|
|
61
|
|
62 // flush and close
|
|
63 fflush(Stream);
|
|
64 fclose(Stream);
|
|
65 IsWriteUncompressed = false;
|
|
66 IsOpen = false;
|
|
67 }
|
|
68
|
|
69 // compresses the current block
|
|
70 int BgzfData::DeflateBlock(void) {
|
|
71
|
|
72 // initialize the gzip header
|
|
73 char* buffer = CompressedBlock;
|
|
74 memset(buffer, 0, 18);
|
|
75 buffer[0] = GZIP_ID1;
|
|
76 buffer[1] = (char)GZIP_ID2;
|
|
77 buffer[2] = CM_DEFLATE;
|
|
78 buffer[3] = FLG_FEXTRA;
|
|
79 buffer[9] = (char)OS_UNKNOWN;
|
|
80 buffer[10] = BGZF_XLEN;
|
|
81 buffer[12] = BGZF_ID1;
|
|
82 buffer[13] = BGZF_ID2;
|
|
83 buffer[14] = BGZF_LEN;
|
|
84
|
|
85 // set compression level
|
|
86 const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );
|
|
87
|
|
88 // loop to retry for blocks that do not compress enough
|
|
89 int inputLength = BlockOffset;
|
|
90 int compressedLength = 0;
|
|
91 unsigned int bufferSize = CompressedBlockSize;
|
|
92
|
|
93 while ( true ) {
|
|
94
|
|
95 // initialize zstream values
|
|
96 z_stream zs;
|
|
97 zs.zalloc = NULL;
|
|
98 zs.zfree = NULL;
|
|
99 zs.next_in = (Bytef*)UncompressedBlock;
|
|
100 zs.avail_in = inputLength;
|
|
101 zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
|
|
102 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
|
|
103
|
|
104 // initialize the zlib compression algorithm
|
|
105 if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {
|
|
106 fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n");
|
|
107 exit(1);
|
|
108 }
|
|
109
|
|
110 // compress the data
|
|
111 int status = deflate(&zs, Z_FINISH);
|
|
112 if ( status != Z_STREAM_END ) {
|
|
113
|
|
114 deflateEnd(&zs);
|
|
115
|
|
116 // reduce the input length and try again
|
|
117 if ( status == Z_OK ) {
|
|
118 inputLength -= 1024;
|
|
119 if( inputLength < 0 ) {
|
|
120 fprintf(stderr, "BGZF ERROR: input reduction failed.\n");
|
|
121 exit(1);
|
|
122 }
|
|
123 continue;
|
|
124 }
|
|
125
|
|
126 fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
|
|
127 exit(1);
|
|
128 }
|
|
129
|
|
130 // finalize the compression routine
|
|
131 if ( deflateEnd(&zs) != Z_OK ) {
|
|
132 fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
|
|
133 exit(1);
|
|
134 }
|
|
135
|
|
136 compressedLength = zs.total_out;
|
|
137 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
|
|
138 if ( compressedLength > MAX_BLOCK_SIZE ) {
|
|
139 fprintf(stderr, "BGZF ERROR: deflate overflow.\n");
|
|
140 exit(1);
|
|
141 }
|
|
142
|
|
143 break;
|
|
144 }
|
|
145
|
|
146 // store the compressed length
|
|
147 BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
|
|
148
|
|
149 // store the CRC32 checksum
|
|
150 unsigned int crc = crc32(0, NULL, 0);
|
|
151 crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
|
|
152 BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
|
|
153 BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
|
|
154
|
|
155 // ensure that we have less than a block of data left
|
|
156 int remaining = BlockOffset - inputLength;
|
|
157 if ( remaining > 0 ) {
|
|
158 if ( remaining > inputLength ) {
|
|
159 fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n");
|
|
160 exit(1);
|
|
161 }
|
|
162 memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
|
|
163 }
|
|
164
|
|
165 BlockOffset = remaining;
|
|
166 return compressedLength;
|
|
167 }
|
|
168
|
|
169 // flushes the data in the BGZF block
|
|
170 void BgzfData::FlushBlock(void) {
|
|
171
|
|
172 // flush all of the remaining blocks
|
|
173 while ( BlockOffset > 0 ) {
|
|
174
|
|
175 // compress the data block
|
|
176 int blockLength = DeflateBlock();
|
|
177
|
|
178 // flush the data to our output stream
|
|
179 int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
|
|
180
|
|
181 if ( numBytesWritten != blockLength ) {
|
|
182 fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
|
|
183 exit(1);
|
|
184 }
|
|
185
|
|
186 BlockAddress += blockLength;
|
|
187 }
|
|
188 }
|
|
189
|
|
190 // de-compresses the current block
|
|
191 int BgzfData::InflateBlock(const int& blockLength) {
|
|
192
|
|
193 // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
|
|
194 z_stream zs;
|
|
195 zs.zalloc = NULL;
|
|
196 zs.zfree = NULL;
|
|
197 zs.next_in = (Bytef*)CompressedBlock + 18;
|
|
198 zs.avail_in = blockLength - 16;
|
|
199 zs.next_out = (Bytef*)UncompressedBlock;
|
|
200 zs.avail_out = UncompressedBlockSize;
|
|
201
|
|
202 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
|
|
203 if ( status != Z_OK ) {
|
|
204 fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
|
|
205 return -1;
|
|
206 }
|
|
207
|
|
208 status = inflate(&zs, Z_FINISH);
|
|
209 if ( status != Z_STREAM_END ) {
|
|
210 inflateEnd(&zs);
|
|
211 fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
|
|
212 return -1;
|
|
213 }
|
|
214
|
|
215 status = inflateEnd(&zs);
|
|
216 if ( status != Z_OK ) {
|
|
217 fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
|
|
218 return -1;
|
|
219 }
|
|
220
|
|
221 return zs.total_out;
|
|
222 }
|
|
223
|
|
224 // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
|
|
225 bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {
|
|
226
|
|
227 // determine open mode
|
|
228 if ( strcmp(mode, "rb") == 0 )
|
|
229 IsWriteOnly = false;
|
|
230 else if ( strcmp(mode, "wb") == 0)
|
|
231 IsWriteOnly = true;
|
|
232 else {
|
|
233 fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode);
|
|
234 return false;
|
|
235 }
|
|
236
|
|
237 // ----------------------------------------------------------------
|
|
238 // open Stream to read to/write from file, stdin, or stdout
|
|
239 // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
|
|
240
|
|
241 // read/write BGZF data to/from a file
|
|
242 if ( (filename != "stdin") && (filename != "stdout") )
|
|
243 Stream = fopen(filename.c_str(), mode);
|
|
244
|
|
245 // read BGZF data from stdin
|
|
246 else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
|
|
247 Stream = freopen(NULL, mode, stdin);
|
|
248
|
|
249 // write BGZF data to stdout
|
|
250 else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
|
|
251 Stream = freopen(NULL, mode, stdout);
|
|
252
|
|
253 if ( !Stream ) {
|
|
254 fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() );
|
|
255 return false;
|
|
256 }
|
|
257
|
|
258 // set flags, return success
|
|
259 IsOpen = true;
|
|
260 IsWriteUncompressed = isWriteUncompressed;
|
|
261 return true;
|
|
262 }
|
|
263
|
|
264 // reads BGZF data into a byte buffer
|
|
265 int BgzfData::Read(char* data, const unsigned int dataLength) {
|
|
266
|
|
267 if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;
|
|
268
|
|
269 char* output = data;
|
|
270 unsigned int numBytesRead = 0;
|
|
271 while ( numBytesRead < dataLength ) {
|
|
272
|
|
273 int bytesAvailable = BlockLength - BlockOffset;
|
|
274 if ( bytesAvailable <= 0 ) {
|
|
275 if ( !ReadBlock() ) return -1;
|
|
276 bytesAvailable = BlockLength - BlockOffset;
|
|
277 if ( bytesAvailable <= 0 ) break;
|
|
278 }
|
|
279
|
|
280 char* buffer = UncompressedBlock;
|
|
281 int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
|
|
282 memcpy(output, buffer + BlockOffset, copyLength);
|
|
283
|
|
284 BlockOffset += copyLength;
|
|
285 output += copyLength;
|
|
286 numBytesRead += copyLength;
|
|
287 }
|
|
288
|
|
289 if ( BlockOffset == BlockLength ) {
|
|
290 BlockAddress = ftell64(Stream);
|
|
291 BlockOffset = 0;
|
|
292 BlockLength = 0;
|
|
293 }
|
|
294
|
|
295 return numBytesRead;
|
|
296 }
|
|
297
|
|
298 // reads a BGZF block
|
|
299 bool BgzfData::ReadBlock(void) {
|
|
300
|
|
301 char header[BLOCK_HEADER_LENGTH];
|
|
302 int64_t blockAddress = ftell64(Stream);
|
|
303
|
|
304 int count = fread(header, 1, sizeof(header), Stream);
|
|
305 if ( count == 0 ) {
|
|
306 BlockLength = 0;
|
|
307 return true;
|
|
308 }
|
|
309
|
|
310 if ( count != sizeof(header) ) {
|
|
311 fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n");
|
|
312 return false;
|
|
313 }
|
|
314
|
|
315 if ( !BgzfData::CheckBlockHeader(header) ) {
|
|
316 fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n");
|
|
317 return false;
|
|
318 }
|
|
319
|
|
320 int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
|
|
321 char* compressedBlock = CompressedBlock;
|
|
322 memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
|
|
323 int remaining = blockLength - BLOCK_HEADER_LENGTH;
|
|
324
|
|
325 count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
|
|
326 if ( count != remaining ) {
|
|
327 fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n");
|
|
328 return false;
|
|
329 }
|
|
330
|
|
331 count = InflateBlock(blockLength);
|
|
332 if ( count < 0 ) {
|
|
333 fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n");
|
|
334 return false;
|
|
335 }
|
|
336
|
|
337 if ( BlockLength != 0 )
|
|
338 BlockOffset = 0;
|
|
339
|
|
340 BlockAddress = blockAddress;
|
|
341 BlockLength = count;
|
|
342 return true;
|
|
343 }
|
|
344
|
|
345 // seek to position in BGZF file
|
|
346 bool BgzfData::Seek(int64_t position) {
|
|
347
|
|
348 if ( !IsOpen ) return false;
|
|
349
|
|
350 int blockOffset = (position & 0xFFFF);
|
|
351 int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
|
|
352
|
|
353 if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
|
|
354 fprintf(stderr, "BGZF ERROR: unable to seek in file\n");
|
|
355 return false;
|
|
356 }
|
|
357
|
|
358 BlockLength = 0;
|
|
359 BlockAddress = blockAddress;
|
|
360 BlockOffset = blockOffset;
|
|
361 return true;
|
|
362 }
|
|
363
|
|
364 // get file position in BGZF file
|
|
365 int64_t BgzfData::Tell(void) {
|
|
366 if ( !IsOpen )
|
|
367 return false;
|
|
368 else
|
|
369 return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
|
|
370 }
|
|
371
|
|
372 // writes the supplied data into the BGZF buffer
|
|
373 unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
|
|
374
|
|
375 if ( !IsOpen || !IsWriteOnly ) return false;
|
|
376
|
|
377 // initialize
|
|
378 unsigned int numBytesWritten = 0;
|
|
379 const char* input = data;
|
|
380 unsigned int blockLength = UncompressedBlockSize;
|
|
381
|
|
382 // copy the data to the buffer
|
|
383 while ( numBytesWritten < dataLen ) {
|
|
384
|
|
385 unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
|
|
386 char* buffer = UncompressedBlock;
|
|
387 memcpy(buffer + BlockOffset, input, copyLength);
|
|
388
|
|
389 BlockOffset += copyLength;
|
|
390 input += copyLength;
|
|
391 numBytesWritten += copyLength;
|
|
392
|
|
393 if ( BlockOffset == blockLength )
|
|
394 FlushBlock();
|
|
395 }
|
|
396
|
|
397 return numBytesWritten;
|
|
398 }
|