diff spp/src/BGZF.cpp @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/spp/src/BGZF.cpp	Tue Nov 27 16:11:40 2012 -0500
@@ -0,0 +1,398 @@
+// ***************************************************************************
+// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading & writing BGZF files
+// ***************************************************************************
+
+#include <BGZF.h>
+using namespace BamTools;
+
+#include <algorithm>
+using namespace std;
+
+BgzfData::BgzfData(void)
+    : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+    , CompressedBlockSize(MAX_BLOCK_SIZE)
+    , BlockLength(0)
+    , BlockOffset(0)
+    , BlockAddress(0)
+    , IsOpen(false)
+    , IsWriteOnly(false)
+    , IsWriteUncompressed(false)
+    , Stream(NULL)
+    , UncompressedBlock(NULL)
+    , CompressedBlock(NULL)
+{
+    try {
+        CompressedBlock   = new char[CompressedBlockSize];
+        UncompressedBlock = new char[UncompressedBlockSize];
+    } catch( std::bad_alloc& ba ) {
+        fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n");
+        exit(1);
+    }
+}
+
+// destructor
+BgzfData::~BgzfData(void) {
+    if( CompressedBlock   ) delete[] CompressedBlock;
+    if( UncompressedBlock ) delete[] UncompressedBlock;
+}
+
+// closes BGZF file
+void BgzfData::Close(void) {
+
+    // skip if file not open, otherwise set flag
+    if ( !IsOpen ) return;
+
+    // if writing to file, flush the current BGZF block,
+    // then write an empty block (as EOF marker)
+    if ( IsWriteOnly ) {
+        FlushBlock();
+        int blockLength = DeflateBlock();
+        fwrite(CompressedBlock, 1, blockLength, Stream);
+    }
+    
+    // flush and close
+    fflush(Stream);
+    fclose(Stream);
+    IsWriteUncompressed = false;
+    IsOpen = false;
+}
+
+// compresses the current block
+int BgzfData::DeflateBlock(void) {
+
+    // initialize the gzip header
+    char* buffer = CompressedBlock;
+    memset(buffer, 0, 18);
+    buffer[0]  = GZIP_ID1;
+    buffer[1]  = (char)GZIP_ID2;
+    buffer[2]  = CM_DEFLATE;
+    buffer[3]  = FLG_FEXTRA;
+    buffer[9]  = (char)OS_UNKNOWN;
+    buffer[10] = BGZF_XLEN;
+    buffer[12] = BGZF_ID1;
+    buffer[13] = BGZF_ID2;
+    buffer[14] = BGZF_LEN;
+
+    // set compression level
+    const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );
+    
+    // loop to retry for blocks that do not compress enough
+    int inputLength = BlockOffset;
+    int compressedLength = 0;
+    unsigned int bufferSize = CompressedBlockSize;
+
+    while ( true ) {
+        
+        // initialize zstream values
+        z_stream zs;
+        zs.zalloc    = NULL;
+        zs.zfree     = NULL;
+        zs.next_in   = (Bytef*)UncompressedBlock;
+        zs.avail_in  = inputLength;
+        zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
+        zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+        // initialize the zlib compression algorithm
+        if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {
+            fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n");
+            exit(1);
+        }
+
+        // compress the data
+        int status = deflate(&zs, Z_FINISH);
+        if ( status != Z_STREAM_END ) {
+
+            deflateEnd(&zs);
+
+            // reduce the input length and try again
+            if ( status == Z_OK ) {
+                inputLength -= 1024;
+                if( inputLength < 0 ) {
+                    fprintf(stderr, "BGZF ERROR: input reduction failed.\n");
+                    exit(1);
+                }
+                continue;
+            }
+
+            fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
+            exit(1);
+        }
+
+        // finalize the compression routine
+        if ( deflateEnd(&zs) != Z_OK ) {
+            fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
+            exit(1);
+        }
+
+        compressedLength = zs.total_out;
+        compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+        if ( compressedLength > MAX_BLOCK_SIZE ) {
+            fprintf(stderr, "BGZF ERROR: deflate overflow.\n");
+            exit(1);
+        }
+
+        break;
+    }
+
+    // store the compressed length
+    BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
+
+    // store the CRC32 checksum
+    unsigned int crc = crc32(0, NULL, 0);
+    crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
+    BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
+    BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+    // ensure that we have less than a block of data left
+    int remaining = BlockOffset - inputLength;
+    if ( remaining > 0 ) {
+        if ( remaining > inputLength ) {
+            fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n");
+            exit(1);
+        }
+        memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
+    }
+
+    BlockOffset = remaining;
+    return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BgzfData::FlushBlock(void) {
+
+    // flush all of the remaining blocks
+    while ( BlockOffset > 0 ) {
+
+        // compress the data block
+        int blockLength = DeflateBlock();
+
+        // flush the data to our output stream
+        int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
+
+        if ( numBytesWritten != blockLength ) {
+          fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
+          exit(1);
+        }
+              
+        BlockAddress += blockLength;
+    }
+}
+
+// de-compresses the current block
+int BgzfData::InflateBlock(const int& blockLength) {
+
+    // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
+    z_stream zs;
+    zs.zalloc    = NULL;
+    zs.zfree     = NULL;
+    zs.next_in   = (Bytef*)CompressedBlock + 18;
+    zs.avail_in  = blockLength - 16;
+    zs.next_out  = (Bytef*)UncompressedBlock;
+    zs.avail_out = UncompressedBlockSize;
+
+    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+    if ( status != Z_OK ) {
+        fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
+        return -1;
+    }
+
+    status = inflate(&zs, Z_FINISH);
+    if ( status != Z_STREAM_END ) {
+        inflateEnd(&zs);
+        fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
+        return -1;
+    }
+
+    status = inflateEnd(&zs);
+    if ( status != Z_OK ) {
+        fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
+        return -1;
+    }
+
+    return zs.total_out;
+}
+
+// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
+bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {
+
+    // determine open mode
+    if ( strcmp(mode, "rb") == 0 )
+        IsWriteOnly = false;
+    else if ( strcmp(mode, "wb") == 0) 
+        IsWriteOnly = true;
+    else {
+        fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode);
+        return false; 
+    }
+
+    // ----------------------------------------------------------------
+    // open Stream to read to/write from file, stdin, or stdout
+    // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
+    
+    // read/write BGZF data to/from a file
+    if ( (filename != "stdin") && (filename != "stdout") )
+        Stream = fopen(filename.c_str(), mode);
+    
+    // read BGZF data from stdin
+    else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
+        Stream = freopen(NULL, mode, stdin);
+    
+    // write BGZF data to stdout
+    else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
+        Stream = freopen(NULL, mode, stdout);
+
+    if ( !Stream ) {
+        fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() );
+        return false;
+    }
+    
+    // set flags, return success
+    IsOpen = true;
+    IsWriteUncompressed = isWriteUncompressed;
+    return true;
+}
+
+// reads BGZF data into a byte buffer
+int BgzfData::Read(char* data, const unsigned int dataLength) {
+
+   if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;
+
+   char* output = data;
+   unsigned int numBytesRead = 0;
+   while ( numBytesRead < dataLength ) {
+
+       int bytesAvailable = BlockLength - BlockOffset;
+       if ( bytesAvailable <= 0 ) {
+           if ( !ReadBlock() ) return -1; 
+           bytesAvailable = BlockLength - BlockOffset;
+           if ( bytesAvailable <= 0 ) break;
+       }
+
+       char* buffer   = UncompressedBlock;
+       int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
+       memcpy(output, buffer + BlockOffset, copyLength);
+
+       BlockOffset  += copyLength;
+       output       += copyLength;
+       numBytesRead += copyLength;
+   }
+
+   if ( BlockOffset == BlockLength ) {
+       BlockAddress = ftell64(Stream);
+       BlockOffset  = 0;
+       BlockLength  = 0;
+   }
+
+   return numBytesRead;
+}
+
+// reads a BGZF block
+bool BgzfData::ReadBlock(void) {
+
+    char    header[BLOCK_HEADER_LENGTH];
+    int64_t blockAddress = ftell64(Stream);
+    
+    int count = fread(header, 1, sizeof(header), Stream);
+    if ( count == 0 ) {
+        BlockLength = 0;
+        return true;
+    }
+
+    if ( count != sizeof(header) ) {
+        fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n");
+        return false;
+    }
+
+    if ( !BgzfData::CheckBlockHeader(header) ) {
+        fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n");
+        return false;
+    }
+
+    int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
+    char* compressedBlock = CompressedBlock;
+    memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
+    int remaining = blockLength - BLOCK_HEADER_LENGTH;
+
+    count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
+    if ( count != remaining ) {
+        fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n");
+        return false;
+    }
+
+    count = InflateBlock(blockLength);
+    if ( count < 0 ) { 
+      fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n");
+      return false;
+    }
+
+    if ( BlockLength != 0 )
+        BlockOffset = 0;
+
+    BlockAddress = blockAddress;
+    BlockLength  = count;
+    return true;
+}
+
+// seek to position in BGZF file
+bool BgzfData::Seek(int64_t position) {
+
+    if ( !IsOpen ) return false;
+  
+    int     blockOffset  = (position & 0xFFFF);
+    int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+
+    if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
+        fprintf(stderr, "BGZF ERROR: unable to seek in file\n");
+        return false;
+    }
+
+    BlockLength  = 0;
+    BlockAddress = blockAddress;
+    BlockOffset  = blockOffset;
+    return true;
+}
+
+// get file position in BGZF file
+int64_t BgzfData::Tell(void) {
+    if ( !IsOpen ) 
+        return false;
+    else 
+        return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
+}
+
+// writes the supplied data into the BGZF buffer
+unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
+
+    if ( !IsOpen || !IsWriteOnly ) return false;
+  
+    // initialize
+    unsigned int numBytesWritten = 0;
+    const char* input = data;
+    unsigned int blockLength = UncompressedBlockSize;
+
+    // copy the data to the buffer
+    while ( numBytesWritten < dataLen ) {
+      
+        unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
+        char* buffer = UncompressedBlock;
+        memcpy(buffer + BlockOffset, input, copyLength);
+
+        BlockOffset     += copyLength;
+        input           += copyLength;
+        numBytesWritten += copyLength;
+
+        if ( BlockOffset == blockLength )
+            FlushBlock();
+    }
+
+    return numBytesWritten;
+}