6
|
1 // ***************************************************************************
|
|
2 // BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg
|
|
3 // Marth Lab, Department of Biology, Boston College
|
|
4 // All rights reserved.
|
|
5 // ---------------------------------------------------------------------------
|
|
6 // Last modified: 19 November 2010 (DB)
|
|
7 // ---------------------------------------------------------------------------
|
|
8 // BGZF routines were adapted from the bgzf.c code developed at the Broad
|
|
9 // Institute.
|
|
10 // ---------------------------------------------------------------------------
|
|
11 // Provides the basic functionality for reading & writing BGZF files
|
|
12 // ***************************************************************************
|
|
13
|
|
14 #ifndef BGZF_H
|
|
15 #define BGZF_H
|
|
16
|
|
17 #include <api_global.h>
|
|
18 #include <zlib.h>
|
|
19
|
|
20 #include <cstdio>
|
|
21 #include <cstdlib>
|
|
22 #include <cstring>
|
|
23 #include <string>
|
|
24
|
|
25 // Platform-specific large-file support
|
|
26 #ifndef BAMTOOLS_LFS
|
|
27 #define BAMTOOLS_LFS
|
|
28 #ifdef WIN32
|
|
29 #define ftell64(a) _ftelli64(a)
|
|
30 #define fseek64(a,b,c) _fseeki64(a,b,c)
|
|
31 #else
|
|
32 #define ftell64(a) ftello(a)
|
|
33 #define fseek64(a,b,c) fseeko(a,b,c)
|
|
34 #endif
|
|
35 #endif // BAMTOOLS_LFS
|
|
36
|
|
37 // Platform-specific type definitions
|
|
38 #ifndef BAMTOOLS_TYPES
|
|
39 #define BAMTOOLS_TYPES
|
|
40 #ifdef _MSC_VER
|
|
41 typedef char int8_t;
|
|
42 typedef unsigned char uint8_t;
|
|
43 typedef short int16_t;
|
|
44 typedef unsigned short uint16_t;
|
|
45 typedef int int32_t;
|
|
46 typedef unsigned int uint32_t;
|
|
47 typedef long long int64_t;
|
|
48 typedef unsigned long long uint64_t;
|
|
49 #else
|
|
50 #include <stdint.h>
|
|
51 #endif
|
|
52 #endif // BAMTOOLS_TYPES
|
|
53
|
|
54 namespace BamTools {
|
|
55
|
|
56 // zlib constants
|
|
57 const int GZIP_ID1 = 31;
|
|
58 const int GZIP_ID2 = 139;
|
|
59 const int CM_DEFLATE = 8;
|
|
60 const int FLG_FEXTRA = 4;
|
|
61 const int OS_UNKNOWN = 255;
|
|
62 const int BGZF_XLEN = 6;
|
|
63 const int BGZF_ID1 = 66;
|
|
64 const int BGZF_ID2 = 67;
|
|
65 const int BGZF_LEN = 2;
|
|
66 const int GZIP_WINDOW_BITS = -15;
|
|
67 const int Z_DEFAULT_MEM_LEVEL = 8;
|
|
68
|
|
69 // BZGF constants
|
|
70 const int BLOCK_HEADER_LENGTH = 18;
|
|
71 const int BLOCK_FOOTER_LENGTH = 8;
|
|
72 const int MAX_BLOCK_SIZE = 65536;
|
|
73 const int DEFAULT_BLOCK_SIZE = 65536;
|
|
74
|
|
75 struct API_EXPORT BgzfData {
|
|
76
|
|
77 // data members
|
|
78 public:
|
|
79 unsigned int UncompressedBlockSize;
|
|
80 unsigned int CompressedBlockSize;
|
|
81 unsigned int BlockLength;
|
|
82 unsigned int BlockOffset;
|
|
83 uint64_t BlockAddress;
|
|
84 bool IsOpen;
|
|
85 bool IsWriteOnly;
|
|
86 bool IsWriteUncompressed;
|
|
87 FILE* Stream;
|
|
88 char* UncompressedBlock;
|
|
89 char* CompressedBlock;
|
|
90
|
|
91 // constructor & destructor
|
|
92 public:
|
|
93 BgzfData(void);
|
|
94 ~BgzfData(void);
|
|
95
|
|
96 // main interface methods
|
|
97 public:
|
|
98 // closes BGZF file
|
|
99 void Close(void);
|
|
100 // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing)
|
|
101 bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false);
|
|
102 // reads BGZF data into a byte buffer
|
|
103 int Read(char* data, const unsigned int dataLength);
|
|
104 // seek to position in BGZF file
|
|
105 bool Seek(int64_t position);
|
|
106 // get file position in BGZF file
|
|
107 int64_t Tell(void);
|
|
108 // writes the supplied data into the BGZF buffer
|
|
109 unsigned int Write(const char* data, const unsigned int dataLen);
|
|
110
|
|
111 // internal methods
|
|
112 private:
|
|
113 // compresses the current block
|
|
114 int DeflateBlock(void);
|
|
115 // flushes the data in the BGZF block
|
|
116 void FlushBlock(void);
|
|
117 // de-compresses the current block
|
|
118 int InflateBlock(const int& blockLength);
|
|
119 // reads a BGZF block
|
|
120 bool ReadBlock(void);
|
|
121
|
|
122 // static 'utility' methods
|
|
123 public:
|
|
124 // checks BGZF block header
|
|
125 static inline bool CheckBlockHeader(char* header);
|
|
126 // packs an unsigned integer into the specified buffer
|
|
127 static inline void PackUnsignedInt(char* buffer, unsigned int value);
|
|
128 // packs an unsigned short into the specified buffer
|
|
129 static inline void PackUnsignedShort(char* buffer, unsigned short value);
|
|
130 // unpacks a buffer into a double
|
|
131 static inline double UnpackDouble(char* buffer);
|
|
132 static inline double UnpackDouble(const char* buffer);
|
|
133 // unpacks a buffer into a float
|
|
134 static inline float UnpackFloat(char* buffer);
|
|
135 static inline float UnpackFloat(const char* buffer);
|
|
136 // unpacks a buffer into a signed int
|
|
137 static inline signed int UnpackSignedInt(char* buffer);
|
|
138 static inline signed int UnpackSignedInt(const char* buffer);
|
|
139 // unpacks a buffer into a signed short
|
|
140 static inline signed short UnpackSignedShort(char* buffer);
|
|
141 static inline signed short UnpackSignedShort(const char* buffer);
|
|
142 // unpacks a buffer into an unsigned int
|
|
143 static inline unsigned int UnpackUnsignedInt(char* buffer);
|
|
144 static inline unsigned int UnpackUnsignedInt(const char* buffer);
|
|
145 // unpacks a buffer into an unsigned short
|
|
146 static inline unsigned short UnpackUnsignedShort(char* buffer);
|
|
147 static inline unsigned short UnpackUnsignedShort(const char* buffer);
|
|
148 };
|
|
149
|
|
150 // -------------------------------------------------------------
|
|
151 // static 'utility' method implementations
|
|
152
|
|
153 // checks BGZF block header
|
|
154 inline
|
|
155 bool BgzfData::CheckBlockHeader(char* header) {
|
|
156 return (header[0] == GZIP_ID1 &&
|
|
157 header[1] == (char)GZIP_ID2 &&
|
|
158 header[2] == Z_DEFLATED &&
|
|
159 (header[3] & FLG_FEXTRA) != 0 &&
|
|
160 BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
|
|
161 header[12] == BGZF_ID1 &&
|
|
162 header[13] == BGZF_ID2 &&
|
|
163 BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN );
|
|
164 }
|
|
165
|
|
166 // 'packs' an unsigned integer into the specified buffer
|
|
167 inline
|
|
168 void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) {
|
|
169 buffer[0] = (char)value;
|
|
170 buffer[1] = (char)(value >> 8);
|
|
171 buffer[2] = (char)(value >> 16);
|
|
172 buffer[3] = (char)(value >> 24);
|
|
173 }
|
|
174
|
|
175 // 'packs' an unsigned short into the specified buffer
|
|
176 inline
|
|
177 void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) {
|
|
178 buffer[0] = (char)value;
|
|
179 buffer[1] = (char)(value >> 8);
|
|
180 }
|
|
181
|
|
182 // 'unpacks' a buffer into a double (includes both non-const & const char* flavors)
|
|
183 inline
|
|
184 double BgzfData::UnpackDouble(char* buffer) {
|
|
185 union { double value; unsigned char valueBuffer[sizeof(double)]; } un;
|
|
186 un.value = 0;
|
|
187 un.valueBuffer[0] = buffer[0];
|
|
188 un.valueBuffer[1] = buffer[1];
|
|
189 un.valueBuffer[2] = buffer[2];
|
|
190 un.valueBuffer[3] = buffer[3];
|
|
191 un.valueBuffer[4] = buffer[4];
|
|
192 un.valueBuffer[5] = buffer[5];
|
|
193 un.valueBuffer[6] = buffer[6];
|
|
194 un.valueBuffer[7] = buffer[7];
|
|
195 return un.value;
|
|
196 }
|
|
197
|
|
198 inline
|
|
199 double BgzfData::UnpackDouble(const char* buffer) {
|
|
200 union { double value; unsigned char valueBuffer[sizeof(double)]; } un;
|
|
201 un.value = 0;
|
|
202 un.valueBuffer[0] = buffer[0];
|
|
203 un.valueBuffer[1] = buffer[1];
|
|
204 un.valueBuffer[2] = buffer[2];
|
|
205 un.valueBuffer[3] = buffer[3];
|
|
206 un.valueBuffer[4] = buffer[4];
|
|
207 un.valueBuffer[5] = buffer[5];
|
|
208 un.valueBuffer[6] = buffer[6];
|
|
209 un.valueBuffer[7] = buffer[7];
|
|
210 return un.value;
|
|
211 }
|
|
212
|
|
213 // 'unpacks' a buffer into a float (includes both non-const & const char* flavors)
|
|
214 inline
|
|
215 float BgzfData::UnpackFloat(char* buffer) {
|
|
216 union { float value; unsigned char valueBuffer[sizeof(float)]; } un;
|
|
217 un.value = 0;
|
|
218 un.valueBuffer[0] = buffer[0];
|
|
219 un.valueBuffer[1] = buffer[1];
|
|
220 un.valueBuffer[2] = buffer[2];
|
|
221 un.valueBuffer[3] = buffer[3];
|
|
222 return un.value;
|
|
223 }
|
|
224
|
|
225 inline
|
|
226 float BgzfData::UnpackFloat(const char* buffer) {
|
|
227 union { float value; unsigned char valueBuffer[sizeof(float)]; } un;
|
|
228 un.value = 0;
|
|
229 un.valueBuffer[0] = buffer[0];
|
|
230 un.valueBuffer[1] = buffer[1];
|
|
231 un.valueBuffer[2] = buffer[2];
|
|
232 un.valueBuffer[3] = buffer[3];
|
|
233 return un.value;
|
|
234 }
|
|
235
|
|
236 // 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors)
|
|
237 inline
|
|
238 signed int BgzfData::UnpackSignedInt(char* buffer) {
|
|
239 union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
|
|
240 un.value = 0;
|
|
241 un.valueBuffer[0] = buffer[0];
|
|
242 un.valueBuffer[1] = buffer[1];
|
|
243 un.valueBuffer[2] = buffer[2];
|
|
244 un.valueBuffer[3] = buffer[3];
|
|
245 return un.value;
|
|
246 }
|
|
247
|
|
248 inline
|
|
249 signed int BgzfData::UnpackSignedInt(const char* buffer) {
|
|
250 union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
|
|
251 un.value = 0;
|
|
252 un.valueBuffer[0] = buffer[0];
|
|
253 un.valueBuffer[1] = buffer[1];
|
|
254 un.valueBuffer[2] = buffer[2];
|
|
255 un.valueBuffer[3] = buffer[3];
|
|
256 return un.value;
|
|
257 }
|
|
258
|
|
259 // 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors)
|
|
260 inline
|
|
261 signed short BgzfData::UnpackSignedShort(char* buffer) {
|
|
262 union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;
|
|
263 un.value = 0;
|
|
264 un.valueBuffer[0] = buffer[0];
|
|
265 un.valueBuffer[1] = buffer[1];
|
|
266 return un.value;
|
|
267 }
|
|
268
|
|
269 inline
|
|
270 signed short BgzfData::UnpackSignedShort(const char* buffer) {
|
|
271 union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;
|
|
272 un.value = 0;
|
|
273 un.valueBuffer[0] = buffer[0];
|
|
274 un.valueBuffer[1] = buffer[1];
|
|
275 return un.value;
|
|
276 }
|
|
277
|
|
278 // 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors)
|
|
279 inline
|
|
280 unsigned int BgzfData::UnpackUnsignedInt(char* buffer) {
|
|
281 union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
|
|
282 un.value = 0;
|
|
283 un.valueBuffer[0] = buffer[0];
|
|
284 un.valueBuffer[1] = buffer[1];
|
|
285 un.valueBuffer[2] = buffer[2];
|
|
286 un.valueBuffer[3] = buffer[3];
|
|
287 return un.value;
|
|
288 }
|
|
289
|
|
290 inline
|
|
291 unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) {
|
|
292 union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
|
|
293 un.value = 0;
|
|
294 un.valueBuffer[0] = buffer[0];
|
|
295 un.valueBuffer[1] = buffer[1];
|
|
296 un.valueBuffer[2] = buffer[2];
|
|
297 un.valueBuffer[3] = buffer[3];
|
|
298 return un.value;
|
|
299 }
|
|
300
|
|
301 // 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors)
|
|
302 inline
|
|
303 unsigned short BgzfData::UnpackUnsignedShort(char* buffer) {
|
|
304 union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;
|
|
305 un.value = 0;
|
|
306 un.valueBuffer[0] = buffer[0];
|
|
307 un.valueBuffer[1] = buffer[1];
|
|
308 return un.value;
|
|
309 }
|
|
310
|
|
311 inline
|
|
312 unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) {
|
|
313 union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;
|
|
314 un.value = 0;
|
|
315 un.valueBuffer[0] = buffer[0];
|
|
316 un.valueBuffer[1] = buffer[1];
|
|
317 return un.value;
|
|
318 }
|
|
319
|
|
320 } // namespace BamTools
|
|
321
|
|
322 #endif // BGZF_H
|