6
|
1 // ***************************************************************************
|
|
2 // BamStandardIndex.h (c) 2010 Derek Barnett
|
|
3 // Marth Lab, Department of Biology, Boston College
|
|
4 // All rights reserved.
|
|
5 // ---------------------------------------------------------------------------
|
|
6 // Last modified: 19 November 2010 (DB)
|
|
7 // ---------------------------------------------------------------------------
|
|
8 // Provides index operations for the standardized BAM index format (".bai")
|
|
9 // ***************************************************************************
|
|
10
|
|
11 #ifndef BAM_STANDARD_INDEX_FORMAT_H
|
|
12 #define BAM_STANDARD_INDEX_FORMAT_H
|
|
13
|
|
14 // -------------
|
|
15 // W A R N I N G
|
|
16 // -------------
|
|
17 //
|
|
18 // This file is not part of the BamTools API. It exists purely as an
|
|
19 // implementation detail. This header file may change from version to
|
|
20 // version without notice, or even be removed.
|
|
21 //
|
|
22 // We mean it.
|
|
23
|
|
24 #include <BamAux.h>
|
|
25 #include <BamIndex.h>
|
|
26 #include <map>
|
|
27 #include <string>
|
|
28 #include <vector>
|
|
29
|
|
30 namespace BamTools {
|
|
31
|
|
32 class BamAlignment;
|
|
33
|
|
34 namespace Internal {
|
|
35
|
|
36 // BAM index constants
|
|
37 const int MAX_BIN = 37450; // =(8^6-1)/7+1
|
|
38 const int BAM_LIDX_SHIFT = 14;
|
|
39
|
|
40 // --------------------------------------------------
|
|
41 // BamStandardIndex data structures & typedefs
|
|
42 struct Chunk {
|
|
43
|
|
44 // data members
|
|
45 uint64_t Start;
|
|
46 uint64_t Stop;
|
|
47
|
|
48 // constructor
|
|
49 Chunk(const uint64_t& start = 0,
|
|
50 const uint64_t& stop = 0)
|
|
51 : Start(start)
|
|
52 , Stop(stop)
|
|
53 { }
|
|
54 };
|
|
55
|
|
56 inline
|
|
57 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
|
|
58 return lhs.Start < rhs.Start;
|
|
59 }
|
|
60
|
|
61 typedef std::vector<Chunk> ChunkVector;
|
|
62 typedef std::map<uint32_t, ChunkVector> BamBinMap;
|
|
63 typedef std::vector<uint64_t> LinearOffsetVector;
|
|
64
|
|
65 struct ReferenceIndex {
|
|
66
|
|
67 // data members
|
|
68 BamBinMap Bins;
|
|
69 LinearOffsetVector Offsets;
|
|
70 bool HasAlignments;
|
|
71
|
|
72 // constructor
|
|
73 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
|
|
74 const LinearOffsetVector& offsets = LinearOffsetVector(),
|
|
75 const bool hasAlignments = false)
|
|
76 : Bins(binMap)
|
|
77 , Offsets(offsets)
|
|
78 , HasAlignments(hasAlignments)
|
|
79 { }
|
|
80 };
|
|
81
|
|
82 typedef std::map<int32_t, ReferenceIndex> BamStandardIndexData;
|
|
83
|
|
84 class BamStandardIndex : public BamIndex {
|
|
85
|
|
86 // ctor & dtor
|
|
87 public:
|
|
88 BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
|
|
89 ~BamStandardIndex(void);
|
|
90
|
|
91 // interface (implements BamIndex virtual methods)
|
|
92 public:
|
|
93 // creates index data (in-memory) from current reader data
|
|
94 bool Build(void);
|
|
95 // returns supported file extension
|
|
96 const std::string Extension(void) const { return std::string(".bai"); }
|
|
97 // returns whether reference has alignments or no
|
|
98 bool HasAlignments(const int& referenceID) const;
|
|
99 // attempts to use index to jump to region; returns success/fail
|
|
100 // a "successful" jump indicates no error, but not whether this region has data
|
|
101 // * thus, the method sets a flag to indicate whether there are alignments
|
|
102 // available after the jump position
|
|
103 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
|
|
104 public:
|
|
105 // clear all current index offset data in memory
|
|
106 void ClearAllData(void);
|
|
107 // return file position after header metadata
|
|
108 const off_t DataBeginOffset(void) const;
|
|
109 // return true if all index data is cached
|
|
110 bool HasFullDataCache(void) const;
|
|
111 // clears index data from all references except the first
|
|
112 void KeepOnlyFirstReferenceOffsets(void);
|
|
113 // load index data for all references, return true if loaded OK
|
|
114 // @saveData - save data in memory if true, just read & discard if false
|
|
115 bool LoadAllReferences(bool saveData = true);
|
|
116 // load first reference from file, return true if loaded OK
|
|
117 // @saveData - save data in memory if true, just read & discard if false
|
|
118 bool LoadFirstReference(bool saveData = true);
|
|
119 // load header data from index file, return true if loaded OK
|
|
120 bool LoadHeader(void);
|
|
121 // position file pointer to first reference begin, return true if skipped OK
|
|
122 bool SkipToFirstReference(void);
|
|
123 // write index reference data
|
|
124 bool WriteAllReferences(void);
|
|
125 // write index header data
|
|
126 bool WriteHeader(void);
|
|
127
|
|
128 // 'internal' methods
|
|
129 public:
|
|
130
|
|
131 // -----------------------
|
|
132 // index file operations
|
|
133
|
|
134 // check index file magic number, return true if OK
|
|
135 bool CheckMagicNumber(void);
|
|
136 // check index file version, return true if OK
|
|
137 bool CheckVersion(void);
|
|
138 // load a single index bin entry from file, return true if loaded OK
|
|
139 // @saveData - save data in memory if true, just read & discard if false
|
|
140 bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
|
|
141 bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
|
|
142 // load a single index bin entry from file, return true if loaded OK
|
|
143 // @saveData - save data in memory if true, just read & discard if false
|
|
144 bool LoadChunk(ChunkVector& chunks, bool saveData = true);
|
|
145 bool LoadChunks(ChunkVector& chunks, bool saveData = true);
|
|
146 // load a single index linear offset entry from file, return true if loaded OK
|
|
147 // @saveData - save data in memory if true, just read & discard if false
|
|
148 bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
|
|
149 // load a single reference from file, return true if loaded OK
|
|
150 // @saveData - save data in memory if true, just read & discard if false
|
|
151 bool LoadReference(const int& refId, bool saveData = true);
|
|
152 // loads number of references, return true if loaded OK
|
|
153 bool LoadReferenceCount(int& numReferences);
|
|
154 // position file pointer to desired reference begin, return true if skipped OK
|
|
155 bool SkipToReference(const int& refId);
|
|
156 // write index data for bin to new index file
|
|
157 bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
|
|
158 // write index data for bins to new index file
|
|
159 bool WriteBins(const BamBinMap& bins);
|
|
160 // write index data for chunk entry to new index file
|
|
161 bool WriteChunk(const Chunk& chunk);
|
|
162 // write index data for chunk entry to new index file
|
|
163 bool WriteChunks(const ChunkVector& chunks);
|
|
164 // write index data for linear offsets entry to new index file
|
|
165 bool WriteLinearOffsets(const LinearOffsetVector& offsets);
|
|
166 // write index data single reference to new index file
|
|
167 bool WriteReference(const ReferenceIndex& refEntry);
|
|
168
|
|
169 // -----------------------
|
|
170 // index data operations
|
|
171
|
|
172 // calculate bins that overlap region
|
|
173 int BinsFromRegion(const BamRegion& region,
|
|
174 const bool isRightBoundSpecified,
|
|
175 uint16_t bins[MAX_BIN]);
|
|
176 // clear all index offset data for desired reference
|
|
177 void ClearReferenceOffsets(const int& refId);
|
|
178 // calculates offset(s) for a given region
|
|
179 bool GetOffsets(const BamRegion& region,
|
|
180 const bool isRightBoundSpecified,
|
|
181 std::vector<int64_t>& offsets,
|
|
182 bool* hasAlignmentsInRegion);
|
|
183 // returns true if index cache has data for desired reference
|
|
184 bool IsDataLoaded(const int& refId) const;
|
|
185 // clears index data from all references except the one specified
|
|
186 void KeepOnlyReferenceOffsets(const int& refId);
|
|
187 // simplifies index by merging 'chunks'
|
|
188 void MergeChunks(void);
|
|
189 // saves BAM bin entry for index
|
|
190 void SaveBinEntry(BamBinMap& binMap,
|
|
191 const uint32_t& saveBin,
|
|
192 const uint64_t& saveOffset,
|
|
193 const uint64_t& lastOffset);
|
|
194 // saves linear offset entry for index
|
|
195 void SaveLinearOffset(LinearOffsetVector& offsets,
|
|
196 const BamAlignment& bAlignment,
|
|
197 const uint64_t& lastOffset);
|
|
198 // initializes index data structure to hold @count references
|
|
199 void SetReferenceCount(const int& count);
|
|
200
|
|
201 // data members
|
|
202 private:
|
|
203
|
|
204 BamStandardIndexData m_indexData;
|
|
205 off_t m_dataBeginOffset;
|
|
206 bool m_hasFullDataCache;
|
|
207 bool m_isBigEndian;
|
|
208 };
|
|
209
|
|
210 } // namespace Internal
|
|
211 } // namespace BamTools
|
|
212
|
|
213 #endif // BAM_STANDARD_INDEX_FORMAT_H
|