annotate spp/src/BamStandardIndex_p.h @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1 // ***************************************************************************
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2 // BamStandardIndex.h (c) 2010 Derek Barnett
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
3 // Marth Lab, Department of Biology, Boston College
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
4 // All rights reserved.
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
5 // ---------------------------------------------------------------------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
6 // Last modified: 19 November 2010 (DB)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
7 // ---------------------------------------------------------------------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
8 // Provides index operations for the standardized BAM index format (".bai")
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
9 // ***************************************************************************
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
10
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
11 #ifndef BAM_STANDARD_INDEX_FORMAT_H
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
12 #define BAM_STANDARD_INDEX_FORMAT_H
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
13
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
14 // -------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
15 // W A R N I N G
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
16 // -------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
17 //
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
18 // This file is not part of the BamTools API. It exists purely as an
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
19 // implementation detail. This header file may change from version to
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
20 // version without notice, or even be removed.
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
21 //
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
22 // We mean it.
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
23
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
24 #include <BamAux.h>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
25 #include <BamIndex.h>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
26 #include <map>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
27 #include <string>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
28 #include <vector>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
29
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
30 namespace BamTools {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
31
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
32 class BamAlignment;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
33
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
34 namespace Internal {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
35
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
36 // BAM index constants
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
37 const int MAX_BIN = 37450; // =(8^6-1)/7+1
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
38 const int BAM_LIDX_SHIFT = 14;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
39
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
40 // --------------------------------------------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
41 // BamStandardIndex data structures & typedefs
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
42 struct Chunk {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
43
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
44 // data members
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
45 uint64_t Start;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
46 uint64_t Stop;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
47
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
48 // constructor
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
49 Chunk(const uint64_t& start = 0,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
50 const uint64_t& stop = 0)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
51 : Start(start)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
52 , Stop(stop)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
53 { }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
54 };
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
55
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
56 inline
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
57 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
58 return lhs.Start < rhs.Start;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
59 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
60
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
61 typedef std::vector<Chunk> ChunkVector;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
62 typedef std::map<uint32_t, ChunkVector> BamBinMap;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
63 typedef std::vector<uint64_t> LinearOffsetVector;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
64
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
65 struct ReferenceIndex {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
66
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
67 // data members
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
68 BamBinMap Bins;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
69 LinearOffsetVector Offsets;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
70 bool HasAlignments;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
71
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
72 // constructor
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
73 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
74 const LinearOffsetVector& offsets = LinearOffsetVector(),
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
75 const bool hasAlignments = false)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
76 : Bins(binMap)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
77 , Offsets(offsets)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
78 , HasAlignments(hasAlignments)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
79 { }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
80 };
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
81
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
82 typedef std::map<int32_t, ReferenceIndex> BamStandardIndexData;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
83
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
84 class BamStandardIndex : public BamIndex {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
85
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
86 // ctor & dtor
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
87 public:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
88 BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
89 ~BamStandardIndex(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
90
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
91 // interface (implements BamIndex virtual methods)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
92 public:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
93 // creates index data (in-memory) from current reader data
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
94 bool Build(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
95 // returns supported file extension
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
96 const std::string Extension(void) const { return std::string(".bai"); }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
97 // returns whether reference has alignments or no
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
98 bool HasAlignments(const int& referenceID) const;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
99 // attempts to use index to jump to region; returns success/fail
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
100 // a "successful" jump indicates no error, but not whether this region has data
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
101 // * thus, the method sets a flag to indicate whether there are alignments
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
102 // available after the jump position
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
103 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
104 public:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
105 // clear all current index offset data in memory
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
106 void ClearAllData(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
107 // return file position after header metadata
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
108 const off_t DataBeginOffset(void) const;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
109 // return true if all index data is cached
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
110 bool HasFullDataCache(void) const;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
111 // clears index data from all references except the first
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
112 void KeepOnlyFirstReferenceOffsets(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
113 // load index data for all references, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
114 // @saveData - save data in memory if true, just read & discard if false
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
115 bool LoadAllReferences(bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
116 // load first reference from file, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
117 // @saveData - save data in memory if true, just read & discard if false
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
118 bool LoadFirstReference(bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
119 // load header data from index file, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
120 bool LoadHeader(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
121 // position file pointer to first reference begin, return true if skipped OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
122 bool SkipToFirstReference(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
123 // write index reference data
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
124 bool WriteAllReferences(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
125 // write index header data
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
126 bool WriteHeader(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
127
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
128 // 'internal' methods
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
129 public:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
130
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
131 // -----------------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
132 // index file operations
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
133
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
134 // check index file magic number, return true if OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
135 bool CheckMagicNumber(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
136 // check index file version, return true if OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
137 bool CheckVersion(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
138 // load a single index bin entry from file, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
139 // @saveData - save data in memory if true, just read & discard if false
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
140 bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
141 bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
142 // load a single index bin entry from file, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
143 // @saveData - save data in memory if true, just read & discard if false
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
144 bool LoadChunk(ChunkVector& chunks, bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
145 bool LoadChunks(ChunkVector& chunks, bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
146 // load a single index linear offset entry from file, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
147 // @saveData - save data in memory if true, just read & discard if false
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
148 bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
149 // load a single reference from file, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
150 // @saveData - save data in memory if true, just read & discard if false
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
151 bool LoadReference(const int& refId, bool saveData = true);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
152 // loads number of references, return true if loaded OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
153 bool LoadReferenceCount(int& numReferences);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
154 // position file pointer to desired reference begin, return true if skipped OK
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
155 bool SkipToReference(const int& refId);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
156 // write index data for bin to new index file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
157 bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
158 // write index data for bins to new index file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
159 bool WriteBins(const BamBinMap& bins);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
160 // write index data for chunk entry to new index file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
161 bool WriteChunk(const Chunk& chunk);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
162 // write index data for chunk entry to new index file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
163 bool WriteChunks(const ChunkVector& chunks);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
164 // write index data for linear offsets entry to new index file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
165 bool WriteLinearOffsets(const LinearOffsetVector& offsets);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
166 // write index data single reference to new index file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
167 bool WriteReference(const ReferenceIndex& refEntry);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
168
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
169 // -----------------------
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
170 // index data operations
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
171
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
172 // calculate bins that overlap region
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
173 int BinsFromRegion(const BamRegion& region,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
174 const bool isRightBoundSpecified,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
175 uint16_t bins[MAX_BIN]);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
176 // clear all index offset data for desired reference
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
177 void ClearReferenceOffsets(const int& refId);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
178 // calculates offset(s) for a given region
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
179 bool GetOffsets(const BamRegion& region,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
180 const bool isRightBoundSpecified,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
181 std::vector<int64_t>& offsets,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
182 bool* hasAlignmentsInRegion);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
183 // returns true if index cache has data for desired reference
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
184 bool IsDataLoaded(const int& refId) const;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
185 // clears index data from all references except the one specified
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
186 void KeepOnlyReferenceOffsets(const int& refId);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
187 // simplifies index by merging 'chunks'
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
188 void MergeChunks(void);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
189 // saves BAM bin entry for index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
190 void SaveBinEntry(BamBinMap& binMap,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
191 const uint32_t& saveBin,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
192 const uint64_t& saveOffset,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
193 const uint64_t& lastOffset);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
194 // saves linear offset entry for index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
195 void SaveLinearOffset(LinearOffsetVector& offsets,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
196 const BamAlignment& bAlignment,
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
197 const uint64_t& lastOffset);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
198 // initializes index data structure to hold @count references
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
199 void SetReferenceCount(const int& count);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
200
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
201 // data members
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
202 private:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
203
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
204 BamStandardIndexData m_indexData;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
205 off_t m_dataBeginOffset;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
206 bool m_hasFullDataCache;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
207 bool m_isBigEndian;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
208 };
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
209
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
210 } // namespace Internal
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
211 } // namespace BamTools
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
212
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
213 #endif // BAM_STANDARD_INDEX_FORMAT_H