Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/coverageBed/coverageBed.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 coverageBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "coverageBed.h" | |
| 14 | |
| 15 // build | |
| 16 BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand, | |
| 17 bool writeHistogram, bool bamInput, bool obeySplits, | |
| 18 bool eachBase, bool countsOnly) { | |
| 19 | |
| 20 _bedAFile = bedAFile; | |
| 21 _bedBFile = bedBFile; | |
| 22 | |
| 23 _bedA = new BedFile(bedAFile); | |
| 24 _bedB = new BedFile(bedBFile); | |
| 25 | |
| 26 _sameStrand = sameStrand; | |
| 27 _diffStrand = diffStrand; | |
| 28 _obeySplits = obeySplits; | |
| 29 _eachBase = eachBase; | |
| 30 _writeHistogram = writeHistogram; | |
| 31 _bamInput = bamInput; | |
| 32 _countsOnly = countsOnly; | |
| 33 | |
| 34 | |
| 35 if (_bamInput == false) | |
| 36 CollectCoverageBed(); | |
| 37 else | |
| 38 CollectCoverageBam(_bedA->bedFile); | |
| 39 } | |
| 40 | |
| 41 // destroy | |
| 42 BedCoverage::~BedCoverage(void) { | |
| 43 delete _bedA; | |
| 44 delete _bedB; | |
| 45 } | |
| 46 | |
| 47 | |
| 48 void BedCoverage::CollectCoverageBed() { | |
| 49 | |
| 50 // load the "B" bed file into a map so | |
| 51 // that we can easily compare "A" to it for overlaps | |
| 52 _bedB->loadBedCovFileIntoMap(); | |
| 53 | |
| 54 int lineNum = 0; // current input line number | |
| 55 BED a, nullBed; | |
| 56 BedLineStatus bedStatus; | |
| 57 | |
| 58 _bedA->Open(); | |
| 59 // process each entry in A | |
| 60 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | |
| 61 if (bedStatus == BED_VALID) { | |
| 62 // process the BED entry as a single block | |
| 63 if (_obeySplits == false) | |
| 64 _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly); | |
| 65 // split the BED into discrete blocksand process each independently. | |
| 66 else { | |
| 67 bedVector bedBlocks; | |
| 68 splitBedIntoBlocks(a, lineNum, bedBlocks); | |
| 69 | |
| 70 // use countSplitHits to avoid over-counting each split chunk | |
| 71 // as distinct read coverage. | |
| 72 _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); | |
| 73 } | |
| 74 a = nullBed; | |
| 75 } | |
| 76 } | |
| 77 _bedA->Close(); | |
| 78 | |
| 79 // report the coverage (summary or histogram) for BED B. | |
| 80 if (_countsOnly == true) | |
| 81 ReportCounts(); | |
| 82 else | |
| 83 ReportCoverage(); | |
| 84 } | |
| 85 | |
| 86 | |
| 87 void BedCoverage::CollectCoverageBam(string bamFile) { | |
| 88 | |
| 89 // load the "B" bed file into a map so | |
| 90 // that we can easily compare "A" to it for overlaps | |
| 91 _bedB->loadBedCovFileIntoMap(); | |
| 92 | |
| 93 // open the BAM file | |
| 94 BamReader reader; | |
| 95 reader.Open(bamFile); | |
| 96 | |
| 97 // get header & reference information | |
| 98 string header = reader.GetHeaderText(); | |
| 99 RefVector refs = reader.GetReferenceData(); | |
| 100 | |
| 101 // convert each aligned BAM entry to BED | |
| 102 // and compute coverage on B | |
| 103 BamAlignment bam; | |
| 104 while (reader.GetNextAlignment(bam)) { | |
| 105 if (bam.IsMapped()) { | |
| 106 // treat the BAM alignment as a single "block" | |
| 107 if (_obeySplits == false) { | |
| 108 // construct a new BED entry from the current BAM alignment. | |
| 109 BED a; | |
| 110 a.chrom = refs.at(bam.RefID).RefName; | |
| 111 a.start = bam.Position; | |
| 112 a.end = bam.GetEndPosition(false, false); | |
| 113 a.strand = "+"; | |
| 114 if (bam.IsReverseStrand()) a.strand = "-"; | |
| 115 | |
| 116 _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly); | |
| 117 } | |
| 118 // split the BAM alignment into discrete blocks and | |
| 119 // look for overlaps only within each block. | |
| 120 else { | |
| 121 // vec to store the discrete BED "blocks" from a | |
| 122 bedVector bedBlocks; | |
| 123 // since we are counting coverage, we do want to split blocks when a | |
| 124 // deletion (D) CIGAR op is encountered (hence the true for the last parm) | |
| 125 getBamBlocks(bam, refs, bedBlocks, true); | |
| 126 // use countSplitHits to avoid over-counting each split chunk | |
| 127 // as distinct read coverage. | |
| 128 _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); | |
| 129 } | |
| 130 } | |
| 131 } | |
| 132 // report the coverage (summary or histogram) for BED B. | |
| 133 if (_countsOnly == true) | |
| 134 ReportCounts(); | |
| 135 else | |
| 136 ReportCoverage(); | |
| 137 // close the BAM file | |
| 138 reader.Close(); | |
| 139 } | |
| 140 | |
| 141 | |
| 142 void BedCoverage::ReportCounts() { | |
| 143 | |
| 144 | |
| 145 // process each chromosome | |
| 146 masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin(); | |
| 147 masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end(); | |
| 148 for (; chromItr != chromEnd; ++chromItr) | |
| 149 { | |
| 150 // for each chrom, process each bin | |
| 151 binsToBedCovs::const_iterator binItr = chromItr->second.begin(); | |
| 152 binsToBedCovs::const_iterator binEnd = chromItr->second.end(); | |
| 153 for (; binItr != binEnd; ++binItr) | |
| 154 { | |
| 155 // for each chrom & bin, compute and report | |
| 156 // the observed coverage for each feature | |
| 157 vector<BEDCOV>::const_iterator bedItr = binItr->second.begin(); | |
| 158 vector<BEDCOV>::const_iterator bedEnd = binItr->second.end(); | |
| 159 for (; bedItr != bedEnd; ++bedItr) | |
| 160 { | |
| 161 _bedB->reportBedTab(*bedItr); | |
| 162 printf("%d\n", bedItr->count); | |
| 163 } | |
| 164 } | |
| 165 } | |
| 166 } | |
| 167 | |
| 168 void BedCoverage::ReportCoverage() { | |
| 169 | |
| 170 map<unsigned int, unsigned int> allDepthHist; | |
| 171 unsigned int totalLength = 0; | |
| 172 | |
| 173 // process each chromosome | |
| 174 masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin(); | |
| 175 masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end(); | |
| 176 for (; chromItr != chromEnd; ++chromItr) | |
| 177 { | |
| 178 // for each chrom, process each bin | |
| 179 binsToBedCovs::const_iterator binItr = chromItr->second.begin(); | |
| 180 binsToBedCovs::const_iterator binEnd = chromItr->second.end(); | |
| 181 for (; binItr != binEnd; ++binItr) | |
| 182 { | |
| 183 // for each chrom & bin, compute and report | |
| 184 // the observed coverage for each feature | |
| 185 vector<BEDCOV>::const_iterator bedItr = binItr->second.begin(); | |
| 186 vector<BEDCOV>::const_iterator bedEnd = binItr->second.end(); | |
| 187 for (; bedItr != bedEnd; ++bedItr) | |
| 188 { | |
| 189 int zeroDepthCount = 0; // number of bases with zero depth | |
| 190 int depth = 0; // tracks the depth at the current base | |
| 191 | |
| 192 // the start is either the first base in the feature OR | |
| 193 // the leftmost position of an overlapping feature. e.g. (s = start): | |
| 194 // A ---------- | |
| 195 // B s ------------ | |
| 196 int start = min(bedItr->minOverlapStart, bedItr->start); | |
| 197 | |
| 198 // track the number of bases in the feature covered by | |
| 199 // 0, 1, 2, ... n features in A | |
| 200 map<unsigned int, unsigned int> depthHist; | |
| 201 map<unsigned int, DEPTH>::const_iterator depthItr; | |
| 202 | |
| 203 // compute the coverage observed at each base in the feature marching from start to end. | |
| 204 for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) | |
| 205 { | |
| 206 // map pointer grabbing the starts and ends observed at this position | |
| 207 depthItr = bedItr->depthMap.find(pos); | |
| 208 // increment coverage if starts observed at this position. | |
| 209 if (depthItr != bedItr->depthMap.end()) | |
| 210 depth += depthItr->second.starts; | |
| 211 // update coverage assuming the current position is within the current B feature | |
| 212 if ((pos > bedItr->start) && (pos <= bedItr->end)) { | |
| 213 if (depth == 0) zeroDepthCount++; | |
| 214 // update our histograms, assuming we are not reporting "per-base" coverage. | |
| 215 if (_eachBase == false) { | |
| 216 depthHist[depth]++; | |
| 217 allDepthHist[depth]++; | |
| 218 } | |
| 219 else if ((_eachBase == true) && (bedItr->zeroLength == false)) | |
| 220 { | |
| 221 _bedB->reportBedTab(*bedItr); | |
| 222 printf("%d\t%d\n", pos-bedItr->start, depth); | |
| 223 } | |
| 224 } | |
| 225 // decrement coverage if ends observed at this position. | |
| 226 if (depthItr != bedItr->depthMap.end()) | |
| 227 depth = depth - depthItr->second.ends; | |
| 228 } | |
| 229 | |
| 230 // handle the special case where the user wants "per-base" depth | |
| 231 // but the current feature is length = 0. | |
| 232 if ((_eachBase == true) && (bedItr->zeroLength == true)) { | |
| 233 _bedB->reportBedTab(*bedItr); | |
| 234 printf("1\t%d\n",depth); | |
| 235 } | |
| 236 // Summarize the coverage for the current interval, | |
| 237 // assuming the user has not requested "per-base" coverage. | |
| 238 else if (_eachBase == false) | |
| 239 { | |
| 240 CHRPOS length = bedItr->end - bedItr->start; | |
| 241 if (bedItr->zeroLength == true) { | |
| 242 length = 0; | |
| 243 } | |
| 244 totalLength += length; | |
| 245 int nonZeroBases = (length - zeroDepthCount); | |
| 246 | |
| 247 float fractCovered = 0.0; | |
| 248 if (bedItr->zeroLength == false) { | |
| 249 fractCovered = (float) nonZeroBases / length; | |
| 250 } | |
| 251 | |
| 252 // print a summary of the coverage | |
| 253 if (_writeHistogram == false) { | |
| 254 _bedB->reportBedTab(*bedItr); | |
| 255 printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, nonZeroBases, length, fractCovered); | |
| 256 } | |
| 257 // HISTOGRAM | |
| 258 // report the number of bases with coverage == x | |
| 259 else { | |
| 260 // produce a histogram when not a zero length feature. | |
| 261 if (bedItr->zeroLength == false) { | |
| 262 map<unsigned int, unsigned int>::const_iterator histItr = depthHist.begin(); | |
| 263 map<unsigned int, unsigned int>::const_iterator histEnd = depthHist.end(); | |
| 264 for (; histItr != histEnd; ++histItr) | |
| 265 { | |
| 266 float fractAtThisDepth = (float) histItr->second / length; | |
| 267 _bedB->reportBedTab(*bedItr); | |
| 268 printf("%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, length, fractAtThisDepth); | |
| 269 } | |
| 270 } | |
| 271 // special case when it is a zero length feauture. | |
| 272 else { | |
| 273 _bedB->reportBedTab(*bedItr); | |
| 274 printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, 0, 0, 1.0000000); | |
| 275 } | |
| 276 } | |
| 277 } | |
| 278 } | |
| 279 } | |
| 280 } | |
| 281 // report a histogram of coverage among _all_ | |
| 282 // features in B. | |
| 283 if (_writeHistogram == true) { | |
| 284 map<unsigned int, unsigned int>::const_iterator histItr = allDepthHist.begin(); | |
| 285 map<unsigned int, unsigned int>::const_iterator histEnd = allDepthHist.end(); | |
| 286 for (; histItr != histEnd; ++histItr) { | |
| 287 float fractAtThisDepth = (float) histItr->second / totalLength; | |
| 288 printf("all\t%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, totalLength, fractAtThisDepth); | |
| 289 } | |
| 290 } | |
| 291 } | |
| 292 | |
| 293 |
