Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp @ 1:bec36315bd12 default tip
Deleted selected files
| author | aaronquinlan |
|---|---|
| date | Sat, 19 Nov 2011 14:17:03 -0500 |
| parents | dfcd8b6c1bda |
| children |
comparison
equal
deleted
inserted
replaced
| 0:dfcd8b6c1bda | 1:bec36315bd12 |
|---|---|
| 1 /***************************************************************************** | |
| 2 intersectBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "intersectBed.h" | |
| 14 | |
| 15 /************************************ | |
| 16 Helper functions | |
| 17 ************************************/ | |
| 18 bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) { | |
| 19 | |
| 20 // how many overlaps are there b/w the bed and the set of hits? | |
| 21 CHRPOS s, e; | |
| 22 int overlapBases; | |
| 23 int numOverlaps = 0; | |
| 24 bool hitsFound = false; | |
| 25 int aLength = (a.end - a.start); // the length of a in b.p. | |
| 26 | |
| 27 // loop through the hits and report those that meet the user's criteria | |
| 28 vector<BED>::const_iterator h = hits.begin(); | |
| 29 vector<BED>::const_iterator hitsEnd = hits.end(); | |
| 30 for (; h != hitsEnd; ++h) { | |
| 31 s = max(a.start, h->start); | |
| 32 e = min(a.end, h->end); | |
| 33 overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
| 34 | |
| 35 // is there enough overlap relative to the user's request? (default ~ 1bp) | |
| 36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { | |
| 37 // Report the hit if the user doesn't care about reciprocal overlap between A and B. | |
| 38 if (_reciprocal == false) { | |
| 39 hitsFound = true; | |
| 40 numOverlaps++; | |
| 41 if (_printable == true) | |
| 42 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
| 43 } | |
| 44 // we require there to be sufficient __reciprocal__ overlap | |
| 45 else { | |
| 46 int bLength = (h->end - h->start); | |
| 47 float bOverlap = ( (float) overlapBases / (float) bLength ); | |
| 48 if (bOverlap >= _overlapFraction) { | |
| 49 hitsFound = true; | |
| 50 numOverlaps++; | |
| 51 if (_printable == true) | |
| 52 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
| 53 } | |
| 54 } | |
| 55 } | |
| 56 } | |
| 57 // report the summary of the overlaps if requested. | |
| 58 ReportOverlapSummary(a, numOverlaps); | |
| 59 // were hits found for this BED feature? | |
| 60 return hitsFound; | |
| 61 } | |
| 62 | |
| 63 | |
| 64 /* | |
| 65 Constructor | |
| 66 */ | |
| 67 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, | |
| 68 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, | |
| 69 float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, | |
| 70 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, | |
| 71 bool sortedInput) { | |
| 72 | |
| 73 _bedAFile = bedAFile; | |
| 74 _bedBFile = bedBFile; | |
| 75 _anyHit = anyHit; | |
| 76 _noHit = noHit; | |
| 77 _writeA = writeA; | |
| 78 _writeB = writeB; | |
| 79 _writeOverlap = writeOverlap; | |
| 80 _writeAllOverlap = writeAllOverlap; | |
| 81 _writeCount = writeCount; | |
| 82 _overlapFraction = overlapFraction; | |
| 83 _sameStrand = sameStrand; | |
| 84 _diffStrand = diffStrand; | |
| 85 _reciprocal = reciprocal; | |
| 86 _obeySplits = obeySplits; | |
| 87 _bamInput = bamInput; | |
| 88 _bamOutput = bamOutput; | |
| 89 _isUncompressedBam = isUncompressedBam; | |
| 90 _sortedInput = sortedInput; | |
| 91 | |
| 92 // should we print each overlap, or does the user want summary information? | |
| 93 _printable = true; | |
| 94 if (_anyHit || _noHit || _writeCount) | |
| 95 _printable = false; | |
| 96 | |
| 97 if (_bamInput == false) | |
| 98 IntersectBed(); | |
| 99 else | |
| 100 IntersectBam(bedAFile); | |
| 101 } | |
| 102 | |
| 103 | |
| 104 /* | |
| 105 Destructor | |
| 106 */ | |
| 107 BedIntersect::~BedIntersect(void) { | |
| 108 } | |
| 109 | |
| 110 | |
| 111 bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { | |
| 112 bool hitsFound = false; | |
| 113 | |
| 114 // collect and report the sufficient hits | |
| 115 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand); | |
| 116 hitsFound = processHits(a, hits); | |
| 117 return hitsFound; | |
| 118 } | |
| 119 | |
| 120 | |
| 121 void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) { | |
| 122 // default. simple intersection only | |
| 123 if (_writeA == false && _writeB == false && _writeOverlap == false) { | |
| 124 _bedA->reportBedRangeNewLine(a,s,e); | |
| 125 } | |
| 126 // -wa -wb write the original A and B | |
| 127 else if (_writeA == true && _writeB == true) { | |
| 128 _bedA->reportBedTab(a); | |
| 129 _bedB->reportBedNewLine(b); | |
| 130 } | |
| 131 // -wa write just the original A | |
| 132 else if (_writeA == true) { | |
| 133 _bedA->reportBedNewLine(a); | |
| 134 } | |
| 135 // -wb write the intersected portion of A and the original B | |
| 136 else if (_writeB == true) { | |
| 137 _bedA->reportBedRangeTab(a,s,e); | |
| 138 _bedB->reportBedNewLine(b); | |
| 139 } | |
| 140 // -wo write the original A and B plus the no. of overlapping bases. | |
| 141 else if (_writeOverlap == true) { | |
| 142 _bedA->reportBedTab(a); | |
| 143 _bedB->reportBedTab(b); | |
| 144 if (b.zeroLength == false) printf("%d\n", overlapBases); | |
| 145 else printf("0\n"); | |
| 146 } | |
| 147 } | |
| 148 | |
| 149 | |
| 150 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { | |
| 151 // -u just report the fact that there was >= 1 overlaps | |
| 152 if (_anyHit && (numOverlapsFound >= 1)) { | |
| 153 _bedA->reportBedNewLine(a); | |
| 154 } | |
| 155 // -c report the total number of features overlapped in B | |
| 156 else if (_writeCount) { | |
| 157 _bedA->reportBedTab(a); | |
| 158 printf("%d\n", numOverlapsFound); | |
| 159 } | |
| 160 // -v report iff there were no overlaps | |
| 161 else if (_noHit && (numOverlapsFound == 0)) { | |
| 162 _bedA->reportBedNewLine(a); | |
| 163 } | |
| 164 // -wao the user wants to force the reporting of 0 overlap | |
| 165 else if (_writeAllOverlap && (numOverlapsFound == 0)) { | |
| 166 _bedA->reportBedTab(a); | |
| 167 _bedB->reportNullBedTab(); | |
| 168 printf("0\n"); | |
| 169 } | |
| 170 } | |
| 171 | |
| 172 | |
| 173 bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { | |
| 174 bool overlapsFound = false; | |
| 175 if (_reciprocal == false) { | |
| 176 overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, | |
| 177 _sameStrand, _diffStrand, _overlapFraction); | |
| 178 } | |
| 179 else { | |
| 180 overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, | |
| 181 _sameStrand, _diffStrand, _overlapFraction); | |
| 182 } | |
| 183 return overlapsFound; | |
| 184 } | |
| 185 | |
| 186 | |
| 187 void BedIntersect::IntersectBed() { | |
| 188 | |
| 189 // create new BED file objects for A and B | |
| 190 _bedA = new BedFile(_bedAFile); | |
| 191 _bedB = new BedFile(_bedBFile); | |
| 192 | |
| 193 if (_sortedInput == false) { | |
| 194 // load the "B" file into a map in order to | |
| 195 // compare each entry in A to it in search of overlaps. | |
| 196 _bedB->loadBedFileIntoMap(); | |
| 197 | |
| 198 int lineNum = 0; | |
| 199 vector<BED> hits; | |
| 200 hits.reserve(100); | |
| 201 BED a, nullBed; | |
| 202 BedLineStatus bedStatus; | |
| 203 | |
| 204 // open the "A" file, process each BED entry and searh for overlaps. | |
| 205 _bedA->Open(); | |
| 206 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | |
| 207 if (bedStatus == BED_VALID) { | |
| 208 // treat the BED as a single "block" | |
| 209 if (_obeySplits == false) { | |
| 210 FindOverlaps(a, hits); | |
| 211 hits.clear(); | |
| 212 } | |
| 213 // split the BED12 into blocks and look for overlaps in each discrete block | |
| 214 else { | |
| 215 bedVector bedBlocks; // vec to store the discrete BED "blocks" | |
| 216 splitBedIntoBlocks(a, lineNum, bedBlocks); | |
| 217 | |
| 218 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
| 219 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
| 220 for (; bedItr != bedEnd; ++bedItr) { | |
| 221 FindOverlaps(*bedItr, hits); | |
| 222 hits.clear(); | |
| 223 } | |
| 224 } | |
| 225 } | |
| 226 a = nullBed; | |
| 227 } | |
| 228 _bedA->Close(); | |
| 229 } | |
| 230 else { | |
| 231 // use the chromsweep algorithm to detect overlaps on the fly. | |
| 232 ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand); | |
| 233 | |
| 234 pair<BED, vector<BED> > hit_set; | |
| 235 hit_set.second.reserve(10000); | |
| 236 while (sweep.Next(hit_set)) { | |
| 237 processHits(hit_set.first, hit_set.second); | |
| 238 } | |
| 239 } | |
| 240 } | |
| 241 | |
| 242 | |
| 243 void BedIntersect::IntersectBam(string bamFile) { | |
| 244 | |
| 245 // load the "B" bed file into a map so | |
| 246 // that we can easily compare "A" to it for overlaps | |
| 247 _bedB = new BedFile(_bedBFile); | |
| 248 _bedB->loadBedFileIntoMap(); | |
| 249 | |
| 250 // create a dummy BED A file for printing purposes if not | |
| 251 // using BAM output. | |
| 252 if (_bamOutput == false) { | |
| 253 _bedA = new BedFile(_bedAFile); | |
| 254 _bedA->bedType = 6; | |
| 255 } | |
| 256 | |
| 257 // open the BAM file | |
| 258 BamReader reader; | |
| 259 BamWriter writer; | |
| 260 | |
| 261 reader.Open(bamFile); | |
| 262 | |
| 263 // get header & reference information | |
| 264 string bamHeader = reader.GetHeaderText(); | |
| 265 RefVector refs = reader.GetReferenceData(); | |
| 266 | |
| 267 // open a BAM output to stdout if we are writing BAM | |
| 268 if (_bamOutput == true) { | |
| 269 // set compression mode | |
| 270 BamWriter::CompressionMode compressionMode = BamWriter::Compressed; | |
| 271 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; | |
| 272 writer.SetCompressionMode(compressionMode); | |
| 273 // open our BAM writer | |
| 274 writer.Open("stdout", bamHeader, refs); | |
| 275 } | |
| 276 | |
| 277 vector<BED> hits; | |
| 278 // reserve some space | |
| 279 hits.reserve(100); | |
| 280 | |
| 281 | |
| 282 BamAlignment bam; | |
| 283 // get each set of alignments for each pair. | |
| 284 while (reader.GetNextAlignment(bam)) { | |
| 285 | |
| 286 if (bam.IsMapped()) { | |
| 287 BED a; | |
| 288 a.chrom = refs.at(bam.RefID).RefName; | |
| 289 a.start = bam.Position; | |
| 290 a.end = bam.GetEndPosition(false, false); | |
| 291 | |
| 292 // build the name field from the BAM alignment. | |
| 293 a.name = bam.Name; | |
| 294 if (bam.IsFirstMate()) a.name += "/1"; | |
| 295 if (bam.IsSecondMate()) a.name += "/2"; | |
| 296 | |
| 297 a.score = ToString(bam.MapQuality); | |
| 298 | |
| 299 a.strand = "+"; | |
| 300 if (bam.IsReverseStrand()) a.strand = "-"; | |
| 301 | |
| 302 if (_bamOutput == true) { | |
| 303 bool overlapsFound = false; | |
| 304 // treat the BAM alignment as a single "block" | |
| 305 if (_obeySplits == false) { | |
| 306 overlapsFound = FindOneOrMoreOverlap(a); | |
| 307 } | |
| 308 // split the BAM alignment into discrete blocks and | |
| 309 // look for overlaps only within each block. | |
| 310 else { | |
| 311 bool overlapFoundForBlock; | |
| 312 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a | |
| 313 // we don't want to split on "D" ops, hence the "false" | |
| 314 getBamBlocks(bam, refs, bedBlocks, false); | |
| 315 | |
| 316 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
| 317 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
| 318 for (; bedItr != bedEnd; ++bedItr) { | |
| 319 overlapFoundForBlock = FindOneOrMoreOverlap(*bedItr); | |
| 320 if (overlapFoundForBlock == true) | |
| 321 overlapsFound = true; | |
| 322 } | |
| 323 } | |
| 324 if (overlapsFound == true) { | |
| 325 if (_noHit == false) | |
| 326 writer.SaveAlignment(bam); | |
| 327 } | |
| 328 else { | |
| 329 if (_noHit == true) { | |
| 330 writer.SaveAlignment(bam); | |
| 331 } | |
| 332 } | |
| 333 } | |
| 334 else { | |
| 335 // treat the BAM alignment as a single BED "block" | |
| 336 if (_obeySplits == false) { | |
| 337 FindOverlaps(a, hits); | |
| 338 hits.clear(); | |
| 339 } | |
| 340 // split the BAM alignment into discrete BED blocks and | |
| 341 // look for overlaps only within each block. | |
| 342 else { | |
| 343 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a | |
| 344 getBamBlocks(bam, refs, bedBlocks, false); | |
| 345 | |
| 346 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
| 347 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
| 348 for (; bedItr != bedEnd; ++bedItr) { | |
| 349 FindOverlaps(*bedItr, hits); | |
| 350 hits.clear(); | |
| 351 } | |
| 352 } | |
| 353 } | |
| 354 } | |
| 355 // BAM IsMapped() is false | |
| 356 else if (_noHit == true) { | |
| 357 writer.SaveAlignment(bam); | |
| 358 } | |
| 359 } | |
| 360 | |
| 361 // close the relevant BAM files. | |
| 362 reader.Close(); | |
| 363 if (_bamOutput == true) { | |
| 364 writer.Close(); | |
| 365 } | |
| 366 } | |
| 367 |
