Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 bedFile.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licensed under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "bedFile.h" | |
| 13 | |
| 14 | |
| 15 /************************************************ | |
| 16 Helper functions | |
| 17 *************************************************/ | |
| 18 void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) { | |
| 19 | |
| 20 if (bed.otherFields.size() < 6) { | |
| 21 cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns on line " << lineNum << "." << endl; | |
| 22 exit(1); | |
| 23 } | |
| 24 | |
| 25 int blockCount = atoi(bed.otherFields[3].c_str()); | |
| 26 if ( blockCount <= 0 ) { | |
| 27 cerr << "Input error: found interval having <= 0 blocks on line " << lineNum << "." << endl; | |
| 28 exit(1); | |
| 29 } | |
| 30 else if ( blockCount == 1 ) { | |
| 31 //take a short-cut for single blocks | |
| 32 bedBlocks.push_back(bed); | |
| 33 } | |
| 34 else { | |
| 35 // get the comma-delimited strings for the BED12 block starts and block ends. | |
| 36 string blockSizes(bed.otherFields[4]); | |
| 37 string blockStarts(bed.otherFields[5]); | |
| 38 | |
| 39 vector<int> sizes; | |
| 40 vector<int> starts; | |
| 41 Tokenize(blockSizes, sizes, ","); | |
| 42 Tokenize(blockStarts, starts, ","); | |
| 43 | |
| 44 if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { | |
| 45 cerr << "Input error: found interval with block-counts not matching starts/sizes on line " << lineNum << "." << endl; | |
| 46 exit(1); | |
| 47 } | |
| 48 | |
| 49 // add each BED block to the bedBlocks vector | |
| 50 for (UINT i = 0; i < (UINT) blockCount; ++i) { | |
| 51 CHRPOS blockStart = bed.start + starts[i]; | |
| 52 CHRPOS blockEnd = bed.start + starts[i] + sizes[i]; | |
| 53 BED currBedBlock(bed.chrom, blockStart, blockEnd, bed.name, bed.score, bed.strand, bed.otherFields); | |
| 54 bedBlocks.push_back(currBedBlock); | |
| 55 } | |
| 56 } | |
| 57 } | |
| 58 | |
| 59 | |
| 60 /*********************************************** | |
| 61 Sorting comparison functions | |
| 62 ************************************************/ | |
| 63 bool sortByChrom(BED const &a, BED const &b) { | |
| 64 if (a.chrom < b.chrom) return true; | |
| 65 else return false; | |
| 66 }; | |
| 67 | |
| 68 bool sortByStart(const BED &a, const BED &b) { | |
| 69 if (a.start < b.start) return true; | |
| 70 else return false; | |
| 71 }; | |
| 72 | |
| 73 bool sortBySizeAsc(const BED &a, const BED &b) { | |
| 74 | |
| 75 CHRPOS aLen = a.end - a.start; | |
| 76 CHRPOS bLen = b.end - b.start; | |
| 77 | |
| 78 if (aLen < bLen) return true; | |
| 79 else return false; | |
| 80 }; | |
| 81 | |
| 82 bool sortBySizeDesc(const BED &a, const BED &b) { | |
| 83 | |
| 84 CHRPOS aLen = a.end - a.start; | |
| 85 CHRPOS bLen = b.end - b.start; | |
| 86 | |
| 87 if (aLen > bLen) return true; | |
| 88 else return false; | |
| 89 }; | |
| 90 | |
| 91 bool sortByScoreAsc(const BED &a, const BED &b) { | |
| 92 if (a.score < b.score) return true; | |
| 93 else return false; | |
| 94 }; | |
| 95 | |
| 96 bool sortByScoreDesc(const BED &a, const BED &b) { | |
| 97 if (a.score > b.score) return true; | |
| 98 else return false; | |
| 99 }; | |
| 100 | |
| 101 bool byChromThenStart(BED const &a, BED const &b) { | |
| 102 | |
| 103 if (a.chrom < b.chrom) return true; | |
| 104 else if (a.chrom > b.chrom) return false; | |
| 105 | |
| 106 if (a.start < b.start) return true; | |
| 107 else if (a.start >= b.start) return false; | |
| 108 | |
| 109 return false; | |
| 110 }; | |
| 111 | |
| 112 | |
| 113 /******************************************* | |
| 114 Class methods | |
| 115 *******************************************/ | |
| 116 | |
| 117 // Constructor | |
| 118 BedFile::BedFile(string &bedFile) | |
| 119 : bedFile(bedFile), | |
| 120 _isGff(false), | |
| 121 _isVcf(false), | |
| 122 _typeIsKnown(false), | |
| 123 _merged_start(-1), | |
| 124 _merged_end(-1), | |
| 125 _merged_chrom(""), | |
| 126 _prev_start(-1), | |
| 127 _prev_chrom("") | |
| 128 {} | |
| 129 | |
| 130 // Destructor | |
| 131 BedFile::~BedFile(void) { | |
| 132 } | |
| 133 | |
| 134 | |
| 135 void BedFile::Open(void) { | |
| 136 | |
| 137 _bedFields.reserve(12); | |
| 138 | |
| 139 if (bedFile == "stdin" || bedFile == "-") { | |
| 140 _bedStream = &cin; | |
| 141 } | |
| 142 else { | |
| 143 _bedStream = new ifstream(bedFile.c_str(), ios::in); | |
| 144 | |
| 145 if( isGzipFile(_bedStream) ) { | |
| 146 delete _bedStream; | |
| 147 _bedStream = new igzstream(bedFile.c_str(), ios::in); | |
| 148 } | |
| 149 if ( !(_bedStream->good()) ) { | |
| 150 cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; | |
| 151 exit (1); | |
| 152 } | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 // Rewind the pointer back to the beginning of the file | |
| 157 void BedFile::Rewind(void) { | |
| 158 _bedStream->seekg(0, ios::beg); | |
| 159 } | |
| 160 | |
| 161 // Jump to a specific byte in the file | |
| 162 void BedFile::Seek(unsigned long offset) { | |
| 163 _bedStream->seekg(offset); | |
| 164 } | |
| 165 | |
| 166 // Jump to a specific byte in the file | |
| 167 bool BedFile::Empty() { | |
| 168 return _bedStream->eof(); | |
| 169 } | |
| 170 | |
| 171 | |
| 172 // Close the BED file | |
| 173 void BedFile::Close(void) { | |
| 174 if (bedFile != "stdin" && bedFile != "-") delete _bedStream; | |
| 175 } | |
| 176 | |
| 177 | |
| 178 BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum, bool forceSorted) { | |
| 179 | |
| 180 // make sure there are still lines to process. | |
| 181 // if so, tokenize, validate and return the BED entry. | |
| 182 _bedFields.clear(); | |
| 183 // clear out the previous bed's data | |
| 184 if (_bedStream->good()) { | |
| 185 // parse the bedStream pointer | |
| 186 getline(*_bedStream, _bedLine); | |
| 187 lineNum++; | |
| 188 | |
| 189 // split into a string vector. | |
| 190 Tokenize(_bedLine, _bedFields); | |
| 191 | |
| 192 // load the BED struct as long as it's a valid BED entry. | |
| 193 BedLineStatus status = parseLine(bed, _bedFields, lineNum); | |
| 194 if (!forceSorted) { | |
| 195 return status; | |
| 196 } | |
| 197 else if (status == BED_VALID) { | |
| 198 if (bed.chrom == _prev_chrom) { | |
| 199 if ((int) bed.start >= _prev_start) { | |
| 200 _prev_chrom = bed.chrom; | |
| 201 _prev_start = bed.start; | |
| 202 return status; | |
| 203 } | |
| 204 else { | |
| 205 cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl; | |
| 206 exit(1); | |
| 207 } | |
| 208 } | |
| 209 else if (bed.chrom > _prev_chrom) { | |
| 210 _prev_chrom = bed.chrom; | |
| 211 _prev_start = bed.start; | |
| 212 return status; | |
| 213 } | |
| 214 else if (bed.chrom < _prev_chrom) { | |
| 215 cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl; | |
| 216 exit(1); | |
| 217 } | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 // default if file is closed or EOF | |
| 222 return BED_INVALID; | |
| 223 } | |
| 224 | |
| 225 | |
| 226 bool BedFile::GetNextMergedBed(BED &merged_bed, int &lineNum) { | |
| 227 | |
| 228 if (_bedStream->good()) { | |
| 229 BED bed; | |
| 230 BedLineStatus bedStatus; | |
| 231 // force sorting; hence third param = true | |
| 232 while ((bedStatus = GetNextBed(bed, lineNum, true)) != BED_INVALID) { | |
| 233 if (bedStatus == BED_VALID) { | |
| 234 if (((int) bed.start - _merged_end > 0) || | |
| 235 (_merged_end < 0) || | |
| 236 (bed.chrom != _merged_chrom)) | |
| 237 { | |
| 238 if (_merged_start >= 0) { | |
| 239 merged_bed.chrom = _merged_chrom; | |
| 240 merged_bed.start = _merged_start; | |
| 241 merged_bed.end = _merged_end; | |
| 242 | |
| 243 _merged_chrom = bed.chrom; | |
| 244 _merged_start = bed.start; | |
| 245 _merged_end = bed.end; | |
| 246 | |
| 247 return true; | |
| 248 } | |
| 249 else { | |
| 250 _merged_start = bed.start; | |
| 251 _merged_chrom = bed.chrom; | |
| 252 _merged_end = bed.end; | |
| 253 } | |
| 254 } | |
| 255 else if ((int) bed.end > _merged_end) | |
| 256 { | |
| 257 _merged_end = bed.end; | |
| 258 } | |
| 259 } | |
| 260 } | |
| 261 // handle the last merged block in the file. | |
| 262 if (bedStatus == BED_INVALID) | |
| 263 { | |
| 264 merged_bed.chrom = _merged_chrom; | |
| 265 merged_bed.start = _merged_start; | |
| 266 merged_bed.end = _merged_end; | |
| 267 return true; | |
| 268 } | |
| 269 } | |
| 270 return false; | |
| 271 } | |
| 272 | |
| 273 | |
| 274 void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, | |
| 275 string strand, vector<BED> &hits, bool sameStrand, bool diffStrand) { | |
| 276 | |
| 277 BIN startBin, endBin; | |
| 278 startBin = (start >> _binFirstShift); | |
| 279 endBin = ((end-1) >> _binFirstShift); | |
| 280 | |
| 281 // loop through each bin "level" in the binning hierarchy | |
| 282 for (BINLEVEL i = 0; i < _binLevels; ++i) { | |
| 283 | |
| 284 // loop through each bin at this level of the hierarchy | |
| 285 BIN offset = _binOffsetsExtended[i]; | |
| 286 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { | |
| 287 | |
| 288 // loop through each feature in this chrom/bin and see if it overlaps | |
| 289 // with the feature that was passed in. if so, add the feature to | |
| 290 // the list of hits. | |
| 291 vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); | |
| 292 vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); | |
| 293 | |
| 294 for (; bedItr != bedEnd; ++bedItr) { | |
| 295 // do we have sufficient overlap? | |
| 296 if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { | |
| 297 | |
| 298 bool strands_are_same = (strand == bedItr->strand); | |
| 299 | |
| 300 // test for necessary strandedness | |
| 301 if ( (sameStrand == false && diffStrand == false) | |
| 302 || | |
| 303 (sameStrand == true && strands_are_same == true) | |
| 304 || | |
| 305 (diffStrand == true && strands_are_same == false) | |
| 306 ) | |
| 307 { | |
| 308 hits.push_back(*bedItr); | |
| 309 } | |
| 310 } | |
| 311 } | |
| 312 } | |
| 313 startBin >>= _binNextShift; | |
| 314 endBin >>= _binNextShift; | |
| 315 } | |
| 316 } | |
| 317 | |
| 318 | |
| 319 bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, | |
| 320 bool sameStrand, bool diffStrand, float overlapFraction) { | |
| 321 | |
| 322 BIN startBin, endBin; | |
| 323 startBin = (start >> _binFirstShift); | |
| 324 endBin = ((end-1) >> _binFirstShift); | |
| 325 | |
| 326 CHRPOS aLength = (end - start); | |
| 327 | |
| 328 // loop through each bin "level" in the binning hierarchy | |
| 329 for (BINLEVEL i = 0; i < _binLevels; ++i) { | |
| 330 | |
| 331 // loop through each bin at this level of the hierarchy | |
| 332 BIN offset = _binOffsetsExtended[i]; | |
| 333 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { | |
| 334 | |
| 335 // loop through each feature in this chrom/bin and see if it overlaps | |
| 336 // with the feature that was passed in. if so, add the feature to | |
| 337 // the list of hits. | |
| 338 vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); | |
| 339 vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); | |
| 340 for (; bedItr != bedEnd; ++bedItr) { | |
| 341 | |
| 342 CHRPOS s = max(start, bedItr->start); | |
| 343 CHRPOS e = min(end, bedItr->end); | |
| 344 // the number of overlapping bases b/w a and b | |
| 345 int overlapBases = (e - s); | |
| 346 | |
| 347 // do we have sufficient overlap? | |
| 348 if ( (float) overlapBases / (float) aLength >= overlapFraction) { | |
| 349 | |
| 350 bool strands_are_same = (strand == bedItr->strand); | |
| 351 // test for necessary strandedness | |
| 352 if ( (sameStrand == false && diffStrand == false) | |
| 353 || | |
| 354 (sameStrand == true && strands_are_same == true) | |
| 355 || | |
| 356 (diffStrand == true && strands_are_same == false) | |
| 357 ) | |
| 358 { | |
| 359 return true; | |
| 360 } | |
| 361 } | |
| 362 } | |
| 363 } | |
| 364 startBin >>= _binNextShift; | |
| 365 endBin >>= _binNextShift; | |
| 366 } | |
| 367 return false; | |
| 368 } | |
| 369 | |
| 370 | |
| 371 bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, | |
| 372 bool sameStrand, bool diffStrand, float overlapFraction) { | |
| 373 | |
| 374 BIN startBin, endBin; | |
| 375 startBin = (start >> _binFirstShift); | |
| 376 endBin = ((end-1) >> _binFirstShift); | |
| 377 | |
| 378 CHRPOS aLength = (end - start); | |
| 379 | |
| 380 // loop through each bin "level" in the binning hierarchy | |
| 381 for (BINLEVEL i = 0; i < _binLevels; ++i) { | |
| 382 | |
| 383 // loop through each bin at this level of the hierarchy | |
| 384 BIN offset = _binOffsetsExtended[i]; | |
| 385 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { | |
| 386 | |
| 387 // loop through each feature in this chrom/bin and see if it overlaps | |
| 388 // with the feature that was passed in. if so, add the feature to | |
| 389 // the list of hits. | |
| 390 vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); | |
| 391 vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); | |
| 392 for (; bedItr != bedEnd; ++bedItr) { | |
| 393 CHRPOS s = max(start, bedItr->start); | |
| 394 CHRPOS e = min(end, bedItr->end); | |
| 395 | |
| 396 // the number of overlapping bases b/w a and b | |
| 397 int overlapBases = (e - s); | |
| 398 | |
| 399 // do we have sufficient overlap? | |
| 400 if ( (float) overlapBases / (float) aLength >= overlapFraction) { | |
| 401 CHRPOS bLength = (bedItr->end - bedItr->start); | |
| 402 float bOverlap = ( (float) overlapBases / (float) bLength ); | |
| 403 bool strands_are_same = (strand == bedItr->strand); | |
| 404 | |
| 405 // test for sufficient reciprocal overlap and strandedness | |
| 406 if ( (bOverlap >= overlapFraction) && | |
| 407 ((sameStrand == false && diffStrand == false) | |
| 408 || | |
| 409 (sameStrand == true && strands_are_same == true) | |
| 410 || | |
| 411 (diffStrand == true && strands_are_same == false)) | |
| 412 ) | |
| 413 { | |
| 414 return true; | |
| 415 } | |
| 416 } | |
| 417 } | |
| 418 } | |
| 419 startBin >>= _binNextShift; | |
| 420 endBin >>= _binNextShift; | |
| 421 } | |
| 422 return false; | |
| 423 } | |
| 424 | |
| 425 | |
| 426 void BedFile::countHits(const BED &a, bool sameStrand, bool diffStrand, bool countsOnly) { | |
| 427 | |
| 428 BIN startBin, endBin; | |
| 429 startBin = (a.start >> _binFirstShift); | |
| 430 endBin = ((a.end-1) >> _binFirstShift); | |
| 431 | |
| 432 // loop through each bin "level" in the binning hierarchy | |
| 433 for (BINLEVEL i = 0; i < _binLevels; ++i) { | |
| 434 | |
| 435 // loop through each bin at this level of the hierarchy | |
| 436 BIN offset = _binOffsetsExtended[i]; | |
| 437 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { | |
| 438 | |
| 439 // loop through each feature in this chrom/bin and see if it overlaps | |
| 440 // with the feature that was passed in. if so, add the feature to | |
| 441 // the list of hits. | |
| 442 vector<BEDCOV>::iterator bedItr = bedCovMap[a.chrom][j].begin(); | |
| 443 vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end(); | |
| 444 for (; bedItr != bedEnd; ++bedItr) { | |
| 445 | |
| 446 bool strands_are_same = (a.strand == bedItr->strand); | |
| 447 // skip the hit if not on the same strand (and we care) | |
| 448 if ((sameStrand == true && strands_are_same == false) || | |
| 449 (diffStrand == true && strands_are_same == true) | |
| 450 ) | |
| 451 { | |
| 452 continue; | |
| 453 } | |
| 454 else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) { | |
| 455 | |
| 456 bedItr->count++; | |
| 457 if (countsOnly == false) { | |
| 458 if (a.zeroLength == false) { | |
| 459 bedItr->depthMap[a.start+1].starts++; | |
| 460 bedItr->depthMap[a.end].ends++; | |
| 461 } | |
| 462 else { | |
| 463 // correct for the fact that we artificially expanded the zeroLength feature | |
| 464 bedItr->depthMap[a.start+2].starts++; | |
| 465 bedItr->depthMap[a.end-1].ends++; | |
| 466 } | |
| 467 | |
| 468 if (a.start < bedItr->minOverlapStart) { | |
| 469 bedItr->minOverlapStart = a.start; | |
| 470 } | |
| 471 } | |
| 472 } | |
| 473 } | |
| 474 } | |
| 475 startBin >>= _binNextShift; | |
| 476 endBin >>= _binNextShift; | |
| 477 } | |
| 478 } | |
| 479 | |
| 480 | |
| 481 void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool sameStrand, bool diffStrand, bool countsOnly) { | |
| 482 | |
| 483 // set to track the distinct B features that had coverage. | |
| 484 // we'll update the counts of coverage for these features by one | |
| 485 // at the end of this function to avoid over-counting. | |
| 486 set< vector<BEDCOV>::iterator > validHits; | |
| 487 | |
| 488 vector<BED>::const_iterator blockItr = bedBlocks.begin(); | |
| 489 vector<BED>::const_iterator blockEnd = bedBlocks.end(); | |
| 490 for (; blockItr != blockEnd; ++blockItr) { | |
| 491 | |
| 492 BIN startBin, endBin; | |
| 493 startBin = (blockItr->start >> _binFirstShift); | |
| 494 endBin = ((blockItr->end-1) >> _binFirstShift); | |
| 495 | |
| 496 // loop through each bin "level" in the binning hierarchy | |
| 497 for (BINLEVEL i = 0; i < _binLevels; ++i) { | |
| 498 | |
| 499 // loop through each bin at this level of the hierarchy | |
| 500 BIN offset = _binOffsetsExtended[i]; | |
| 501 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { | |
| 502 | |
| 503 // loop through each feature in this chrom/bin and see if it overlaps | |
| 504 // with the feature that was passed in. if so, add the feature to | |
| 505 // the list of hits. | |
| 506 vector<BEDCOV>::iterator bedItr = bedCovMap[blockItr->chrom][j].begin(); | |
| 507 vector<BEDCOV>::iterator bedEnd = bedCovMap[blockItr->chrom][j].end(); | |
| 508 for (; bedItr != bedEnd; ++bedItr) { | |
| 509 | |
| 510 bool strands_are_same = (blockItr->strand == bedItr->strand); | |
| 511 // skip the hit if not on the same strand (and we care) | |
| 512 if ((sameStrand == true && strands_are_same == false) || | |
| 513 (diffStrand == true && strands_are_same == true) | |
| 514 ) | |
| 515 { | |
| 516 continue; | |
| 517 } | |
| 518 else if (overlaps(bedItr->start, bedItr->end, blockItr->start, blockItr->end) > 0) { | |
| 519 if (countsOnly == false) { | |
| 520 if (blockItr->zeroLength == false) { | |
| 521 bedItr->depthMap[blockItr->start+1].starts++; | |
| 522 bedItr->depthMap[blockItr->end].ends++; | |
| 523 } | |
| 524 else { | |
| 525 // correct for the fact that we artificially expanded the zeroLength feature | |
| 526 bedItr->depthMap[blockItr->start+2].starts++; | |
| 527 bedItr->depthMap[blockItr->end-1].ends++; | |
| 528 } | |
| 529 } | |
| 530 | |
| 531 validHits.insert(bedItr); | |
| 532 if (blockItr->start < bedItr->minOverlapStart) | |
| 533 bedItr->minOverlapStart = blockItr->start; | |
| 534 } | |
| 535 } | |
| 536 } | |
| 537 startBin >>= _binNextShift; | |
| 538 endBin >>= _binNextShift; | |
| 539 } | |
| 540 } | |
| 541 // incrment the count of overlap by one for each B feature that overlapped | |
| 542 // the current passed hit. This is necessary to prevent over-counting for | |
| 543 // each "split"" of a single read. | |
| 544 set< vector<BEDCOV>::iterator >::iterator validHitsItr = validHits.begin(); | |
| 545 set< vector<BEDCOV>::iterator >::iterator validHitsEnd = validHits.end(); | |
| 546 for (; validHitsItr != validHitsEnd; ++validHitsItr) | |
| 547 // the validHitsItr points to another itr, hence the (*itr)-> dereferencing. | |
| 548 // ugly, but that's C++. | |
| 549 (*validHitsItr)->count++; | |
| 550 } | |
| 551 | |
| 552 | |
| 553 void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) { | |
| 554 | |
| 555 BIN startBin, endBin; | |
| 556 startBin = (a.start >> _binFirstShift); | |
| 557 endBin = ((a.end-1) >> _binFirstShift); | |
| 558 | |
| 559 // loop through each bin "level" in the binning hierarchy | |
| 560 for (BINLEVEL i = 0; i < _binLevels; ++i) { | |
| 561 | |
| 562 // loop through each bin at this level of the hierarchy | |
| 563 BIN offset = _binOffsetsExtended[i]; | |
| 564 for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { | |
| 565 | |
| 566 // loop through each feature in this chrom/bin and see if it overlaps | |
| 567 // with the feature that was passed in. if so, add the feature to | |
| 568 // the list of hits. | |
| 569 vector<BEDCOVLIST>::iterator bedItr = bedCovListMap[a.chrom][j].begin(); | |
| 570 vector<BEDCOVLIST>::iterator bedEnd = bedCovListMap[a.chrom][j].end(); | |
| 571 for (; bedItr != bedEnd; ++bedItr) { | |
| 572 | |
| 573 bool strands_are_same = (a.strand == bedItr->strand); | |
| 574 // skip the hit if not on the same strand (and we care) | |
| 575 if ((sameStrand == true && strands_are_same == false) || | |
| 576 (diffStrand == true && strands_are_same == true) | |
| 577 ) | |
| 578 { | |
| 579 continue; | |
| 580 } | |
| 581 else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) { | |
| 582 bedItr->counts[index]++; | |
| 583 if (a.zeroLength == false) { | |
| 584 bedItr->depthMapList[index][a.start+1].starts++; | |
| 585 bedItr->depthMapList[index][a.end].ends++; | |
| 586 } | |
| 587 else { | |
| 588 // correct for the fact that we artificially expanded the zeroLength feature | |
| 589 bedItr->depthMapList[index][a.start+2].starts++; | |
| 590 bedItr->depthMapList[index][a.end-1].ends++; | |
| 591 } | |
| 592 | |
| 593 if (a.start < bedItr->minOverlapStarts[index]) { | |
| 594 bedItr->minOverlapStarts[index] = a.start; | |
| 595 } | |
| 596 } | |
| 597 } | |
| 598 } | |
| 599 startBin >>= _binNextShift; | |
| 600 endBin >>= _binNextShift; | |
| 601 } | |
| 602 } | |
| 603 | |
| 604 void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; } | |
| 605 | |
| 606 void BedFile::setGff (bool gff) { this->_isGff = gff; } | |
| 607 | |
| 608 | |
| 609 void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; } | |
| 610 | |
| 611 | |
| 612 void BedFile::setFileType (FileType type) { | |
| 613 _fileType = type; | |
| 614 _typeIsKnown = true; | |
| 615 } | |
| 616 | |
| 617 | |
| 618 void BedFile::setBedType (int colNums) { | |
| 619 bedType = colNums; | |
| 620 } | |
| 621 | |
| 622 | |
| 623 void BedFile::loadBedFileIntoMap() { | |
| 624 | |
| 625 BED bedEntry, nullBed; | |
| 626 int lineNum = 0; | |
| 627 BedLineStatus bedStatus; | |
| 628 | |
| 629 Open(); | |
| 630 while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { | |
| 631 if (bedStatus == BED_VALID) { | |
| 632 BIN bin = getBin(bedEntry.start, bedEntry.end); | |
| 633 bedMap[bedEntry.chrom][bin].push_back(bedEntry); | |
| 634 bedEntry = nullBed; | |
| 635 } | |
| 636 } | |
| 637 Close(); | |
| 638 } | |
| 639 | |
| 640 | |
| 641 void BedFile::loadBedCovFileIntoMap() { | |
| 642 | |
| 643 BED bedEntry, nullBed; | |
| 644 int lineNum = 0; | |
| 645 BedLineStatus bedStatus; | |
| 646 | |
| 647 Open(); | |
| 648 while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { | |
| 649 if (bedStatus == BED_VALID) { | |
| 650 BIN bin = getBin(bedEntry.start, bedEntry.end); | |
| 651 | |
| 652 BEDCOV bedCov; | |
| 653 bedCov.chrom = bedEntry.chrom; | |
| 654 bedCov.start = bedEntry.start; | |
| 655 bedCov.end = bedEntry.end; | |
| 656 bedCov.name = bedEntry.name; | |
| 657 bedCov.score = bedEntry.score; | |
| 658 bedCov.strand = bedEntry.strand; | |
| 659 bedCov.otherFields = bedEntry.otherFields; | |
| 660 bedCov.zeroLength = bedEntry.zeroLength; | |
| 661 bedCov.count = 0; | |
| 662 bedCov.minOverlapStart = INT_MAX; | |
| 663 | |
| 664 bedCovMap[bedEntry.chrom][bin].push_back(bedCov); | |
| 665 bedEntry = nullBed; | |
| 666 } | |
| 667 } | |
| 668 Close(); | |
| 669 } | |
| 670 | |
| 671 void BedFile::loadBedCovListFileIntoMap() { | |
| 672 | |
| 673 BED bedEntry, nullBed; | |
| 674 int lineNum = 0; | |
| 675 BedLineStatus bedStatus; | |
| 676 | |
| 677 Open(); | |
| 678 while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { | |
| 679 if (bedStatus == BED_VALID) { | |
| 680 BIN bin = getBin(bedEntry.start, bedEntry.end); | |
| 681 | |
| 682 BEDCOVLIST bedCovList; | |
| 683 bedCovList.chrom = bedEntry.chrom; | |
| 684 bedCovList.start = bedEntry.start; | |
| 685 bedCovList.end = bedEntry.end; | |
| 686 bedCovList.name = bedEntry.name; | |
| 687 bedCovList.score = bedEntry.score; | |
| 688 bedCovList.strand = bedEntry.strand; | |
| 689 bedCovList.otherFields = bedEntry.otherFields; | |
| 690 bedCovList.zeroLength = bedEntry.zeroLength; | |
| 691 | |
| 692 bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList); | |
| 693 bedEntry = nullBed; | |
| 694 } | |
| 695 } | |
| 696 Close(); | |
| 697 } | |
| 698 | |
| 699 | |
| 700 void BedFile::loadBedFileIntoMapNoBin() { | |
| 701 | |
| 702 BED bedEntry, nullBed; | |
| 703 int lineNum = 0; | |
| 704 BedLineStatus bedStatus; | |
| 705 | |
| 706 Open(); | |
| 707 while ((bedStatus = this->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { | |
| 708 if (bedStatus == BED_VALID) { | |
| 709 bedMapNoBin[bedEntry.chrom].push_back(bedEntry); | |
| 710 bedEntry = nullBed; | |
| 711 } | |
| 712 } | |
| 713 Close(); | |
| 714 | |
| 715 // sort the BED entries for each chromosome | |
| 716 // in ascending order of start position | |
| 717 for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin(); m != this->bedMapNoBin.end(); ++m) { | |
| 718 sort(m->second.begin(), m->second.end(), sortByStart); | |
| 719 } | |
| 720 } |
