Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Assembly/Scaffold.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Scaffold.pm,v 1.2 2002/11/11 18:16:30 lapp Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Assembly::Scaffold | |
| 4 # | |
| 5 # Copyright by Robson F. de Souza | |
| 6 # | |
| 7 # You may distribute this module under the same terms as perl itself | |
| 8 # | |
| 9 # POD documentation - main docs before the code | |
| 10 | |
| 11 =head1 NAME | |
| 12 | |
| 13 Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly data. | |
| 14 | |
| 15 =head1 SYNOPSYS | |
| 16 | |
| 17 # Module loading | |
| 18 use Bio::Assembly::IO; | |
| 19 | |
| 20 # Assembly loading methods | |
| 21 my $aio = new Bio::Assembly::IO(-file=>"test.ace.1", -format=>'phrap'); | |
| 22 my $assembly = $aio->next_assembly; | |
| 23 | |
| 24 foreach my $contig ($assembly->all_contigs) { | |
| 25 # do something... (see Bio::Assembly::Contig) | |
| 26 } | |
| 27 | |
| 28 =head1 DESCRIPTION | |
| 29 | |
| 30 Bio::Assembly::Scaffold was developed to store and manipulate data | |
| 31 from sequence assembly programs like Phrap. It implements the | |
| 32 ScaffoldI interface and intends to be generic enough to be used by | |
| 33 Bio::Assembly::IO drivers written to programs other than Phrap. | |
| 34 | |
| 35 =head1 FEEDBACK | |
| 36 | |
| 37 =head2 Mailing Lists | |
| 38 | |
| 39 User feedback is an integral part of the evolution of this and other | |
| 40 Bioperl modules. Send your comments and suggestions preferably to the | |
| 41 Bioperl mailing lists Your participation is much appreciated. | |
| 42 | |
| 43 bioperl-l@bioperl.org - General discussion | |
| 44 http://bio.perl.org/MailList.html - About the mailing lists | |
| 45 | |
| 46 =head2 Reporting Bugs | |
| 47 | |
| 48 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 49 the bugs and their resolution. Bug reports can be submitted via email | |
| 50 or the web: | |
| 51 | |
| 52 bioperl-bugs@bio.perl.org | |
| 53 http://bugzilla.bioperl.org/ | |
| 54 | |
| 55 =head1 AUTHOR - Robson Francisco de Souza | |
| 56 | |
| 57 rfsouza@citri.iq.usp.br | |
| 58 | |
| 59 =head1 APPENDIX | |
| 60 | |
| 61 The rest of the documentation details each of the object | |
| 62 methods. Internal methods are usually preceded with a _ | |
| 63 | |
| 64 =cut | |
| 65 | |
| 66 package Bio::Assembly::Scaffold; | |
| 67 | |
| 68 use strict; | |
| 69 use vars qw(@ISA $VERSION); | |
| 70 | |
| 71 use Bio::Root::Root; | |
| 72 use Bio::Assembly::ScaffoldI; | |
| 73 use Bio::Annotation::Collection; | |
| 74 | |
| 75 $VERSION = '0.0.1'; | |
| 76 @ISA = qw(Bio::Root::Root Bio::Assembly::ScaffoldI); | |
| 77 | |
| 78 =head2 new () | |
| 79 | |
| 80 Title : new | |
| 81 Usage : $assembly = new (-source=>'program_name', | |
| 82 -contigs=>\@contigs, | |
| 83 -id=>"assembly 1"); | |
| 84 Function: creates a new assembly object | |
| 85 Returns : | |
| 86 Args : | |
| 87 -source : [string] sequence assembly program | |
| 88 -contigs : reference to array of | |
| 89 Bio::Assembly::Contig objects | |
| 90 -id : [string] assembly name | |
| 91 | |
| 92 =cut | |
| 93 | |
| 94 sub new { | |
| 95 my($class,@args) = @_; | |
| 96 | |
| 97 my $self = $class->SUPER::new(@args); | |
| 98 | |
| 99 my ($src,$contigs,$id) = $self->_rearrange([qw(SOURCE CONTIGS ID)], @args); | |
| 100 | |
| 101 $self->{'_contigs'} = {}; | |
| 102 $self->{'_singlets'} = {}; | |
| 103 $self->{'_seqs'} = {}; | |
| 104 $self->{'_annotation'} = Bio::Annotation::Collection->new(); | |
| 105 $self->{'_id'} = 'NoName'; | |
| 106 | |
| 107 if (defined $contigs && ref($contigs = 'ARRAY')) { | |
| 108 foreach my $contig (@{$contigs}) { | |
| 109 $self->add_contig($contig); | |
| 110 } | |
| 111 } | |
| 112 | |
| 113 $self->{'_id'} = $id if (defined $id); | |
| 114 | |
| 115 return $self; | |
| 116 } | |
| 117 | |
| 118 =head1 Accessing general assembly data | |
| 119 | |
| 120 =cut | |
| 121 | |
| 122 =head2 id | |
| 123 | |
| 124 Title : id | |
| 125 Usage : $assembly->id() | |
| 126 Function: Get/Set assembly ID | |
| 127 Returns : string or undef | |
| 128 Args : string | |
| 129 | |
| 130 =cut | |
| 131 | |
| 132 sub id { | |
| 133 my $self = shift; | |
| 134 my $id = shift; | |
| 135 | |
| 136 $self->{'_id'} = $id if (defined $id); | |
| 137 | |
| 138 return $self->{'_id'}; | |
| 139 } | |
| 140 | |
| 141 =head2 annotation | |
| 142 | |
| 143 Title : annotation | |
| 144 Usage : $assembly->annotation() | |
| 145 Function: Get/Set assembly annotation object | |
| 146 Returns : Bio::Annotation::Collection | |
| 147 Args : none | |
| 148 | |
| 149 =cut | |
| 150 | |
| 151 sub annotation { | |
| 152 my ($self,$ref) = shift; | |
| 153 | |
| 154 $self->{'_annotation'} = $ref if (defined $ref); | |
| 155 return $self->{'_annotation'}; | |
| 156 } | |
| 157 | |
| 158 =head2 get_nof_contigs | |
| 159 | |
| 160 Title : get_nof_contigs | |
| 161 Usage : $assembly->get_nof_contigs() | |
| 162 Function: Get the number of contigs included in the assembly | |
| 163 Returns : integer | |
| 164 Args : none | |
| 165 | |
| 166 =cut | |
| 167 | |
| 168 sub get_nof_contigs { | |
| 169 my $self = shift; | |
| 170 | |
| 171 return scalar( $self->get_contig_ids() ); | |
| 172 } | |
| 173 | |
| 174 =head2 get_nof_sequences_in_contigs | |
| 175 | |
| 176 Title : get_nof_sequences_in_contigs | |
| 177 Usage : $assembly->get_nof_sequences_in_contigs() | |
| 178 Function: | |
| 179 | |
| 180 Get the number of sequences included in the | |
| 181 assembly. This number refers only to the sequences used | |
| 182 to build contigs in this assembly. It does not includes | |
| 183 contig consensus sequences or singlets. | |
| 184 | |
| 185 Returns : integer | |
| 186 Args : none | |
| 187 | |
| 188 =cut | |
| 189 | |
| 190 sub get_nof_sequences_in_contigs { | |
| 191 my $self = shift; | |
| 192 | |
| 193 my $nof_seqs = 0; | |
| 194 foreach my $contig ($self->all_contigs) { | |
| 195 $nof_seqs += scalar( $contig->get_seq_ids() ); | |
| 196 } | |
| 197 | |
| 198 return $nof_seqs; | |
| 199 } | |
| 200 | |
| 201 =head2 get_nof_singlets | |
| 202 | |
| 203 Title : nof_singlets | |
| 204 Usage : $assembly->nof_singlets() | |
| 205 Function: Get the number of singlets included in the assembly | |
| 206 Returns : integer | |
| 207 Args : none | |
| 208 | |
| 209 =cut | |
| 210 | |
| 211 sub get_nof_singlets { | |
| 212 my $self = shift; | |
| 213 | |
| 214 return scalar( $self->get_singlet_ids() ); | |
| 215 } | |
| 216 | |
| 217 =head2 get_seq_ids | |
| 218 | |
| 219 Title : get_seq_ids | |
| 220 Usage : $assembly->get_seq_ids() | |
| 221 Function: | |
| 222 | |
| 223 Get the ID of sequences from all contigs. This list | |
| 224 refers only to the aligned sequences in contigs. It does | |
| 225 not includes contig consensus sequences or singlets. | |
| 226 | |
| 227 Returns : array of strings | |
| 228 Args : none | |
| 229 | |
| 230 =cut | |
| 231 | |
| 232 sub get_seq_ids { | |
| 233 my $self = shift; | |
| 234 | |
| 235 return keys %{ $self->{'_seqs'} }; | |
| 236 } | |
| 237 | |
| 238 =head2 get_contig_ids | |
| 239 | |
| 240 Title : get_contig_ids | |
| 241 Usage : $assembly->get_contig_ids() | |
| 242 Function: Access list of contig IDs from assembly | |
| 243 Returns : an array, if there are any contigs in the | |
| 244 assembly. An empty array otherwise | |
| 245 Args : none | |
| 246 | |
| 247 =cut | |
| 248 | |
| 249 sub get_contig_ids { | |
| 250 my $self = shift; | |
| 251 | |
| 252 return sort keys %{$self->{'_contigs'}}; | |
| 253 } | |
| 254 | |
| 255 =head2 get_singlet_ids | |
| 256 | |
| 257 Title : get_singlet_ids | |
| 258 Usage : $assembly->get_singlet_ids() | |
| 259 Function: Access list of singlet IDs from assembly | |
| 260 Returns : array of strings if there are any singlets | |
| 261 otherwise an empty array | |
| 262 Args : none | |
| 263 | |
| 264 =cut | |
| 265 | |
| 266 sub get_singlet_ids { | |
| 267 my $self = shift; | |
| 268 | |
| 269 return sort keys %{$self->{'_singlets'}}; | |
| 270 } | |
| 271 | |
| 272 =head2 get_seq_by_id | |
| 273 | |
| 274 Title : get_seq_by_id | |
| 275 Usage : $assembly->get_seq_by_id($id) | |
| 276 Function: | |
| 277 | |
| 278 Get a reference for an aligned sequence | |
| 279 This sequence must be part of a contig | |
| 280 in the assembly. | |
| 281 | |
| 282 Returns : a Bio::LocatableSeq object | |
| 283 undef if sequence $id is not found | |
| 284 in any contig | |
| 285 Args : [string] sequence identifier (id) | |
| 286 | |
| 287 =cut | |
| 288 | |
| 289 sub get_seq_by_id { | |
| 290 my $self = shift; | |
| 291 my $seqID = shift; | |
| 292 | |
| 293 return undef unless (exists $self->{'_seqs'}{$seqID}); | |
| 294 | |
| 295 return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID); | |
| 296 } | |
| 297 | |
| 298 =head2 get_contig_by_id | |
| 299 | |
| 300 Title : get_contig_by_id | |
| 301 Usage : $assembly->get_contig_by_id($id) | |
| 302 Function: Get a reference for a contig | |
| 303 Returns : a Bio::Assembly::Contig object or undef | |
| 304 Args : [string] contig unique identifier (ID) | |
| 305 | |
| 306 =cut | |
| 307 | |
| 308 sub get_contig_by_id { | |
| 309 my $self = shift; | |
| 310 my $contigID = shift; | |
| 311 | |
| 312 return undef unless (exists $self->{'_contigs'}{$contigID}); | |
| 313 | |
| 314 return $self->{'_contigs'}{$contigID}; | |
| 315 } | |
| 316 | |
| 317 =head2 get_singlet_by_id | |
| 318 | |
| 319 Title : get_singlet_by_id | |
| 320 Usage : $assembly->get_singlet_by_id() | |
| 321 Function: Get a reference for a singlet | |
| 322 Returns : Bio::PrimarySeqI object or undef | |
| 323 Args : [string] a singlet ID | |
| 324 | |
| 325 =cut | |
| 326 | |
| 327 sub get_singlet_by_id { | |
| 328 my $self = shift; | |
| 329 | |
| 330 my $singletID = shift; | |
| 331 | |
| 332 return undef unless (exists $self->{'_singlets'}{$singletID}); | |
| 333 | |
| 334 return $self->{'_singlets'}{$singletID}; | |
| 335 } | |
| 336 | |
| 337 =head1 Modifier methods | |
| 338 | |
| 339 =cut | |
| 340 | |
| 341 =head2 add_contig | |
| 342 | |
| 343 Title : add_contig | |
| 344 Usage : $assembly->add_contig($contig) | |
| 345 Function: Add a contig to the assembly | |
| 346 Returns : 1 on success | |
| 347 Args : a Bio::Assembly::Contig object | |
| 348 order (optional) | |
| 349 | |
| 350 =cut | |
| 351 | |
| 352 sub add_contig { | |
| 353 my $self = shift; | |
| 354 my $contig = shift; | |
| 355 | |
| 356 if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) { | |
| 357 $self->throw("Unable to process non Bio::Assembly::Contig object [", ref($contig), "]"); | |
| 358 } | |
| 359 my $contigID = $contig->id(); | |
| 360 if( !defined $contigID ) { | |
| 361 $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1); | |
| 362 $contig->id($contigID); | |
| 363 $self->warn("Attributing ID $contigID to unidentified Bio::Assembly::Contig object."); | |
| 364 } | |
| 365 | |
| 366 $self->warn("Replacing contig $contigID with a new contig object") | |
| 367 if (exists $self->{'_contigs'}{$contigID}); | |
| 368 $self->{'_contigs'}{$contigID} = $contig; | |
| 369 | |
| 370 foreach my $seqID ($contig->get_seq_ids()) { | |
| 371 if (exists $self->{'_seqs'}{$seqID}) { | |
| 372 $self->warn( "Sequence $seqID already assigned to contig ". | |
| 373 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID") | |
| 374 unless ($self->{'_seqs'}{$seqID} eq $contig); | |
| 375 } | |
| 376 $self->{'_seqs'}{$seqID} = $contig; | |
| 377 } | |
| 378 | |
| 379 return 1; | |
| 380 } | |
| 381 | |
| 382 =head2 add_singlet | |
| 383 | |
| 384 Title : add_singlet | |
| 385 Usage : $assembly->add_singlet($seq) | |
| 386 Function: Add a singlet to the assembly | |
| 387 Returns : 1 on success, 0 otherwise | |
| 388 Args : a Bio::PrimarySeqI object | |
| 389 order (optional) | |
| 390 | |
| 391 =cut | |
| 392 | |
| 393 sub add_singlet { | |
| 394 my $self = shift; | |
| 395 my $singlet = shift; | |
| 396 | |
| 397 if( !ref $singlet || ! $singlet->isa('Bio::PrimarySeqI') ) { | |
| 398 $self->warn("Unable to process non Bio::SeqI object [", ref($singlet), "]"); | |
| 399 return 0; | |
| 400 } | |
| 401 | |
| 402 my $singletID = $singlet->id(); | |
| 403 $self->warn("Replacing singlet $singletID wih a new sequence object") | |
| 404 if (exists $self->{'_contigs'}{$singletID}); | |
| 405 $self->{'_singlets'}{$singletID} = $singlet; | |
| 406 | |
| 407 return 1; | |
| 408 } | |
| 409 | |
| 410 =head2 update_seq_list | |
| 411 | |
| 412 Title : update_seq_list | |
| 413 Usage : $assembly->update_seq_list() | |
| 414 Function: | |
| 415 | |
| 416 Synchronizes the assembly registry for sequences in | |
| 417 contigs and contig actual aligned sequences content. You | |
| 418 probably want to run this after you remove/add a | |
| 419 sequence from/to a contig in the assembly. | |
| 420 | |
| 421 Returns : nothing | |
| 422 Args : none | |
| 423 | |
| 424 =cut | |
| 425 | |
| 426 sub update_seq_list { | |
| 427 my $self = shift; | |
| 428 | |
| 429 $self->{'_seqs'} = {}; | |
| 430 foreach my $contig ($self->all_contigs) { | |
| 431 foreach my $seqID ($contig->get_seq_ids) { | |
| 432 $self->{'_seqs'}{$seqID} = $contig; | |
| 433 } | |
| 434 } | |
| 435 | |
| 436 return 1; | |
| 437 } | |
| 438 | |
| 439 =head2 remove_contigs | |
| 440 | |
| 441 Title : remove_contigs | |
| 442 Usage : $assembly->remove_contigs(1..4) | |
| 443 Function: Remove contig from assembly object | |
| 444 Returns : an array of removed Bio::Assembly::Contig | |
| 445 objects | |
| 446 Args : an array of contig IDs | |
| 447 | |
| 448 See function get_contig_ids() above | |
| 449 | |
| 450 =cut | |
| 451 | |
| 452 #--------------------- | |
| 453 sub remove_contigs { | |
| 454 #--------------------- | |
| 455 my ($self,@args) = @_; | |
| 456 | |
| 457 my @ret = (); | |
| 458 foreach my $contigID (@args) { | |
| 459 foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) { | |
| 460 delete $self->{'_seqs'}{$seqID}; | |
| 461 } | |
| 462 push(@ret,$self->{'_contigs'}{$contigID}); | |
| 463 delete $self->{'_contigs'}{$contigID}; | |
| 464 } | |
| 465 | |
| 466 return @ret; | |
| 467 } | |
| 468 | |
| 469 =head2 remove_singlets | |
| 470 | |
| 471 Title : remove_singlets | |
| 472 Usage : $assembly->remove_singlets(@singlet_ids) | |
| 473 Function: Remove singlet from assembly object | |
| 474 Returns : the Bio::SeqI objects removed | |
| 475 Args : a list of singlet IDs | |
| 476 | |
| 477 See function get_singlet_ids() above | |
| 478 | |
| 479 =cut | |
| 480 | |
| 481 #--------------------- | |
| 482 sub remove_singlets { | |
| 483 #--------------------- | |
| 484 my ($self,@args) = @_; | |
| 485 | |
| 486 my @ret = (); | |
| 487 foreach my $singletID (@args) { | |
| 488 push(@ret,$self->{'_singlets'}{$singletID}); | |
| 489 delete $self->{'_singlets'}{$singletID}; | |
| 490 } | |
| 491 | |
| 492 return @ret; | |
| 493 } | |
| 494 | |
| 495 =head1 Contig and singlet selection methos | |
| 496 | |
| 497 =cut | |
| 498 | |
| 499 =head2 select_contigs | |
| 500 | |
| 501 Title : select_contigs | |
| 502 Usage : $assembly->select_contigs(@list) | |
| 503 Function: Select an array of contigs from the assembly | |
| 504 Returns : an array of Bio::Assembly::Contig objects | |
| 505 Args : an array of contig ids | |
| 506 | |
| 507 See function get_contig_ids() above | |
| 508 | |
| 509 =cut | |
| 510 | |
| 511 #--------------------- | |
| 512 sub select_contigs { | |
| 513 #--------------------- | |
| 514 my ($self,@args) = @_; | |
| 515 | |
| 516 my @contigs = (); | |
| 517 foreach my $contig (@args) { | |
| 518 unless (exists $self->{'_contigs'}{$contig}) { | |
| 519 $self->warn("$contig contig not found. Ignoring..."); | |
| 520 next; | |
| 521 } | |
| 522 push(@contigs, $self->{'_contigs'}{$contig}); | |
| 523 } | |
| 524 | |
| 525 return @contigs; | |
| 526 } | |
| 527 | |
| 528 =head2 select_singlets | |
| 529 | |
| 530 Title : select_singlets | |
| 531 Usage : $assembly->select_singlets(@list) | |
| 532 Function: Selects an array of singlets from the assembly | |
| 533 Returns : an array of Bio::SeqI objects | |
| 534 Args : an array of singlet ids | |
| 535 | |
| 536 See function get_singlet_ids() above | |
| 537 | |
| 538 =cut | |
| 539 | |
| 540 #--------------------- | |
| 541 sub select_singlets { | |
| 542 #--------------------- | |
| 543 my ($self,@args) = @_; | |
| 544 | |
| 545 my @singlets = (); | |
| 546 foreach my $singlet (@args) { | |
| 547 unless (exists $self->{'_singlets'}{$singlet}) { | |
| 548 $self->warn("$singlet singlet not found. Ignoring..."); | |
| 549 next; | |
| 550 } | |
| 551 push(@singlets, $self->{'_singlets'}{$singlet}); | |
| 552 } | |
| 553 | |
| 554 return @singlets; | |
| 555 } | |
| 556 | |
| 557 =head2 all_contigs | |
| 558 | |
| 559 Title : all_contigs | |
| 560 Usage : my @contigs = $assembly->all_contigs | |
| 561 Function: | |
| 562 | |
| 563 Returns a list of all contigs in this assembly. Contigs | |
| 564 are both clusters and alignments of one or more reads, | |
| 565 with an associated consensus sequence. | |
| 566 | |
| 567 Returns : array of Bio::Assembly::Contig (in lexical id order) | |
| 568 Args : none | |
| 569 | |
| 570 =cut | |
| 571 | |
| 572 #--------------------- | |
| 573 sub all_contigs { | |
| 574 #--------------------- | |
| 575 my ($self) = @_; | |
| 576 | |
| 577 my @contigs = (); | |
| 578 foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) { | |
| 579 push(@contigs, $self->{'_contigs'}{$contig}); | |
| 580 } | |
| 581 | |
| 582 return @contigs; | |
| 583 } | |
| 584 | |
| 585 =head2 all_singlets | |
| 586 | |
| 587 Title : all_singlets | |
| 588 Usage : my @singlets = $assembly->all_singlets | |
| 589 Function: | |
| 590 | |
| 591 Returns a list of all singlets in this assembly. | |
| 592 Singlets are isolated reads, without non-vector | |
| 593 matches to any other read in the assembly. | |
| 594 | |
| 595 Returns : array of Bio::SeqI (in lexical order by id) | |
| 596 Args : none | |
| 597 | |
| 598 =cut | |
| 599 | |
| 600 #--------------------- | |
| 601 sub all_singlets { | |
| 602 #--------------------- | |
| 603 my ($self) = @_; | |
| 604 | |
| 605 my @singlets = (); | |
| 606 foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) { | |
| 607 push(@singlets, $self->{'_singlets'}{$singlet}); | |
| 608 } | |
| 609 | |
| 610 return @singlets; | |
| 611 } | |
| 612 | |
| 613 # =head1 Internal Methods | |
| 614 | |
| 615 1; |
