Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Assembly/Contig.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Contig.pm,v 1.1 2002/11/04 11:50:11 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Assembly::Contig | |
| 4 # Mostly based on Bio::SimpleAlign by Ewan Birney | |
| 5 # | |
| 6 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br> | |
| 7 # | |
| 8 # Copyright Robson Francisco de Souza | |
| 9 # | |
| 10 # You may distribute this module under the same terms as perl itself | |
| 11 | |
| 12 # POD documentation - main docs before the code | |
| 13 | |
| 14 =head1 NAME | |
| 15 | |
| 16 Bio::Assembly::Contig - Perl module to hold and manipulate | |
| 17 sequence assembly contigs. | |
| 18 | |
| 19 =head1 SYNOPSYS | |
| 20 | |
| 21 # Module loading | |
| 22 use Bio::Assembly::IO; | |
| 23 | |
| 24 # Assembly loading methods | |
| 25 $aio = new Bio::Assembly::IO(-file=>"test.ace.1", | |
| 26 -format=>'phrap'); | |
| 27 | |
| 28 $assembly = $aio->next_assembly; | |
| 29 foreach $contig ($assembly->all_contigs) { | |
| 30 # do something | |
| 31 } | |
| 32 | |
| 33 # OR, if you want to build the contig yourself, | |
| 34 | |
| 35 use Bio::Assembly::Contig; | |
| 36 $c = Bio::Assembly::Contig->new(-id=>"1"); | |
| 37 | |
| 38 $ls = Bio::LocatableSeq->new(-seq=>"ACCG-T", | |
| 39 -id=>"r1", | |
| 40 -alphabet=>'dna'); | |
| 41 $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T", | |
| 42 -id=>"r2", | |
| 43 -alphabet=>'dna'); | |
| 44 | |
| 45 $ls_coord = Bio::SeqFeature::Generic->new(-start=>3, | |
| 46 -end=>8, | |
| 47 -strand=>1); | |
| 48 $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1, | |
| 49 -end=>8, | |
| 50 -strand=>1); | |
| 51 $c->add_seq($ls); | |
| 52 $c->add_seq($ls2); | |
| 53 $c->set_seq_coord($ls_coord,$ls); | |
| 54 $c->set_seq_coord($ls2_coord,$ls2); | |
| 55 | |
| 56 $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T", | |
| 57 -alphabet=>'dna'); | |
| 58 $c->set_consensus_sequence($con); | |
| 59 | |
| 60 $l = $c->change_coord('unaligned r2','ungapped consensus',6); | |
| 61 print "6 in unaligned r2 => $l in ungapped consensus\n"; | |
| 62 | |
| 63 | |
| 64 =head1 DESCRIPTION | |
| 65 | |
| 66 A contig is as a set of sequences, locally aligned to each other, so | |
| 67 that every sequence has overlapping regions with at least one sequence | |
| 68 in the contig, such that a continuous of overlapping sequences is | |
| 69 formed, allowing the deduction of a consensus sequence which may be | |
| 70 longer than any of the sequences from which it was deduced. | |
| 71 | |
| 72 In this documentation we refer to the overlapping sequences used to | |
| 73 build the contig as "aligned sequences" and to the sequence deduced | |
| 74 from the overlap of aligned sequences as the "consensus". Methods to | |
| 75 deduce the consensus sequence from aligned sequences were not yet | |
| 76 implemented in this module, but its posssible to add a consensus | |
| 77 sequence deduced by other means, e.g, by the assembly program used to | |
| 78 build the alignment. | |
| 79 | |
| 80 All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable | |
| 81 objects and have a unique ID. The unique ID restriction is due to the | |
| 82 nature of the module's internal data structures and is also a request | |
| 83 of some assembly programs. If two sequences with the same ID are added | |
| 84 to a contig, the first sequence added is replaced by the second one. | |
| 85 | |
| 86 =head2 Coordinate_systems | |
| 87 | |
| 88 There are four base coordinate systems in Bio::Assembly::Contig. When | |
| 89 you need to access contig elements or data that exists on a certain | |
| 90 range or location, you may be specifying coordinates in relation to | |
| 91 different sequences, which may be either the contig consensus or one | |
| 92 of the aligned sequences that were used to do the assembly. | |
| 93 | |
| 94 ========================================================= | |
| 95 Name | Referenced sequence | |
| 96 --------------------------------------------------------- | |
| 97 "gapped consensus" | Contig (with gaps) | |
| 98 "ungapped consensus" | Contig (without gaps) | |
| 99 "aligned $seqID" | sequence $seqID (with gaps) | |
| 100 "unaligned $seqID" | sequence $seqID (without gaps) | |
| 101 ========================================================= | |
| 102 | |
| 103 "gapped consensus" refers to positions in the aligned consensus | |
| 104 sequence, which is the consensus sequence including the gaps inserted | |
| 105 to align it agains the aligned sequences that were used to assemble | |
| 106 the contig. So, its limits are [ 1, (consensus length + number of gaps | |
| 107 in consensus) ] | |
| 108 | |
| 109 "ungapped consensus" is a coordinate system based on the consensus | |
| 110 sequence, but excluding consensus gaps. This is just the coordinate | |
| 111 system that you have when considering the consensus sequence alone, | |
| 112 instead of aligned to other sequences. | |
| 113 | |
| 114 "aligned $seqID" refers to locations in the sequence $seqID after | |
| 115 alignment of $seqID against the consensus sequence (reverse | |
| 116 complementing the original sequence, if needed). Coordinate 1 in | |
| 117 "aligned $seqID" is equivalent to the start location (first base) of | |
| 118 $seqID in the consensus sequence, just like if the aligned sequence | |
| 119 $seqID was a feature of the consensus sequence. | |
| 120 | |
| 121 "unaligned $seqID" is equivalent to a location in the isolated | |
| 122 sequence, just like you would have when considering the sequence | |
| 123 alone, out of an alignment. When changing coordinates from "aligned | |
| 124 $seq2" to "unaligned $seq2", if $seq2 was reverse complemented when | |
| 125 included in the alignment, the output coordinates will be reversed to | |
| 126 fit that fact, i.e. 1 will be changed to length($seq2), 2 will be | |
| 127 length($seq)-1 and so on. | |
| 128 | |
| 129 An important note: when you change gap coordinates from a gapped | |
| 130 system ("gapped consensus" or "aligned $seqID") to a system that does | |
| 131 not include gaps ("ungapped consensus" or "unaligned $seqID"), the | |
| 132 position returned will be the first location before all gaps | |
| 133 neighboring the input location. | |
| 134 | |
| 135 =head2 Feature_collection | |
| 136 | |
| 137 Bio::Assembly::Contig stores much information about a contig in a | |
| 138 Bio::Assembly::SeqFeature::Collection object. Relevant information on the | |
| 139 alignment is accessed by selecting features based on their primary | |
| 140 tags (e.g. all features which have a primary tag of the form | |
| 141 '_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are | |
| 142 coordinates for sequences in the contig alignment) and, by using | |
| 143 methods from Bio::Assembly::SeqFeature::Collection, it's possible to select | |
| 144 features by overlap with other features. | |
| 145 | |
| 146 We suggest that you use the primary tags of features as identifiers | |
| 147 for feature classes. By convention, features with primary tags | |
| 148 starting with a '_' are generated by modules that populate the contig | |
| 149 data structure and return the contig object, maybe as part of an | |
| 150 assembly object, e.g. drivers from the Bio::Assembly::IO set. | |
| 151 | |
| 152 Features in the features collection may be associated with particular | |
| 153 aligned sequences. To obtain this, you must attach the sequence to the | |
| 154 feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the | |
| 155 feature to the feature collection. We also suggest to add the sequence | |
| 156 id to the primary tag, so that is easy to select feature for a | |
| 157 particular sequence. | |
| 158 | |
| 159 There is only one feature class that some methods in | |
| 160 Bio::Assembly::Contig expect to find in the feature collection: features | |
| 161 with primary tags of the form '_aligned_coord:$seqID', where $seqID is | |
| 162 the aligned sequence id (like returned by $seq-E<gt>id()). These features | |
| 163 describe the position (in "gapped consensus" coordinates) of aligned | |
| 164 sequences, and the method set_seq_coord() automatically changes a | |
| 165 feature's primary tag to this form whenever the feature is added to | |
| 166 the collection by this method. Only two methods in Bio::Assembly::Contig | |
| 167 will not work unless there are features from this class: | |
| 168 change_coord() and get_seq_coord(). | |
| 169 | |
| 170 Other feature classes will be automatically available only when | |
| 171 Bio::Assembly::Contig objects are created by a specific module. Such | |
| 172 feature classes are (or should be) documented in the documentation of | |
| 173 the module which create them, to which the user should refer. | |
| 174 | |
| 175 =head1 FEEDBACK | |
| 176 | |
| 177 =head2 Mailing Lists | |
| 178 | |
| 179 User feedback is an integral part of the evolution of this and other | |
| 180 Bioperl modules. Send your comments and suggestions preferably to the | |
| 181 Bioperl mailing lists Your participation is much appreciated. | |
| 182 | |
| 183 bioperl-l@bioperl.org - General discussion | |
| 184 http://bio.perl.org/MailList.html - About the mailing lists | |
| 185 | |
| 186 =head2 Reporting Bugs | |
| 187 | |
| 188 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 189 the bugs and their resolution. Bug reports can be submitted via email | |
| 190 or the web: | |
| 191 | |
| 192 bioperl-bugs@bio.perl.org | |
| 193 http://bugzilla.bioperl.org/ | |
| 194 | |
| 195 =head1 AUTHOR - Robson Francisco de Souza | |
| 196 | |
| 197 rfsouza@citri.iq.usp.br | |
| 198 | |
| 199 =head1 APPENDIX | |
| 200 | |
| 201 The rest of the documentation details each of the object | |
| 202 methods. Internal methods are usually preceded with a _ | |
| 203 | |
| 204 =cut | |
| 205 | |
| 206 #' | |
| 207 package Bio::Assembly::Contig; | |
| 208 | |
| 209 use strict; | |
| 210 use vars qw(@ISA $VERSION); | |
| 211 | |
| 212 use Bio::Root::Root; | |
| 213 use Bio::Align::AlignI; | |
| 214 use Bio::SeqFeature::Collection; | |
| 215 use Bio::Seq::PrimaryQual; | |
| 216 | |
| 217 @ISA = qw(Bio::Root::Root Bio::Align::AlignI); | |
| 218 | |
| 219 =head1 Object creator | |
| 220 | |
| 221 =head2 new | |
| 222 | |
| 223 Title : new | |
| 224 Usage : my $contig = new Bio::Assembly::Contig(); | |
| 225 Function : Creates a new contig object | |
| 226 Returns : Bio::Assembly::Contig | |
| 227 Args : -source => string representing the source | |
| 228 program where this contig came | |
| 229 from | |
| 230 -id => contig unique ID | |
| 231 | |
| 232 =cut | |
| 233 | |
| 234 #----------- | |
| 235 sub new { | |
| 236 #----------- | |
| 237 my ($class,@args) = @_; | |
| 238 | |
| 239 my $self = $class->SUPER::new(@args); | |
| 240 | |
| 241 my ($src, $id) = $self->_rearrange([qw(SOURCE ID)], @args); | |
| 242 $src && $self->source($src); | |
| 243 ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name | |
| 244 ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig | |
| 245 # we need to set up internal hashes first! | |
| 246 | |
| 247 # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility) | |
| 248 $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID) | |
| 249 $self->{'_order'} = {}; # store sequence order | |
| 250 # $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids. | |
| 251 # $self->{'_dis_name'} = {}; # Display names for each sequence | |
| 252 $self->{'_symbols'} = {}; # List of symbols | |
| 253 | |
| 254 #Contig specific slots | |
| 255 $self->{'_consensus_sequence'} = undef; | |
| 256 $self->{'_consensus_quality'} = undef; | |
| 257 $self->{'_nof_residues'} = 0; | |
| 258 $self->{'_nof_seqs'} = 0; | |
| 259 # $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now... | |
| 260 $self->{'_sfc'} = Bio::SeqFeature::Collection->new(); | |
| 261 | |
| 262 # Assembly specifcs | |
| 263 $self->{'_assembly'} = undef; # Reference to a Bio::Assembly::Scaffold object, if contig belongs to one. | |
| 264 $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise | |
| 265 $self->{'_neighbor_start'} = undef; # Will hold a reference to another contig | |
| 266 $self->{'_neighbor_end'} = undef; # Will hold a reference to another contig | |
| 267 | |
| 268 return $self; # success - we hope! | |
| 269 } | |
| 270 | |
| 271 =head1 Assembly related methods | |
| 272 | |
| 273 These methods exist to enable adding information about possible | |
| 274 relations among contigs, e.g. when you already have a scaffold for | |
| 275 your assembly, describing the ordering of contigs in the final | |
| 276 assembly, but no sequences covering the gaps between neighboring | |
| 277 contigs. | |
| 278 | |
| 279 =head2 source | |
| 280 | |
| 281 Title : source | |
| 282 Usage : $contig->source($program); | |
| 283 Function : Get/Set program used to build this contig | |
| 284 Returns : string | |
| 285 Argument : [optional] string | |
| 286 | |
| 287 =cut | |
| 288 | |
| 289 sub source { | |
| 290 my $self = shift; | |
| 291 my $source = shift; | |
| 292 | |
| 293 $self->{'_source'} = $source if (defined $source); | |
| 294 return $self->{'_source'}; | |
| 295 } | |
| 296 | |
| 297 =head2 assembly | |
| 298 | |
| 299 Title : assembly | |
| 300 Usage : $contig->assembly($assembly); | |
| 301 Function : Get/Set assembly object for this contig | |
| 302 Returns : a Bio::Assembly::Scaffold object | |
| 303 Argument : a Bio::Assembly::Scaffold object | |
| 304 | |
| 305 =cut | |
| 306 | |
| 307 sub assembly { | |
| 308 my $self = shift; | |
| 309 my $assembly = shift; | |
| 310 | |
| 311 $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly") | |
| 312 if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold")); | |
| 313 | |
| 314 $self->{'_assembly'} = $assembly if (defined $assembly); | |
| 315 return $self->{'_assembly'}; | |
| 316 } | |
| 317 | |
| 318 =head2 strand | |
| 319 | |
| 320 Title : strand | |
| 321 Usage : $contig->strand($num); | |
| 322 Function : Get/Set contig orientation in a scaffold/assembly. | |
| 323 Its equivalent to the strand property of sequence | |
| 324 objects and sets whether the contig consensus should | |
| 325 be reversed and complemented before being added to a | |
| 326 scaffold or assembly. | |
| 327 Returns : integer | |
| 328 Argument : 1 if orientaion is forward, -1 if reverse and | |
| 329 0 if none | |
| 330 | |
| 331 =cut | |
| 332 | |
| 333 sub strand { | |
| 334 my $self = shift; | |
| 335 my $ori = shift; | |
| 336 | |
| 337 $self->throw("Contig strand must be either 1, -1 or 0") | |
| 338 unless (defined $ori && ($ori == 1 || $ori == 0 || $ori == -1)); | |
| 339 | |
| 340 $self->{'_strand'} = $ori; | |
| 341 return $self->{'_strand'}; | |
| 342 } | |
| 343 | |
| 344 =head2 upstream_neighbor | |
| 345 | |
| 346 Title : upstream_neighbor | |
| 347 Usage : $contig->upstream_neighbor($contig); | |
| 348 Function : Get/Set a contig neighbor for the current contig when | |
| 349 building a scaffold. The upstream neighbor is | |
| 350 located before $contig first base | |
| 351 Returns : nothing | |
| 352 Argument : Bio::Assembly::Contig | |
| 353 | |
| 354 =cut | |
| 355 | |
| 356 sub upstream_neighbor { | |
| 357 my $self = shift; | |
| 358 my $ref = shift; | |
| 359 | |
| 360 $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig") | |
| 361 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig")); | |
| 362 | |
| 363 $self->{'_neighbor_start'} = $ref if (defined $ref); | |
| 364 return $self->{'_neighbor_start'}; | |
| 365 } | |
| 366 | |
| 367 =head2 downstream_neighbor | |
| 368 | |
| 369 Title : downstream_neighbor | |
| 370 Usage : $contig->downstream_neighbor($num); | |
| 371 Function : Get/Set a contig neighbor for the current contig when | |
| 372 building a scaffold. The downstream neighbor is | |
| 373 located after $contig last base | |
| 374 Returns : nothing | |
| 375 Argument : Bio::Assembly::Contig | |
| 376 | |
| 377 =cut | |
| 378 | |
| 379 sub downstream_neighbor { | |
| 380 my $self = shift; | |
| 381 my $ref = shift; | |
| 382 | |
| 383 $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig") | |
| 384 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig")); | |
| 385 $self->{'_neighbor_end'} = $ref if (defined $ref); | |
| 386 return $self->{'_neighbor_end'}; | |
| 387 } | |
| 388 | |
| 389 =head1 Contig feature collection methods | |
| 390 | |
| 391 =head2 add_features | |
| 392 | |
| 393 Title : add_features | |
| 394 Usage : $contig->add_features($feat,$flag) | |
| 395 Function : | |
| 396 | |
| 397 Add an array of features to the contig feature | |
| 398 collection. The consensus sequence may be attached to the | |
| 399 added feature, if $flag is set to 1. If $flag is 0 and | |
| 400 the feature attached to one of the contig aligned | |
| 401 sequences, the feature is registered as an aligned | |
| 402 sequence feature. If $flag is 0 and the feature is not | |
| 403 attched to any sequence in the contig, the feature is | |
| 404 simply added to the feature collection and no attachment | |
| 405 or registration is made. | |
| 406 | |
| 407 Note: You must attach aligned sequences to their features | |
| 408 prior to calling add_features, otherwise you won't be | |
| 409 able to access the feature through get_seq_feat_by_tag() | |
| 410 method. | |
| 411 | |
| 412 Returns : number of features added. | |
| 413 Argument : | |
| 414 $feat : A reference to an array of Bio::SeqFeatureI | |
| 415 $flag : boolean - true if consensus sequence object | |
| 416 should be attached to this feature, false if | |
| 417 no consensus attachment should be made. | |
| 418 Default: false. | |
| 419 | |
| 420 =cut | |
| 421 | |
| 422 sub add_features { | |
| 423 my ($self, $args, $flag) = @_; | |
| 424 | |
| 425 # Adding shortcuts for aligned sequence features | |
| 426 $flag = 0 unless (defined $flag); | |
| 427 if ($flag && defined $self->{'_consensus_sequence'}) { | |
| 428 foreach my $feat (@$args) { | |
| 429 next if (defined $feat->seq); | |
| 430 $feat->attach_seq($self->{'_consensus_sequence'}); | |
| 431 } | |
| 432 } elsif (!$flag) { # Register aligned sequence features | |
| 433 foreach my $feat (@$args) { | |
| 434 if (my $seq = $feat->entire_seq()) { | |
| 435 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 436 $self->warn("Adding contig feature attached to unknown sequence $seqID!") | |
| 437 unless (exists $self->{'_elem'}{$seqID}); | |
| 438 my $tag = $feat->primary_tag; | |
| 439 $self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat; | |
| 440 } | |
| 441 } | |
| 442 } | |
| 443 | |
| 444 # Add feature to feature collection | |
| 445 my $nof_added = $self->{'_sfc'}->add_features($args); | |
| 446 | |
| 447 return $nof_added; | |
| 448 } | |
| 449 | |
| 450 =head2 remove_features | |
| 451 | |
| 452 Title : remove_features | |
| 453 Usage : $contig->remove_features(@feat) | |
| 454 Function : Remove an array of contig features | |
| 455 Returns : number of features removed. | |
| 456 Argument : An array of Bio::SeqFeatureI | |
| 457 | |
| 458 =cut | |
| 459 | |
| 460 sub remove_features { | |
| 461 my ($self, @args) = @_; | |
| 462 | |
| 463 # Removing shortcuts for aligned sequence features | |
| 464 foreach my $feat (@args) { | |
| 465 if (my $seq = $feat->entire_seq()) { | |
| 466 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 467 my $tag = $feat->primary_tag; | |
| 468 $tag =~ s/:$seqID$/$1/g; | |
| 469 delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} ) | |
| 470 if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} && | |
| 471 $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat); | |
| 472 } | |
| 473 } | |
| 474 | |
| 475 return $self->{'_sfc'}->remove_features(\@args); | |
| 476 } | |
| 477 | |
| 478 =head2 get_features_collection | |
| 479 | |
| 480 Title : get_features_collection | |
| 481 Usage : $contig->get_features_collection() | |
| 482 Function : Get the collection of all contig features | |
| 483 Returns : Bio::SeqFeature::Collection | |
| 484 Argument : none | |
| 485 | |
| 486 =cut | |
| 487 | |
| 488 sub get_features_collection { | |
| 489 my $self = shift; | |
| 490 | |
| 491 return $self->{'_sfc'}; | |
| 492 } | |
| 493 | |
| 494 =head1 Coordinate system's related methods | |
| 495 | |
| 496 See L<Coordinate_Systems> above. | |
| 497 | |
| 498 =head2 change_coord | |
| 499 | |
| 500 Title : change_coord | |
| 501 Usage : $contig->change_coord($in,$out,$query) | |
| 502 Function : | |
| 503 | |
| 504 Change coordinate system for $query. This method | |
| 505 transforms locations between coordinate systems described | |
| 506 in section "Coordinate Systems" of this document. | |
| 507 | |
| 508 Note: this method will throw an exception when changing | |
| 509 coordinates between "ungapped consensus" and other | |
| 510 systems if consensus sequence was not set. It will also | |
| 511 throw exceptions when changing coordinates among aligned | |
| 512 sequence, either with or without gaps, and other systems | |
| 513 if sequence locations were not set with set_seq_coord(). | |
| 514 | |
| 515 Returns : integer | |
| 516 Argument : | |
| 517 $in : [string] input coordinate system | |
| 518 $out : [string] output coordinate system | |
| 519 $query : [integer] a position in a sequence | |
| 520 | |
| 521 =cut | |
| 522 | |
| 523 sub change_coord { | |
| 524 my $self = shift; | |
| 525 my $type_in = shift; | |
| 526 my $type_out = shift; | |
| 527 my $query = shift; | |
| 528 | |
| 529 # Parsing arguments | |
| 530 # Loading read objects (these calls will throw exceptions whether $read_in or | |
| 531 # $read_out is not found | |
| 532 my ($read_in,$read_out) = (undef,undef); | |
| 533 my $in_ID = ( split(' ',$type_in) )[1]; | |
| 534 my $out_ID = ( split(' ',$type_out) )[1]; | |
| 535 | |
| 536 if ($in_ID ne 'consensus') { | |
| 537 $read_in = $self->get_seq_coord( $self->get_seq_by_name($in_ID) ); | |
| 538 $self->throw("Can't change coordinates without sequence location for $in_ID") | |
| 539 unless (defined $read_in); | |
| 540 } | |
| 541 if ($out_ID ne 'consensus') { | |
| 542 $read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) ); | |
| 543 $self->throw("Can't change coordinates without sequence location for $out_ID") | |
| 544 unless (defined $read_out); | |
| 545 } | |
| 546 | |
| 547 # Performing transformation between coordinates | |
| 548 SWITCH1: { | |
| 549 | |
| 550 # Transformations between contig padded and contig unpadded | |
| 551 (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do { | |
| 552 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence") | |
| 553 unless (defined $self->{'_consensus_sequence'}); | |
| 554 $query = &_padded_unpadded($self->{'_consensus_gaps'}, $query); | |
| 555 last SWITCH1; | |
| 556 }; | |
| 557 (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do { | |
| 558 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence") | |
| 559 unless (defined $self->{'_consensus_sequence'}); | |
| 560 $query = &_unpadded_padded($self->{'_consensus_gaps'},$query); | |
| 561 last SWITCH1; | |
| 562 }; | |
| 563 | |
| 564 # Transformations between contig (padded) and read (padded) | |
| 565 (($type_in eq 'gapped consensus') && | |
| 566 ($type_out =~ /^aligned /) && defined($read_out)) && do { | |
| 567 $query = $query - $read_out->start() + 1; | |
| 568 last SWITCH1; | |
| 569 }; | |
| 570 (($type_in =~ /^aligned /) && defined($read_in) && | |
| 571 ($type_out eq 'gapped consensus')) && do { | |
| 572 $query = $query + $read_in->start() - 1; | |
| 573 last SWITCH1; | |
| 574 }; | |
| 575 | |
| 576 # Transformations between contig (unpadded) and read (padded) | |
| 577 (($type_in eq 'ungapped consensus') && | |
| 578 ($type_out =~ /^aligned /) && defined($read_out)) && do { | |
| 579 $query = $self->change_coord('ungapped consensus','gapped consensus',$query); | |
| 580 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
| 581 last SWITCH1; | |
| 582 }; | |
| 583 (($type_in =~ /^aligned /) && defined($read_in) && | |
| 584 ($type_out eq 'ungapped consensus')) && do { | |
| 585 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
| 586 $query = $self->change_coord('gapped consensus','ungapped consensus',$query); | |
| 587 last SWITCH1; | |
| 588 }; | |
| 589 | |
| 590 # Transformations between seq $read_in padded and seq $read_out padded | |
| 591 (defined($read_in) && ($type_in =~ /^aligned /) && | |
| 592 defined($read_out) && ($type_out =~ /^aligned /)) && do { | |
| 593 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
| 594 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
| 595 last SWITCH1; | |
| 596 }; | |
| 597 | |
| 598 # Transformations between seq $read_in padded and seq $read_out unpadded | |
| 599 (defined($read_in) && ($type_in =~ /^aligned /) && | |
| 600 defined($read_out) && ($type_out =~ /^unaligned /)) && do { | |
| 601 if ($read_in ne $read_out) { | |
| 602 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
| 603 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
| 604 } | |
| 605 my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'}; | |
| 606 $query = &_padded_unpadded($list_out,$query); | |
| 607 # Changing read orientation if read was reverse complemented when aligned | |
| 608 if ($read_out->strand == -1) { | |
| 609 my ($length) = $read_out->length(); | |
| 610 $length = $length - &_nof_gaps($list_out,$length); | |
| 611 $query = $length - $query + 1; | |
| 612 } | |
| 613 last SWITCH1; | |
| 614 }; | |
| 615 (defined($read_in) && ($type_in =~ /^unaligned /) && | |
| 616 defined($read_out) && ($type_out =~ /^aligned /)) && do { | |
| 617 my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'}; | |
| 618 # Changing read orientation if read was reverse complemented when aligned | |
| 619 if ($read_in->strand == -1) { | |
| 620 my ($length) = $read_in->length(); | |
| 621 $length = $length - &_nof_gaps($list_in,$length); | |
| 622 $query = $length - $query + 1; | |
| 623 } | |
| 624 $query = &_unpadded_padded($list_in,$query); | |
| 625 if ($read_in ne $read_out) { | |
| 626 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
| 627 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
| 628 } | |
| 629 last SWITCH1; | |
| 630 }; | |
| 631 | |
| 632 # Transformations between seq $read_in unpadded and seq $read_out unpadded | |
| 633 (defined($read_in) && ($type_in =~ /^unaligned /) && | |
| 634 defined($read_out) && ($type_out =~ /^unaligned /)) && do { | |
| 635 $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query); | |
| 636 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query); | |
| 637 last SWITCH1; | |
| 638 }; | |
| 639 | |
| 640 # Transformations between contig (padded) and read (unpadded) | |
| 641 (($type_in eq 'gapped consensus') && | |
| 642 ($type_out =~ /^unaligned /) && defined($read_out)) && do { | |
| 643 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
| 644 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query); | |
| 645 last SWITCH1; | |
| 646 }; | |
| 647 (($type_in =~ /^unaligned /) && defined($read_in) && | |
| 648 ($type_out eq 'gapped consensus')) && do { | |
| 649 $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query); | |
| 650 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
| 651 last SWITCH1; | |
| 652 }; | |
| 653 | |
| 654 # Transformations between contig (unpadded) and read (unpadded) | |
| 655 (($type_in eq 'ungapped consensus') && | |
| 656 ($type_out =~ /^unaligned /) && defined($read_out)) && do { | |
| 657 $query = $self->change_coord('ungapped consensus','gapped consensus',$query); | |
| 658 $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query); | |
| 659 last SWITCH1; | |
| 660 }; | |
| 661 (($type_in =~ /^unaligned /) && defined($read_in) && | |
| 662 ($type_out eq 'ungapped consensus')) && do { | |
| 663 $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query); | |
| 664 $query = $self->change_coord('gapped consensus','ungapped consensus',$query); | |
| 665 last SWITCH1; | |
| 666 }; | |
| 667 | |
| 668 $self->throw("Unknow coordinate system. Args: $type_in, $type_out."); | |
| 669 $query = undef; # If a coordinate systems just requested is unknown | |
| 670 } | |
| 671 | |
| 672 return $query; | |
| 673 } | |
| 674 | |
| 675 =head2 get_seq_coord | |
| 676 | |
| 677 Title : get_seq_coord | |
| 678 Usage : $contig->get_seq_coord($seq); | |
| 679 Function : Get "gapped consensus" location for aligned sequence | |
| 680 Returns : Bio::SeqFeature::Generic for coordinates or undef. | |
| 681 A warning is printed if sequence coordinates were not set. | |
| 682 Argument : Bio::LocatabaleSeq object | |
| 683 | |
| 684 =cut | |
| 685 | |
| 686 sub get_seq_coord { | |
| 687 my ($self,$seq) = @_; | |
| 688 | |
| 689 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
| 690 $self->throw("$seq is not a Bio::LocatableSeq"); | |
| 691 } | |
| 692 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 693 | |
| 694 unless (exists( $self->{'_elem'}{$seqID} )) { | |
| 695 $self->warn("No such sequence ($seqID) in contig ".$self->id); | |
| 696 return undef; | |
| 697 } | |
| 698 unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) { | |
| 699 $self->warn("Location not set for sequence ($seqID) in contig ".$self->id); | |
| 700 return undef; | |
| 701 } | |
| 702 | |
| 703 return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"}; | |
| 704 } | |
| 705 | |
| 706 =head2 set_seq_coord | |
| 707 | |
| 708 Title : set_seq_coord | |
| 709 Usage : $contig->set_seq_coord($feat,$seq); | |
| 710 Function : | |
| 711 | |
| 712 Set "gapped consensus" location for an aligned | |
| 713 sequence. If the sequence was previously added using | |
| 714 add_seq, its coordinates are changed/set. Otherwise, | |
| 715 add_seq is called and the sequence is added to the | |
| 716 contig. | |
| 717 | |
| 718 Returns : Bio::SeqFeature::Generic for old coordinates or undef. | |
| 719 Argument : | |
| 720 $feat : a Bio::SeqFeature::Generic object | |
| 721 representing a location for the | |
| 722 aligned sequence, in "gapped | |
| 723 consensus" coordinates. | |
| 724 | |
| 725 Note: the original feature primary tag will | |
| 726 be lost. | |
| 727 | |
| 728 $seq : a Bio::LocatabaleSeq object | |
| 729 | |
| 730 =cut | |
| 731 | |
| 732 sub set_seq_coord { | |
| 733 my ($self,$feat,$seq) = @_; | |
| 734 | |
| 735 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
| 736 $self->throw("Unable to process non locatable sequences [".ref($seq)."]"); | |
| 737 } | |
| 738 | |
| 739 # Complaining about inadequate feature object | |
| 740 $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!") | |
| 741 unless ( $feat->isa("Bio::SeqFeature::Generic") ); | |
| 742 $self->throw("Sequence coordinates must have an end!") | |
| 743 unless (defined $feat->end); | |
| 744 $self->throw("Sequence coordinates must have a start!") | |
| 745 unless (defined $feat->start); | |
| 746 | |
| 747 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 748 if (exists( $self->{'_elem'}{$seqID} ) && | |
| 749 exists( $self->{'_elem'}{$seqID}{'_seq'} ) && | |
| 750 defined( $self->{'_elem'}{$seqID}{'_seq'} ) && | |
| 751 ($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) { | |
| 752 $self->warn("Replacing sequence $seqID\n"); | |
| 753 $self->remove_seq($self->{'_elem'}{$seqID}{'_seq'}); | |
| 754 } | |
| 755 $self->add_seq($seq); | |
| 756 | |
| 757 # Remove previous coordinates, if any | |
| 758 $self->remove_features($feat); | |
| 759 | |
| 760 # Add new Bio::Generic::SeqFeature | |
| 761 $feat->add_tag_value('contig',$self->id) | |
| 762 unless ( $feat->has_tag('contig') ); | |
| 763 $feat->primary_tag("_aligned_coord:$seqID"); | |
| 764 $feat->attach_seq($seq); | |
| 765 $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat; | |
| 766 $self->add_features([ $feat ]); | |
| 767 } | |
| 768 | |
| 769 =head1 Bio::Assembly::Contig consensus methods | |
| 770 | |
| 771 =head2 set_consensus_sequence | |
| 772 | |
| 773 Title : set_consensus_sequence | |
| 774 Usage : $contig->set_consensus_sequence($seq) | |
| 775 Function : Set the consensus sequence object for this contig | |
| 776 Returns : consensus length | |
| 777 Argument : Bio::LocatableSeq | |
| 778 | |
| 779 =cut | |
| 780 | |
| 781 sub set_consensus_sequence { | |
| 782 my $self = shift; | |
| 783 my $seq = shift; | |
| 784 | |
| 785 $self->throw("Consensus sequence must be a Bio::LocatableSeq!") | |
| 786 unless ($seq->isa("Bio::LocatableSeq")); | |
| 787 | |
| 788 my $con_len = $seq->length; | |
| 789 $seq->start(1); $seq->end($con_len); | |
| 790 | |
| 791 $self->{'_consensus_gaps'} = []; # Consensus Gap registry | |
| 792 $self->_register_gaps($seq->seq, | |
| 793 $self->{'_consensus_gaps'}); | |
| 794 $self->{'_consensus_sequence'} = $seq; | |
| 795 | |
| 796 return $con_len; | |
| 797 } | |
| 798 | |
| 799 =head2 set_consensus_quality | |
| 800 | |
| 801 Title : set_consensus_quality | |
| 802 Usage : $contig->set_consensus_quality($qual) | |
| 803 Function : Set the quality object for consensus sequence | |
| 804 Returns : nothing | |
| 805 Argument : Bio::Seq::QualI object | |
| 806 | |
| 807 =cut | |
| 808 | |
| 809 sub set_consensus_quality { | |
| 810 my $self = shift; | |
| 811 my $qual = shift; | |
| 812 | |
| 813 $self->throw("Consensus quality must be a Bio::Seq::QualI object!") | |
| 814 unless ( $qual->isa("Bio::Seq::QualI") ); | |
| 815 | |
| 816 $self->throw("Consensus quality can't be added before you set the consensus sequence!") | |
| 817 unless (defined $self->{'_consensus_sequence'}); | |
| 818 | |
| 819 $self->{'_consensus_quality'} = $qual; | |
| 820 } | |
| 821 | |
| 822 =head2 get_consensus_length | |
| 823 | |
| 824 Title : get_consensus_length | |
| 825 Usage : $contig->get_consensus_length() | |
| 826 Function : Get consensus sequence length | |
| 827 Returns : integer | |
| 828 Argument : none | |
| 829 | |
| 830 =cut | |
| 831 | |
| 832 sub get_consensus_length { | |
| 833 my $self = shift; | |
| 834 | |
| 835 return $self->{'_consensus_sequence'}->length(); | |
| 836 } | |
| 837 | |
| 838 =head2 get_consensus_sequence | |
| 839 | |
| 840 Title : get_consensus_sequence | |
| 841 Usage : $contig->get_consensus_sequence() | |
| 842 Function : Get a reference to the consensus sequence object | |
| 843 for this contig | |
| 844 Returns : Bio::SeqI object | |
| 845 Argument : none | |
| 846 | |
| 847 =cut | |
| 848 | |
| 849 sub get_consensus_sequence { | |
| 850 my ($self, @args) = @_; | |
| 851 | |
| 852 return $self->{'_consensus_sequence'}; | |
| 853 } | |
| 854 | |
| 855 =head2 get_consensus_quality | |
| 856 | |
| 857 Title : get_consensus_quality | |
| 858 Usage : $contig->get_consensus_quality() | |
| 859 Function : Get a reference to the consensus quality object | |
| 860 for this contig. | |
| 861 Returns : A Bio::QualI object | |
| 862 Argument : none | |
| 863 | |
| 864 =cut | |
| 865 | |
| 866 sub get_consensus_quality { | |
| 867 my ($self, @args) = @_; | |
| 868 | |
| 869 return $self->{'_consensus_quality'}; | |
| 870 } | |
| 871 | |
| 872 =head1 Bio::Assembly::Contig aligned sequences methods | |
| 873 | |
| 874 =head2 set_seq_qual | |
| 875 | |
| 876 Title : set_seq_qual | |
| 877 Usage : $contig->set_seq_qual($seq,$qual); | |
| 878 Function : Adds quality to an aligned sequence. | |
| 879 Returns : nothing | |
| 880 Argument : a Bio::LocatableSeq object and | |
| 881 a Bio::Seq::QualI object | |
| 882 | |
| 883 See L<Bio::LocatableSeq> for more information. | |
| 884 | |
| 885 =cut | |
| 886 | |
| 887 sub set_seq_qual { | |
| 888 my ($self,$seq,$qual) = @_; | |
| 889 | |
| 890 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
| 891 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
| 892 } | |
| 893 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 894 | |
| 895 $self->throw("Consensus quality must be a Bio::Seq::QualI object!") | |
| 896 unless ( $qual->isa("Bio::Seq::QualI") ); | |
| 897 $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!") | |
| 898 unless (exists $self->{'_elem'}{$seqID}{'_seq'}); | |
| 899 $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) )); | |
| 900 | |
| 901 # Adding gaps to quality object | |
| 902 my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq(); | |
| 903 my $tmp = $qual->qual(); | |
| 904 @{$tmp} = reverse(@{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1); | |
| 905 my @quality = (); | |
| 906 my $previous = 0; | |
| 907 my $next = 0; | |
| 908 my $i = 0; my $j = 0; | |
| 909 while ($i<=$#{$tmp}) { | |
| 910 # IF base is a gap, quality is the average for neighbouring sites | |
| 911 if (substr($sequence,$j,1) eq '-') { | |
| 912 $previous = $tmp->[$i-1] unless ($i == 0); | |
| 913 if ($i < $#{$tmp}) { | |
| 914 $next = $tmp->[$i+1]; | |
| 915 } else { | |
| 916 $next = 0; | |
| 917 } | |
| 918 push(@quality,int( ($previous+$next)/2 )); | |
| 919 } else { | |
| 920 push(@quality,$tmp->[$i]); | |
| 921 $i++; | |
| 922 } | |
| 923 $j++; | |
| 924 } | |
| 925 | |
| 926 $self->{'_elem'}{$seqID}{'_qual'} = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality), | |
| 927 -id=>$seqID); | |
| 928 } | |
| 929 | |
| 930 =head2 get_seq_ids | |
| 931 | |
| 932 Title : get_seq_ids | |
| 933 Usage : $contig->get_seq_ids(-start=>$start, | |
| 934 -end=>$end, | |
| 935 -type=>"gapped A0QR67B08.b"); | |
| 936 Function : Get list of sequence IDs overlapping inteval [$start, $end] | |
| 937 The default interval is [1,$contig->length] | |
| 938 Default coordinate system is "gapped contig" | |
| 939 Returns : An array | |
| 940 Argument : A hash with optional elements: | |
| 941 -start : consensus subsequence start | |
| 942 -end : consensus subsequence end | |
| 943 -type : the coordinate system type for $start and $end arguments | |
| 944 Coordinate system avaliable are: | |
| 945 "gapped consensus" : consensus coordinates with gaps | |
| 946 "ungapped consensus" : consensus coordinates without gaps | |
| 947 "aligned $ReadID" : read $ReadID coordinates with gaps | |
| 948 "unaligned $ReadID" : read $ReadID coordinates without gaps | |
| 949 | |
| 950 | |
| 951 =cut | |
| 952 | |
| 953 sub get_seq_ids { | |
| 954 my ($self, @args) = @_; | |
| 955 | |
| 956 my ($type,$start,$end) = | |
| 957 $self->_rearrange([qw(TYPE START END)], @args); | |
| 958 | |
| 959 if (defined($start) && defined($end)) { | |
| 960 if (defined($type) && ($type ne 'gapped consensus')) { | |
| 961 $start = $self->change_coord($type,'gapped consensus',$start); | |
| 962 $end = $self->change_coord($type,'gapped consensus',$end); | |
| 963 } | |
| 964 | |
| 965 my @list = grep { $_->isa("Bio::SeqFeature::Generic") && | |
| 966 ($_->primary_tag =~ /^_aligned_coord:/) } | |
| 967 $self->{'_sfc'}->features_in_range(-start=>$start, | |
| 968 -end=>$end, | |
| 969 -contain=>0, | |
| 970 -strandmatch=>'ignore'); | |
| 971 @list = map { $_->entire_seq->id } @list; | |
| 972 return @list; | |
| 973 } | |
| 974 | |
| 975 # Entire aligned sequences list | |
| 976 return map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} }; | |
| 977 } | |
| 978 | |
| 979 =head2 get_seq_feat_by_tag | |
| 980 | |
| 981 Title : get_seq_feat_by_tag | |
| 982 Usage : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID") | |
| 983 Function : | |
| 984 | |
| 985 Get a sequence feature based on its primary_tag. | |
| 986 When you add | |
| 987 | |
| 988 Returns : a Bio::SeqFeature object | |
| 989 Argument : a Bio::LocatableSeq and a string (feature primary tag) | |
| 990 | |
| 991 =cut | |
| 992 | |
| 993 sub get_seq_feat_by_tag { | |
| 994 my ($self,$seq,$tag) = @_; | |
| 995 | |
| 996 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
| 997 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
| 998 } | |
| 999 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 1000 | |
| 1001 return $self->{'_elem'}{$seqID}{'_feat'}{$tag}; | |
| 1002 } | |
| 1003 | |
| 1004 =head2 get_seq_by_name | |
| 1005 | |
| 1006 Title : get_seq_by_name | |
| 1007 Usage : $seq = $contig->get_seq_by_name('Seq1') | |
| 1008 Function : Gets a sequence based on its id. | |
| 1009 Returns : a Bio::LocatableSeq object | |
| 1010 undef if name is not found | |
| 1011 Argument : string | |
| 1012 | |
| 1013 =cut | |
| 1014 | |
| 1015 sub get_seq_by_name { | |
| 1016 my $self = shift; | |
| 1017 my ($seqID) = @_; | |
| 1018 | |
| 1019 unless (exists $self->{'_elem'}{$seqID}{'_seq'}) { | |
| 1020 $self->throw("Could not find sequence $seqID in contig ".$self->id); | |
| 1021 return undef; | |
| 1022 } | |
| 1023 | |
| 1024 return $self->{'_elem'}{$seqID}{'_seq'}; | |
| 1025 } | |
| 1026 | |
| 1027 =head2 get_qual_by_name | |
| 1028 | |
| 1029 Title : get_qual_by_name | |
| 1030 Usage : $seq = $contig->get_qual_by_name('Seq1') | |
| 1031 Function : | |
| 1032 | |
| 1033 Gets Bio::Seq::QualI object for a sequence | |
| 1034 through its id ( as given by $qual->id() ). | |
| 1035 | |
| 1036 Returns : a Bio::Seq::QualI object. | |
| 1037 undef if name is not found | |
| 1038 Argument : string | |
| 1039 | |
| 1040 =cut | |
| 1041 | |
| 1042 sub get_qual_by_name { | |
| 1043 my $self = shift; | |
| 1044 my ($seqID) = @_; | |
| 1045 | |
| 1046 unless (exists $self->{'_elem'}{$seqID}{'_qual'}) { | |
| 1047 $self->warn("Could not find quality for $seqID in contig!"); | |
| 1048 return undef; | |
| 1049 } | |
| 1050 | |
| 1051 return $self->{'_elem'}{$seqID}{'_qual'}; | |
| 1052 } | |
| 1053 | |
| 1054 =head1 Bio::Align::AlignI compatible methods | |
| 1055 | |
| 1056 =head2 Modifier methods | |
| 1057 | |
| 1058 These methods modify the MSE by adding, removing or shuffling complete | |
| 1059 sequences. | |
| 1060 | |
| 1061 =head2 add_seq | |
| 1062 | |
| 1063 Title : add_seq | |
| 1064 Usage : $contig->add_seq($newseq); | |
| 1065 Function : | |
| 1066 | |
| 1067 Adds a sequence to the contig. *Does* | |
| 1068 *not* align it - just adds it to the | |
| 1069 hashes. | |
| 1070 | |
| 1071 Returns : nothing | |
| 1072 Argument : a Bio::LocatableSeq object | |
| 1073 | |
| 1074 See L<Bio::LocatableSeq> for more information. | |
| 1075 | |
| 1076 =cut | |
| 1077 | |
| 1078 sub add_seq { | |
| 1079 my $self = shift; | |
| 1080 my $seq = shift; | |
| 1081 | |
| 1082 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
| 1083 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
| 1084 } | |
| 1085 | |
| 1086 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 1087 $self->{'_elem'}{$seqID} = {} unless (exists $self->{'elem'}{$seqID}); | |
| 1088 | |
| 1089 if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) && | |
| 1090 ($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) { | |
| 1091 $self->warn("Adding sequence $seqID, which has already been added"); | |
| 1092 } | |
| 1093 | |
| 1094 # Our locatable sequences are always considered to be complete sequences | |
| 1095 $seq->start(1); $seq->end($seq->length()); | |
| 1096 | |
| 1097 $self->warn("Adding non-nucleotidic sequence ".$seqID) | |
| 1098 if (lc($seq->alphabet) ne 'dna' && lc($seq->alphabet) ne 'rna'); | |
| 1099 | |
| 1100 # build the symbol list for this sequence, | |
| 1101 # will prune out the gap and missing/match chars | |
| 1102 # when actually asked for the symbol list in the | |
| 1103 # symbol_chars | |
| 1104 if (defined $seq->seq) { | |
| 1105 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); | |
| 1106 } else { | |
| 1107 $self->{'_symbols'} = {}; | |
| 1108 } | |
| 1109 | |
| 1110 my $seq_no = ++$self->{'_nof_seqs'}; | |
| 1111 | |
| 1112 if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) { | |
| 1113 $self->warn("Replacing one sequence [$seqID]\n"); | |
| 1114 } else { | |
| 1115 #print STDERR "Assigning $seqID to $order\n"; | |
| 1116 $self->{'_order'}->{$seq_no} = $seqID; | |
| 1117 # $self->{'_start_end_lists'}->{$id} = [] | |
| 1118 # unless(exists $self->{'_start_end_lists'}->{$id}); | |
| 1119 # push @{$self->{'_start_end_lists'}->{$id}}, $seq; | |
| 1120 } | |
| 1121 | |
| 1122 $self->{'_elem'}{$seqID}{'_seq'} = $seq; | |
| 1123 $self->{'_elem'}{$seqID}{'_feat'} = {}; | |
| 1124 $self->{'_elem'}{$seqID}{'_gaps'} = []; | |
| 1125 my $dbref = $self->{'_elem'}{$seqID}{'_gaps'}; | |
| 1126 my $nofgaps = $self->_register_gaps($seq->seq,$dbref); | |
| 1127 | |
| 1128 # Updating residue count | |
| 1129 $self->{'_nof_residues'} += $seq->length - $nofgaps; | |
| 1130 | |
| 1131 return 1; | |
| 1132 } | |
| 1133 | |
| 1134 =head2 remove_seq | |
| 1135 | |
| 1136 Title : remove_seq | |
| 1137 Usage : $contig->remove_seq($seq); | |
| 1138 Function : Removes a single sequence from an alignment | |
| 1139 Returns : 1 on success, 0 otherwise | |
| 1140 Argument : a Bio::LocatableSeq object | |
| 1141 | |
| 1142 =cut | |
| 1143 | |
| 1144 sub remove_seq { | |
| 1145 my ($self,$seq) = @_; | |
| 1146 | |
| 1147 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
| 1148 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
| 1149 } | |
| 1150 | |
| 1151 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
| 1152 unless (exists $self->{'_elem'}{$seqID} ) { | |
| 1153 $self->warn("No sequence named $seqID [$seq]"); | |
| 1154 return 0; | |
| 1155 } | |
| 1156 | |
| 1157 # Updating residue count | |
| 1158 $self->{'_nof_residues'} -= $seq->length() + | |
| 1159 &_nof_gaps( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length ); | |
| 1160 | |
| 1161 # Remove all references to features of this sequence | |
| 1162 my @feats = (); | |
| 1163 foreach my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) { | |
| 1164 push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag}); | |
| 1165 } | |
| 1166 $self->{'_sfc'}->remove_features(\@feats); | |
| 1167 delete $self->{'_elem'}{$seqID}; | |
| 1168 | |
| 1169 return 1; | |
| 1170 } | |
| 1171 | |
| 1172 =head2 purge | |
| 1173 | |
| 1174 Title : purge | |
| 1175 Usage : $contig->purge(0.7); | |
| 1176 Function: | |
| 1177 | |
| 1178 Removes sequences above whatever %id. | |
| 1179 | |
| 1180 This function will grind on large alignments. Beware! | |
| 1181 (perhaps not ideally implemented) | |
| 1182 | |
| 1183 Example : | |
| 1184 Returns : An array of the removed sequences | |
| 1185 Argument: | |
| 1186 | |
| 1187 | |
| 1188 =cut | |
| 1189 | |
| 1190 sub purge { | |
| 1191 my ($self) = @_; | |
| 1192 $self->throw_not_implemented(); | |
| 1193 } | |
| 1194 | |
| 1195 =head2 sort_alphabetically | |
| 1196 | |
| 1197 Title : sort_alphabetically | |
| 1198 Usage : $contig->sort_alphabetically | |
| 1199 Function : | |
| 1200 | |
| 1201 Changes the order of the alignemnt to alphabetical on name | |
| 1202 followed by numerical by number. | |
| 1203 | |
| 1204 Returns : | |
| 1205 Argument : | |
| 1206 | |
| 1207 =cut | |
| 1208 | |
| 1209 sub sort_alphabetically { | |
| 1210 my ($self) = @_; | |
| 1211 $self->throw_not_implemented(); | |
| 1212 } | |
| 1213 | |
| 1214 =head2 Sequence selection methods | |
| 1215 | |
| 1216 Methods returning one or more sequences objects. | |
| 1217 | |
| 1218 =head2 each_seq | |
| 1219 | |
| 1220 Title : each_seq | |
| 1221 Usage : foreach $seq ( $contig->each_seq() ) | |
| 1222 Function : Gets an array of Seq objects from the alignment | |
| 1223 Returns : an array | |
| 1224 Argument : | |
| 1225 | |
| 1226 =cut | |
| 1227 | |
| 1228 sub each_seq { | |
| 1229 my ($self) = @_; | |
| 1230 | |
| 1231 my (@arr,$seqID); | |
| 1232 | |
| 1233 foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) { | |
| 1234 push(@arr,$self->{'_elem'}{$seqID}{'_seq'}); | |
| 1235 } | |
| 1236 | |
| 1237 return @arr; | |
| 1238 } | |
| 1239 | |
| 1240 =head2 each_alphabetically | |
| 1241 | |
| 1242 Title : each_alphabetically | |
| 1243 Usage : foreach $seq ( $contig->each_alphabetically() ) | |
| 1244 Function : | |
| 1245 | |
| 1246 Returns an array of sequence object sorted alphabetically | |
| 1247 by name and then by start point. | |
| 1248 Does not change the order of the alignment | |
| 1249 | |
| 1250 Returns : | |
| 1251 Argument : | |
| 1252 | |
| 1253 =cut | |
| 1254 | |
| 1255 sub each_alphabetically { | |
| 1256 my($self) = @_; | |
| 1257 $self->throw_not_implemented(); | |
| 1258 } | |
| 1259 | |
| 1260 =head2 each_seq_with_id | |
| 1261 | |
| 1262 Title : each_seq_with_id | |
| 1263 Usage : foreach $seq ( $contig->each_seq_with_id() ) | |
| 1264 Function : | |
| 1265 | |
| 1266 Gets an array of Seq objects from the | |
| 1267 alignment, the contents being those sequences | |
| 1268 with the given name (there may be more than one) | |
| 1269 | |
| 1270 Returns : an array | |
| 1271 Argument : a seq name | |
| 1272 | |
| 1273 =cut | |
| 1274 | |
| 1275 sub each_seq_with_id { | |
| 1276 my ($self) = @_; | |
| 1277 $self->throw_not_implemented(); | |
| 1278 } | |
| 1279 | |
| 1280 =head2 get_seq_by_pos | |
| 1281 | |
| 1282 Title : get_seq_by_pos | |
| 1283 Usage : $seq = $contig->get_seq_by_pos(3) | |
| 1284 Function : | |
| 1285 | |
| 1286 Gets a sequence based on its position in the alignment. | |
| 1287 Numbering starts from 1. Sequence positions larger than | |
| 1288 no_sequences() will thow an error. | |
| 1289 | |
| 1290 Returns : a Bio::LocatableSeq object | |
| 1291 Argument : positive integer for the sequence osition | |
| 1292 | |
| 1293 =cut | |
| 1294 | |
| 1295 sub get_seq_by_pos { | |
| 1296 my $self = shift; | |
| 1297 my ($pos) = @_; | |
| 1298 | |
| 1299 $self->throw("Sequence position has to be a positive integer, not [$pos]") | |
| 1300 unless $pos =~ /^\d+$/ and $pos > 0; | |
| 1301 $self->throw("No sequence at position [$pos]") | |
| 1302 unless $pos <= $self->no_sequences ; | |
| 1303 | |
| 1304 my $seqID = $self->{'_order'}->{--$pos}; | |
| 1305 return $self->{'_elem'}{$seqID}{'_seq'}; | |
| 1306 } | |
| 1307 | |
| 1308 =head2 Create new alignments | |
| 1309 | |
| 1310 The result of these methods are horizontal or vertical subsets of the | |
| 1311 current MSE. | |
| 1312 | |
| 1313 =head2 select | |
| 1314 | |
| 1315 Title : select | |
| 1316 Usage : $contig2 = $contig->select(1, 3) # three first sequences | |
| 1317 Function : | |
| 1318 | |
| 1319 Creates a new alignment from a continuous subset of | |
| 1320 sequences. Numbering starts from 1. Sequence positions | |
| 1321 larger than no_sequences() will thow an error. | |
| 1322 | |
| 1323 Returns : a Bio::Assembly::Contig object | |
| 1324 Argument : positive integer for the first sequence | |
| 1325 positive integer for the last sequence to include (optional) | |
| 1326 | |
| 1327 =cut | |
| 1328 | |
| 1329 sub select { | |
| 1330 my ($self) = @_; | |
| 1331 $self->throw_not_implemented(); | |
| 1332 } | |
| 1333 | |
| 1334 | |
| 1335 =head2 select_noncont | |
| 1336 | |
| 1337 Title : select_noncont | |
| 1338 Usage : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences | |
| 1339 Function : | |
| 1340 | |
| 1341 Creates a new alignment from a subset of | |
| 1342 sequences. Numbering starts from 1. Sequence positions | |
| 1343 larger than no_sequences() will thow an error. | |
| 1344 | |
| 1345 Returns : a Bio::Assembly::Contig object | |
| 1346 Args : array of integers for the sequences | |
| 1347 | |
| 1348 =cut | |
| 1349 | |
| 1350 sub select_noncont { | |
| 1351 my ($self) = @_; | |
| 1352 $self->throw_not_implemented(); | |
| 1353 } | |
| 1354 | |
| 1355 =head2 slice | |
| 1356 | |
| 1357 Title : slice | |
| 1358 Usage : $contig2 = $contig->slice(20, 30) | |
| 1359 Function : | |
| 1360 | |
| 1361 Creates a slice from the alignment inclusive of start and | |
| 1362 end columns. Sequences with no residues in the slice are | |
| 1363 excluded from the new alignment and a warning is printed. | |
| 1364 Slice beyond the length of the sequence does not do | |
| 1365 padding. | |
| 1366 | |
| 1367 Returns : a Bio::Assembly::Contig object | |
| 1368 Argument : positive integer for start column | |
| 1369 positive integer for end column | |
| 1370 | |
| 1371 =cut | |
| 1372 | |
| 1373 sub slice { | |
| 1374 my ($self) = @_; | |
| 1375 $self->throw_not_implemented(); | |
| 1376 } | |
| 1377 | |
| 1378 =head2 Change sequences within the MSE | |
| 1379 | |
| 1380 These methods affect characters in all sequences without changeing the | |
| 1381 alignment. | |
| 1382 | |
| 1383 | |
| 1384 =head2 map_chars | |
| 1385 | |
| 1386 Title : map_chars | |
| 1387 Usage : $contig->map_chars('\.','-') | |
| 1388 Function : | |
| 1389 | |
| 1390 Does a s/$arg1/$arg2/ on the sequences. Useful for gap | |
| 1391 characters | |
| 1392 | |
| 1393 Notice that the from (arg1) is interpretted as a regex, | |
| 1394 so be careful about quoting meta characters (eg | |
| 1395 $contig->map_chars('.','-') wont do what you want) | |
| 1396 | |
| 1397 Returns : | |
| 1398 Argument : 'from' rexexp | |
| 1399 'to' string | |
| 1400 | |
| 1401 =cut | |
| 1402 | |
| 1403 sub map_chars { | |
| 1404 my ($self) = @_; | |
| 1405 $self->throw_not_implemented(); | |
| 1406 } | |
| 1407 | |
| 1408 =head2 uppercase | |
| 1409 | |
| 1410 Title : uppercase() | |
| 1411 Usage : $contig->uppercase() | |
| 1412 Function : Sets all the sequences to uppercase | |
| 1413 Returns : | |
| 1414 Argument : | |
| 1415 | |
| 1416 =cut | |
| 1417 | |
| 1418 sub uppercase { | |
| 1419 my ($self) = @_; | |
| 1420 $self->throw_not_implemented(); | |
| 1421 } | |
| 1422 | |
| 1423 =head2 match_line | |
| 1424 | |
| 1425 Title : match_line() | |
| 1426 Usage : $contig->match_line() | |
| 1427 Function : Generates a match line - much like consensus string | |
| 1428 except that a line indicating the '*' for a match. | |
| 1429 Argument : (optional) Match line characters ('*' by default) | |
| 1430 (optional) Strong match char (':' by default) | |
| 1431 (optional) Weak match char ('.' by default) | |
| 1432 | |
| 1433 =cut | |
| 1434 | |
| 1435 sub match_line { | |
| 1436 my ($self) = @_; | |
| 1437 $self->throw_not_implemented(); | |
| 1438 } | |
| 1439 | |
| 1440 =head2 match | |
| 1441 | |
| 1442 Title : match() | |
| 1443 Usage : $contig->match() | |
| 1444 Function : | |
| 1445 | |
| 1446 Goes through all columns and changes residues that are | |
| 1447 identical to residue in first sequence to match '.' | |
| 1448 character. Sets match_char. | |
| 1449 | |
| 1450 USE WITH CARE: Most MSE formats do not support match | |
| 1451 characters in sequences, so this is mostly for output | |
| 1452 only. NEXUS format (Bio::AlignIO::nexus) can handle | |
| 1453 it. | |
| 1454 | |
| 1455 Returns : 1 | |
| 1456 Argument : a match character, optional, defaults to '.' | |
| 1457 | |
| 1458 =cut | |
| 1459 | |
| 1460 sub match { | |
| 1461 my ($self) = @_; | |
| 1462 $self->throw_not_implemented(); | |
| 1463 } | |
| 1464 | |
| 1465 =head2 unmatch | |
| 1466 | |
| 1467 Title : unmatch() | |
| 1468 Usage : $contig->unmatch() | |
| 1469 Function : | |
| 1470 | |
| 1471 Undoes the effect of method match. Unsets match_char. | |
| 1472 | |
| 1473 Returns : 1 | |
| 1474 Argument : a match character, optional, defaults to '.' | |
| 1475 | |
| 1476 =cut | |
| 1477 | |
| 1478 sub unmatch { | |
| 1479 my ($self) = @_; | |
| 1480 $self->throw_not_implemented(); | |
| 1481 } | |
| 1482 | |
| 1483 | |
| 1484 =head2 MSE attibutes | |
| 1485 | |
| 1486 Methods for setting and reading the MSE attributes. | |
| 1487 | |
| 1488 Note that the methods defining character semantics depend on the user | |
| 1489 to set them sensibly. They are needed only by certain input/output | |
| 1490 methods. Unset them by setting to an empty string (''). | |
| 1491 | |
| 1492 =head2 id | |
| 1493 | |
| 1494 Title : id | |
| 1495 Usage : $contig->id("Ig") | |
| 1496 Function : Gets/sets the id field of the alignment | |
| 1497 Returns : An id string | |
| 1498 Argument : An id string (optional) | |
| 1499 | |
| 1500 =cut | |
| 1501 | |
| 1502 sub id { | |
| 1503 my ($self, $contig_name) = @_; | |
| 1504 | |
| 1505 if (defined( $contig_name )) { | |
| 1506 $self->{'_id'} = $contig_name; | |
| 1507 } | |
| 1508 | |
| 1509 return $self->{'_id'}; | |
| 1510 } | |
| 1511 | |
| 1512 =head2 missing_char | |
| 1513 | |
| 1514 Title : missing_char | |
| 1515 Usage : $contig->missing_char("?") | |
| 1516 Function : Gets/sets the missing_char attribute of the alignment | |
| 1517 It is generally recommended to set it to 'n' or 'N' | |
| 1518 for nucleotides and to 'X' for protein. | |
| 1519 Returns : An missing_char string, | |
| 1520 Argument : An missing_char string (optional) | |
| 1521 | |
| 1522 =cut | |
| 1523 | |
| 1524 sub missing_char { | |
| 1525 my ($self) = @_; | |
| 1526 $self->throw_not_implemented(); | |
| 1527 } | |
| 1528 | |
| 1529 =head2 match_char | |
| 1530 | |
| 1531 Title : match_char | |
| 1532 Usage : $contig->match_char('.') | |
| 1533 Function : Gets/sets the match_char attribute of the alignment | |
| 1534 Returns : An match_char string, | |
| 1535 Argument : An match_char string (optional) | |
| 1536 | |
| 1537 =cut | |
| 1538 | |
| 1539 sub match_char { | |
| 1540 my ($self) = @_; | |
| 1541 $self->throw_not_implemented(); | |
| 1542 } | |
| 1543 | |
| 1544 =head2 gap_char | |
| 1545 | |
| 1546 Title : gap_char | |
| 1547 Usage : $contig->gap_char('-') | |
| 1548 Function : Gets/sets the gap_char attribute of the alignment | |
| 1549 Returns : An gap_char string, defaults to '-' | |
| 1550 Argument : An gap_char string (optional) | |
| 1551 | |
| 1552 =cut | |
| 1553 | |
| 1554 sub gap_char { | |
| 1555 my ($self) = @_; | |
| 1556 $self->throw_not_implemented(); | |
| 1557 } | |
| 1558 | |
| 1559 =head2 symbol_chars | |
| 1560 | |
| 1561 Title : symbol_chars | |
| 1562 Usage : my @symbolchars = $contig->symbol_chars; | |
| 1563 Function: Returns all the seen symbols (other than gaps) | |
| 1564 Returns : array of characters that are the seen symbols | |
| 1565 Argument: boolean to include the gap/missing/match characters | |
| 1566 | |
| 1567 =cut | |
| 1568 | |
| 1569 sub symbol_chars{ | |
| 1570 my ($self) = @_; | |
| 1571 $self->throw_not_implemented(); | |
| 1572 } | |
| 1573 | |
| 1574 =head2 Alignment descriptors | |
| 1575 | |
| 1576 These read only methods describe the MSE in various ways. | |
| 1577 | |
| 1578 | |
| 1579 =head2 consensus_string | |
| 1580 | |
| 1581 Title : consensus_string | |
| 1582 Usage : $str = $contig->consensus_string($threshold_percent) | |
| 1583 Function : Makes a strict consensus | |
| 1584 Returns : | |
| 1585 Argument : Optional treshold ranging from 0 to 100. | |
| 1586 The consensus residue has to appear at least threshold % | |
| 1587 of the sequences at a given location, otherwise a '?' | |
| 1588 character will be placed at that location. | |
| 1589 (Default value = 0%) | |
| 1590 | |
| 1591 =cut | |
| 1592 | |
| 1593 sub consensus_string { | |
| 1594 my ($self) = @_; | |
| 1595 $self->throw_not_implemented(); | |
| 1596 } | |
| 1597 | |
| 1598 =head2 consensus_iupac | |
| 1599 | |
| 1600 Title : consensus_iupac | |
| 1601 Usage : $str = $contig->consensus_iupac() | |
| 1602 Function : | |
| 1603 | |
| 1604 Makes a consensus using IUPAC ambiguity codes from DNA | |
| 1605 and RNA. The output is in upper case except when gaps in | |
| 1606 a column force output to be in lower case. | |
| 1607 | |
| 1608 Note that if your alignment sequences contain a lot of | |
| 1609 IUPAC ambiquity codes you often have to manually set | |
| 1610 alphabet. Bio::PrimarySeq::_guess_type thinks they | |
| 1611 indicate a protein sequence. | |
| 1612 | |
| 1613 Returns : consensus string | |
| 1614 Argument : none | |
| 1615 Throws : on protein sequences | |
| 1616 | |
| 1617 | |
| 1618 =cut | |
| 1619 | |
| 1620 sub consensus_iupac { | |
| 1621 my ($self) = @_; | |
| 1622 $self->throw_not_implemented(); | |
| 1623 } | |
| 1624 | |
| 1625 =head2 is_flush | |
| 1626 | |
| 1627 Title : is_flush | |
| 1628 Usage : if( $contig->is_flush() ) | |
| 1629 : | |
| 1630 : | |
| 1631 Function : Tells you whether the alignment | |
| 1632 : is flush, ie all of the same length | |
| 1633 : | |
| 1634 : | |
| 1635 Returns : 1 or 0 | |
| 1636 Argument : | |
| 1637 | |
| 1638 =cut | |
| 1639 | |
| 1640 sub is_flush { | |
| 1641 my ($self) = @_; | |
| 1642 $self->throw_not_implemented(); | |
| 1643 } | |
| 1644 | |
| 1645 =head2 length | |
| 1646 | |
| 1647 Title : length() | |
| 1648 Usage : $len = $contig->length() | |
| 1649 Function : Returns the maximum length of the alignment. | |
| 1650 To be sure the alignment is a block, use is_flush | |
| 1651 Returns : | |
| 1652 Argument : | |
| 1653 | |
| 1654 =cut | |
| 1655 | |
| 1656 sub length { | |
| 1657 my ($self) = @_; | |
| 1658 | |
| 1659 $self->throw_not_implemented(); | |
| 1660 } | |
| 1661 | |
| 1662 =head2 maxdisplayname_length | |
| 1663 | |
| 1664 Title : maxdisplayname_length | |
| 1665 Usage : $contig->maxdisplayname_length() | |
| 1666 Function : | |
| 1667 | |
| 1668 Gets the maximum length of the displayname in the | |
| 1669 alignment. Used in writing out various MSE formats. | |
| 1670 | |
| 1671 Returns : integer | |
| 1672 Argument : | |
| 1673 | |
| 1674 =cut | |
| 1675 | |
| 1676 sub maxname_length { | |
| 1677 my ($self) = @_; | |
| 1678 $self->throw_not_implemented(); | |
| 1679 } | |
| 1680 | |
| 1681 =head2 no_residues | |
| 1682 | |
| 1683 Title : no_residues | |
| 1684 Usage : $no = $contig->no_residues | |
| 1685 Function : number of residues in total in the alignment | |
| 1686 Returns : integer | |
| 1687 Argument : | |
| 1688 | |
| 1689 =cut | |
| 1690 | |
| 1691 sub no_residues { | |
| 1692 my ($self) = @_; | |
| 1693 | |
| 1694 return $self->{'_nof_residues'}; | |
| 1695 } | |
| 1696 | |
| 1697 =head2 no_sequences | |
| 1698 | |
| 1699 Title : no_sequences | |
| 1700 Usage : $depth = $contig->no_sequences | |
| 1701 Function : number of sequence in the sequence alignment | |
| 1702 Returns : integer | |
| 1703 Argument : None | |
| 1704 | |
| 1705 =cut | |
| 1706 | |
| 1707 sub no_sequences { | |
| 1708 my ($self) = @_; | |
| 1709 | |
| 1710 return scalar( keys %{ $self->{'_elem'} } ); | |
| 1711 } | |
| 1712 | |
| 1713 =head2 percentage_identity | |
| 1714 | |
| 1715 Title : percentage_identity | |
| 1716 Usage : $id = $contig->percentage_identity | |
| 1717 Function: The function calculates the percentage identity of the alignment | |
| 1718 Returns : The percentage identity of the alignment (as defined by the | |
| 1719 implementation) | |
| 1720 Argument: None | |
| 1721 | |
| 1722 =cut | |
| 1723 | |
| 1724 sub percentage_identity{ | |
| 1725 my ($self) = @_; | |
| 1726 | |
| 1727 $self->throw_not_implemeneted(); | |
| 1728 } | |
| 1729 | |
| 1730 =head2 overall_percentage_identity | |
| 1731 | |
| 1732 Title : percentage_identity | |
| 1733 Usage : $id = $contig->percentage_identity | |
| 1734 Function: The function calculates the percentage identity of | |
| 1735 the conserved columns | |
| 1736 Returns : The percentage identity of the conserved columns | |
| 1737 Args : None | |
| 1738 | |
| 1739 =cut | |
| 1740 | |
| 1741 sub overall_percentage_identity{ | |
| 1742 my ($self) = @_; | |
| 1743 $self->throw_not_implemented(); | |
| 1744 } | |
| 1745 | |
| 1746 | |
| 1747 =head2 average_percentage_identity | |
| 1748 | |
| 1749 Title : average_percentage_identity | |
| 1750 Usage : $id = $contig->average_percentage_identity | |
| 1751 Function: The function uses a fast method to calculate the average | |
| 1752 percentage identity of the alignment | |
| 1753 Returns : The average percentage identity of the alignment | |
| 1754 Args : None | |
| 1755 | |
| 1756 =cut | |
| 1757 | |
| 1758 sub average_percentage_identity { | |
| 1759 my ($self) = @_; | |
| 1760 $self->throw_not_implemented(); | |
| 1761 } | |
| 1762 | |
| 1763 =head2 Alignment positions | |
| 1764 | |
| 1765 Methods to map a sequence position into an alignment column and back. | |
| 1766 column_from_residue_number() does the former. The latter is really a | |
| 1767 property of the sequence object and can done using | |
| 1768 L<Bio::LocatableSeq::location_from_column>: | |
| 1769 | |
| 1770 # select somehow a sequence from the alignment, e.g. | |
| 1771 my $seq = $contig->get_seq_by_pos(1); | |
| 1772 #$loc is undef or Bio::LocationI object | |
| 1773 my $loc = $seq->location_from_column(5); | |
| 1774 | |
| 1775 | |
| 1776 =head2 column_from_residue_number | |
| 1777 | |
| 1778 Title : column_from_residue_number | |
| 1779 Usage : $col = $contig->column_from_residue_number( $seqname, $resnumber) | |
| 1780 Function: | |
| 1781 | |
| 1782 This function gives the position in the alignment | |
| 1783 (i.e. column number) of the given residue number in the | |
| 1784 sequence with the given name. For example, for the | |
| 1785 alignment | |
| 1786 | |
| 1787 Seq1/91-97 AC..DEF.GH | |
| 1788 Seq2/24-30 ACGG.RTY.. | |
| 1789 Seq3/43-51 AC.DDEFGHI | |
| 1790 | |
| 1791 column_from_residue_number( "Seq1", 94 ) returns 5. | |
| 1792 column_from_residue_number( "Seq2", 25 ) returns 2. | |
| 1793 column_from_residue_number( "Seq3", 50 ) returns 9. | |
| 1794 | |
| 1795 An exception is thrown if the residue number would lie | |
| 1796 outside the length of the aligment | |
| 1797 (e.g. column_from_residue_number( "Seq2", 22 ) | |
| 1798 | |
| 1799 Note: If the the parent sequence is represented by more than | |
| 1800 one alignment sequence and the residue number is present in | |
| 1801 them, this method finds only the first one. | |
| 1802 | |
| 1803 Returns : A column number for the position in the alignment of the | |
| 1804 given residue in the given sequence (1 = first column) | |
| 1805 Args : A sequence id/name (not a name/start-end) | |
| 1806 A residue number in the whole sequence (not just that | |
| 1807 segment of it in the alignment) | |
| 1808 | |
| 1809 =cut | |
| 1810 | |
| 1811 sub column_from_residue_number { | |
| 1812 my ($self) = @_; | |
| 1813 $self->throw_not_implemented(); | |
| 1814 } | |
| 1815 | |
| 1816 =head2 Sequence names | |
| 1817 | |
| 1818 Methods to manipulate the display name. The default name based on the | |
| 1819 sequence id and subsequence positions can be overridden in various | |
| 1820 ways. | |
| 1821 | |
| 1822 =head2 displayname | |
| 1823 | |
| 1824 Title : displayname | |
| 1825 Usage : $contig->displayname("Ig", "IgA") | |
| 1826 Function : Gets/sets the display name of a sequence in the alignment | |
| 1827 : | |
| 1828 Returns : A display name string | |
| 1829 Argument : name of the sequence | |
| 1830 displayname of the sequence (optional) | |
| 1831 | |
| 1832 =cut | |
| 1833 | |
| 1834 sub displayname { # Do nothing | |
| 1835 } | |
| 1836 | |
| 1837 =head2 set_displayname_count | |
| 1838 | |
| 1839 Title : set_displayname_count | |
| 1840 Usage : $contig->set_displayname_count | |
| 1841 Function : | |
| 1842 | |
| 1843 Sets the names to be name_# where # is the number of | |
| 1844 times this name has been used. | |
| 1845 | |
| 1846 Returns : None | |
| 1847 Argument : None | |
| 1848 | |
| 1849 =cut | |
| 1850 | |
| 1851 sub set_displayname_count { | |
| 1852 my ($self) = @_; | |
| 1853 $self->throw_not_implemented(); | |
| 1854 } | |
| 1855 | |
| 1856 =head2 set_displayname_flat | |
| 1857 | |
| 1858 Title : set_displayname_flat | |
| 1859 Usage : $contig->set_displayname_flat() | |
| 1860 Function : Makes all the sequences be displayed as just their name, | |
| 1861 not name/start-end | |
| 1862 Returns : 1 | |
| 1863 Argument : None | |
| 1864 | |
| 1865 =cut | |
| 1866 | |
| 1867 sub set_displayname_flat { # Do nothing! | |
| 1868 } | |
| 1869 | |
| 1870 =head2 set_displayname_normal | |
| 1871 | |
| 1872 Title : set_displayname_normal | |
| 1873 Usage : $contig->set_displayname_normal() | |
| 1874 Function : Makes all the sequences be displayed as name/start-end | |
| 1875 Returns : None | |
| 1876 Argument : None | |
| 1877 | |
| 1878 =cut | |
| 1879 | |
| 1880 sub set_displayname_normal { # Do nothing! | |
| 1881 } | |
| 1882 | |
| 1883 =head1 Internal Methods | |
| 1884 | |
| 1885 =head2 _binary_search | |
| 1886 | |
| 1887 Title : _binary_search | |
| 1888 Usage : _binary_search($list,$query) | |
| 1889 Function : | |
| 1890 | |
| 1891 Find a number in a sorted list of numbers. Return values | |
| 1892 may be on or two integers. One positive integer or zero | |
| 1893 (>=0) is the index of the element that stores the queried | |
| 1894 value. Two positive integers (or zero and another | |
| 1895 number) are the indexes of elements among which the | |
| 1896 queried value should be placed. Negative single values | |
| 1897 mean: | |
| 1898 | |
| 1899 -1: $query is smaller than smallest element in list | |
| 1900 -2: $query is greater than greatest element in list | |
| 1901 | |
| 1902 Returns : array of integers | |
| 1903 Argument : | |
| 1904 $list : array reference | |
| 1905 $query : integer | |
| 1906 | |
| 1907 =cut | |
| 1908 | |
| 1909 sub _binary_search { | |
| 1910 my $list = shift; | |
| 1911 my $query = shift; | |
| 1912 # | |
| 1913 # If there is only one element in list | |
| 1914 if (!$#{$list} && ($query == $list->[0])) { return (0) } | |
| 1915 # If there are others... | |
| 1916 my $start = 0; | |
| 1917 my $end = $#{$list}; | |
| 1918 (&_compare($query,$list->[$start]) == 0) && do { return ($start) }; | |
| 1919 (&_compare($query,$list->[$end]) == 0) && do { return ($end) }; | |
| 1920 (&_compare($query,$list->[$start]) < 0) && do { return (-1) }; | |
| 1921 (&_compare($query,$list->[$end]) > 0) && do { return (-2) }; | |
| 1922 my $middle = 0; | |
| 1923 while ($end - $start > 1) { | |
| 1924 $middle = int(($end+$middle)/2); | |
| 1925 (&_compare($query,$list->[$middle]) == 0) && return ($middle); | |
| 1926 (&_compare($query,$list->[$middle]) < 0) && do { $end = $middle ; $middle = 0; next }; | |
| 1927 $start = $middle; # If &_compare() > 0, move region beggining | |
| 1928 } | |
| 1929 return ($start,$end); | |
| 1930 } | |
| 1931 | |
| 1932 =head2 _compare | |
| 1933 | |
| 1934 Title : _compare | |
| 1935 Usage : _compare($arg1,$arg2) | |
| 1936 Function: Perform numeric or string comparisons | |
| 1937 Returns : integer (0, 1 or -1) | |
| 1938 Args : values to be compared | |
| 1939 | |
| 1940 =cut | |
| 1941 | |
| 1942 sub _compare { | |
| 1943 my $arg1 = shift; | |
| 1944 my $arg2 = shift; | |
| 1945 # | |
| 1946 if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 } | |
| 1947 else { return $arg1 cmp $arg2 } | |
| 1948 } | |
| 1949 | |
| 1950 =head2 _nof_gaps | |
| 1951 | |
| 1952 Title : _nof_gaps | |
| 1953 Usage : _nof_gaps($array_ref, $query) | |
| 1954 Function: number of gaps found before position $query | |
| 1955 Returns : integer | |
| 1956 Args : | |
| 1957 $array_ref : gap registry reference | |
| 1958 $query : [integer] a position in a sequence | |
| 1959 | |
| 1960 =cut | |
| 1961 | |
| 1962 #' emacs... | |
| 1963 sub _nof_gaps { | |
| 1964 my $list = shift; | |
| 1965 my $query = shift; | |
| 1966 # If there are no gaps in this contig | |
| 1967 return 0 unless (defined($list) && scalar(@{$list})); | |
| 1968 # Locate query index in gap list (if any) | |
| 1969 my @index = &_binary_search($list,$query); | |
| 1970 # If after all alignments, correct using total number of align | |
| 1971 if ($index[0] == -2) { $query = scalar(@{$list}) } | |
| 1972 # If before any alignment, return 0 | |
| 1973 elsif ($index[0] == -1) { $query = 0 } | |
| 1974 elsif ($index[0] >= 0) { | |
| 1975 # If query is between alignments, translate coordinates | |
| 1976 if ($#index > 0) { $query = $index[0] + 1 } | |
| 1977 # If query sits upon an alignment, do another correction | |
| 1978 elsif ($#index == 0) { $query = $index[0] } | |
| 1979 } | |
| 1980 # | |
| 1981 return $query; | |
| 1982 } | |
| 1983 | |
| 1984 =head2 _padded_unpadded | |
| 1985 | |
| 1986 Title : _padded_unpadded | |
| 1987 Usage : _padded_unpadded($array_ref, $query) | |
| 1988 Function: | |
| 1989 | |
| 1990 Returns a coordinate corresponding to | |
| 1991 position $query after gaps were | |
| 1992 removed from a sequence. | |
| 1993 | |
| 1994 Returns : integer | |
| 1995 Args : | |
| 1996 $array_ref : reference to this gap registry | |
| 1997 $query : [integer] coordionate to change | |
| 1998 | |
| 1999 =cut | |
| 2000 | |
| 2001 sub _padded_unpadded { | |
| 2002 my $list = shift; | |
| 2003 my $query = shift; | |
| 2004 | |
| 2005 my $align = &_nof_gaps($list,$query); | |
| 2006 $query-- if (defined($list->[$align]) && ($list->[$align] == $query)); | |
| 2007 $query = $query - $align; | |
| 2008 # | |
| 2009 return $query; | |
| 2010 } | |
| 2011 | |
| 2012 =head2 _unpadded_padded | |
| 2013 | |
| 2014 Title : _unpadded_padded | |
| 2015 Usage : _unpadded_padded($array_ref, $query) | |
| 2016 Function: | |
| 2017 | |
| 2018 Returns the value corresponding to | |
| 2019 ungapped position $query when gaps are | |
| 2020 counted as valid sites in a sequence | |
| 2021 | |
| 2022 Returns : | |
| 2023 Args : $array_ref = a reference to this sequence's gap registry | |
| 2024 $query = [integer] location to change | |
| 2025 | |
| 2026 =cut | |
| 2027 | |
| 2028 #' | |
| 2029 sub _unpadded_padded { | |
| 2030 my $list = shift; | |
| 2031 my $query = shift; | |
| 2032 | |
| 2033 my $align = &_nof_gaps($list,$query); | |
| 2034 $query = $query + $align; | |
| 2035 my $new_align = &_nof_gaps($list,$query); | |
| 2036 while ($new_align - $align > 0) { | |
| 2037 $query = $query + $new_align - $align; | |
| 2038 $align = $new_align; | |
| 2039 $new_align = &_nof_gaps($list,$query); | |
| 2040 } | |
| 2041 # If current position is also a align, look for the first upstream base | |
| 2042 while (defined($list->[$align]) && ($list->[$align] == $query)) { | |
| 2043 $query++; $align++; | |
| 2044 } | |
| 2045 # | |
| 2046 return $query; | |
| 2047 } | |
| 2048 | |
| 2049 =head2 _register_gaps | |
| 2050 | |
| 2051 Title : _register_gaps | |
| 2052 Usage : $self->_register_gaps($seq, $array_ref) | |
| 2053 Function: stores gap locations for a sequence | |
| 2054 Returns : number of gaps found | |
| 2055 Args : | |
| 2056 $seq : sequence string | |
| 2057 $array_ref : a reference to an array, | |
| 2058 where gap locations will | |
| 2059 be stored | |
| 2060 | |
| 2061 =cut | |
| 2062 | |
| 2063 sub _register_gaps { | |
| 2064 my $self = shift; | |
| 2065 my $sequence = shift; | |
| 2066 my $dbref = shift; | |
| 2067 | |
| 2068 $self->throw("Not an aligned sequence string to register gaps") | |
| 2069 if (ref($sequence)); | |
| 2070 | |
| 2071 $self->throw("Not an array reference for gap registry") | |
| 2072 unless (ref($dbref) eq 'ARRAY'); | |
| 2073 | |
| 2074 # Registering alignments | |
| 2075 @{$dbref} = (); # Cleaning registry | |
| 2076 if (defined $sequence) { | |
| 2077 my $i = -1; | |
| 2078 while(1) { | |
| 2079 $i = index($sequence,"-",$i+1); | |
| 2080 last if ($i == -1); | |
| 2081 push(@{$dbref},$i+1); | |
| 2082 } | |
| 2083 } else { | |
| 2084 # $self->warn("Found undefined sequence while registering gaps"); | |
| 2085 return 0; | |
| 2086 } | |
| 2087 | |
| 2088 return scalar(@{$dbref}); | |
| 2089 } | |
| 2090 | |
| 2091 1; |
