comparison variant_effect_predictor/Bio/Assembly/Contig.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: Contig.pm,v 1.1 2002/11/04 11:50:11 heikki Exp $
2 #
3 # BioPerl module for Bio::Assembly::Contig
4 # Mostly based on Bio::SimpleAlign by Ewan Birney
5 #
6 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
7 #
8 # Copyright Robson Francisco de Souza
9 #
10 # You may distribute this module under the same terms as perl itself
11
12 # POD documentation - main docs before the code
13
14 =head1 NAME
15
16 Bio::Assembly::Contig - Perl module to hold and manipulate
17 sequence assembly contigs.
18
19 =head1 SYNOPSYS
20
21 # Module loading
22 use Bio::Assembly::IO;
23
24 # Assembly loading methods
25 $aio = new Bio::Assembly::IO(-file=>"test.ace.1",
26 -format=>'phrap');
27
28 $assembly = $aio->next_assembly;
29 foreach $contig ($assembly->all_contigs) {
30 # do something
31 }
32
33 # OR, if you want to build the contig yourself,
34
35 use Bio::Assembly::Contig;
36 $c = Bio::Assembly::Contig->new(-id=>"1");
37
38 $ls = Bio::LocatableSeq->new(-seq=>"ACCG-T",
39 -id=>"r1",
40 -alphabet=>'dna');
41 $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T",
42 -id=>"r2",
43 -alphabet=>'dna');
44
45 $ls_coord = Bio::SeqFeature::Generic->new(-start=>3,
46 -end=>8,
47 -strand=>1);
48 $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1,
49 -end=>8,
50 -strand=>1);
51 $c->add_seq($ls);
52 $c->add_seq($ls2);
53 $c->set_seq_coord($ls_coord,$ls);
54 $c->set_seq_coord($ls2_coord,$ls2);
55
56 $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T",
57 -alphabet=>'dna');
58 $c->set_consensus_sequence($con);
59
60 $l = $c->change_coord('unaligned r2','ungapped consensus',6);
61 print "6 in unaligned r2 => $l in ungapped consensus\n";
62
63
64 =head1 DESCRIPTION
65
66 A contig is as a set of sequences, locally aligned to each other, so
67 that every sequence has overlapping regions with at least one sequence
68 in the contig, such that a continuous of overlapping sequences is
69 formed, allowing the deduction of a consensus sequence which may be
70 longer than any of the sequences from which it was deduced.
71
72 In this documentation we refer to the overlapping sequences used to
73 build the contig as "aligned sequences" and to the sequence deduced
74 from the overlap of aligned sequences as the "consensus". Methods to
75 deduce the consensus sequence from aligned sequences were not yet
76 implemented in this module, but its posssible to add a consensus
77 sequence deduced by other means, e.g, by the assembly program used to
78 build the alignment.
79
80 All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable
81 objects and have a unique ID. The unique ID restriction is due to the
82 nature of the module's internal data structures and is also a request
83 of some assembly programs. If two sequences with the same ID are added
84 to a contig, the first sequence added is replaced by the second one.
85
86 =head2 Coordinate_systems
87
88 There are four base coordinate systems in Bio::Assembly::Contig. When
89 you need to access contig elements or data that exists on a certain
90 range or location, you may be specifying coordinates in relation to
91 different sequences, which may be either the contig consensus or one
92 of the aligned sequences that were used to do the assembly.
93
94 =========================================================
95 Name | Referenced sequence
96 ---------------------------------------------------------
97 "gapped consensus" | Contig (with gaps)
98 "ungapped consensus" | Contig (without gaps)
99 "aligned $seqID" | sequence $seqID (with gaps)
100 "unaligned $seqID" | sequence $seqID (without gaps)
101 =========================================================
102
103 "gapped consensus" refers to positions in the aligned consensus
104 sequence, which is the consensus sequence including the gaps inserted
105 to align it agains the aligned sequences that were used to assemble
106 the contig. So, its limits are [ 1, (consensus length + number of gaps
107 in consensus) ]
108
109 "ungapped consensus" is a coordinate system based on the consensus
110 sequence, but excluding consensus gaps. This is just the coordinate
111 system that you have when considering the consensus sequence alone,
112 instead of aligned to other sequences.
113
114 "aligned $seqID" refers to locations in the sequence $seqID after
115 alignment of $seqID against the consensus sequence (reverse
116 complementing the original sequence, if needed). Coordinate 1 in
117 "aligned $seqID" is equivalent to the start location (first base) of
118 $seqID in the consensus sequence, just like if the aligned sequence
119 $seqID was a feature of the consensus sequence.
120
121 "unaligned $seqID" is equivalent to a location in the isolated
122 sequence, just like you would have when considering the sequence
123 alone, out of an alignment. When changing coordinates from "aligned
124 $seq2" to "unaligned $seq2", if $seq2 was reverse complemented when
125 included in the alignment, the output coordinates will be reversed to
126 fit that fact, i.e. 1 will be changed to length($seq2), 2 will be
127 length($seq)-1 and so on.
128
129 An important note: when you change gap coordinates from a gapped
130 system ("gapped consensus" or "aligned $seqID") to a system that does
131 not include gaps ("ungapped consensus" or "unaligned $seqID"), the
132 position returned will be the first location before all gaps
133 neighboring the input location.
134
135 =head2 Feature_collection
136
137 Bio::Assembly::Contig stores much information about a contig in a
138 Bio::Assembly::SeqFeature::Collection object. Relevant information on the
139 alignment is accessed by selecting features based on their primary
140 tags (e.g. all features which have a primary tag of the form
141 '_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are
142 coordinates for sequences in the contig alignment) and, by using
143 methods from Bio::Assembly::SeqFeature::Collection, it's possible to select
144 features by overlap with other features.
145
146 We suggest that you use the primary tags of features as identifiers
147 for feature classes. By convention, features with primary tags
148 starting with a '_' are generated by modules that populate the contig
149 data structure and return the contig object, maybe as part of an
150 assembly object, e.g. drivers from the Bio::Assembly::IO set.
151
152 Features in the features collection may be associated with particular
153 aligned sequences. To obtain this, you must attach the sequence to the
154 feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the
155 feature to the feature collection. We also suggest to add the sequence
156 id to the primary tag, so that is easy to select feature for a
157 particular sequence.
158
159 There is only one feature class that some methods in
160 Bio::Assembly::Contig expect to find in the feature collection: features
161 with primary tags of the form '_aligned_coord:$seqID', where $seqID is
162 the aligned sequence id (like returned by $seq-E<gt>id()). These features
163 describe the position (in "gapped consensus" coordinates) of aligned
164 sequences, and the method set_seq_coord() automatically changes a
165 feature's primary tag to this form whenever the feature is added to
166 the collection by this method. Only two methods in Bio::Assembly::Contig
167 will not work unless there are features from this class:
168 change_coord() and get_seq_coord().
169
170 Other feature classes will be automatically available only when
171 Bio::Assembly::Contig objects are created by a specific module. Such
172 feature classes are (or should be) documented in the documentation of
173 the module which create them, to which the user should refer.
174
175 =head1 FEEDBACK
176
177 =head2 Mailing Lists
178
179 User feedback is an integral part of the evolution of this and other
180 Bioperl modules. Send your comments and suggestions preferably to the
181 Bioperl mailing lists Your participation is much appreciated.
182
183 bioperl-l@bioperl.org - General discussion
184 http://bio.perl.org/MailList.html - About the mailing lists
185
186 =head2 Reporting Bugs
187
188 Report bugs to the Bioperl bug tracking system to help us keep track
189 the bugs and their resolution. Bug reports can be submitted via email
190 or the web:
191
192 bioperl-bugs@bio.perl.org
193 http://bugzilla.bioperl.org/
194
195 =head1 AUTHOR - Robson Francisco de Souza
196
197 rfsouza@citri.iq.usp.br
198
199 =head1 APPENDIX
200
201 The rest of the documentation details each of the object
202 methods. Internal methods are usually preceded with a _
203
204 =cut
205
206 #'
207 package Bio::Assembly::Contig;
208
209 use strict;
210 use vars qw(@ISA $VERSION);
211
212 use Bio::Root::Root;
213 use Bio::Align::AlignI;
214 use Bio::SeqFeature::Collection;
215 use Bio::Seq::PrimaryQual;
216
217 @ISA = qw(Bio::Root::Root Bio::Align::AlignI);
218
219 =head1 Object creator
220
221 =head2 new
222
223 Title : new
224 Usage : my $contig = new Bio::Assembly::Contig();
225 Function : Creates a new contig object
226 Returns : Bio::Assembly::Contig
227 Args : -source => string representing the source
228 program where this contig came
229 from
230 -id => contig unique ID
231
232 =cut
233
234 #-----------
235 sub new {
236 #-----------
237 my ($class,@args) = @_;
238
239 my $self = $class->SUPER::new(@args);
240
241 my ($src, $id) = $self->_rearrange([qw(SOURCE ID)], @args);
242 $src && $self->source($src);
243 ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name
244 ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig
245 # we need to set up internal hashes first!
246
247 # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility)
248 $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID)
249 $self->{'_order'} = {}; # store sequence order
250 # $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids.
251 # $self->{'_dis_name'} = {}; # Display names for each sequence
252 $self->{'_symbols'} = {}; # List of symbols
253
254 #Contig specific slots
255 $self->{'_consensus_sequence'} = undef;
256 $self->{'_consensus_quality'} = undef;
257 $self->{'_nof_residues'} = 0;
258 $self->{'_nof_seqs'} = 0;
259 # $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now...
260 $self->{'_sfc'} = Bio::SeqFeature::Collection->new();
261
262 # Assembly specifcs
263 $self->{'_assembly'} = undef; # Reference to a Bio::Assembly::Scaffold object, if contig belongs to one.
264 $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise
265 $self->{'_neighbor_start'} = undef; # Will hold a reference to another contig
266 $self->{'_neighbor_end'} = undef; # Will hold a reference to another contig
267
268 return $self; # success - we hope!
269 }
270
271 =head1 Assembly related methods
272
273 These methods exist to enable adding information about possible
274 relations among contigs, e.g. when you already have a scaffold for
275 your assembly, describing the ordering of contigs in the final
276 assembly, but no sequences covering the gaps between neighboring
277 contigs.
278
279 =head2 source
280
281 Title : source
282 Usage : $contig->source($program);
283 Function : Get/Set program used to build this contig
284 Returns : string
285 Argument : [optional] string
286
287 =cut
288
289 sub source {
290 my $self = shift;
291 my $source = shift;
292
293 $self->{'_source'} = $source if (defined $source);
294 return $self->{'_source'};
295 }
296
297 =head2 assembly
298
299 Title : assembly
300 Usage : $contig->assembly($assembly);
301 Function : Get/Set assembly object for this contig
302 Returns : a Bio::Assembly::Scaffold object
303 Argument : a Bio::Assembly::Scaffold object
304
305 =cut
306
307 sub assembly {
308 my $self = shift;
309 my $assembly = shift;
310
311 $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly")
312 if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold"));
313
314 $self->{'_assembly'} = $assembly if (defined $assembly);
315 return $self->{'_assembly'};
316 }
317
318 =head2 strand
319
320 Title : strand
321 Usage : $contig->strand($num);
322 Function : Get/Set contig orientation in a scaffold/assembly.
323 Its equivalent to the strand property of sequence
324 objects and sets whether the contig consensus should
325 be reversed and complemented before being added to a
326 scaffold or assembly.
327 Returns : integer
328 Argument : 1 if orientaion is forward, -1 if reverse and
329 0 if none
330
331 =cut
332
333 sub strand {
334 my $self = shift;
335 my $ori = shift;
336
337 $self->throw("Contig strand must be either 1, -1 or 0")
338 unless (defined $ori && ($ori == 1 || $ori == 0 || $ori == -1));
339
340 $self->{'_strand'} = $ori;
341 return $self->{'_strand'};
342 }
343
344 =head2 upstream_neighbor
345
346 Title : upstream_neighbor
347 Usage : $contig->upstream_neighbor($contig);
348 Function : Get/Set a contig neighbor for the current contig when
349 building a scaffold. The upstream neighbor is
350 located before $contig first base
351 Returns : nothing
352 Argument : Bio::Assembly::Contig
353
354 =cut
355
356 sub upstream_neighbor {
357 my $self = shift;
358 my $ref = shift;
359
360 $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig")
361 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
362
363 $self->{'_neighbor_start'} = $ref if (defined $ref);
364 return $self->{'_neighbor_start'};
365 }
366
367 =head2 downstream_neighbor
368
369 Title : downstream_neighbor
370 Usage : $contig->downstream_neighbor($num);
371 Function : Get/Set a contig neighbor for the current contig when
372 building a scaffold. The downstream neighbor is
373 located after $contig last base
374 Returns : nothing
375 Argument : Bio::Assembly::Contig
376
377 =cut
378
379 sub downstream_neighbor {
380 my $self = shift;
381 my $ref = shift;
382
383 $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig")
384 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
385 $self->{'_neighbor_end'} = $ref if (defined $ref);
386 return $self->{'_neighbor_end'};
387 }
388
389 =head1 Contig feature collection methods
390
391 =head2 add_features
392
393 Title : add_features
394 Usage : $contig->add_features($feat,$flag)
395 Function :
396
397 Add an array of features to the contig feature
398 collection. The consensus sequence may be attached to the
399 added feature, if $flag is set to 1. If $flag is 0 and
400 the feature attached to one of the contig aligned
401 sequences, the feature is registered as an aligned
402 sequence feature. If $flag is 0 and the feature is not
403 attched to any sequence in the contig, the feature is
404 simply added to the feature collection and no attachment
405 or registration is made.
406
407 Note: You must attach aligned sequences to their features
408 prior to calling add_features, otherwise you won't be
409 able to access the feature through get_seq_feat_by_tag()
410 method.
411
412 Returns : number of features added.
413 Argument :
414 $feat : A reference to an array of Bio::SeqFeatureI
415 $flag : boolean - true if consensus sequence object
416 should be attached to this feature, false if
417 no consensus attachment should be made.
418 Default: false.
419
420 =cut
421
422 sub add_features {
423 my ($self, $args, $flag) = @_;
424
425 # Adding shortcuts for aligned sequence features
426 $flag = 0 unless (defined $flag);
427 if ($flag && defined $self->{'_consensus_sequence'}) {
428 foreach my $feat (@$args) {
429 next if (defined $feat->seq);
430 $feat->attach_seq($self->{'_consensus_sequence'});
431 }
432 } elsif (!$flag) { # Register aligned sequence features
433 foreach my $feat (@$args) {
434 if (my $seq = $feat->entire_seq()) {
435 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
436 $self->warn("Adding contig feature attached to unknown sequence $seqID!")
437 unless (exists $self->{'_elem'}{$seqID});
438 my $tag = $feat->primary_tag;
439 $self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat;
440 }
441 }
442 }
443
444 # Add feature to feature collection
445 my $nof_added = $self->{'_sfc'}->add_features($args);
446
447 return $nof_added;
448 }
449
450 =head2 remove_features
451
452 Title : remove_features
453 Usage : $contig->remove_features(@feat)
454 Function : Remove an array of contig features
455 Returns : number of features removed.
456 Argument : An array of Bio::SeqFeatureI
457
458 =cut
459
460 sub remove_features {
461 my ($self, @args) = @_;
462
463 # Removing shortcuts for aligned sequence features
464 foreach my $feat (@args) {
465 if (my $seq = $feat->entire_seq()) {
466 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
467 my $tag = $feat->primary_tag;
468 $tag =~ s/:$seqID$/$1/g;
469 delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} )
470 if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} &&
471 $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat);
472 }
473 }
474
475 return $self->{'_sfc'}->remove_features(\@args);
476 }
477
478 =head2 get_features_collection
479
480 Title : get_features_collection
481 Usage : $contig->get_features_collection()
482 Function : Get the collection of all contig features
483 Returns : Bio::SeqFeature::Collection
484 Argument : none
485
486 =cut
487
488 sub get_features_collection {
489 my $self = shift;
490
491 return $self->{'_sfc'};
492 }
493
494 =head1 Coordinate system's related methods
495
496 See L<Coordinate_Systems> above.
497
498 =head2 change_coord
499
500 Title : change_coord
501 Usage : $contig->change_coord($in,$out,$query)
502 Function :
503
504 Change coordinate system for $query. This method
505 transforms locations between coordinate systems described
506 in section "Coordinate Systems" of this document.
507
508 Note: this method will throw an exception when changing
509 coordinates between "ungapped consensus" and other
510 systems if consensus sequence was not set. It will also
511 throw exceptions when changing coordinates among aligned
512 sequence, either with or without gaps, and other systems
513 if sequence locations were not set with set_seq_coord().
514
515 Returns : integer
516 Argument :
517 $in : [string] input coordinate system
518 $out : [string] output coordinate system
519 $query : [integer] a position in a sequence
520
521 =cut
522
523 sub change_coord {
524 my $self = shift;
525 my $type_in = shift;
526 my $type_out = shift;
527 my $query = shift;
528
529 # Parsing arguments
530 # Loading read objects (these calls will throw exceptions whether $read_in or
531 # $read_out is not found
532 my ($read_in,$read_out) = (undef,undef);
533 my $in_ID = ( split(' ',$type_in) )[1];
534 my $out_ID = ( split(' ',$type_out) )[1];
535
536 if ($in_ID ne 'consensus') {
537 $read_in = $self->get_seq_coord( $self->get_seq_by_name($in_ID) );
538 $self->throw("Can't change coordinates without sequence location for $in_ID")
539 unless (defined $read_in);
540 }
541 if ($out_ID ne 'consensus') {
542 $read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) );
543 $self->throw("Can't change coordinates without sequence location for $out_ID")
544 unless (defined $read_out);
545 }
546
547 # Performing transformation between coordinates
548 SWITCH1: {
549
550 # Transformations between contig padded and contig unpadded
551 (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do {
552 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
553 unless (defined $self->{'_consensus_sequence'});
554 $query = &_padded_unpadded($self->{'_consensus_gaps'}, $query);
555 last SWITCH1;
556 };
557 (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do {
558 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
559 unless (defined $self->{'_consensus_sequence'});
560 $query = &_unpadded_padded($self->{'_consensus_gaps'},$query);
561 last SWITCH1;
562 };
563
564 # Transformations between contig (padded) and read (padded)
565 (($type_in eq 'gapped consensus') &&
566 ($type_out =~ /^aligned /) && defined($read_out)) && do {
567 $query = $query - $read_out->start() + 1;
568 last SWITCH1;
569 };
570 (($type_in =~ /^aligned /) && defined($read_in) &&
571 ($type_out eq 'gapped consensus')) && do {
572 $query = $query + $read_in->start() - 1;
573 last SWITCH1;
574 };
575
576 # Transformations between contig (unpadded) and read (padded)
577 (($type_in eq 'ungapped consensus') &&
578 ($type_out =~ /^aligned /) && defined($read_out)) && do {
579 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
580 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
581 last SWITCH1;
582 };
583 (($type_in =~ /^aligned /) && defined($read_in) &&
584 ($type_out eq 'ungapped consensus')) && do {
585 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
586 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
587 last SWITCH1;
588 };
589
590 # Transformations between seq $read_in padded and seq $read_out padded
591 (defined($read_in) && ($type_in =~ /^aligned /) &&
592 defined($read_out) && ($type_out =~ /^aligned /)) && do {
593 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
594 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
595 last SWITCH1;
596 };
597
598 # Transformations between seq $read_in padded and seq $read_out unpadded
599 (defined($read_in) && ($type_in =~ /^aligned /) &&
600 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
601 if ($read_in ne $read_out) {
602 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
603 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
604 }
605 my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'};
606 $query = &_padded_unpadded($list_out,$query);
607 # Changing read orientation if read was reverse complemented when aligned
608 if ($read_out->strand == -1) {
609 my ($length) = $read_out->length();
610 $length = $length - &_nof_gaps($list_out,$length);
611 $query = $length - $query + 1;
612 }
613 last SWITCH1;
614 };
615 (defined($read_in) && ($type_in =~ /^unaligned /) &&
616 defined($read_out) && ($type_out =~ /^aligned /)) && do {
617 my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'};
618 # Changing read orientation if read was reverse complemented when aligned
619 if ($read_in->strand == -1) {
620 my ($length) = $read_in->length();
621 $length = $length - &_nof_gaps($list_in,$length);
622 $query = $length - $query + 1;
623 }
624 $query = &_unpadded_padded($list_in,$query);
625 if ($read_in ne $read_out) {
626 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
627 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
628 }
629 last SWITCH1;
630 };
631
632 # Transformations between seq $read_in unpadded and seq $read_out unpadded
633 (defined($read_in) && ($type_in =~ /^unaligned /) &&
634 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
635 $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query);
636 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
637 last SWITCH1;
638 };
639
640 # Transformations between contig (padded) and read (unpadded)
641 (($type_in eq 'gapped consensus') &&
642 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
643 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
644 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
645 last SWITCH1;
646 };
647 (($type_in =~ /^unaligned /) && defined($read_in) &&
648 ($type_out eq 'gapped consensus')) && do {
649 $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query);
650 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
651 last SWITCH1;
652 };
653
654 # Transformations between contig (unpadded) and read (unpadded)
655 (($type_in eq 'ungapped consensus') &&
656 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
657 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
658 $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query);
659 last SWITCH1;
660 };
661 (($type_in =~ /^unaligned /) && defined($read_in) &&
662 ($type_out eq 'ungapped consensus')) && do {
663 $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query);
664 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
665 last SWITCH1;
666 };
667
668 $self->throw("Unknow coordinate system. Args: $type_in, $type_out.");
669 $query = undef; # If a coordinate systems just requested is unknown
670 }
671
672 return $query;
673 }
674
675 =head2 get_seq_coord
676
677 Title : get_seq_coord
678 Usage : $contig->get_seq_coord($seq);
679 Function : Get "gapped consensus" location for aligned sequence
680 Returns : Bio::SeqFeature::Generic for coordinates or undef.
681 A warning is printed if sequence coordinates were not set.
682 Argument : Bio::LocatabaleSeq object
683
684 =cut
685
686 sub get_seq_coord {
687 my ($self,$seq) = @_;
688
689 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
690 $self->throw("$seq is not a Bio::LocatableSeq");
691 }
692 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
693
694 unless (exists( $self->{'_elem'}{$seqID} )) {
695 $self->warn("No such sequence ($seqID) in contig ".$self->id);
696 return undef;
697 }
698 unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) {
699 $self->warn("Location not set for sequence ($seqID) in contig ".$self->id);
700 return undef;
701 }
702
703 return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"};
704 }
705
706 =head2 set_seq_coord
707
708 Title : set_seq_coord
709 Usage : $contig->set_seq_coord($feat,$seq);
710 Function :
711
712 Set "gapped consensus" location for an aligned
713 sequence. If the sequence was previously added using
714 add_seq, its coordinates are changed/set. Otherwise,
715 add_seq is called and the sequence is added to the
716 contig.
717
718 Returns : Bio::SeqFeature::Generic for old coordinates or undef.
719 Argument :
720 $feat : a Bio::SeqFeature::Generic object
721 representing a location for the
722 aligned sequence, in "gapped
723 consensus" coordinates.
724
725 Note: the original feature primary tag will
726 be lost.
727
728 $seq : a Bio::LocatabaleSeq object
729
730 =cut
731
732 sub set_seq_coord {
733 my ($self,$feat,$seq) = @_;
734
735 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
736 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
737 }
738
739 # Complaining about inadequate feature object
740 $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!")
741 unless ( $feat->isa("Bio::SeqFeature::Generic") );
742 $self->throw("Sequence coordinates must have an end!")
743 unless (defined $feat->end);
744 $self->throw("Sequence coordinates must have a start!")
745 unless (defined $feat->start);
746
747 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
748 if (exists( $self->{'_elem'}{$seqID} ) &&
749 exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
750 defined( $self->{'_elem'}{$seqID}{'_seq'} ) &&
751 ($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) {
752 $self->warn("Replacing sequence $seqID\n");
753 $self->remove_seq($self->{'_elem'}{$seqID}{'_seq'});
754 }
755 $self->add_seq($seq);
756
757 # Remove previous coordinates, if any
758 $self->remove_features($feat);
759
760 # Add new Bio::Generic::SeqFeature
761 $feat->add_tag_value('contig',$self->id)
762 unless ( $feat->has_tag('contig') );
763 $feat->primary_tag("_aligned_coord:$seqID");
764 $feat->attach_seq($seq);
765 $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat;
766 $self->add_features([ $feat ]);
767 }
768
769 =head1 Bio::Assembly::Contig consensus methods
770
771 =head2 set_consensus_sequence
772
773 Title : set_consensus_sequence
774 Usage : $contig->set_consensus_sequence($seq)
775 Function : Set the consensus sequence object for this contig
776 Returns : consensus length
777 Argument : Bio::LocatableSeq
778
779 =cut
780
781 sub set_consensus_sequence {
782 my $self = shift;
783 my $seq = shift;
784
785 $self->throw("Consensus sequence must be a Bio::LocatableSeq!")
786 unless ($seq->isa("Bio::LocatableSeq"));
787
788 my $con_len = $seq->length;
789 $seq->start(1); $seq->end($con_len);
790
791 $self->{'_consensus_gaps'} = []; # Consensus Gap registry
792 $self->_register_gaps($seq->seq,
793 $self->{'_consensus_gaps'});
794 $self->{'_consensus_sequence'} = $seq;
795
796 return $con_len;
797 }
798
799 =head2 set_consensus_quality
800
801 Title : set_consensus_quality
802 Usage : $contig->set_consensus_quality($qual)
803 Function : Set the quality object for consensus sequence
804 Returns : nothing
805 Argument : Bio::Seq::QualI object
806
807 =cut
808
809 sub set_consensus_quality {
810 my $self = shift;
811 my $qual = shift;
812
813 $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
814 unless ( $qual->isa("Bio::Seq::QualI") );
815
816 $self->throw("Consensus quality can't be added before you set the consensus sequence!")
817 unless (defined $self->{'_consensus_sequence'});
818
819 $self->{'_consensus_quality'} = $qual;
820 }
821
822 =head2 get_consensus_length
823
824 Title : get_consensus_length
825 Usage : $contig->get_consensus_length()
826 Function : Get consensus sequence length
827 Returns : integer
828 Argument : none
829
830 =cut
831
832 sub get_consensus_length {
833 my $self = shift;
834
835 return $self->{'_consensus_sequence'}->length();
836 }
837
838 =head2 get_consensus_sequence
839
840 Title : get_consensus_sequence
841 Usage : $contig->get_consensus_sequence()
842 Function : Get a reference to the consensus sequence object
843 for this contig
844 Returns : Bio::SeqI object
845 Argument : none
846
847 =cut
848
849 sub get_consensus_sequence {
850 my ($self, @args) = @_;
851
852 return $self->{'_consensus_sequence'};
853 }
854
855 =head2 get_consensus_quality
856
857 Title : get_consensus_quality
858 Usage : $contig->get_consensus_quality()
859 Function : Get a reference to the consensus quality object
860 for this contig.
861 Returns : A Bio::QualI object
862 Argument : none
863
864 =cut
865
866 sub get_consensus_quality {
867 my ($self, @args) = @_;
868
869 return $self->{'_consensus_quality'};
870 }
871
872 =head1 Bio::Assembly::Contig aligned sequences methods
873
874 =head2 set_seq_qual
875
876 Title : set_seq_qual
877 Usage : $contig->set_seq_qual($seq,$qual);
878 Function : Adds quality to an aligned sequence.
879 Returns : nothing
880 Argument : a Bio::LocatableSeq object and
881 a Bio::Seq::QualI object
882
883 See L<Bio::LocatableSeq> for more information.
884
885 =cut
886
887 sub set_seq_qual {
888 my ($self,$seq,$qual) = @_;
889
890 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
891 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
892 }
893 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
894
895 $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
896 unless ( $qual->isa("Bio::Seq::QualI") );
897 $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!")
898 unless (exists $self->{'_elem'}{$seqID}{'_seq'});
899 $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) ));
900
901 # Adding gaps to quality object
902 my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq();
903 my $tmp = $qual->qual();
904 @{$tmp} = reverse(@{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1);
905 my @quality = ();
906 my $previous = 0;
907 my $next = 0;
908 my $i = 0; my $j = 0;
909 while ($i<=$#{$tmp}) {
910 # IF base is a gap, quality is the average for neighbouring sites
911 if (substr($sequence,$j,1) eq '-') {
912 $previous = $tmp->[$i-1] unless ($i == 0);
913 if ($i < $#{$tmp}) {
914 $next = $tmp->[$i+1];
915 } else {
916 $next = 0;
917 }
918 push(@quality,int( ($previous+$next)/2 ));
919 } else {
920 push(@quality,$tmp->[$i]);
921 $i++;
922 }
923 $j++;
924 }
925
926 $self->{'_elem'}{$seqID}{'_qual'} = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
927 -id=>$seqID);
928 }
929
930 =head2 get_seq_ids
931
932 Title : get_seq_ids
933 Usage : $contig->get_seq_ids(-start=>$start,
934 -end=>$end,
935 -type=>"gapped A0QR67B08.b");
936 Function : Get list of sequence IDs overlapping inteval [$start, $end]
937 The default interval is [1,$contig->length]
938 Default coordinate system is "gapped contig"
939 Returns : An array
940 Argument : A hash with optional elements:
941 -start : consensus subsequence start
942 -end : consensus subsequence end
943 -type : the coordinate system type for $start and $end arguments
944 Coordinate system avaliable are:
945 "gapped consensus" : consensus coordinates with gaps
946 "ungapped consensus" : consensus coordinates without gaps
947 "aligned $ReadID" : read $ReadID coordinates with gaps
948 "unaligned $ReadID" : read $ReadID coordinates without gaps
949
950
951 =cut
952
953 sub get_seq_ids {
954 my ($self, @args) = @_;
955
956 my ($type,$start,$end) =
957 $self->_rearrange([qw(TYPE START END)], @args);
958
959 if (defined($start) && defined($end)) {
960 if (defined($type) && ($type ne 'gapped consensus')) {
961 $start = $self->change_coord($type,'gapped consensus',$start);
962 $end = $self->change_coord($type,'gapped consensus',$end);
963 }
964
965 my @list = grep { $_->isa("Bio::SeqFeature::Generic") &&
966 ($_->primary_tag =~ /^_aligned_coord:/) }
967 $self->{'_sfc'}->features_in_range(-start=>$start,
968 -end=>$end,
969 -contain=>0,
970 -strandmatch=>'ignore');
971 @list = map { $_->entire_seq->id } @list;
972 return @list;
973 }
974
975 # Entire aligned sequences list
976 return map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} };
977 }
978
979 =head2 get_seq_feat_by_tag
980
981 Title : get_seq_feat_by_tag
982 Usage : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID")
983 Function :
984
985 Get a sequence feature based on its primary_tag.
986 When you add
987
988 Returns : a Bio::SeqFeature object
989 Argument : a Bio::LocatableSeq and a string (feature primary tag)
990
991 =cut
992
993 sub get_seq_feat_by_tag {
994 my ($self,$seq,$tag) = @_;
995
996 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
997 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
998 }
999 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1000
1001 return $self->{'_elem'}{$seqID}{'_feat'}{$tag};
1002 }
1003
1004 =head2 get_seq_by_name
1005
1006 Title : get_seq_by_name
1007 Usage : $seq = $contig->get_seq_by_name('Seq1')
1008 Function : Gets a sequence based on its id.
1009 Returns : a Bio::LocatableSeq object
1010 undef if name is not found
1011 Argument : string
1012
1013 =cut
1014
1015 sub get_seq_by_name {
1016 my $self = shift;
1017 my ($seqID) = @_;
1018
1019 unless (exists $self->{'_elem'}{$seqID}{'_seq'}) {
1020 $self->throw("Could not find sequence $seqID in contig ".$self->id);
1021 return undef;
1022 }
1023
1024 return $self->{'_elem'}{$seqID}{'_seq'};
1025 }
1026
1027 =head2 get_qual_by_name
1028
1029 Title : get_qual_by_name
1030 Usage : $seq = $contig->get_qual_by_name('Seq1')
1031 Function :
1032
1033 Gets Bio::Seq::QualI object for a sequence
1034 through its id ( as given by $qual->id() ).
1035
1036 Returns : a Bio::Seq::QualI object.
1037 undef if name is not found
1038 Argument : string
1039
1040 =cut
1041
1042 sub get_qual_by_name {
1043 my $self = shift;
1044 my ($seqID) = @_;
1045
1046 unless (exists $self->{'_elem'}{$seqID}{'_qual'}) {
1047 $self->warn("Could not find quality for $seqID in contig!");
1048 return undef;
1049 }
1050
1051 return $self->{'_elem'}{$seqID}{'_qual'};
1052 }
1053
1054 =head1 Bio::Align::AlignI compatible methods
1055
1056 =head2 Modifier methods
1057
1058 These methods modify the MSE by adding, removing or shuffling complete
1059 sequences.
1060
1061 =head2 add_seq
1062
1063 Title : add_seq
1064 Usage : $contig->add_seq($newseq);
1065 Function :
1066
1067 Adds a sequence to the contig. *Does*
1068 *not* align it - just adds it to the
1069 hashes.
1070
1071 Returns : nothing
1072 Argument : a Bio::LocatableSeq object
1073
1074 See L<Bio::LocatableSeq> for more information.
1075
1076 =cut
1077
1078 sub add_seq {
1079 my $self = shift;
1080 my $seq = shift;
1081
1082 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1083 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
1084 }
1085
1086 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1087 $self->{'_elem'}{$seqID} = {} unless (exists $self->{'elem'}{$seqID});
1088
1089 if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
1090 ($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) {
1091 $self->warn("Adding sequence $seqID, which has already been added");
1092 }
1093
1094 # Our locatable sequences are always considered to be complete sequences
1095 $seq->start(1); $seq->end($seq->length());
1096
1097 $self->warn("Adding non-nucleotidic sequence ".$seqID)
1098 if (lc($seq->alphabet) ne 'dna' && lc($seq->alphabet) ne 'rna');
1099
1100 # build the symbol list for this sequence,
1101 # will prune out the gap and missing/match chars
1102 # when actually asked for the symbol list in the
1103 # symbol_chars
1104 if (defined $seq->seq) {
1105 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1106 } else {
1107 $self->{'_symbols'} = {};
1108 }
1109
1110 my $seq_no = ++$self->{'_nof_seqs'};
1111
1112 if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) {
1113 $self->warn("Replacing one sequence [$seqID]\n");
1114 } else {
1115 #print STDERR "Assigning $seqID to $order\n";
1116 $self->{'_order'}->{$seq_no} = $seqID;
1117 # $self->{'_start_end_lists'}->{$id} = []
1118 # unless(exists $self->{'_start_end_lists'}->{$id});
1119 # push @{$self->{'_start_end_lists'}->{$id}}, $seq;
1120 }
1121
1122 $self->{'_elem'}{$seqID}{'_seq'} = $seq;
1123 $self->{'_elem'}{$seqID}{'_feat'} = {};
1124 $self->{'_elem'}{$seqID}{'_gaps'} = [];
1125 my $dbref = $self->{'_elem'}{$seqID}{'_gaps'};
1126 my $nofgaps = $self->_register_gaps($seq->seq,$dbref);
1127
1128 # Updating residue count
1129 $self->{'_nof_residues'} += $seq->length - $nofgaps;
1130
1131 return 1;
1132 }
1133
1134 =head2 remove_seq
1135
1136 Title : remove_seq
1137 Usage : $contig->remove_seq($seq);
1138 Function : Removes a single sequence from an alignment
1139 Returns : 1 on success, 0 otherwise
1140 Argument : a Bio::LocatableSeq object
1141
1142 =cut
1143
1144 sub remove_seq {
1145 my ($self,$seq) = @_;
1146
1147 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
1148 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
1149 }
1150
1151 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
1152 unless (exists $self->{'_elem'}{$seqID} ) {
1153 $self->warn("No sequence named $seqID [$seq]");
1154 return 0;
1155 }
1156
1157 # Updating residue count
1158 $self->{'_nof_residues'} -= $seq->length() +
1159 &_nof_gaps( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length );
1160
1161 # Remove all references to features of this sequence
1162 my @feats = ();
1163 foreach my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) {
1164 push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag});
1165 }
1166 $self->{'_sfc'}->remove_features(\@feats);
1167 delete $self->{'_elem'}{$seqID};
1168
1169 return 1;
1170 }
1171
1172 =head2 purge
1173
1174 Title : purge
1175 Usage : $contig->purge(0.7);
1176 Function:
1177
1178 Removes sequences above whatever %id.
1179
1180 This function will grind on large alignments. Beware!
1181 (perhaps not ideally implemented)
1182
1183 Example :
1184 Returns : An array of the removed sequences
1185 Argument:
1186
1187
1188 =cut
1189
1190 sub purge {
1191 my ($self) = @_;
1192 $self->throw_not_implemented();
1193 }
1194
1195 =head2 sort_alphabetically
1196
1197 Title : sort_alphabetically
1198 Usage : $contig->sort_alphabetically
1199 Function :
1200
1201 Changes the order of the alignemnt to alphabetical on name
1202 followed by numerical by number.
1203
1204 Returns :
1205 Argument :
1206
1207 =cut
1208
1209 sub sort_alphabetically {
1210 my ($self) = @_;
1211 $self->throw_not_implemented();
1212 }
1213
1214 =head2 Sequence selection methods
1215
1216 Methods returning one or more sequences objects.
1217
1218 =head2 each_seq
1219
1220 Title : each_seq
1221 Usage : foreach $seq ( $contig->each_seq() )
1222 Function : Gets an array of Seq objects from the alignment
1223 Returns : an array
1224 Argument :
1225
1226 =cut
1227
1228 sub each_seq {
1229 my ($self) = @_;
1230
1231 my (@arr,$seqID);
1232
1233 foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) {
1234 push(@arr,$self->{'_elem'}{$seqID}{'_seq'});
1235 }
1236
1237 return @arr;
1238 }
1239
1240 =head2 each_alphabetically
1241
1242 Title : each_alphabetically
1243 Usage : foreach $seq ( $contig->each_alphabetically() )
1244 Function :
1245
1246 Returns an array of sequence object sorted alphabetically
1247 by name and then by start point.
1248 Does not change the order of the alignment
1249
1250 Returns :
1251 Argument :
1252
1253 =cut
1254
1255 sub each_alphabetically {
1256 my($self) = @_;
1257 $self->throw_not_implemented();
1258 }
1259
1260 =head2 each_seq_with_id
1261
1262 Title : each_seq_with_id
1263 Usage : foreach $seq ( $contig->each_seq_with_id() )
1264 Function :
1265
1266 Gets an array of Seq objects from the
1267 alignment, the contents being those sequences
1268 with the given name (there may be more than one)
1269
1270 Returns : an array
1271 Argument : a seq name
1272
1273 =cut
1274
1275 sub each_seq_with_id {
1276 my ($self) = @_;
1277 $self->throw_not_implemented();
1278 }
1279
1280 =head2 get_seq_by_pos
1281
1282 Title : get_seq_by_pos
1283 Usage : $seq = $contig->get_seq_by_pos(3)
1284 Function :
1285
1286 Gets a sequence based on its position in the alignment.
1287 Numbering starts from 1. Sequence positions larger than
1288 no_sequences() will thow an error.
1289
1290 Returns : a Bio::LocatableSeq object
1291 Argument : positive integer for the sequence osition
1292
1293 =cut
1294
1295 sub get_seq_by_pos {
1296 my $self = shift;
1297 my ($pos) = @_;
1298
1299 $self->throw("Sequence position has to be a positive integer, not [$pos]")
1300 unless $pos =~ /^\d+$/ and $pos > 0;
1301 $self->throw("No sequence at position [$pos]")
1302 unless $pos <= $self->no_sequences ;
1303
1304 my $seqID = $self->{'_order'}->{--$pos};
1305 return $self->{'_elem'}{$seqID}{'_seq'};
1306 }
1307
1308 =head2 Create new alignments
1309
1310 The result of these methods are horizontal or vertical subsets of the
1311 current MSE.
1312
1313 =head2 select
1314
1315 Title : select
1316 Usage : $contig2 = $contig->select(1, 3) # three first sequences
1317 Function :
1318
1319 Creates a new alignment from a continuous subset of
1320 sequences. Numbering starts from 1. Sequence positions
1321 larger than no_sequences() will thow an error.
1322
1323 Returns : a Bio::Assembly::Contig object
1324 Argument : positive integer for the first sequence
1325 positive integer for the last sequence to include (optional)
1326
1327 =cut
1328
1329 sub select {
1330 my ($self) = @_;
1331 $self->throw_not_implemented();
1332 }
1333
1334
1335 =head2 select_noncont
1336
1337 Title : select_noncont
1338 Usage : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences
1339 Function :
1340
1341 Creates a new alignment from a subset of
1342 sequences. Numbering starts from 1. Sequence positions
1343 larger than no_sequences() will thow an error.
1344
1345 Returns : a Bio::Assembly::Contig object
1346 Args : array of integers for the sequences
1347
1348 =cut
1349
1350 sub select_noncont {
1351 my ($self) = @_;
1352 $self->throw_not_implemented();
1353 }
1354
1355 =head2 slice
1356
1357 Title : slice
1358 Usage : $contig2 = $contig->slice(20, 30)
1359 Function :
1360
1361 Creates a slice from the alignment inclusive of start and
1362 end columns. Sequences with no residues in the slice are
1363 excluded from the new alignment and a warning is printed.
1364 Slice beyond the length of the sequence does not do
1365 padding.
1366
1367 Returns : a Bio::Assembly::Contig object
1368 Argument : positive integer for start column
1369 positive integer for end column
1370
1371 =cut
1372
1373 sub slice {
1374 my ($self) = @_;
1375 $self->throw_not_implemented();
1376 }
1377
1378 =head2 Change sequences within the MSE
1379
1380 These methods affect characters in all sequences without changeing the
1381 alignment.
1382
1383
1384 =head2 map_chars
1385
1386 Title : map_chars
1387 Usage : $contig->map_chars('\.','-')
1388 Function :
1389
1390 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1391 characters
1392
1393 Notice that the from (arg1) is interpretted as a regex,
1394 so be careful about quoting meta characters (eg
1395 $contig->map_chars('.','-') wont do what you want)
1396
1397 Returns :
1398 Argument : 'from' rexexp
1399 'to' string
1400
1401 =cut
1402
1403 sub map_chars {
1404 my ($self) = @_;
1405 $self->throw_not_implemented();
1406 }
1407
1408 =head2 uppercase
1409
1410 Title : uppercase()
1411 Usage : $contig->uppercase()
1412 Function : Sets all the sequences to uppercase
1413 Returns :
1414 Argument :
1415
1416 =cut
1417
1418 sub uppercase {
1419 my ($self) = @_;
1420 $self->throw_not_implemented();
1421 }
1422
1423 =head2 match_line
1424
1425 Title : match_line()
1426 Usage : $contig->match_line()
1427 Function : Generates a match line - much like consensus string
1428 except that a line indicating the '*' for a match.
1429 Argument : (optional) Match line characters ('*' by default)
1430 (optional) Strong match char (':' by default)
1431 (optional) Weak match char ('.' by default)
1432
1433 =cut
1434
1435 sub match_line {
1436 my ($self) = @_;
1437 $self->throw_not_implemented();
1438 }
1439
1440 =head2 match
1441
1442 Title : match()
1443 Usage : $contig->match()
1444 Function :
1445
1446 Goes through all columns and changes residues that are
1447 identical to residue in first sequence to match '.'
1448 character. Sets match_char.
1449
1450 USE WITH CARE: Most MSE formats do not support match
1451 characters in sequences, so this is mostly for output
1452 only. NEXUS format (Bio::AlignIO::nexus) can handle
1453 it.
1454
1455 Returns : 1
1456 Argument : a match character, optional, defaults to '.'
1457
1458 =cut
1459
1460 sub match {
1461 my ($self) = @_;
1462 $self->throw_not_implemented();
1463 }
1464
1465 =head2 unmatch
1466
1467 Title : unmatch()
1468 Usage : $contig->unmatch()
1469 Function :
1470
1471 Undoes the effect of method match. Unsets match_char.
1472
1473 Returns : 1
1474 Argument : a match character, optional, defaults to '.'
1475
1476 =cut
1477
1478 sub unmatch {
1479 my ($self) = @_;
1480 $self->throw_not_implemented();
1481 }
1482
1483
1484 =head2 MSE attibutes
1485
1486 Methods for setting and reading the MSE attributes.
1487
1488 Note that the methods defining character semantics depend on the user
1489 to set them sensibly. They are needed only by certain input/output
1490 methods. Unset them by setting to an empty string ('').
1491
1492 =head2 id
1493
1494 Title : id
1495 Usage : $contig->id("Ig")
1496 Function : Gets/sets the id field of the alignment
1497 Returns : An id string
1498 Argument : An id string (optional)
1499
1500 =cut
1501
1502 sub id {
1503 my ($self, $contig_name) = @_;
1504
1505 if (defined( $contig_name )) {
1506 $self->{'_id'} = $contig_name;
1507 }
1508
1509 return $self->{'_id'};
1510 }
1511
1512 =head2 missing_char
1513
1514 Title : missing_char
1515 Usage : $contig->missing_char("?")
1516 Function : Gets/sets the missing_char attribute of the alignment
1517 It is generally recommended to set it to 'n' or 'N'
1518 for nucleotides and to 'X' for protein.
1519 Returns : An missing_char string,
1520 Argument : An missing_char string (optional)
1521
1522 =cut
1523
1524 sub missing_char {
1525 my ($self) = @_;
1526 $self->throw_not_implemented();
1527 }
1528
1529 =head2 match_char
1530
1531 Title : match_char
1532 Usage : $contig->match_char('.')
1533 Function : Gets/sets the match_char attribute of the alignment
1534 Returns : An match_char string,
1535 Argument : An match_char string (optional)
1536
1537 =cut
1538
1539 sub match_char {
1540 my ($self) = @_;
1541 $self->throw_not_implemented();
1542 }
1543
1544 =head2 gap_char
1545
1546 Title : gap_char
1547 Usage : $contig->gap_char('-')
1548 Function : Gets/sets the gap_char attribute of the alignment
1549 Returns : An gap_char string, defaults to '-'
1550 Argument : An gap_char string (optional)
1551
1552 =cut
1553
1554 sub gap_char {
1555 my ($self) = @_;
1556 $self->throw_not_implemented();
1557 }
1558
1559 =head2 symbol_chars
1560
1561 Title : symbol_chars
1562 Usage : my @symbolchars = $contig->symbol_chars;
1563 Function: Returns all the seen symbols (other than gaps)
1564 Returns : array of characters that are the seen symbols
1565 Argument: boolean to include the gap/missing/match characters
1566
1567 =cut
1568
1569 sub symbol_chars{
1570 my ($self) = @_;
1571 $self->throw_not_implemented();
1572 }
1573
1574 =head2 Alignment descriptors
1575
1576 These read only methods describe the MSE in various ways.
1577
1578
1579 =head2 consensus_string
1580
1581 Title : consensus_string
1582 Usage : $str = $contig->consensus_string($threshold_percent)
1583 Function : Makes a strict consensus
1584 Returns :
1585 Argument : Optional treshold ranging from 0 to 100.
1586 The consensus residue has to appear at least threshold %
1587 of the sequences at a given location, otherwise a '?'
1588 character will be placed at that location.
1589 (Default value = 0%)
1590
1591 =cut
1592
1593 sub consensus_string {
1594 my ($self) = @_;
1595 $self->throw_not_implemented();
1596 }
1597
1598 =head2 consensus_iupac
1599
1600 Title : consensus_iupac
1601 Usage : $str = $contig->consensus_iupac()
1602 Function :
1603
1604 Makes a consensus using IUPAC ambiguity codes from DNA
1605 and RNA. The output is in upper case except when gaps in
1606 a column force output to be in lower case.
1607
1608 Note that if your alignment sequences contain a lot of
1609 IUPAC ambiquity codes you often have to manually set
1610 alphabet. Bio::PrimarySeq::_guess_type thinks they
1611 indicate a protein sequence.
1612
1613 Returns : consensus string
1614 Argument : none
1615 Throws : on protein sequences
1616
1617
1618 =cut
1619
1620 sub consensus_iupac {
1621 my ($self) = @_;
1622 $self->throw_not_implemented();
1623 }
1624
1625 =head2 is_flush
1626
1627 Title : is_flush
1628 Usage : if( $contig->is_flush() )
1629 :
1630 :
1631 Function : Tells you whether the alignment
1632 : is flush, ie all of the same length
1633 :
1634 :
1635 Returns : 1 or 0
1636 Argument :
1637
1638 =cut
1639
1640 sub is_flush {
1641 my ($self) = @_;
1642 $self->throw_not_implemented();
1643 }
1644
1645 =head2 length
1646
1647 Title : length()
1648 Usage : $len = $contig->length()
1649 Function : Returns the maximum length of the alignment.
1650 To be sure the alignment is a block, use is_flush
1651 Returns :
1652 Argument :
1653
1654 =cut
1655
1656 sub length {
1657 my ($self) = @_;
1658
1659 $self->throw_not_implemented();
1660 }
1661
1662 =head2 maxdisplayname_length
1663
1664 Title : maxdisplayname_length
1665 Usage : $contig->maxdisplayname_length()
1666 Function :
1667
1668 Gets the maximum length of the displayname in the
1669 alignment. Used in writing out various MSE formats.
1670
1671 Returns : integer
1672 Argument :
1673
1674 =cut
1675
1676 sub maxname_length {
1677 my ($self) = @_;
1678 $self->throw_not_implemented();
1679 }
1680
1681 =head2 no_residues
1682
1683 Title : no_residues
1684 Usage : $no = $contig->no_residues
1685 Function : number of residues in total in the alignment
1686 Returns : integer
1687 Argument :
1688
1689 =cut
1690
1691 sub no_residues {
1692 my ($self) = @_;
1693
1694 return $self->{'_nof_residues'};
1695 }
1696
1697 =head2 no_sequences
1698
1699 Title : no_sequences
1700 Usage : $depth = $contig->no_sequences
1701 Function : number of sequence in the sequence alignment
1702 Returns : integer
1703 Argument : None
1704
1705 =cut
1706
1707 sub no_sequences {
1708 my ($self) = @_;
1709
1710 return scalar( keys %{ $self->{'_elem'} } );
1711 }
1712
1713 =head2 percentage_identity
1714
1715 Title : percentage_identity
1716 Usage : $id = $contig->percentage_identity
1717 Function: The function calculates the percentage identity of the alignment
1718 Returns : The percentage identity of the alignment (as defined by the
1719 implementation)
1720 Argument: None
1721
1722 =cut
1723
1724 sub percentage_identity{
1725 my ($self) = @_;
1726
1727 $self->throw_not_implemeneted();
1728 }
1729
1730 =head2 overall_percentage_identity
1731
1732 Title : percentage_identity
1733 Usage : $id = $contig->percentage_identity
1734 Function: The function calculates the percentage identity of
1735 the conserved columns
1736 Returns : The percentage identity of the conserved columns
1737 Args : None
1738
1739 =cut
1740
1741 sub overall_percentage_identity{
1742 my ($self) = @_;
1743 $self->throw_not_implemented();
1744 }
1745
1746
1747 =head2 average_percentage_identity
1748
1749 Title : average_percentage_identity
1750 Usage : $id = $contig->average_percentage_identity
1751 Function: The function uses a fast method to calculate the average
1752 percentage identity of the alignment
1753 Returns : The average percentage identity of the alignment
1754 Args : None
1755
1756 =cut
1757
1758 sub average_percentage_identity {
1759 my ($self) = @_;
1760 $self->throw_not_implemented();
1761 }
1762
1763 =head2 Alignment positions
1764
1765 Methods to map a sequence position into an alignment column and back.
1766 column_from_residue_number() does the former. The latter is really a
1767 property of the sequence object and can done using
1768 L<Bio::LocatableSeq::location_from_column>:
1769
1770 # select somehow a sequence from the alignment, e.g.
1771 my $seq = $contig->get_seq_by_pos(1);
1772 #$loc is undef or Bio::LocationI object
1773 my $loc = $seq->location_from_column(5);
1774
1775
1776 =head2 column_from_residue_number
1777
1778 Title : column_from_residue_number
1779 Usage : $col = $contig->column_from_residue_number( $seqname, $resnumber)
1780 Function:
1781
1782 This function gives the position in the alignment
1783 (i.e. column number) of the given residue number in the
1784 sequence with the given name. For example, for the
1785 alignment
1786
1787 Seq1/91-97 AC..DEF.GH
1788 Seq2/24-30 ACGG.RTY..
1789 Seq3/43-51 AC.DDEFGHI
1790
1791 column_from_residue_number( "Seq1", 94 ) returns 5.
1792 column_from_residue_number( "Seq2", 25 ) returns 2.
1793 column_from_residue_number( "Seq3", 50 ) returns 9.
1794
1795 An exception is thrown if the residue number would lie
1796 outside the length of the aligment
1797 (e.g. column_from_residue_number( "Seq2", 22 )
1798
1799 Note: If the the parent sequence is represented by more than
1800 one alignment sequence and the residue number is present in
1801 them, this method finds only the first one.
1802
1803 Returns : A column number for the position in the alignment of the
1804 given residue in the given sequence (1 = first column)
1805 Args : A sequence id/name (not a name/start-end)
1806 A residue number in the whole sequence (not just that
1807 segment of it in the alignment)
1808
1809 =cut
1810
1811 sub column_from_residue_number {
1812 my ($self) = @_;
1813 $self->throw_not_implemented();
1814 }
1815
1816 =head2 Sequence names
1817
1818 Methods to manipulate the display name. The default name based on the
1819 sequence id and subsequence positions can be overridden in various
1820 ways.
1821
1822 =head2 displayname
1823
1824 Title : displayname
1825 Usage : $contig->displayname("Ig", "IgA")
1826 Function : Gets/sets the display name of a sequence in the alignment
1827 :
1828 Returns : A display name string
1829 Argument : name of the sequence
1830 displayname of the sequence (optional)
1831
1832 =cut
1833
1834 sub displayname { # Do nothing
1835 }
1836
1837 =head2 set_displayname_count
1838
1839 Title : set_displayname_count
1840 Usage : $contig->set_displayname_count
1841 Function :
1842
1843 Sets the names to be name_# where # is the number of
1844 times this name has been used.
1845
1846 Returns : None
1847 Argument : None
1848
1849 =cut
1850
1851 sub set_displayname_count {
1852 my ($self) = @_;
1853 $self->throw_not_implemented();
1854 }
1855
1856 =head2 set_displayname_flat
1857
1858 Title : set_displayname_flat
1859 Usage : $contig->set_displayname_flat()
1860 Function : Makes all the sequences be displayed as just their name,
1861 not name/start-end
1862 Returns : 1
1863 Argument : None
1864
1865 =cut
1866
1867 sub set_displayname_flat { # Do nothing!
1868 }
1869
1870 =head2 set_displayname_normal
1871
1872 Title : set_displayname_normal
1873 Usage : $contig->set_displayname_normal()
1874 Function : Makes all the sequences be displayed as name/start-end
1875 Returns : None
1876 Argument : None
1877
1878 =cut
1879
1880 sub set_displayname_normal { # Do nothing!
1881 }
1882
1883 =head1 Internal Methods
1884
1885 =head2 _binary_search
1886
1887 Title : _binary_search
1888 Usage : _binary_search($list,$query)
1889 Function :
1890
1891 Find a number in a sorted list of numbers. Return values
1892 may be on or two integers. One positive integer or zero
1893 (>=0) is the index of the element that stores the queried
1894 value. Two positive integers (or zero and another
1895 number) are the indexes of elements among which the
1896 queried value should be placed. Negative single values
1897 mean:
1898
1899 -1: $query is smaller than smallest element in list
1900 -2: $query is greater than greatest element in list
1901
1902 Returns : array of integers
1903 Argument :
1904 $list : array reference
1905 $query : integer
1906
1907 =cut
1908
1909 sub _binary_search {
1910 my $list = shift;
1911 my $query = shift;
1912 #
1913 # If there is only one element in list
1914 if (!$#{$list} && ($query == $list->[0])) { return (0) }
1915 # If there are others...
1916 my $start = 0;
1917 my $end = $#{$list};
1918 (&_compare($query,$list->[$start]) == 0) && do { return ($start) };
1919 (&_compare($query,$list->[$end]) == 0) && do { return ($end) };
1920 (&_compare($query,$list->[$start]) < 0) && do { return (-1) };
1921 (&_compare($query,$list->[$end]) > 0) && do { return (-2) };
1922 my $middle = 0;
1923 while ($end - $start > 1) {
1924 $middle = int(($end+$middle)/2);
1925 (&_compare($query,$list->[$middle]) == 0) && return ($middle);
1926 (&_compare($query,$list->[$middle]) < 0) && do { $end = $middle ; $middle = 0; next };
1927 $start = $middle; # If &_compare() > 0, move region beggining
1928 }
1929 return ($start,$end);
1930 }
1931
1932 =head2 _compare
1933
1934 Title : _compare
1935 Usage : _compare($arg1,$arg2)
1936 Function: Perform numeric or string comparisons
1937 Returns : integer (0, 1 or -1)
1938 Args : values to be compared
1939
1940 =cut
1941
1942 sub _compare {
1943 my $arg1 = shift;
1944 my $arg2 = shift;
1945 #
1946 if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 }
1947 else { return $arg1 cmp $arg2 }
1948 }
1949
1950 =head2 _nof_gaps
1951
1952 Title : _nof_gaps
1953 Usage : _nof_gaps($array_ref, $query)
1954 Function: number of gaps found before position $query
1955 Returns : integer
1956 Args :
1957 $array_ref : gap registry reference
1958 $query : [integer] a position in a sequence
1959
1960 =cut
1961
1962 #' emacs...
1963 sub _nof_gaps {
1964 my $list = shift;
1965 my $query = shift;
1966 # If there are no gaps in this contig
1967 return 0 unless (defined($list) && scalar(@{$list}));
1968 # Locate query index in gap list (if any)
1969 my @index = &_binary_search($list,$query);
1970 # If after all alignments, correct using total number of align
1971 if ($index[0] == -2) { $query = scalar(@{$list}) }
1972 # If before any alignment, return 0
1973 elsif ($index[0] == -1) { $query = 0 }
1974 elsif ($index[0] >= 0) {
1975 # If query is between alignments, translate coordinates
1976 if ($#index > 0) { $query = $index[0] + 1 }
1977 # If query sits upon an alignment, do another correction
1978 elsif ($#index == 0) { $query = $index[0] }
1979 }
1980 #
1981 return $query;
1982 }
1983
1984 =head2 _padded_unpadded
1985
1986 Title : _padded_unpadded
1987 Usage : _padded_unpadded($array_ref, $query)
1988 Function:
1989
1990 Returns a coordinate corresponding to
1991 position $query after gaps were
1992 removed from a sequence.
1993
1994 Returns : integer
1995 Args :
1996 $array_ref : reference to this gap registry
1997 $query : [integer] coordionate to change
1998
1999 =cut
2000
2001 sub _padded_unpadded {
2002 my $list = shift;
2003 my $query = shift;
2004
2005 my $align = &_nof_gaps($list,$query);
2006 $query-- if (defined($list->[$align]) && ($list->[$align] == $query));
2007 $query = $query - $align;
2008 #
2009 return $query;
2010 }
2011
2012 =head2 _unpadded_padded
2013
2014 Title : _unpadded_padded
2015 Usage : _unpadded_padded($array_ref, $query)
2016 Function:
2017
2018 Returns the value corresponding to
2019 ungapped position $query when gaps are
2020 counted as valid sites in a sequence
2021
2022 Returns :
2023 Args : $array_ref = a reference to this sequence's gap registry
2024 $query = [integer] location to change
2025
2026 =cut
2027
2028 #'
2029 sub _unpadded_padded {
2030 my $list = shift;
2031 my $query = shift;
2032
2033 my $align = &_nof_gaps($list,$query);
2034 $query = $query + $align;
2035 my $new_align = &_nof_gaps($list,$query);
2036 while ($new_align - $align > 0) {
2037 $query = $query + $new_align - $align;
2038 $align = $new_align;
2039 $new_align = &_nof_gaps($list,$query);
2040 }
2041 # If current position is also a align, look for the first upstream base
2042 while (defined($list->[$align]) && ($list->[$align] == $query)) {
2043 $query++; $align++;
2044 }
2045 #
2046 return $query;
2047 }
2048
2049 =head2 _register_gaps
2050
2051 Title : _register_gaps
2052 Usage : $self->_register_gaps($seq, $array_ref)
2053 Function: stores gap locations for a sequence
2054 Returns : number of gaps found
2055 Args :
2056 $seq : sequence string
2057 $array_ref : a reference to an array,
2058 where gap locations will
2059 be stored
2060
2061 =cut
2062
2063 sub _register_gaps {
2064 my $self = shift;
2065 my $sequence = shift;
2066 my $dbref = shift;
2067
2068 $self->throw("Not an aligned sequence string to register gaps")
2069 if (ref($sequence));
2070
2071 $self->throw("Not an array reference for gap registry")
2072 unless (ref($dbref) eq 'ARRAY');
2073
2074 # Registering alignments
2075 @{$dbref} = (); # Cleaning registry
2076 if (defined $sequence) {
2077 my $i = -1;
2078 while(1) {
2079 $i = index($sequence,"-",$i+1);
2080 last if ($i == -1);
2081 push(@{$dbref},$i+1);
2082 }
2083 } else {
2084 # $self->warn("Found undefined sequence while registering gaps");
2085 return 0;
2086 }
2087
2088 return scalar(@{$dbref});
2089 }
2090
2091 1;