0
|
1 # $Id: Contig.pm,v 1.1 2002/11/04 11:50:11 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Assembly::Contig
|
|
4 # Mostly based on Bio::SimpleAlign by Ewan Birney
|
|
5 #
|
|
6 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
|
|
7 #
|
|
8 # Copyright Robson Francisco de Souza
|
|
9 #
|
|
10 # You may distribute this module under the same terms as perl itself
|
|
11
|
|
12 # POD documentation - main docs before the code
|
|
13
|
|
14 =head1 NAME
|
|
15
|
|
16 Bio::Assembly::Contig - Perl module to hold and manipulate
|
|
17 sequence assembly contigs.
|
|
18
|
|
19 =head1 SYNOPSYS
|
|
20
|
|
21 # Module loading
|
|
22 use Bio::Assembly::IO;
|
|
23
|
|
24 # Assembly loading methods
|
|
25 $aio = new Bio::Assembly::IO(-file=>"test.ace.1",
|
|
26 -format=>'phrap');
|
|
27
|
|
28 $assembly = $aio->next_assembly;
|
|
29 foreach $contig ($assembly->all_contigs) {
|
|
30 # do something
|
|
31 }
|
|
32
|
|
33 # OR, if you want to build the contig yourself,
|
|
34
|
|
35 use Bio::Assembly::Contig;
|
|
36 $c = Bio::Assembly::Contig->new(-id=>"1");
|
|
37
|
|
38 $ls = Bio::LocatableSeq->new(-seq=>"ACCG-T",
|
|
39 -id=>"r1",
|
|
40 -alphabet=>'dna');
|
|
41 $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T",
|
|
42 -id=>"r2",
|
|
43 -alphabet=>'dna');
|
|
44
|
|
45 $ls_coord = Bio::SeqFeature::Generic->new(-start=>3,
|
|
46 -end=>8,
|
|
47 -strand=>1);
|
|
48 $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1,
|
|
49 -end=>8,
|
|
50 -strand=>1);
|
|
51 $c->add_seq($ls);
|
|
52 $c->add_seq($ls2);
|
|
53 $c->set_seq_coord($ls_coord,$ls);
|
|
54 $c->set_seq_coord($ls2_coord,$ls2);
|
|
55
|
|
56 $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T",
|
|
57 -alphabet=>'dna');
|
|
58 $c->set_consensus_sequence($con);
|
|
59
|
|
60 $l = $c->change_coord('unaligned r2','ungapped consensus',6);
|
|
61 print "6 in unaligned r2 => $l in ungapped consensus\n";
|
|
62
|
|
63
|
|
64 =head1 DESCRIPTION
|
|
65
|
|
66 A contig is as a set of sequences, locally aligned to each other, so
|
|
67 that every sequence has overlapping regions with at least one sequence
|
|
68 in the contig, such that a continuous of overlapping sequences is
|
|
69 formed, allowing the deduction of a consensus sequence which may be
|
|
70 longer than any of the sequences from which it was deduced.
|
|
71
|
|
72 In this documentation we refer to the overlapping sequences used to
|
|
73 build the contig as "aligned sequences" and to the sequence deduced
|
|
74 from the overlap of aligned sequences as the "consensus". Methods to
|
|
75 deduce the consensus sequence from aligned sequences were not yet
|
|
76 implemented in this module, but its posssible to add a consensus
|
|
77 sequence deduced by other means, e.g, by the assembly program used to
|
|
78 build the alignment.
|
|
79
|
|
80 All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable
|
|
81 objects and have a unique ID. The unique ID restriction is due to the
|
|
82 nature of the module's internal data structures and is also a request
|
|
83 of some assembly programs. If two sequences with the same ID are added
|
|
84 to a contig, the first sequence added is replaced by the second one.
|
|
85
|
|
86 =head2 Coordinate_systems
|
|
87
|
|
88 There are four base coordinate systems in Bio::Assembly::Contig. When
|
|
89 you need to access contig elements or data that exists on a certain
|
|
90 range or location, you may be specifying coordinates in relation to
|
|
91 different sequences, which may be either the contig consensus or one
|
|
92 of the aligned sequences that were used to do the assembly.
|
|
93
|
|
94 =========================================================
|
|
95 Name | Referenced sequence
|
|
96 ---------------------------------------------------------
|
|
97 "gapped consensus" | Contig (with gaps)
|
|
98 "ungapped consensus" | Contig (without gaps)
|
|
99 "aligned $seqID" | sequence $seqID (with gaps)
|
|
100 "unaligned $seqID" | sequence $seqID (without gaps)
|
|
101 =========================================================
|
|
102
|
|
103 "gapped consensus" refers to positions in the aligned consensus
|
|
104 sequence, which is the consensus sequence including the gaps inserted
|
|
105 to align it agains the aligned sequences that were used to assemble
|
|
106 the contig. So, its limits are [ 1, (consensus length + number of gaps
|
|
107 in consensus) ]
|
|
108
|
|
109 "ungapped consensus" is a coordinate system based on the consensus
|
|
110 sequence, but excluding consensus gaps. This is just the coordinate
|
|
111 system that you have when considering the consensus sequence alone,
|
|
112 instead of aligned to other sequences.
|
|
113
|
|
114 "aligned $seqID" refers to locations in the sequence $seqID after
|
|
115 alignment of $seqID against the consensus sequence (reverse
|
|
116 complementing the original sequence, if needed). Coordinate 1 in
|
|
117 "aligned $seqID" is equivalent to the start location (first base) of
|
|
118 $seqID in the consensus sequence, just like if the aligned sequence
|
|
119 $seqID was a feature of the consensus sequence.
|
|
120
|
|
121 "unaligned $seqID" is equivalent to a location in the isolated
|
|
122 sequence, just like you would have when considering the sequence
|
|
123 alone, out of an alignment. When changing coordinates from "aligned
|
|
124 $seq2" to "unaligned $seq2", if $seq2 was reverse complemented when
|
|
125 included in the alignment, the output coordinates will be reversed to
|
|
126 fit that fact, i.e. 1 will be changed to length($seq2), 2 will be
|
|
127 length($seq)-1 and so on.
|
|
128
|
|
129 An important note: when you change gap coordinates from a gapped
|
|
130 system ("gapped consensus" or "aligned $seqID") to a system that does
|
|
131 not include gaps ("ungapped consensus" or "unaligned $seqID"), the
|
|
132 position returned will be the first location before all gaps
|
|
133 neighboring the input location.
|
|
134
|
|
135 =head2 Feature_collection
|
|
136
|
|
137 Bio::Assembly::Contig stores much information about a contig in a
|
|
138 Bio::Assembly::SeqFeature::Collection object. Relevant information on the
|
|
139 alignment is accessed by selecting features based on their primary
|
|
140 tags (e.g. all features which have a primary tag of the form
|
|
141 '_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are
|
|
142 coordinates for sequences in the contig alignment) and, by using
|
|
143 methods from Bio::Assembly::SeqFeature::Collection, it's possible to select
|
|
144 features by overlap with other features.
|
|
145
|
|
146 We suggest that you use the primary tags of features as identifiers
|
|
147 for feature classes. By convention, features with primary tags
|
|
148 starting with a '_' are generated by modules that populate the contig
|
|
149 data structure and return the contig object, maybe as part of an
|
|
150 assembly object, e.g. drivers from the Bio::Assembly::IO set.
|
|
151
|
|
152 Features in the features collection may be associated with particular
|
|
153 aligned sequences. To obtain this, you must attach the sequence to the
|
|
154 feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the
|
|
155 feature to the feature collection. We also suggest to add the sequence
|
|
156 id to the primary tag, so that is easy to select feature for a
|
|
157 particular sequence.
|
|
158
|
|
159 There is only one feature class that some methods in
|
|
160 Bio::Assembly::Contig expect to find in the feature collection: features
|
|
161 with primary tags of the form '_aligned_coord:$seqID', where $seqID is
|
|
162 the aligned sequence id (like returned by $seq-E<gt>id()). These features
|
|
163 describe the position (in "gapped consensus" coordinates) of aligned
|
|
164 sequences, and the method set_seq_coord() automatically changes a
|
|
165 feature's primary tag to this form whenever the feature is added to
|
|
166 the collection by this method. Only two methods in Bio::Assembly::Contig
|
|
167 will not work unless there are features from this class:
|
|
168 change_coord() and get_seq_coord().
|
|
169
|
|
170 Other feature classes will be automatically available only when
|
|
171 Bio::Assembly::Contig objects are created by a specific module. Such
|
|
172 feature classes are (or should be) documented in the documentation of
|
|
173 the module which create them, to which the user should refer.
|
|
174
|
|
175 =head1 FEEDBACK
|
|
176
|
|
177 =head2 Mailing Lists
|
|
178
|
|
179 User feedback is an integral part of the evolution of this and other
|
|
180 Bioperl modules. Send your comments and suggestions preferably to the
|
|
181 Bioperl mailing lists Your participation is much appreciated.
|
|
182
|
|
183 bioperl-l@bioperl.org - General discussion
|
|
184 http://bio.perl.org/MailList.html - About the mailing lists
|
|
185
|
|
186 =head2 Reporting Bugs
|
|
187
|
|
188 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
189 the bugs and their resolution. Bug reports can be submitted via email
|
|
190 or the web:
|
|
191
|
|
192 bioperl-bugs@bio.perl.org
|
|
193 http://bugzilla.bioperl.org/
|
|
194
|
|
195 =head1 AUTHOR - Robson Francisco de Souza
|
|
196
|
|
197 rfsouza@citri.iq.usp.br
|
|
198
|
|
199 =head1 APPENDIX
|
|
200
|
|
201 The rest of the documentation details each of the object
|
|
202 methods. Internal methods are usually preceded with a _
|
|
203
|
|
204 =cut
|
|
205
|
|
206 #'
|
|
207 package Bio::Assembly::Contig;
|
|
208
|
|
209 use strict;
|
|
210 use vars qw(@ISA $VERSION);
|
|
211
|
|
212 use Bio::Root::Root;
|
|
213 use Bio::Align::AlignI;
|
|
214 use Bio::SeqFeature::Collection;
|
|
215 use Bio::Seq::PrimaryQual;
|
|
216
|
|
217 @ISA = qw(Bio::Root::Root Bio::Align::AlignI);
|
|
218
|
|
219 =head1 Object creator
|
|
220
|
|
221 =head2 new
|
|
222
|
|
223 Title : new
|
|
224 Usage : my $contig = new Bio::Assembly::Contig();
|
|
225 Function : Creates a new contig object
|
|
226 Returns : Bio::Assembly::Contig
|
|
227 Args : -source => string representing the source
|
|
228 program where this contig came
|
|
229 from
|
|
230 -id => contig unique ID
|
|
231
|
|
232 =cut
|
|
233
|
|
234 #-----------
|
|
235 sub new {
|
|
236 #-----------
|
|
237 my ($class,@args) = @_;
|
|
238
|
|
239 my $self = $class->SUPER::new(@args);
|
|
240
|
|
241 my ($src, $id) = $self->_rearrange([qw(SOURCE ID)], @args);
|
|
242 $src && $self->source($src);
|
|
243 ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name
|
|
244 ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig
|
|
245 # we need to set up internal hashes first!
|
|
246
|
|
247 # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility)
|
|
248 $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID)
|
|
249 $self->{'_order'} = {}; # store sequence order
|
|
250 # $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids.
|
|
251 # $self->{'_dis_name'} = {}; # Display names for each sequence
|
|
252 $self->{'_symbols'} = {}; # List of symbols
|
|
253
|
|
254 #Contig specific slots
|
|
255 $self->{'_consensus_sequence'} = undef;
|
|
256 $self->{'_consensus_quality'} = undef;
|
|
257 $self->{'_nof_residues'} = 0;
|
|
258 $self->{'_nof_seqs'} = 0;
|
|
259 # $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now...
|
|
260 $self->{'_sfc'} = Bio::SeqFeature::Collection->new();
|
|
261
|
|
262 # Assembly specifcs
|
|
263 $self->{'_assembly'} = undef; # Reference to a Bio::Assembly::Scaffold object, if contig belongs to one.
|
|
264 $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise
|
|
265 $self->{'_neighbor_start'} = undef; # Will hold a reference to another contig
|
|
266 $self->{'_neighbor_end'} = undef; # Will hold a reference to another contig
|
|
267
|
|
268 return $self; # success - we hope!
|
|
269 }
|
|
270
|
|
271 =head1 Assembly related methods
|
|
272
|
|
273 These methods exist to enable adding information about possible
|
|
274 relations among contigs, e.g. when you already have a scaffold for
|
|
275 your assembly, describing the ordering of contigs in the final
|
|
276 assembly, but no sequences covering the gaps between neighboring
|
|
277 contigs.
|
|
278
|
|
279 =head2 source
|
|
280
|
|
281 Title : source
|
|
282 Usage : $contig->source($program);
|
|
283 Function : Get/Set program used to build this contig
|
|
284 Returns : string
|
|
285 Argument : [optional] string
|
|
286
|
|
287 =cut
|
|
288
|
|
289 sub source {
|
|
290 my $self = shift;
|
|
291 my $source = shift;
|
|
292
|
|
293 $self->{'_source'} = $source if (defined $source);
|
|
294 return $self->{'_source'};
|
|
295 }
|
|
296
|
|
297 =head2 assembly
|
|
298
|
|
299 Title : assembly
|
|
300 Usage : $contig->assembly($assembly);
|
|
301 Function : Get/Set assembly object for this contig
|
|
302 Returns : a Bio::Assembly::Scaffold object
|
|
303 Argument : a Bio::Assembly::Scaffold object
|
|
304
|
|
305 =cut
|
|
306
|
|
307 sub assembly {
|
|
308 my $self = shift;
|
|
309 my $assembly = shift;
|
|
310
|
|
311 $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly")
|
|
312 if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold"));
|
|
313
|
|
314 $self->{'_assembly'} = $assembly if (defined $assembly);
|
|
315 return $self->{'_assembly'};
|
|
316 }
|
|
317
|
|
318 =head2 strand
|
|
319
|
|
320 Title : strand
|
|
321 Usage : $contig->strand($num);
|
|
322 Function : Get/Set contig orientation in a scaffold/assembly.
|
|
323 Its equivalent to the strand property of sequence
|
|
324 objects and sets whether the contig consensus should
|
|
325 be reversed and complemented before being added to a
|
|
326 scaffold or assembly.
|
|
327 Returns : integer
|
|
328 Argument : 1 if orientaion is forward, -1 if reverse and
|
|
329 0 if none
|
|
330
|
|
331 =cut
|
|
332
|
|
333 sub strand {
|
|
334 my $self = shift;
|
|
335 my $ori = shift;
|
|
336
|
|
337 $self->throw("Contig strand must be either 1, -1 or 0")
|
|
338 unless (defined $ori && ($ori == 1 || $ori == 0 || $ori == -1));
|
|
339
|
|
340 $self->{'_strand'} = $ori;
|
|
341 return $self->{'_strand'};
|
|
342 }
|
|
343
|
|
344 =head2 upstream_neighbor
|
|
345
|
|
346 Title : upstream_neighbor
|
|
347 Usage : $contig->upstream_neighbor($contig);
|
|
348 Function : Get/Set a contig neighbor for the current contig when
|
|
349 building a scaffold. The upstream neighbor is
|
|
350 located before $contig first base
|
|
351 Returns : nothing
|
|
352 Argument : Bio::Assembly::Contig
|
|
353
|
|
354 =cut
|
|
355
|
|
356 sub upstream_neighbor {
|
|
357 my $self = shift;
|
|
358 my $ref = shift;
|
|
359
|
|
360 $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig")
|
|
361 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
|
|
362
|
|
363 $self->{'_neighbor_start'} = $ref if (defined $ref);
|
|
364 return $self->{'_neighbor_start'};
|
|
365 }
|
|
366
|
|
367 =head2 downstream_neighbor
|
|
368
|
|
369 Title : downstream_neighbor
|
|
370 Usage : $contig->downstream_neighbor($num);
|
|
371 Function : Get/Set a contig neighbor for the current contig when
|
|
372 building a scaffold. The downstream neighbor is
|
|
373 located after $contig last base
|
|
374 Returns : nothing
|
|
375 Argument : Bio::Assembly::Contig
|
|
376
|
|
377 =cut
|
|
378
|
|
379 sub downstream_neighbor {
|
|
380 my $self = shift;
|
|
381 my $ref = shift;
|
|
382
|
|
383 $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig")
|
|
384 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
|
|
385 $self->{'_neighbor_end'} = $ref if (defined $ref);
|
|
386 return $self->{'_neighbor_end'};
|
|
387 }
|
|
388
|
|
389 =head1 Contig feature collection methods
|
|
390
|
|
391 =head2 add_features
|
|
392
|
|
393 Title : add_features
|
|
394 Usage : $contig->add_features($feat,$flag)
|
|
395 Function :
|
|
396
|
|
397 Add an array of features to the contig feature
|
|
398 collection. The consensus sequence may be attached to the
|
|
399 added feature, if $flag is set to 1. If $flag is 0 and
|
|
400 the feature attached to one of the contig aligned
|
|
401 sequences, the feature is registered as an aligned
|
|
402 sequence feature. If $flag is 0 and the feature is not
|
|
403 attched to any sequence in the contig, the feature is
|
|
404 simply added to the feature collection and no attachment
|
|
405 or registration is made.
|
|
406
|
|
407 Note: You must attach aligned sequences to their features
|
|
408 prior to calling add_features, otherwise you won't be
|
|
409 able to access the feature through get_seq_feat_by_tag()
|
|
410 method.
|
|
411
|
|
412 Returns : number of features added.
|
|
413 Argument :
|
|
414 $feat : A reference to an array of Bio::SeqFeatureI
|
|
415 $flag : boolean - true if consensus sequence object
|
|
416 should be attached to this feature, false if
|
|
417 no consensus attachment should be made.
|
|
418 Default: false.
|
|
419
|
|
420 =cut
|
|
421
|
|
422 sub add_features {
|
|
423 my ($self, $args, $flag) = @_;
|
|
424
|
|
425 # Adding shortcuts for aligned sequence features
|
|
426 $flag = 0 unless (defined $flag);
|
|
427 if ($flag && defined $self->{'_consensus_sequence'}) {
|
|
428 foreach my $feat (@$args) {
|
|
429 next if (defined $feat->seq);
|
|
430 $feat->attach_seq($self->{'_consensus_sequence'});
|
|
431 }
|
|
432 } elsif (!$flag) { # Register aligned sequence features
|
|
433 foreach my $feat (@$args) {
|
|
434 if (my $seq = $feat->entire_seq()) {
|
|
435 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
436 $self->warn("Adding contig feature attached to unknown sequence $seqID!")
|
|
437 unless (exists $self->{'_elem'}{$seqID});
|
|
438 my $tag = $feat->primary_tag;
|
|
439 $self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat;
|
|
440 }
|
|
441 }
|
|
442 }
|
|
443
|
|
444 # Add feature to feature collection
|
|
445 my $nof_added = $self->{'_sfc'}->add_features($args);
|
|
446
|
|
447 return $nof_added;
|
|
448 }
|
|
449
|
|
450 =head2 remove_features
|
|
451
|
|
452 Title : remove_features
|
|
453 Usage : $contig->remove_features(@feat)
|
|
454 Function : Remove an array of contig features
|
|
455 Returns : number of features removed.
|
|
456 Argument : An array of Bio::SeqFeatureI
|
|
457
|
|
458 =cut
|
|
459
|
|
460 sub remove_features {
|
|
461 my ($self, @args) = @_;
|
|
462
|
|
463 # Removing shortcuts for aligned sequence features
|
|
464 foreach my $feat (@args) {
|
|
465 if (my $seq = $feat->entire_seq()) {
|
|
466 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
467 my $tag = $feat->primary_tag;
|
|
468 $tag =~ s/:$seqID$/$1/g;
|
|
469 delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} )
|
|
470 if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} &&
|
|
471 $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat);
|
|
472 }
|
|
473 }
|
|
474
|
|
475 return $self->{'_sfc'}->remove_features(\@args);
|
|
476 }
|
|
477
|
|
478 =head2 get_features_collection
|
|
479
|
|
480 Title : get_features_collection
|
|
481 Usage : $contig->get_features_collection()
|
|
482 Function : Get the collection of all contig features
|
|
483 Returns : Bio::SeqFeature::Collection
|
|
484 Argument : none
|
|
485
|
|
486 =cut
|
|
487
|
|
488 sub get_features_collection {
|
|
489 my $self = shift;
|
|
490
|
|
491 return $self->{'_sfc'};
|
|
492 }
|
|
493
|
|
494 =head1 Coordinate system's related methods
|
|
495
|
|
496 See L<Coordinate_Systems> above.
|
|
497
|
|
498 =head2 change_coord
|
|
499
|
|
500 Title : change_coord
|
|
501 Usage : $contig->change_coord($in,$out,$query)
|
|
502 Function :
|
|
503
|
|
504 Change coordinate system for $query. This method
|
|
505 transforms locations between coordinate systems described
|
|
506 in section "Coordinate Systems" of this document.
|
|
507
|
|
508 Note: this method will throw an exception when changing
|
|
509 coordinates between "ungapped consensus" and other
|
|
510 systems if consensus sequence was not set. It will also
|
|
511 throw exceptions when changing coordinates among aligned
|
|
512 sequence, either with or without gaps, and other systems
|
|
513 if sequence locations were not set with set_seq_coord().
|
|
514
|
|
515 Returns : integer
|
|
516 Argument :
|
|
517 $in : [string] input coordinate system
|
|
518 $out : [string] output coordinate system
|
|
519 $query : [integer] a position in a sequence
|
|
520
|
|
521 =cut
|
|
522
|
|
523 sub change_coord {
|
|
524 my $self = shift;
|
|
525 my $type_in = shift;
|
|
526 my $type_out = shift;
|
|
527 my $query = shift;
|
|
528
|
|
529 # Parsing arguments
|
|
530 # Loading read objects (these calls will throw exceptions whether $read_in or
|
|
531 # $read_out is not found
|
|
532 my ($read_in,$read_out) = (undef,undef);
|
|
533 my $in_ID = ( split(' ',$type_in) )[1];
|
|
534 my $out_ID = ( split(' ',$type_out) )[1];
|
|
535
|
|
536 if ($in_ID ne 'consensus') {
|
|
537 $read_in = $self->get_seq_coord( $self->get_seq_by_name($in_ID) );
|
|
538 $self->throw("Can't change coordinates without sequence location for $in_ID")
|
|
539 unless (defined $read_in);
|
|
540 }
|
|
541 if ($out_ID ne 'consensus') {
|
|
542 $read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) );
|
|
543 $self->throw("Can't change coordinates without sequence location for $out_ID")
|
|
544 unless (defined $read_out);
|
|
545 }
|
|
546
|
|
547 # Performing transformation between coordinates
|
|
548 SWITCH1: {
|
|
549
|
|
550 # Transformations between contig padded and contig unpadded
|
|
551 (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do {
|
|
552 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
|
|
553 unless (defined $self->{'_consensus_sequence'});
|
|
554 $query = &_padded_unpadded($self->{'_consensus_gaps'}, $query);
|
|
555 last SWITCH1;
|
|
556 };
|
|
557 (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do {
|
|
558 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
|
|
559 unless (defined $self->{'_consensus_sequence'});
|
|
560 $query = &_unpadded_padded($self->{'_consensus_gaps'},$query);
|
|
561 last SWITCH1;
|
|
562 };
|
|
563
|
|
564 # Transformations between contig (padded) and read (padded)
|
|
565 (($type_in eq 'gapped consensus') &&
|
|
566 ($type_out =~ /^aligned /) && defined($read_out)) && do {
|
|
567 $query = $query - $read_out->start() + 1;
|
|
568 last SWITCH1;
|
|
569 };
|
|
570 (($type_in =~ /^aligned /) && defined($read_in) &&
|
|
571 ($type_out eq 'gapped consensus')) && do {
|
|
572 $query = $query + $read_in->start() - 1;
|
|
573 last SWITCH1;
|
|
574 };
|
|
575
|
|
576 # Transformations between contig (unpadded) and read (padded)
|
|
577 (($type_in eq 'ungapped consensus') &&
|
|
578 ($type_out =~ /^aligned /) && defined($read_out)) && do {
|
|
579 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
|
|
580 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
|
|
581 last SWITCH1;
|
|
582 };
|
|
583 (($type_in =~ /^aligned /) && defined($read_in) &&
|
|
584 ($type_out eq 'ungapped consensus')) && do {
|
|
585 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
|
|
586 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
|
|
587 last SWITCH1;
|
|
588 };
|
|
589
|
|
590 # Transformations between seq $read_in padded and seq $read_out padded
|
|
591 (defined($read_in) && ($type_in =~ /^aligned /) &&
|
|
592 defined($read_out) && ($type_out =~ /^aligned /)) && do {
|
|
593 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
|
|
594 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
|
|
595 last SWITCH1;
|
|
596 };
|
|
597
|
|
598 # Transformations between seq $read_in padded and seq $read_out unpadded
|
|
599 (defined($read_in) && ($type_in =~ /^aligned /) &&
|
|
600 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
|
|
601 if ($read_in ne $read_out) {
|
|
602 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
|
|
603 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
|
|
604 }
|
|
605 my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'};
|
|
606 $query = &_padded_unpadded($list_out,$query);
|
|
607 # Changing read orientation if read was reverse complemented when aligned
|
|
608 if ($read_out->strand == -1) {
|
|
609 my ($length) = $read_out->length();
|
|
610 $length = $length - &_nof_gaps($list_out,$length);
|
|
611 $query = $length - $query + 1;
|
|
612 }
|
|
613 last SWITCH1;
|
|
614 };
|
|
615 (defined($read_in) && ($type_in =~ /^unaligned /) &&
|
|
616 defined($read_out) && ($type_out =~ /^aligned /)) && do {
|
|
617 my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'};
|
|
618 # Changing read orientation if read was reverse complemented when aligned
|
|
619 if ($read_in->strand == -1) {
|
|
620 my ($length) = $read_in->length();
|
|
621 $length = $length - &_nof_gaps($list_in,$length);
|
|
622 $query = $length - $query + 1;
|
|
623 }
|
|
624 $query = &_unpadded_padded($list_in,$query);
|
|
625 if ($read_in ne $read_out) {
|
|
626 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
|
|
627 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
|
|
628 }
|
|
629 last SWITCH1;
|
|
630 };
|
|
631
|
|
632 # Transformations between seq $read_in unpadded and seq $read_out unpadded
|
|
633 (defined($read_in) && ($type_in =~ /^unaligned /) &&
|
|
634 defined($read_out) && ($type_out =~ /^unaligned /)) && do {
|
|
635 $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query);
|
|
636 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
|
|
637 last SWITCH1;
|
|
638 };
|
|
639
|
|
640 # Transformations between contig (padded) and read (unpadded)
|
|
641 (($type_in eq 'gapped consensus') &&
|
|
642 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
|
|
643 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
|
|
644 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
|
|
645 last SWITCH1;
|
|
646 };
|
|
647 (($type_in =~ /^unaligned /) && defined($read_in) &&
|
|
648 ($type_out eq 'gapped consensus')) && do {
|
|
649 $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query);
|
|
650 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
|
|
651 last SWITCH1;
|
|
652 };
|
|
653
|
|
654 # Transformations between contig (unpadded) and read (unpadded)
|
|
655 (($type_in eq 'ungapped consensus') &&
|
|
656 ($type_out =~ /^unaligned /) && defined($read_out)) && do {
|
|
657 $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
|
|
658 $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query);
|
|
659 last SWITCH1;
|
|
660 };
|
|
661 (($type_in =~ /^unaligned /) && defined($read_in) &&
|
|
662 ($type_out eq 'ungapped consensus')) && do {
|
|
663 $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query);
|
|
664 $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
|
|
665 last SWITCH1;
|
|
666 };
|
|
667
|
|
668 $self->throw("Unknow coordinate system. Args: $type_in, $type_out.");
|
|
669 $query = undef; # If a coordinate systems just requested is unknown
|
|
670 }
|
|
671
|
|
672 return $query;
|
|
673 }
|
|
674
|
|
675 =head2 get_seq_coord
|
|
676
|
|
677 Title : get_seq_coord
|
|
678 Usage : $contig->get_seq_coord($seq);
|
|
679 Function : Get "gapped consensus" location for aligned sequence
|
|
680 Returns : Bio::SeqFeature::Generic for coordinates or undef.
|
|
681 A warning is printed if sequence coordinates were not set.
|
|
682 Argument : Bio::LocatabaleSeq object
|
|
683
|
|
684 =cut
|
|
685
|
|
686 sub get_seq_coord {
|
|
687 my ($self,$seq) = @_;
|
|
688
|
|
689 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
|
|
690 $self->throw("$seq is not a Bio::LocatableSeq");
|
|
691 }
|
|
692 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
693
|
|
694 unless (exists( $self->{'_elem'}{$seqID} )) {
|
|
695 $self->warn("No such sequence ($seqID) in contig ".$self->id);
|
|
696 return undef;
|
|
697 }
|
|
698 unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) {
|
|
699 $self->warn("Location not set for sequence ($seqID) in contig ".$self->id);
|
|
700 return undef;
|
|
701 }
|
|
702
|
|
703 return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"};
|
|
704 }
|
|
705
|
|
706 =head2 set_seq_coord
|
|
707
|
|
708 Title : set_seq_coord
|
|
709 Usage : $contig->set_seq_coord($feat,$seq);
|
|
710 Function :
|
|
711
|
|
712 Set "gapped consensus" location for an aligned
|
|
713 sequence. If the sequence was previously added using
|
|
714 add_seq, its coordinates are changed/set. Otherwise,
|
|
715 add_seq is called and the sequence is added to the
|
|
716 contig.
|
|
717
|
|
718 Returns : Bio::SeqFeature::Generic for old coordinates or undef.
|
|
719 Argument :
|
|
720 $feat : a Bio::SeqFeature::Generic object
|
|
721 representing a location for the
|
|
722 aligned sequence, in "gapped
|
|
723 consensus" coordinates.
|
|
724
|
|
725 Note: the original feature primary tag will
|
|
726 be lost.
|
|
727
|
|
728 $seq : a Bio::LocatabaleSeq object
|
|
729
|
|
730 =cut
|
|
731
|
|
732 sub set_seq_coord {
|
|
733 my ($self,$feat,$seq) = @_;
|
|
734
|
|
735 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
|
|
736 $self->throw("Unable to process non locatable sequences [".ref($seq)."]");
|
|
737 }
|
|
738
|
|
739 # Complaining about inadequate feature object
|
|
740 $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!")
|
|
741 unless ( $feat->isa("Bio::SeqFeature::Generic") );
|
|
742 $self->throw("Sequence coordinates must have an end!")
|
|
743 unless (defined $feat->end);
|
|
744 $self->throw("Sequence coordinates must have a start!")
|
|
745 unless (defined $feat->start);
|
|
746
|
|
747 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
748 if (exists( $self->{'_elem'}{$seqID} ) &&
|
|
749 exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
|
|
750 defined( $self->{'_elem'}{$seqID}{'_seq'} ) &&
|
|
751 ($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) {
|
|
752 $self->warn("Replacing sequence $seqID\n");
|
|
753 $self->remove_seq($self->{'_elem'}{$seqID}{'_seq'});
|
|
754 }
|
|
755 $self->add_seq($seq);
|
|
756
|
|
757 # Remove previous coordinates, if any
|
|
758 $self->remove_features($feat);
|
|
759
|
|
760 # Add new Bio::Generic::SeqFeature
|
|
761 $feat->add_tag_value('contig',$self->id)
|
|
762 unless ( $feat->has_tag('contig') );
|
|
763 $feat->primary_tag("_aligned_coord:$seqID");
|
|
764 $feat->attach_seq($seq);
|
|
765 $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat;
|
|
766 $self->add_features([ $feat ]);
|
|
767 }
|
|
768
|
|
769 =head1 Bio::Assembly::Contig consensus methods
|
|
770
|
|
771 =head2 set_consensus_sequence
|
|
772
|
|
773 Title : set_consensus_sequence
|
|
774 Usage : $contig->set_consensus_sequence($seq)
|
|
775 Function : Set the consensus sequence object for this contig
|
|
776 Returns : consensus length
|
|
777 Argument : Bio::LocatableSeq
|
|
778
|
|
779 =cut
|
|
780
|
|
781 sub set_consensus_sequence {
|
|
782 my $self = shift;
|
|
783 my $seq = shift;
|
|
784
|
|
785 $self->throw("Consensus sequence must be a Bio::LocatableSeq!")
|
|
786 unless ($seq->isa("Bio::LocatableSeq"));
|
|
787
|
|
788 my $con_len = $seq->length;
|
|
789 $seq->start(1); $seq->end($con_len);
|
|
790
|
|
791 $self->{'_consensus_gaps'} = []; # Consensus Gap registry
|
|
792 $self->_register_gaps($seq->seq,
|
|
793 $self->{'_consensus_gaps'});
|
|
794 $self->{'_consensus_sequence'} = $seq;
|
|
795
|
|
796 return $con_len;
|
|
797 }
|
|
798
|
|
799 =head2 set_consensus_quality
|
|
800
|
|
801 Title : set_consensus_quality
|
|
802 Usage : $contig->set_consensus_quality($qual)
|
|
803 Function : Set the quality object for consensus sequence
|
|
804 Returns : nothing
|
|
805 Argument : Bio::Seq::QualI object
|
|
806
|
|
807 =cut
|
|
808
|
|
809 sub set_consensus_quality {
|
|
810 my $self = shift;
|
|
811 my $qual = shift;
|
|
812
|
|
813 $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
|
|
814 unless ( $qual->isa("Bio::Seq::QualI") );
|
|
815
|
|
816 $self->throw("Consensus quality can't be added before you set the consensus sequence!")
|
|
817 unless (defined $self->{'_consensus_sequence'});
|
|
818
|
|
819 $self->{'_consensus_quality'} = $qual;
|
|
820 }
|
|
821
|
|
822 =head2 get_consensus_length
|
|
823
|
|
824 Title : get_consensus_length
|
|
825 Usage : $contig->get_consensus_length()
|
|
826 Function : Get consensus sequence length
|
|
827 Returns : integer
|
|
828 Argument : none
|
|
829
|
|
830 =cut
|
|
831
|
|
832 sub get_consensus_length {
|
|
833 my $self = shift;
|
|
834
|
|
835 return $self->{'_consensus_sequence'}->length();
|
|
836 }
|
|
837
|
|
838 =head2 get_consensus_sequence
|
|
839
|
|
840 Title : get_consensus_sequence
|
|
841 Usage : $contig->get_consensus_sequence()
|
|
842 Function : Get a reference to the consensus sequence object
|
|
843 for this contig
|
|
844 Returns : Bio::SeqI object
|
|
845 Argument : none
|
|
846
|
|
847 =cut
|
|
848
|
|
849 sub get_consensus_sequence {
|
|
850 my ($self, @args) = @_;
|
|
851
|
|
852 return $self->{'_consensus_sequence'};
|
|
853 }
|
|
854
|
|
855 =head2 get_consensus_quality
|
|
856
|
|
857 Title : get_consensus_quality
|
|
858 Usage : $contig->get_consensus_quality()
|
|
859 Function : Get a reference to the consensus quality object
|
|
860 for this contig.
|
|
861 Returns : A Bio::QualI object
|
|
862 Argument : none
|
|
863
|
|
864 =cut
|
|
865
|
|
866 sub get_consensus_quality {
|
|
867 my ($self, @args) = @_;
|
|
868
|
|
869 return $self->{'_consensus_quality'};
|
|
870 }
|
|
871
|
|
872 =head1 Bio::Assembly::Contig aligned sequences methods
|
|
873
|
|
874 =head2 set_seq_qual
|
|
875
|
|
876 Title : set_seq_qual
|
|
877 Usage : $contig->set_seq_qual($seq,$qual);
|
|
878 Function : Adds quality to an aligned sequence.
|
|
879 Returns : nothing
|
|
880 Argument : a Bio::LocatableSeq object and
|
|
881 a Bio::Seq::QualI object
|
|
882
|
|
883 See L<Bio::LocatableSeq> for more information.
|
|
884
|
|
885 =cut
|
|
886
|
|
887 sub set_seq_qual {
|
|
888 my ($self,$seq,$qual) = @_;
|
|
889
|
|
890 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
|
|
891 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
|
|
892 }
|
|
893 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
894
|
|
895 $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
|
|
896 unless ( $qual->isa("Bio::Seq::QualI") );
|
|
897 $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!")
|
|
898 unless (exists $self->{'_elem'}{$seqID}{'_seq'});
|
|
899 $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) ));
|
|
900
|
|
901 # Adding gaps to quality object
|
|
902 my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq();
|
|
903 my $tmp = $qual->qual();
|
|
904 @{$tmp} = reverse(@{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1);
|
|
905 my @quality = ();
|
|
906 my $previous = 0;
|
|
907 my $next = 0;
|
|
908 my $i = 0; my $j = 0;
|
|
909 while ($i<=$#{$tmp}) {
|
|
910 # IF base is a gap, quality is the average for neighbouring sites
|
|
911 if (substr($sequence,$j,1) eq '-') {
|
|
912 $previous = $tmp->[$i-1] unless ($i == 0);
|
|
913 if ($i < $#{$tmp}) {
|
|
914 $next = $tmp->[$i+1];
|
|
915 } else {
|
|
916 $next = 0;
|
|
917 }
|
|
918 push(@quality,int( ($previous+$next)/2 ));
|
|
919 } else {
|
|
920 push(@quality,$tmp->[$i]);
|
|
921 $i++;
|
|
922 }
|
|
923 $j++;
|
|
924 }
|
|
925
|
|
926 $self->{'_elem'}{$seqID}{'_qual'} = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
|
|
927 -id=>$seqID);
|
|
928 }
|
|
929
|
|
930 =head2 get_seq_ids
|
|
931
|
|
932 Title : get_seq_ids
|
|
933 Usage : $contig->get_seq_ids(-start=>$start,
|
|
934 -end=>$end,
|
|
935 -type=>"gapped A0QR67B08.b");
|
|
936 Function : Get list of sequence IDs overlapping inteval [$start, $end]
|
|
937 The default interval is [1,$contig->length]
|
|
938 Default coordinate system is "gapped contig"
|
|
939 Returns : An array
|
|
940 Argument : A hash with optional elements:
|
|
941 -start : consensus subsequence start
|
|
942 -end : consensus subsequence end
|
|
943 -type : the coordinate system type for $start and $end arguments
|
|
944 Coordinate system avaliable are:
|
|
945 "gapped consensus" : consensus coordinates with gaps
|
|
946 "ungapped consensus" : consensus coordinates without gaps
|
|
947 "aligned $ReadID" : read $ReadID coordinates with gaps
|
|
948 "unaligned $ReadID" : read $ReadID coordinates without gaps
|
|
949
|
|
950
|
|
951 =cut
|
|
952
|
|
953 sub get_seq_ids {
|
|
954 my ($self, @args) = @_;
|
|
955
|
|
956 my ($type,$start,$end) =
|
|
957 $self->_rearrange([qw(TYPE START END)], @args);
|
|
958
|
|
959 if (defined($start) && defined($end)) {
|
|
960 if (defined($type) && ($type ne 'gapped consensus')) {
|
|
961 $start = $self->change_coord($type,'gapped consensus',$start);
|
|
962 $end = $self->change_coord($type,'gapped consensus',$end);
|
|
963 }
|
|
964
|
|
965 my @list = grep { $_->isa("Bio::SeqFeature::Generic") &&
|
|
966 ($_->primary_tag =~ /^_aligned_coord:/) }
|
|
967 $self->{'_sfc'}->features_in_range(-start=>$start,
|
|
968 -end=>$end,
|
|
969 -contain=>0,
|
|
970 -strandmatch=>'ignore');
|
|
971 @list = map { $_->entire_seq->id } @list;
|
|
972 return @list;
|
|
973 }
|
|
974
|
|
975 # Entire aligned sequences list
|
|
976 return map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} };
|
|
977 }
|
|
978
|
|
979 =head2 get_seq_feat_by_tag
|
|
980
|
|
981 Title : get_seq_feat_by_tag
|
|
982 Usage : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID")
|
|
983 Function :
|
|
984
|
|
985 Get a sequence feature based on its primary_tag.
|
|
986 When you add
|
|
987
|
|
988 Returns : a Bio::SeqFeature object
|
|
989 Argument : a Bio::LocatableSeq and a string (feature primary tag)
|
|
990
|
|
991 =cut
|
|
992
|
|
993 sub get_seq_feat_by_tag {
|
|
994 my ($self,$seq,$tag) = @_;
|
|
995
|
|
996 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
|
|
997 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
|
|
998 }
|
|
999 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
1000
|
|
1001 return $self->{'_elem'}{$seqID}{'_feat'}{$tag};
|
|
1002 }
|
|
1003
|
|
1004 =head2 get_seq_by_name
|
|
1005
|
|
1006 Title : get_seq_by_name
|
|
1007 Usage : $seq = $contig->get_seq_by_name('Seq1')
|
|
1008 Function : Gets a sequence based on its id.
|
|
1009 Returns : a Bio::LocatableSeq object
|
|
1010 undef if name is not found
|
|
1011 Argument : string
|
|
1012
|
|
1013 =cut
|
|
1014
|
|
1015 sub get_seq_by_name {
|
|
1016 my $self = shift;
|
|
1017 my ($seqID) = @_;
|
|
1018
|
|
1019 unless (exists $self->{'_elem'}{$seqID}{'_seq'}) {
|
|
1020 $self->throw("Could not find sequence $seqID in contig ".$self->id);
|
|
1021 return undef;
|
|
1022 }
|
|
1023
|
|
1024 return $self->{'_elem'}{$seqID}{'_seq'};
|
|
1025 }
|
|
1026
|
|
1027 =head2 get_qual_by_name
|
|
1028
|
|
1029 Title : get_qual_by_name
|
|
1030 Usage : $seq = $contig->get_qual_by_name('Seq1')
|
|
1031 Function :
|
|
1032
|
|
1033 Gets Bio::Seq::QualI object for a sequence
|
|
1034 through its id ( as given by $qual->id() ).
|
|
1035
|
|
1036 Returns : a Bio::Seq::QualI object.
|
|
1037 undef if name is not found
|
|
1038 Argument : string
|
|
1039
|
|
1040 =cut
|
|
1041
|
|
1042 sub get_qual_by_name {
|
|
1043 my $self = shift;
|
|
1044 my ($seqID) = @_;
|
|
1045
|
|
1046 unless (exists $self->{'_elem'}{$seqID}{'_qual'}) {
|
|
1047 $self->warn("Could not find quality for $seqID in contig!");
|
|
1048 return undef;
|
|
1049 }
|
|
1050
|
|
1051 return $self->{'_elem'}{$seqID}{'_qual'};
|
|
1052 }
|
|
1053
|
|
1054 =head1 Bio::Align::AlignI compatible methods
|
|
1055
|
|
1056 =head2 Modifier methods
|
|
1057
|
|
1058 These methods modify the MSE by adding, removing or shuffling complete
|
|
1059 sequences.
|
|
1060
|
|
1061 =head2 add_seq
|
|
1062
|
|
1063 Title : add_seq
|
|
1064 Usage : $contig->add_seq($newseq);
|
|
1065 Function :
|
|
1066
|
|
1067 Adds a sequence to the contig. *Does*
|
|
1068 *not* align it - just adds it to the
|
|
1069 hashes.
|
|
1070
|
|
1071 Returns : nothing
|
|
1072 Argument : a Bio::LocatableSeq object
|
|
1073
|
|
1074 See L<Bio::LocatableSeq> for more information.
|
|
1075
|
|
1076 =cut
|
|
1077
|
|
1078 sub add_seq {
|
|
1079 my $self = shift;
|
|
1080 my $seq = shift;
|
|
1081
|
|
1082 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
|
|
1083 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
|
|
1084 }
|
|
1085
|
|
1086 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
1087 $self->{'_elem'}{$seqID} = {} unless (exists $self->{'elem'}{$seqID});
|
|
1088
|
|
1089 if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
|
|
1090 ($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) {
|
|
1091 $self->warn("Adding sequence $seqID, which has already been added");
|
|
1092 }
|
|
1093
|
|
1094 # Our locatable sequences are always considered to be complete sequences
|
|
1095 $seq->start(1); $seq->end($seq->length());
|
|
1096
|
|
1097 $self->warn("Adding non-nucleotidic sequence ".$seqID)
|
|
1098 if (lc($seq->alphabet) ne 'dna' && lc($seq->alphabet) ne 'rna');
|
|
1099
|
|
1100 # build the symbol list for this sequence,
|
|
1101 # will prune out the gap and missing/match chars
|
|
1102 # when actually asked for the symbol list in the
|
|
1103 # symbol_chars
|
|
1104 if (defined $seq->seq) {
|
|
1105 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
|
|
1106 } else {
|
|
1107 $self->{'_symbols'} = {};
|
|
1108 }
|
|
1109
|
|
1110 my $seq_no = ++$self->{'_nof_seqs'};
|
|
1111
|
|
1112 if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) {
|
|
1113 $self->warn("Replacing one sequence [$seqID]\n");
|
|
1114 } else {
|
|
1115 #print STDERR "Assigning $seqID to $order\n";
|
|
1116 $self->{'_order'}->{$seq_no} = $seqID;
|
|
1117 # $self->{'_start_end_lists'}->{$id} = []
|
|
1118 # unless(exists $self->{'_start_end_lists'}->{$id});
|
|
1119 # push @{$self->{'_start_end_lists'}->{$id}}, $seq;
|
|
1120 }
|
|
1121
|
|
1122 $self->{'_elem'}{$seqID}{'_seq'} = $seq;
|
|
1123 $self->{'_elem'}{$seqID}{'_feat'} = {};
|
|
1124 $self->{'_elem'}{$seqID}{'_gaps'} = [];
|
|
1125 my $dbref = $self->{'_elem'}{$seqID}{'_gaps'};
|
|
1126 my $nofgaps = $self->_register_gaps($seq->seq,$dbref);
|
|
1127
|
|
1128 # Updating residue count
|
|
1129 $self->{'_nof_residues'} += $seq->length - $nofgaps;
|
|
1130
|
|
1131 return 1;
|
|
1132 }
|
|
1133
|
|
1134 =head2 remove_seq
|
|
1135
|
|
1136 Title : remove_seq
|
|
1137 Usage : $contig->remove_seq($seq);
|
|
1138 Function : Removes a single sequence from an alignment
|
|
1139 Returns : 1 on success, 0 otherwise
|
|
1140 Argument : a Bio::LocatableSeq object
|
|
1141
|
|
1142 =cut
|
|
1143
|
|
1144 sub remove_seq {
|
|
1145 my ($self,$seq) = @_;
|
|
1146
|
|
1147 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
|
|
1148 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
|
|
1149 }
|
|
1150
|
|
1151 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
|
|
1152 unless (exists $self->{'_elem'}{$seqID} ) {
|
|
1153 $self->warn("No sequence named $seqID [$seq]");
|
|
1154 return 0;
|
|
1155 }
|
|
1156
|
|
1157 # Updating residue count
|
|
1158 $self->{'_nof_residues'} -= $seq->length() +
|
|
1159 &_nof_gaps( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length );
|
|
1160
|
|
1161 # Remove all references to features of this sequence
|
|
1162 my @feats = ();
|
|
1163 foreach my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) {
|
|
1164 push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag});
|
|
1165 }
|
|
1166 $self->{'_sfc'}->remove_features(\@feats);
|
|
1167 delete $self->{'_elem'}{$seqID};
|
|
1168
|
|
1169 return 1;
|
|
1170 }
|
|
1171
|
|
1172 =head2 purge
|
|
1173
|
|
1174 Title : purge
|
|
1175 Usage : $contig->purge(0.7);
|
|
1176 Function:
|
|
1177
|
|
1178 Removes sequences above whatever %id.
|
|
1179
|
|
1180 This function will grind on large alignments. Beware!
|
|
1181 (perhaps not ideally implemented)
|
|
1182
|
|
1183 Example :
|
|
1184 Returns : An array of the removed sequences
|
|
1185 Argument:
|
|
1186
|
|
1187
|
|
1188 =cut
|
|
1189
|
|
1190 sub purge {
|
|
1191 my ($self) = @_;
|
|
1192 $self->throw_not_implemented();
|
|
1193 }
|
|
1194
|
|
1195 =head2 sort_alphabetically
|
|
1196
|
|
1197 Title : sort_alphabetically
|
|
1198 Usage : $contig->sort_alphabetically
|
|
1199 Function :
|
|
1200
|
|
1201 Changes the order of the alignemnt to alphabetical on name
|
|
1202 followed by numerical by number.
|
|
1203
|
|
1204 Returns :
|
|
1205 Argument :
|
|
1206
|
|
1207 =cut
|
|
1208
|
|
1209 sub sort_alphabetically {
|
|
1210 my ($self) = @_;
|
|
1211 $self->throw_not_implemented();
|
|
1212 }
|
|
1213
|
|
1214 =head2 Sequence selection methods
|
|
1215
|
|
1216 Methods returning one or more sequences objects.
|
|
1217
|
|
1218 =head2 each_seq
|
|
1219
|
|
1220 Title : each_seq
|
|
1221 Usage : foreach $seq ( $contig->each_seq() )
|
|
1222 Function : Gets an array of Seq objects from the alignment
|
|
1223 Returns : an array
|
|
1224 Argument :
|
|
1225
|
|
1226 =cut
|
|
1227
|
|
1228 sub each_seq {
|
|
1229 my ($self) = @_;
|
|
1230
|
|
1231 my (@arr,$seqID);
|
|
1232
|
|
1233 foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) {
|
|
1234 push(@arr,$self->{'_elem'}{$seqID}{'_seq'});
|
|
1235 }
|
|
1236
|
|
1237 return @arr;
|
|
1238 }
|
|
1239
|
|
1240 =head2 each_alphabetically
|
|
1241
|
|
1242 Title : each_alphabetically
|
|
1243 Usage : foreach $seq ( $contig->each_alphabetically() )
|
|
1244 Function :
|
|
1245
|
|
1246 Returns an array of sequence object sorted alphabetically
|
|
1247 by name and then by start point.
|
|
1248 Does not change the order of the alignment
|
|
1249
|
|
1250 Returns :
|
|
1251 Argument :
|
|
1252
|
|
1253 =cut
|
|
1254
|
|
1255 sub each_alphabetically {
|
|
1256 my($self) = @_;
|
|
1257 $self->throw_not_implemented();
|
|
1258 }
|
|
1259
|
|
1260 =head2 each_seq_with_id
|
|
1261
|
|
1262 Title : each_seq_with_id
|
|
1263 Usage : foreach $seq ( $contig->each_seq_with_id() )
|
|
1264 Function :
|
|
1265
|
|
1266 Gets an array of Seq objects from the
|
|
1267 alignment, the contents being those sequences
|
|
1268 with the given name (there may be more than one)
|
|
1269
|
|
1270 Returns : an array
|
|
1271 Argument : a seq name
|
|
1272
|
|
1273 =cut
|
|
1274
|
|
1275 sub each_seq_with_id {
|
|
1276 my ($self) = @_;
|
|
1277 $self->throw_not_implemented();
|
|
1278 }
|
|
1279
|
|
1280 =head2 get_seq_by_pos
|
|
1281
|
|
1282 Title : get_seq_by_pos
|
|
1283 Usage : $seq = $contig->get_seq_by_pos(3)
|
|
1284 Function :
|
|
1285
|
|
1286 Gets a sequence based on its position in the alignment.
|
|
1287 Numbering starts from 1. Sequence positions larger than
|
|
1288 no_sequences() will thow an error.
|
|
1289
|
|
1290 Returns : a Bio::LocatableSeq object
|
|
1291 Argument : positive integer for the sequence osition
|
|
1292
|
|
1293 =cut
|
|
1294
|
|
1295 sub get_seq_by_pos {
|
|
1296 my $self = shift;
|
|
1297 my ($pos) = @_;
|
|
1298
|
|
1299 $self->throw("Sequence position has to be a positive integer, not [$pos]")
|
|
1300 unless $pos =~ /^\d+$/ and $pos > 0;
|
|
1301 $self->throw("No sequence at position [$pos]")
|
|
1302 unless $pos <= $self->no_sequences ;
|
|
1303
|
|
1304 my $seqID = $self->{'_order'}->{--$pos};
|
|
1305 return $self->{'_elem'}{$seqID}{'_seq'};
|
|
1306 }
|
|
1307
|
|
1308 =head2 Create new alignments
|
|
1309
|
|
1310 The result of these methods are horizontal or vertical subsets of the
|
|
1311 current MSE.
|
|
1312
|
|
1313 =head2 select
|
|
1314
|
|
1315 Title : select
|
|
1316 Usage : $contig2 = $contig->select(1, 3) # three first sequences
|
|
1317 Function :
|
|
1318
|
|
1319 Creates a new alignment from a continuous subset of
|
|
1320 sequences. Numbering starts from 1. Sequence positions
|
|
1321 larger than no_sequences() will thow an error.
|
|
1322
|
|
1323 Returns : a Bio::Assembly::Contig object
|
|
1324 Argument : positive integer for the first sequence
|
|
1325 positive integer for the last sequence to include (optional)
|
|
1326
|
|
1327 =cut
|
|
1328
|
|
1329 sub select {
|
|
1330 my ($self) = @_;
|
|
1331 $self->throw_not_implemented();
|
|
1332 }
|
|
1333
|
|
1334
|
|
1335 =head2 select_noncont
|
|
1336
|
|
1337 Title : select_noncont
|
|
1338 Usage : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences
|
|
1339 Function :
|
|
1340
|
|
1341 Creates a new alignment from a subset of
|
|
1342 sequences. Numbering starts from 1. Sequence positions
|
|
1343 larger than no_sequences() will thow an error.
|
|
1344
|
|
1345 Returns : a Bio::Assembly::Contig object
|
|
1346 Args : array of integers for the sequences
|
|
1347
|
|
1348 =cut
|
|
1349
|
|
1350 sub select_noncont {
|
|
1351 my ($self) = @_;
|
|
1352 $self->throw_not_implemented();
|
|
1353 }
|
|
1354
|
|
1355 =head2 slice
|
|
1356
|
|
1357 Title : slice
|
|
1358 Usage : $contig2 = $contig->slice(20, 30)
|
|
1359 Function :
|
|
1360
|
|
1361 Creates a slice from the alignment inclusive of start and
|
|
1362 end columns. Sequences with no residues in the slice are
|
|
1363 excluded from the new alignment and a warning is printed.
|
|
1364 Slice beyond the length of the sequence does not do
|
|
1365 padding.
|
|
1366
|
|
1367 Returns : a Bio::Assembly::Contig object
|
|
1368 Argument : positive integer for start column
|
|
1369 positive integer for end column
|
|
1370
|
|
1371 =cut
|
|
1372
|
|
1373 sub slice {
|
|
1374 my ($self) = @_;
|
|
1375 $self->throw_not_implemented();
|
|
1376 }
|
|
1377
|
|
1378 =head2 Change sequences within the MSE
|
|
1379
|
|
1380 These methods affect characters in all sequences without changeing the
|
|
1381 alignment.
|
|
1382
|
|
1383
|
|
1384 =head2 map_chars
|
|
1385
|
|
1386 Title : map_chars
|
|
1387 Usage : $contig->map_chars('\.','-')
|
|
1388 Function :
|
|
1389
|
|
1390 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
|
|
1391 characters
|
|
1392
|
|
1393 Notice that the from (arg1) is interpretted as a regex,
|
|
1394 so be careful about quoting meta characters (eg
|
|
1395 $contig->map_chars('.','-') wont do what you want)
|
|
1396
|
|
1397 Returns :
|
|
1398 Argument : 'from' rexexp
|
|
1399 'to' string
|
|
1400
|
|
1401 =cut
|
|
1402
|
|
1403 sub map_chars {
|
|
1404 my ($self) = @_;
|
|
1405 $self->throw_not_implemented();
|
|
1406 }
|
|
1407
|
|
1408 =head2 uppercase
|
|
1409
|
|
1410 Title : uppercase()
|
|
1411 Usage : $contig->uppercase()
|
|
1412 Function : Sets all the sequences to uppercase
|
|
1413 Returns :
|
|
1414 Argument :
|
|
1415
|
|
1416 =cut
|
|
1417
|
|
1418 sub uppercase {
|
|
1419 my ($self) = @_;
|
|
1420 $self->throw_not_implemented();
|
|
1421 }
|
|
1422
|
|
1423 =head2 match_line
|
|
1424
|
|
1425 Title : match_line()
|
|
1426 Usage : $contig->match_line()
|
|
1427 Function : Generates a match line - much like consensus string
|
|
1428 except that a line indicating the '*' for a match.
|
|
1429 Argument : (optional) Match line characters ('*' by default)
|
|
1430 (optional) Strong match char (':' by default)
|
|
1431 (optional) Weak match char ('.' by default)
|
|
1432
|
|
1433 =cut
|
|
1434
|
|
1435 sub match_line {
|
|
1436 my ($self) = @_;
|
|
1437 $self->throw_not_implemented();
|
|
1438 }
|
|
1439
|
|
1440 =head2 match
|
|
1441
|
|
1442 Title : match()
|
|
1443 Usage : $contig->match()
|
|
1444 Function :
|
|
1445
|
|
1446 Goes through all columns and changes residues that are
|
|
1447 identical to residue in first sequence to match '.'
|
|
1448 character. Sets match_char.
|
|
1449
|
|
1450 USE WITH CARE: Most MSE formats do not support match
|
|
1451 characters in sequences, so this is mostly for output
|
|
1452 only. NEXUS format (Bio::AlignIO::nexus) can handle
|
|
1453 it.
|
|
1454
|
|
1455 Returns : 1
|
|
1456 Argument : a match character, optional, defaults to '.'
|
|
1457
|
|
1458 =cut
|
|
1459
|
|
1460 sub match {
|
|
1461 my ($self) = @_;
|
|
1462 $self->throw_not_implemented();
|
|
1463 }
|
|
1464
|
|
1465 =head2 unmatch
|
|
1466
|
|
1467 Title : unmatch()
|
|
1468 Usage : $contig->unmatch()
|
|
1469 Function :
|
|
1470
|
|
1471 Undoes the effect of method match. Unsets match_char.
|
|
1472
|
|
1473 Returns : 1
|
|
1474 Argument : a match character, optional, defaults to '.'
|
|
1475
|
|
1476 =cut
|
|
1477
|
|
1478 sub unmatch {
|
|
1479 my ($self) = @_;
|
|
1480 $self->throw_not_implemented();
|
|
1481 }
|
|
1482
|
|
1483
|
|
1484 =head2 MSE attibutes
|
|
1485
|
|
1486 Methods for setting and reading the MSE attributes.
|
|
1487
|
|
1488 Note that the methods defining character semantics depend on the user
|
|
1489 to set them sensibly. They are needed only by certain input/output
|
|
1490 methods. Unset them by setting to an empty string ('').
|
|
1491
|
|
1492 =head2 id
|
|
1493
|
|
1494 Title : id
|
|
1495 Usage : $contig->id("Ig")
|
|
1496 Function : Gets/sets the id field of the alignment
|
|
1497 Returns : An id string
|
|
1498 Argument : An id string (optional)
|
|
1499
|
|
1500 =cut
|
|
1501
|
|
1502 sub id {
|
|
1503 my ($self, $contig_name) = @_;
|
|
1504
|
|
1505 if (defined( $contig_name )) {
|
|
1506 $self->{'_id'} = $contig_name;
|
|
1507 }
|
|
1508
|
|
1509 return $self->{'_id'};
|
|
1510 }
|
|
1511
|
|
1512 =head2 missing_char
|
|
1513
|
|
1514 Title : missing_char
|
|
1515 Usage : $contig->missing_char("?")
|
|
1516 Function : Gets/sets the missing_char attribute of the alignment
|
|
1517 It is generally recommended to set it to 'n' or 'N'
|
|
1518 for nucleotides and to 'X' for protein.
|
|
1519 Returns : An missing_char string,
|
|
1520 Argument : An missing_char string (optional)
|
|
1521
|
|
1522 =cut
|
|
1523
|
|
1524 sub missing_char {
|
|
1525 my ($self) = @_;
|
|
1526 $self->throw_not_implemented();
|
|
1527 }
|
|
1528
|
|
1529 =head2 match_char
|
|
1530
|
|
1531 Title : match_char
|
|
1532 Usage : $contig->match_char('.')
|
|
1533 Function : Gets/sets the match_char attribute of the alignment
|
|
1534 Returns : An match_char string,
|
|
1535 Argument : An match_char string (optional)
|
|
1536
|
|
1537 =cut
|
|
1538
|
|
1539 sub match_char {
|
|
1540 my ($self) = @_;
|
|
1541 $self->throw_not_implemented();
|
|
1542 }
|
|
1543
|
|
1544 =head2 gap_char
|
|
1545
|
|
1546 Title : gap_char
|
|
1547 Usage : $contig->gap_char('-')
|
|
1548 Function : Gets/sets the gap_char attribute of the alignment
|
|
1549 Returns : An gap_char string, defaults to '-'
|
|
1550 Argument : An gap_char string (optional)
|
|
1551
|
|
1552 =cut
|
|
1553
|
|
1554 sub gap_char {
|
|
1555 my ($self) = @_;
|
|
1556 $self->throw_not_implemented();
|
|
1557 }
|
|
1558
|
|
1559 =head2 symbol_chars
|
|
1560
|
|
1561 Title : symbol_chars
|
|
1562 Usage : my @symbolchars = $contig->symbol_chars;
|
|
1563 Function: Returns all the seen symbols (other than gaps)
|
|
1564 Returns : array of characters that are the seen symbols
|
|
1565 Argument: boolean to include the gap/missing/match characters
|
|
1566
|
|
1567 =cut
|
|
1568
|
|
1569 sub symbol_chars{
|
|
1570 my ($self) = @_;
|
|
1571 $self->throw_not_implemented();
|
|
1572 }
|
|
1573
|
|
1574 =head2 Alignment descriptors
|
|
1575
|
|
1576 These read only methods describe the MSE in various ways.
|
|
1577
|
|
1578
|
|
1579 =head2 consensus_string
|
|
1580
|
|
1581 Title : consensus_string
|
|
1582 Usage : $str = $contig->consensus_string($threshold_percent)
|
|
1583 Function : Makes a strict consensus
|
|
1584 Returns :
|
|
1585 Argument : Optional treshold ranging from 0 to 100.
|
|
1586 The consensus residue has to appear at least threshold %
|
|
1587 of the sequences at a given location, otherwise a '?'
|
|
1588 character will be placed at that location.
|
|
1589 (Default value = 0%)
|
|
1590
|
|
1591 =cut
|
|
1592
|
|
1593 sub consensus_string {
|
|
1594 my ($self) = @_;
|
|
1595 $self->throw_not_implemented();
|
|
1596 }
|
|
1597
|
|
1598 =head2 consensus_iupac
|
|
1599
|
|
1600 Title : consensus_iupac
|
|
1601 Usage : $str = $contig->consensus_iupac()
|
|
1602 Function :
|
|
1603
|
|
1604 Makes a consensus using IUPAC ambiguity codes from DNA
|
|
1605 and RNA. The output is in upper case except when gaps in
|
|
1606 a column force output to be in lower case.
|
|
1607
|
|
1608 Note that if your alignment sequences contain a lot of
|
|
1609 IUPAC ambiquity codes you often have to manually set
|
|
1610 alphabet. Bio::PrimarySeq::_guess_type thinks they
|
|
1611 indicate a protein sequence.
|
|
1612
|
|
1613 Returns : consensus string
|
|
1614 Argument : none
|
|
1615 Throws : on protein sequences
|
|
1616
|
|
1617
|
|
1618 =cut
|
|
1619
|
|
1620 sub consensus_iupac {
|
|
1621 my ($self) = @_;
|
|
1622 $self->throw_not_implemented();
|
|
1623 }
|
|
1624
|
|
1625 =head2 is_flush
|
|
1626
|
|
1627 Title : is_flush
|
|
1628 Usage : if( $contig->is_flush() )
|
|
1629 :
|
|
1630 :
|
|
1631 Function : Tells you whether the alignment
|
|
1632 : is flush, ie all of the same length
|
|
1633 :
|
|
1634 :
|
|
1635 Returns : 1 or 0
|
|
1636 Argument :
|
|
1637
|
|
1638 =cut
|
|
1639
|
|
1640 sub is_flush {
|
|
1641 my ($self) = @_;
|
|
1642 $self->throw_not_implemented();
|
|
1643 }
|
|
1644
|
|
1645 =head2 length
|
|
1646
|
|
1647 Title : length()
|
|
1648 Usage : $len = $contig->length()
|
|
1649 Function : Returns the maximum length of the alignment.
|
|
1650 To be sure the alignment is a block, use is_flush
|
|
1651 Returns :
|
|
1652 Argument :
|
|
1653
|
|
1654 =cut
|
|
1655
|
|
1656 sub length {
|
|
1657 my ($self) = @_;
|
|
1658
|
|
1659 $self->throw_not_implemented();
|
|
1660 }
|
|
1661
|
|
1662 =head2 maxdisplayname_length
|
|
1663
|
|
1664 Title : maxdisplayname_length
|
|
1665 Usage : $contig->maxdisplayname_length()
|
|
1666 Function :
|
|
1667
|
|
1668 Gets the maximum length of the displayname in the
|
|
1669 alignment. Used in writing out various MSE formats.
|
|
1670
|
|
1671 Returns : integer
|
|
1672 Argument :
|
|
1673
|
|
1674 =cut
|
|
1675
|
|
1676 sub maxname_length {
|
|
1677 my ($self) = @_;
|
|
1678 $self->throw_not_implemented();
|
|
1679 }
|
|
1680
|
|
1681 =head2 no_residues
|
|
1682
|
|
1683 Title : no_residues
|
|
1684 Usage : $no = $contig->no_residues
|
|
1685 Function : number of residues in total in the alignment
|
|
1686 Returns : integer
|
|
1687 Argument :
|
|
1688
|
|
1689 =cut
|
|
1690
|
|
1691 sub no_residues {
|
|
1692 my ($self) = @_;
|
|
1693
|
|
1694 return $self->{'_nof_residues'};
|
|
1695 }
|
|
1696
|
|
1697 =head2 no_sequences
|
|
1698
|
|
1699 Title : no_sequences
|
|
1700 Usage : $depth = $contig->no_sequences
|
|
1701 Function : number of sequence in the sequence alignment
|
|
1702 Returns : integer
|
|
1703 Argument : None
|
|
1704
|
|
1705 =cut
|
|
1706
|
|
1707 sub no_sequences {
|
|
1708 my ($self) = @_;
|
|
1709
|
|
1710 return scalar( keys %{ $self->{'_elem'} } );
|
|
1711 }
|
|
1712
|
|
1713 =head2 percentage_identity
|
|
1714
|
|
1715 Title : percentage_identity
|
|
1716 Usage : $id = $contig->percentage_identity
|
|
1717 Function: The function calculates the percentage identity of the alignment
|
|
1718 Returns : The percentage identity of the alignment (as defined by the
|
|
1719 implementation)
|
|
1720 Argument: None
|
|
1721
|
|
1722 =cut
|
|
1723
|
|
1724 sub percentage_identity{
|
|
1725 my ($self) = @_;
|
|
1726
|
|
1727 $self->throw_not_implemeneted();
|
|
1728 }
|
|
1729
|
|
1730 =head2 overall_percentage_identity
|
|
1731
|
|
1732 Title : percentage_identity
|
|
1733 Usage : $id = $contig->percentage_identity
|
|
1734 Function: The function calculates the percentage identity of
|
|
1735 the conserved columns
|
|
1736 Returns : The percentage identity of the conserved columns
|
|
1737 Args : None
|
|
1738
|
|
1739 =cut
|
|
1740
|
|
1741 sub overall_percentage_identity{
|
|
1742 my ($self) = @_;
|
|
1743 $self->throw_not_implemented();
|
|
1744 }
|
|
1745
|
|
1746
|
|
1747 =head2 average_percentage_identity
|
|
1748
|
|
1749 Title : average_percentage_identity
|
|
1750 Usage : $id = $contig->average_percentage_identity
|
|
1751 Function: The function uses a fast method to calculate the average
|
|
1752 percentage identity of the alignment
|
|
1753 Returns : The average percentage identity of the alignment
|
|
1754 Args : None
|
|
1755
|
|
1756 =cut
|
|
1757
|
|
1758 sub average_percentage_identity {
|
|
1759 my ($self) = @_;
|
|
1760 $self->throw_not_implemented();
|
|
1761 }
|
|
1762
|
|
1763 =head2 Alignment positions
|
|
1764
|
|
1765 Methods to map a sequence position into an alignment column and back.
|
|
1766 column_from_residue_number() does the former. The latter is really a
|
|
1767 property of the sequence object and can done using
|
|
1768 L<Bio::LocatableSeq::location_from_column>:
|
|
1769
|
|
1770 # select somehow a sequence from the alignment, e.g.
|
|
1771 my $seq = $contig->get_seq_by_pos(1);
|
|
1772 #$loc is undef or Bio::LocationI object
|
|
1773 my $loc = $seq->location_from_column(5);
|
|
1774
|
|
1775
|
|
1776 =head2 column_from_residue_number
|
|
1777
|
|
1778 Title : column_from_residue_number
|
|
1779 Usage : $col = $contig->column_from_residue_number( $seqname, $resnumber)
|
|
1780 Function:
|
|
1781
|
|
1782 This function gives the position in the alignment
|
|
1783 (i.e. column number) of the given residue number in the
|
|
1784 sequence with the given name. For example, for the
|
|
1785 alignment
|
|
1786
|
|
1787 Seq1/91-97 AC..DEF.GH
|
|
1788 Seq2/24-30 ACGG.RTY..
|
|
1789 Seq3/43-51 AC.DDEFGHI
|
|
1790
|
|
1791 column_from_residue_number( "Seq1", 94 ) returns 5.
|
|
1792 column_from_residue_number( "Seq2", 25 ) returns 2.
|
|
1793 column_from_residue_number( "Seq3", 50 ) returns 9.
|
|
1794
|
|
1795 An exception is thrown if the residue number would lie
|
|
1796 outside the length of the aligment
|
|
1797 (e.g. column_from_residue_number( "Seq2", 22 )
|
|
1798
|
|
1799 Note: If the the parent sequence is represented by more than
|
|
1800 one alignment sequence and the residue number is present in
|
|
1801 them, this method finds only the first one.
|
|
1802
|
|
1803 Returns : A column number for the position in the alignment of the
|
|
1804 given residue in the given sequence (1 = first column)
|
|
1805 Args : A sequence id/name (not a name/start-end)
|
|
1806 A residue number in the whole sequence (not just that
|
|
1807 segment of it in the alignment)
|
|
1808
|
|
1809 =cut
|
|
1810
|
|
1811 sub column_from_residue_number {
|
|
1812 my ($self) = @_;
|
|
1813 $self->throw_not_implemented();
|
|
1814 }
|
|
1815
|
|
1816 =head2 Sequence names
|
|
1817
|
|
1818 Methods to manipulate the display name. The default name based on the
|
|
1819 sequence id and subsequence positions can be overridden in various
|
|
1820 ways.
|
|
1821
|
|
1822 =head2 displayname
|
|
1823
|
|
1824 Title : displayname
|
|
1825 Usage : $contig->displayname("Ig", "IgA")
|
|
1826 Function : Gets/sets the display name of a sequence in the alignment
|
|
1827 :
|
|
1828 Returns : A display name string
|
|
1829 Argument : name of the sequence
|
|
1830 displayname of the sequence (optional)
|
|
1831
|
|
1832 =cut
|
|
1833
|
|
1834 sub displayname { # Do nothing
|
|
1835 }
|
|
1836
|
|
1837 =head2 set_displayname_count
|
|
1838
|
|
1839 Title : set_displayname_count
|
|
1840 Usage : $contig->set_displayname_count
|
|
1841 Function :
|
|
1842
|
|
1843 Sets the names to be name_# where # is the number of
|
|
1844 times this name has been used.
|
|
1845
|
|
1846 Returns : None
|
|
1847 Argument : None
|
|
1848
|
|
1849 =cut
|
|
1850
|
|
1851 sub set_displayname_count {
|
|
1852 my ($self) = @_;
|
|
1853 $self->throw_not_implemented();
|
|
1854 }
|
|
1855
|
|
1856 =head2 set_displayname_flat
|
|
1857
|
|
1858 Title : set_displayname_flat
|
|
1859 Usage : $contig->set_displayname_flat()
|
|
1860 Function : Makes all the sequences be displayed as just their name,
|
|
1861 not name/start-end
|
|
1862 Returns : 1
|
|
1863 Argument : None
|
|
1864
|
|
1865 =cut
|
|
1866
|
|
1867 sub set_displayname_flat { # Do nothing!
|
|
1868 }
|
|
1869
|
|
1870 =head2 set_displayname_normal
|
|
1871
|
|
1872 Title : set_displayname_normal
|
|
1873 Usage : $contig->set_displayname_normal()
|
|
1874 Function : Makes all the sequences be displayed as name/start-end
|
|
1875 Returns : None
|
|
1876 Argument : None
|
|
1877
|
|
1878 =cut
|
|
1879
|
|
1880 sub set_displayname_normal { # Do nothing!
|
|
1881 }
|
|
1882
|
|
1883 =head1 Internal Methods
|
|
1884
|
|
1885 =head2 _binary_search
|
|
1886
|
|
1887 Title : _binary_search
|
|
1888 Usage : _binary_search($list,$query)
|
|
1889 Function :
|
|
1890
|
|
1891 Find a number in a sorted list of numbers. Return values
|
|
1892 may be on or two integers. One positive integer or zero
|
|
1893 (>=0) is the index of the element that stores the queried
|
|
1894 value. Two positive integers (or zero and another
|
|
1895 number) are the indexes of elements among which the
|
|
1896 queried value should be placed. Negative single values
|
|
1897 mean:
|
|
1898
|
|
1899 -1: $query is smaller than smallest element in list
|
|
1900 -2: $query is greater than greatest element in list
|
|
1901
|
|
1902 Returns : array of integers
|
|
1903 Argument :
|
|
1904 $list : array reference
|
|
1905 $query : integer
|
|
1906
|
|
1907 =cut
|
|
1908
|
|
1909 sub _binary_search {
|
|
1910 my $list = shift;
|
|
1911 my $query = shift;
|
|
1912 #
|
|
1913 # If there is only one element in list
|
|
1914 if (!$#{$list} && ($query == $list->[0])) { return (0) }
|
|
1915 # If there are others...
|
|
1916 my $start = 0;
|
|
1917 my $end = $#{$list};
|
|
1918 (&_compare($query,$list->[$start]) == 0) && do { return ($start) };
|
|
1919 (&_compare($query,$list->[$end]) == 0) && do { return ($end) };
|
|
1920 (&_compare($query,$list->[$start]) < 0) && do { return (-1) };
|
|
1921 (&_compare($query,$list->[$end]) > 0) && do { return (-2) };
|
|
1922 my $middle = 0;
|
|
1923 while ($end - $start > 1) {
|
|
1924 $middle = int(($end+$middle)/2);
|
|
1925 (&_compare($query,$list->[$middle]) == 0) && return ($middle);
|
|
1926 (&_compare($query,$list->[$middle]) < 0) && do { $end = $middle ; $middle = 0; next };
|
|
1927 $start = $middle; # If &_compare() > 0, move region beggining
|
|
1928 }
|
|
1929 return ($start,$end);
|
|
1930 }
|
|
1931
|
|
1932 =head2 _compare
|
|
1933
|
|
1934 Title : _compare
|
|
1935 Usage : _compare($arg1,$arg2)
|
|
1936 Function: Perform numeric or string comparisons
|
|
1937 Returns : integer (0, 1 or -1)
|
|
1938 Args : values to be compared
|
|
1939
|
|
1940 =cut
|
|
1941
|
|
1942 sub _compare {
|
|
1943 my $arg1 = shift;
|
|
1944 my $arg2 = shift;
|
|
1945 #
|
|
1946 if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 }
|
|
1947 else { return $arg1 cmp $arg2 }
|
|
1948 }
|
|
1949
|
|
1950 =head2 _nof_gaps
|
|
1951
|
|
1952 Title : _nof_gaps
|
|
1953 Usage : _nof_gaps($array_ref, $query)
|
|
1954 Function: number of gaps found before position $query
|
|
1955 Returns : integer
|
|
1956 Args :
|
|
1957 $array_ref : gap registry reference
|
|
1958 $query : [integer] a position in a sequence
|
|
1959
|
|
1960 =cut
|
|
1961
|
|
1962 #' emacs...
|
|
1963 sub _nof_gaps {
|
|
1964 my $list = shift;
|
|
1965 my $query = shift;
|
|
1966 # If there are no gaps in this contig
|
|
1967 return 0 unless (defined($list) && scalar(@{$list}));
|
|
1968 # Locate query index in gap list (if any)
|
|
1969 my @index = &_binary_search($list,$query);
|
|
1970 # If after all alignments, correct using total number of align
|
|
1971 if ($index[0] == -2) { $query = scalar(@{$list}) }
|
|
1972 # If before any alignment, return 0
|
|
1973 elsif ($index[0] == -1) { $query = 0 }
|
|
1974 elsif ($index[0] >= 0) {
|
|
1975 # If query is between alignments, translate coordinates
|
|
1976 if ($#index > 0) { $query = $index[0] + 1 }
|
|
1977 # If query sits upon an alignment, do another correction
|
|
1978 elsif ($#index == 0) { $query = $index[0] }
|
|
1979 }
|
|
1980 #
|
|
1981 return $query;
|
|
1982 }
|
|
1983
|
|
1984 =head2 _padded_unpadded
|
|
1985
|
|
1986 Title : _padded_unpadded
|
|
1987 Usage : _padded_unpadded($array_ref, $query)
|
|
1988 Function:
|
|
1989
|
|
1990 Returns a coordinate corresponding to
|
|
1991 position $query after gaps were
|
|
1992 removed from a sequence.
|
|
1993
|
|
1994 Returns : integer
|
|
1995 Args :
|
|
1996 $array_ref : reference to this gap registry
|
|
1997 $query : [integer] coordionate to change
|
|
1998
|
|
1999 =cut
|
|
2000
|
|
2001 sub _padded_unpadded {
|
|
2002 my $list = shift;
|
|
2003 my $query = shift;
|
|
2004
|
|
2005 my $align = &_nof_gaps($list,$query);
|
|
2006 $query-- if (defined($list->[$align]) && ($list->[$align] == $query));
|
|
2007 $query = $query - $align;
|
|
2008 #
|
|
2009 return $query;
|
|
2010 }
|
|
2011
|
|
2012 =head2 _unpadded_padded
|
|
2013
|
|
2014 Title : _unpadded_padded
|
|
2015 Usage : _unpadded_padded($array_ref, $query)
|
|
2016 Function:
|
|
2017
|
|
2018 Returns the value corresponding to
|
|
2019 ungapped position $query when gaps are
|
|
2020 counted as valid sites in a sequence
|
|
2021
|
|
2022 Returns :
|
|
2023 Args : $array_ref = a reference to this sequence's gap registry
|
|
2024 $query = [integer] location to change
|
|
2025
|
|
2026 =cut
|
|
2027
|
|
2028 #'
|
|
2029 sub _unpadded_padded {
|
|
2030 my $list = shift;
|
|
2031 my $query = shift;
|
|
2032
|
|
2033 my $align = &_nof_gaps($list,$query);
|
|
2034 $query = $query + $align;
|
|
2035 my $new_align = &_nof_gaps($list,$query);
|
|
2036 while ($new_align - $align > 0) {
|
|
2037 $query = $query + $new_align - $align;
|
|
2038 $align = $new_align;
|
|
2039 $new_align = &_nof_gaps($list,$query);
|
|
2040 }
|
|
2041 # If current position is also a align, look for the first upstream base
|
|
2042 while (defined($list->[$align]) && ($list->[$align] == $query)) {
|
|
2043 $query++; $align++;
|
|
2044 }
|
|
2045 #
|
|
2046 return $query;
|
|
2047 }
|
|
2048
|
|
2049 =head2 _register_gaps
|
|
2050
|
|
2051 Title : _register_gaps
|
|
2052 Usage : $self->_register_gaps($seq, $array_ref)
|
|
2053 Function: stores gap locations for a sequence
|
|
2054 Returns : number of gaps found
|
|
2055 Args :
|
|
2056 $seq : sequence string
|
|
2057 $array_ref : a reference to an array,
|
|
2058 where gap locations will
|
|
2059 be stored
|
|
2060
|
|
2061 =cut
|
|
2062
|
|
2063 sub _register_gaps {
|
|
2064 my $self = shift;
|
|
2065 my $sequence = shift;
|
|
2066 my $dbref = shift;
|
|
2067
|
|
2068 $self->throw("Not an aligned sequence string to register gaps")
|
|
2069 if (ref($sequence));
|
|
2070
|
|
2071 $self->throw("Not an array reference for gap registry")
|
|
2072 unless (ref($dbref) eq 'ARRAY');
|
|
2073
|
|
2074 # Registering alignments
|
|
2075 @{$dbref} = (); # Cleaning registry
|
|
2076 if (defined $sequence) {
|
|
2077 my $i = -1;
|
|
2078 while(1) {
|
|
2079 $i = index($sequence,"-",$i+1);
|
|
2080 last if ($i == -1);
|
|
2081 push(@{$dbref},$i+1);
|
|
2082 }
|
|
2083 } else {
|
|
2084 # $self->warn("Found undefined sequence while registering gaps");
|
|
2085 return 0;
|
|
2086 }
|
|
2087
|
|
2088 return scalar(@{$dbref});
|
|
2089 }
|
|
2090
|
|
2091 1;
|