Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Assembly/Contig.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Contig.pm,v 1.1 2002/11/04 11:50:11 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::Assembly::Contig | |
4 # Mostly based on Bio::SimpleAlign by Ewan Birney | |
5 # | |
6 # Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br> | |
7 # | |
8 # Copyright Robson Francisco de Souza | |
9 # | |
10 # You may distribute this module under the same terms as perl itself | |
11 | |
12 # POD documentation - main docs before the code | |
13 | |
14 =head1 NAME | |
15 | |
16 Bio::Assembly::Contig - Perl module to hold and manipulate | |
17 sequence assembly contigs. | |
18 | |
19 =head1 SYNOPSYS | |
20 | |
21 # Module loading | |
22 use Bio::Assembly::IO; | |
23 | |
24 # Assembly loading methods | |
25 $aio = new Bio::Assembly::IO(-file=>"test.ace.1", | |
26 -format=>'phrap'); | |
27 | |
28 $assembly = $aio->next_assembly; | |
29 foreach $contig ($assembly->all_contigs) { | |
30 # do something | |
31 } | |
32 | |
33 # OR, if you want to build the contig yourself, | |
34 | |
35 use Bio::Assembly::Contig; | |
36 $c = Bio::Assembly::Contig->new(-id=>"1"); | |
37 | |
38 $ls = Bio::LocatableSeq->new(-seq=>"ACCG-T", | |
39 -id=>"r1", | |
40 -alphabet=>'dna'); | |
41 $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T", | |
42 -id=>"r2", | |
43 -alphabet=>'dna'); | |
44 | |
45 $ls_coord = Bio::SeqFeature::Generic->new(-start=>3, | |
46 -end=>8, | |
47 -strand=>1); | |
48 $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1, | |
49 -end=>8, | |
50 -strand=>1); | |
51 $c->add_seq($ls); | |
52 $c->add_seq($ls2); | |
53 $c->set_seq_coord($ls_coord,$ls); | |
54 $c->set_seq_coord($ls2_coord,$ls2); | |
55 | |
56 $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T", | |
57 -alphabet=>'dna'); | |
58 $c->set_consensus_sequence($con); | |
59 | |
60 $l = $c->change_coord('unaligned r2','ungapped consensus',6); | |
61 print "6 in unaligned r2 => $l in ungapped consensus\n"; | |
62 | |
63 | |
64 =head1 DESCRIPTION | |
65 | |
66 A contig is as a set of sequences, locally aligned to each other, so | |
67 that every sequence has overlapping regions with at least one sequence | |
68 in the contig, such that a continuous of overlapping sequences is | |
69 formed, allowing the deduction of a consensus sequence which may be | |
70 longer than any of the sequences from which it was deduced. | |
71 | |
72 In this documentation we refer to the overlapping sequences used to | |
73 build the contig as "aligned sequences" and to the sequence deduced | |
74 from the overlap of aligned sequences as the "consensus". Methods to | |
75 deduce the consensus sequence from aligned sequences were not yet | |
76 implemented in this module, but its posssible to add a consensus | |
77 sequence deduced by other means, e.g, by the assembly program used to | |
78 build the alignment. | |
79 | |
80 All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable | |
81 objects and have a unique ID. The unique ID restriction is due to the | |
82 nature of the module's internal data structures and is also a request | |
83 of some assembly programs. If two sequences with the same ID are added | |
84 to a contig, the first sequence added is replaced by the second one. | |
85 | |
86 =head2 Coordinate_systems | |
87 | |
88 There are four base coordinate systems in Bio::Assembly::Contig. When | |
89 you need to access contig elements or data that exists on a certain | |
90 range or location, you may be specifying coordinates in relation to | |
91 different sequences, which may be either the contig consensus or one | |
92 of the aligned sequences that were used to do the assembly. | |
93 | |
94 ========================================================= | |
95 Name | Referenced sequence | |
96 --------------------------------------------------------- | |
97 "gapped consensus" | Contig (with gaps) | |
98 "ungapped consensus" | Contig (without gaps) | |
99 "aligned $seqID" | sequence $seqID (with gaps) | |
100 "unaligned $seqID" | sequence $seqID (without gaps) | |
101 ========================================================= | |
102 | |
103 "gapped consensus" refers to positions in the aligned consensus | |
104 sequence, which is the consensus sequence including the gaps inserted | |
105 to align it agains the aligned sequences that were used to assemble | |
106 the contig. So, its limits are [ 1, (consensus length + number of gaps | |
107 in consensus) ] | |
108 | |
109 "ungapped consensus" is a coordinate system based on the consensus | |
110 sequence, but excluding consensus gaps. This is just the coordinate | |
111 system that you have when considering the consensus sequence alone, | |
112 instead of aligned to other sequences. | |
113 | |
114 "aligned $seqID" refers to locations in the sequence $seqID after | |
115 alignment of $seqID against the consensus sequence (reverse | |
116 complementing the original sequence, if needed). Coordinate 1 in | |
117 "aligned $seqID" is equivalent to the start location (first base) of | |
118 $seqID in the consensus sequence, just like if the aligned sequence | |
119 $seqID was a feature of the consensus sequence. | |
120 | |
121 "unaligned $seqID" is equivalent to a location in the isolated | |
122 sequence, just like you would have when considering the sequence | |
123 alone, out of an alignment. When changing coordinates from "aligned | |
124 $seq2" to "unaligned $seq2", if $seq2 was reverse complemented when | |
125 included in the alignment, the output coordinates will be reversed to | |
126 fit that fact, i.e. 1 will be changed to length($seq2), 2 will be | |
127 length($seq)-1 and so on. | |
128 | |
129 An important note: when you change gap coordinates from a gapped | |
130 system ("gapped consensus" or "aligned $seqID") to a system that does | |
131 not include gaps ("ungapped consensus" or "unaligned $seqID"), the | |
132 position returned will be the first location before all gaps | |
133 neighboring the input location. | |
134 | |
135 =head2 Feature_collection | |
136 | |
137 Bio::Assembly::Contig stores much information about a contig in a | |
138 Bio::Assembly::SeqFeature::Collection object. Relevant information on the | |
139 alignment is accessed by selecting features based on their primary | |
140 tags (e.g. all features which have a primary tag of the form | |
141 '_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are | |
142 coordinates for sequences in the contig alignment) and, by using | |
143 methods from Bio::Assembly::SeqFeature::Collection, it's possible to select | |
144 features by overlap with other features. | |
145 | |
146 We suggest that you use the primary tags of features as identifiers | |
147 for feature classes. By convention, features with primary tags | |
148 starting with a '_' are generated by modules that populate the contig | |
149 data structure and return the contig object, maybe as part of an | |
150 assembly object, e.g. drivers from the Bio::Assembly::IO set. | |
151 | |
152 Features in the features collection may be associated with particular | |
153 aligned sequences. To obtain this, you must attach the sequence to the | |
154 feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the | |
155 feature to the feature collection. We also suggest to add the sequence | |
156 id to the primary tag, so that is easy to select feature for a | |
157 particular sequence. | |
158 | |
159 There is only one feature class that some methods in | |
160 Bio::Assembly::Contig expect to find in the feature collection: features | |
161 with primary tags of the form '_aligned_coord:$seqID', where $seqID is | |
162 the aligned sequence id (like returned by $seq-E<gt>id()). These features | |
163 describe the position (in "gapped consensus" coordinates) of aligned | |
164 sequences, and the method set_seq_coord() automatically changes a | |
165 feature's primary tag to this form whenever the feature is added to | |
166 the collection by this method. Only two methods in Bio::Assembly::Contig | |
167 will not work unless there are features from this class: | |
168 change_coord() and get_seq_coord(). | |
169 | |
170 Other feature classes will be automatically available only when | |
171 Bio::Assembly::Contig objects are created by a specific module. Such | |
172 feature classes are (or should be) documented in the documentation of | |
173 the module which create them, to which the user should refer. | |
174 | |
175 =head1 FEEDBACK | |
176 | |
177 =head2 Mailing Lists | |
178 | |
179 User feedback is an integral part of the evolution of this and other | |
180 Bioperl modules. Send your comments and suggestions preferably to the | |
181 Bioperl mailing lists Your participation is much appreciated. | |
182 | |
183 bioperl-l@bioperl.org - General discussion | |
184 http://bio.perl.org/MailList.html - About the mailing lists | |
185 | |
186 =head2 Reporting Bugs | |
187 | |
188 Report bugs to the Bioperl bug tracking system to help us keep track | |
189 the bugs and their resolution. Bug reports can be submitted via email | |
190 or the web: | |
191 | |
192 bioperl-bugs@bio.perl.org | |
193 http://bugzilla.bioperl.org/ | |
194 | |
195 =head1 AUTHOR - Robson Francisco de Souza | |
196 | |
197 rfsouza@citri.iq.usp.br | |
198 | |
199 =head1 APPENDIX | |
200 | |
201 The rest of the documentation details each of the object | |
202 methods. Internal methods are usually preceded with a _ | |
203 | |
204 =cut | |
205 | |
206 #' | |
207 package Bio::Assembly::Contig; | |
208 | |
209 use strict; | |
210 use vars qw(@ISA $VERSION); | |
211 | |
212 use Bio::Root::Root; | |
213 use Bio::Align::AlignI; | |
214 use Bio::SeqFeature::Collection; | |
215 use Bio::Seq::PrimaryQual; | |
216 | |
217 @ISA = qw(Bio::Root::Root Bio::Align::AlignI); | |
218 | |
219 =head1 Object creator | |
220 | |
221 =head2 new | |
222 | |
223 Title : new | |
224 Usage : my $contig = new Bio::Assembly::Contig(); | |
225 Function : Creates a new contig object | |
226 Returns : Bio::Assembly::Contig | |
227 Args : -source => string representing the source | |
228 program where this contig came | |
229 from | |
230 -id => contig unique ID | |
231 | |
232 =cut | |
233 | |
234 #----------- | |
235 sub new { | |
236 #----------- | |
237 my ($class,@args) = @_; | |
238 | |
239 my $self = $class->SUPER::new(@args); | |
240 | |
241 my ($src, $id) = $self->_rearrange([qw(SOURCE ID)], @args); | |
242 $src && $self->source($src); | |
243 ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name | |
244 ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig | |
245 # we need to set up internal hashes first! | |
246 | |
247 # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility) | |
248 $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID) | |
249 $self->{'_order'} = {}; # store sequence order | |
250 # $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids. | |
251 # $self->{'_dis_name'} = {}; # Display names for each sequence | |
252 $self->{'_symbols'} = {}; # List of symbols | |
253 | |
254 #Contig specific slots | |
255 $self->{'_consensus_sequence'} = undef; | |
256 $self->{'_consensus_quality'} = undef; | |
257 $self->{'_nof_residues'} = 0; | |
258 $self->{'_nof_seqs'} = 0; | |
259 # $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now... | |
260 $self->{'_sfc'} = Bio::SeqFeature::Collection->new(); | |
261 | |
262 # Assembly specifcs | |
263 $self->{'_assembly'} = undef; # Reference to a Bio::Assembly::Scaffold object, if contig belongs to one. | |
264 $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise | |
265 $self->{'_neighbor_start'} = undef; # Will hold a reference to another contig | |
266 $self->{'_neighbor_end'} = undef; # Will hold a reference to another contig | |
267 | |
268 return $self; # success - we hope! | |
269 } | |
270 | |
271 =head1 Assembly related methods | |
272 | |
273 These methods exist to enable adding information about possible | |
274 relations among contigs, e.g. when you already have a scaffold for | |
275 your assembly, describing the ordering of contigs in the final | |
276 assembly, but no sequences covering the gaps between neighboring | |
277 contigs. | |
278 | |
279 =head2 source | |
280 | |
281 Title : source | |
282 Usage : $contig->source($program); | |
283 Function : Get/Set program used to build this contig | |
284 Returns : string | |
285 Argument : [optional] string | |
286 | |
287 =cut | |
288 | |
289 sub source { | |
290 my $self = shift; | |
291 my $source = shift; | |
292 | |
293 $self->{'_source'} = $source if (defined $source); | |
294 return $self->{'_source'}; | |
295 } | |
296 | |
297 =head2 assembly | |
298 | |
299 Title : assembly | |
300 Usage : $contig->assembly($assembly); | |
301 Function : Get/Set assembly object for this contig | |
302 Returns : a Bio::Assembly::Scaffold object | |
303 Argument : a Bio::Assembly::Scaffold object | |
304 | |
305 =cut | |
306 | |
307 sub assembly { | |
308 my $self = shift; | |
309 my $assembly = shift; | |
310 | |
311 $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly") | |
312 if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold")); | |
313 | |
314 $self->{'_assembly'} = $assembly if (defined $assembly); | |
315 return $self->{'_assembly'}; | |
316 } | |
317 | |
318 =head2 strand | |
319 | |
320 Title : strand | |
321 Usage : $contig->strand($num); | |
322 Function : Get/Set contig orientation in a scaffold/assembly. | |
323 Its equivalent to the strand property of sequence | |
324 objects and sets whether the contig consensus should | |
325 be reversed and complemented before being added to a | |
326 scaffold or assembly. | |
327 Returns : integer | |
328 Argument : 1 if orientaion is forward, -1 if reverse and | |
329 0 if none | |
330 | |
331 =cut | |
332 | |
333 sub strand { | |
334 my $self = shift; | |
335 my $ori = shift; | |
336 | |
337 $self->throw("Contig strand must be either 1, -1 or 0") | |
338 unless (defined $ori && ($ori == 1 || $ori == 0 || $ori == -1)); | |
339 | |
340 $self->{'_strand'} = $ori; | |
341 return $self->{'_strand'}; | |
342 } | |
343 | |
344 =head2 upstream_neighbor | |
345 | |
346 Title : upstream_neighbor | |
347 Usage : $contig->upstream_neighbor($contig); | |
348 Function : Get/Set a contig neighbor for the current contig when | |
349 building a scaffold. The upstream neighbor is | |
350 located before $contig first base | |
351 Returns : nothing | |
352 Argument : Bio::Assembly::Contig | |
353 | |
354 =cut | |
355 | |
356 sub upstream_neighbor { | |
357 my $self = shift; | |
358 my $ref = shift; | |
359 | |
360 $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig") | |
361 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig")); | |
362 | |
363 $self->{'_neighbor_start'} = $ref if (defined $ref); | |
364 return $self->{'_neighbor_start'}; | |
365 } | |
366 | |
367 =head2 downstream_neighbor | |
368 | |
369 Title : downstream_neighbor | |
370 Usage : $contig->downstream_neighbor($num); | |
371 Function : Get/Set a contig neighbor for the current contig when | |
372 building a scaffold. The downstream neighbor is | |
373 located after $contig last base | |
374 Returns : nothing | |
375 Argument : Bio::Assembly::Contig | |
376 | |
377 =cut | |
378 | |
379 sub downstream_neighbor { | |
380 my $self = shift; | |
381 my $ref = shift; | |
382 | |
383 $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig") | |
384 if (defined $ref && ! $ref->isa("Bio::Assembly::Contig")); | |
385 $self->{'_neighbor_end'} = $ref if (defined $ref); | |
386 return $self->{'_neighbor_end'}; | |
387 } | |
388 | |
389 =head1 Contig feature collection methods | |
390 | |
391 =head2 add_features | |
392 | |
393 Title : add_features | |
394 Usage : $contig->add_features($feat,$flag) | |
395 Function : | |
396 | |
397 Add an array of features to the contig feature | |
398 collection. The consensus sequence may be attached to the | |
399 added feature, if $flag is set to 1. If $flag is 0 and | |
400 the feature attached to one of the contig aligned | |
401 sequences, the feature is registered as an aligned | |
402 sequence feature. If $flag is 0 and the feature is not | |
403 attched to any sequence in the contig, the feature is | |
404 simply added to the feature collection and no attachment | |
405 or registration is made. | |
406 | |
407 Note: You must attach aligned sequences to their features | |
408 prior to calling add_features, otherwise you won't be | |
409 able to access the feature through get_seq_feat_by_tag() | |
410 method. | |
411 | |
412 Returns : number of features added. | |
413 Argument : | |
414 $feat : A reference to an array of Bio::SeqFeatureI | |
415 $flag : boolean - true if consensus sequence object | |
416 should be attached to this feature, false if | |
417 no consensus attachment should be made. | |
418 Default: false. | |
419 | |
420 =cut | |
421 | |
422 sub add_features { | |
423 my ($self, $args, $flag) = @_; | |
424 | |
425 # Adding shortcuts for aligned sequence features | |
426 $flag = 0 unless (defined $flag); | |
427 if ($flag && defined $self->{'_consensus_sequence'}) { | |
428 foreach my $feat (@$args) { | |
429 next if (defined $feat->seq); | |
430 $feat->attach_seq($self->{'_consensus_sequence'}); | |
431 } | |
432 } elsif (!$flag) { # Register aligned sequence features | |
433 foreach my $feat (@$args) { | |
434 if (my $seq = $feat->entire_seq()) { | |
435 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
436 $self->warn("Adding contig feature attached to unknown sequence $seqID!") | |
437 unless (exists $self->{'_elem'}{$seqID}); | |
438 my $tag = $feat->primary_tag; | |
439 $self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat; | |
440 } | |
441 } | |
442 } | |
443 | |
444 # Add feature to feature collection | |
445 my $nof_added = $self->{'_sfc'}->add_features($args); | |
446 | |
447 return $nof_added; | |
448 } | |
449 | |
450 =head2 remove_features | |
451 | |
452 Title : remove_features | |
453 Usage : $contig->remove_features(@feat) | |
454 Function : Remove an array of contig features | |
455 Returns : number of features removed. | |
456 Argument : An array of Bio::SeqFeatureI | |
457 | |
458 =cut | |
459 | |
460 sub remove_features { | |
461 my ($self, @args) = @_; | |
462 | |
463 # Removing shortcuts for aligned sequence features | |
464 foreach my $feat (@args) { | |
465 if (my $seq = $feat->entire_seq()) { | |
466 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
467 my $tag = $feat->primary_tag; | |
468 $tag =~ s/:$seqID$/$1/g; | |
469 delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} ) | |
470 if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} && | |
471 $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat); | |
472 } | |
473 } | |
474 | |
475 return $self->{'_sfc'}->remove_features(\@args); | |
476 } | |
477 | |
478 =head2 get_features_collection | |
479 | |
480 Title : get_features_collection | |
481 Usage : $contig->get_features_collection() | |
482 Function : Get the collection of all contig features | |
483 Returns : Bio::SeqFeature::Collection | |
484 Argument : none | |
485 | |
486 =cut | |
487 | |
488 sub get_features_collection { | |
489 my $self = shift; | |
490 | |
491 return $self->{'_sfc'}; | |
492 } | |
493 | |
494 =head1 Coordinate system's related methods | |
495 | |
496 See L<Coordinate_Systems> above. | |
497 | |
498 =head2 change_coord | |
499 | |
500 Title : change_coord | |
501 Usage : $contig->change_coord($in,$out,$query) | |
502 Function : | |
503 | |
504 Change coordinate system for $query. This method | |
505 transforms locations between coordinate systems described | |
506 in section "Coordinate Systems" of this document. | |
507 | |
508 Note: this method will throw an exception when changing | |
509 coordinates between "ungapped consensus" and other | |
510 systems if consensus sequence was not set. It will also | |
511 throw exceptions when changing coordinates among aligned | |
512 sequence, either with or without gaps, and other systems | |
513 if sequence locations were not set with set_seq_coord(). | |
514 | |
515 Returns : integer | |
516 Argument : | |
517 $in : [string] input coordinate system | |
518 $out : [string] output coordinate system | |
519 $query : [integer] a position in a sequence | |
520 | |
521 =cut | |
522 | |
523 sub change_coord { | |
524 my $self = shift; | |
525 my $type_in = shift; | |
526 my $type_out = shift; | |
527 my $query = shift; | |
528 | |
529 # Parsing arguments | |
530 # Loading read objects (these calls will throw exceptions whether $read_in or | |
531 # $read_out is not found | |
532 my ($read_in,$read_out) = (undef,undef); | |
533 my $in_ID = ( split(' ',$type_in) )[1]; | |
534 my $out_ID = ( split(' ',$type_out) )[1]; | |
535 | |
536 if ($in_ID ne 'consensus') { | |
537 $read_in = $self->get_seq_coord( $self->get_seq_by_name($in_ID) ); | |
538 $self->throw("Can't change coordinates without sequence location for $in_ID") | |
539 unless (defined $read_in); | |
540 } | |
541 if ($out_ID ne 'consensus') { | |
542 $read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) ); | |
543 $self->throw("Can't change coordinates without sequence location for $out_ID") | |
544 unless (defined $read_out); | |
545 } | |
546 | |
547 # Performing transformation between coordinates | |
548 SWITCH1: { | |
549 | |
550 # Transformations between contig padded and contig unpadded | |
551 (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do { | |
552 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence") | |
553 unless (defined $self->{'_consensus_sequence'}); | |
554 $query = &_padded_unpadded($self->{'_consensus_gaps'}, $query); | |
555 last SWITCH1; | |
556 }; | |
557 (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do { | |
558 $self->throw("Can't use ungapped consensus coordinates without a consensus sequence") | |
559 unless (defined $self->{'_consensus_sequence'}); | |
560 $query = &_unpadded_padded($self->{'_consensus_gaps'},$query); | |
561 last SWITCH1; | |
562 }; | |
563 | |
564 # Transformations between contig (padded) and read (padded) | |
565 (($type_in eq 'gapped consensus') && | |
566 ($type_out =~ /^aligned /) && defined($read_out)) && do { | |
567 $query = $query - $read_out->start() + 1; | |
568 last SWITCH1; | |
569 }; | |
570 (($type_in =~ /^aligned /) && defined($read_in) && | |
571 ($type_out eq 'gapped consensus')) && do { | |
572 $query = $query + $read_in->start() - 1; | |
573 last SWITCH1; | |
574 }; | |
575 | |
576 # Transformations between contig (unpadded) and read (padded) | |
577 (($type_in eq 'ungapped consensus') && | |
578 ($type_out =~ /^aligned /) && defined($read_out)) && do { | |
579 $query = $self->change_coord('ungapped consensus','gapped consensus',$query); | |
580 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
581 last SWITCH1; | |
582 }; | |
583 (($type_in =~ /^aligned /) && defined($read_in) && | |
584 ($type_out eq 'ungapped consensus')) && do { | |
585 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
586 $query = $self->change_coord('gapped consensus','ungapped consensus',$query); | |
587 last SWITCH1; | |
588 }; | |
589 | |
590 # Transformations between seq $read_in padded and seq $read_out padded | |
591 (defined($read_in) && ($type_in =~ /^aligned /) && | |
592 defined($read_out) && ($type_out =~ /^aligned /)) && do { | |
593 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
594 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
595 last SWITCH1; | |
596 }; | |
597 | |
598 # Transformations between seq $read_in padded and seq $read_out unpadded | |
599 (defined($read_in) && ($type_in =~ /^aligned /) && | |
600 defined($read_out) && ($type_out =~ /^unaligned /)) && do { | |
601 if ($read_in ne $read_out) { | |
602 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
603 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
604 } | |
605 my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'}; | |
606 $query = &_padded_unpadded($list_out,$query); | |
607 # Changing read orientation if read was reverse complemented when aligned | |
608 if ($read_out->strand == -1) { | |
609 my ($length) = $read_out->length(); | |
610 $length = $length - &_nof_gaps($list_out,$length); | |
611 $query = $length - $query + 1; | |
612 } | |
613 last SWITCH1; | |
614 }; | |
615 (defined($read_in) && ($type_in =~ /^unaligned /) && | |
616 defined($read_out) && ($type_out =~ /^aligned /)) && do { | |
617 my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'}; | |
618 # Changing read orientation if read was reverse complemented when aligned | |
619 if ($read_in->strand == -1) { | |
620 my ($length) = $read_in->length(); | |
621 $length = $length - &_nof_gaps($list_in,$length); | |
622 $query = $length - $query + 1; | |
623 } | |
624 $query = &_unpadded_padded($list_in,$query); | |
625 if ($read_in ne $read_out) { | |
626 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
627 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
628 } | |
629 last SWITCH1; | |
630 }; | |
631 | |
632 # Transformations between seq $read_in unpadded and seq $read_out unpadded | |
633 (defined($read_in) && ($type_in =~ /^unaligned /) && | |
634 defined($read_out) && ($type_out =~ /^unaligned /)) && do { | |
635 $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query); | |
636 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query); | |
637 last SWITCH1; | |
638 }; | |
639 | |
640 # Transformations between contig (padded) and read (unpadded) | |
641 (($type_in eq 'gapped consensus') && | |
642 ($type_out =~ /^unaligned /) && defined($read_out)) && do { | |
643 $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query); | |
644 $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query); | |
645 last SWITCH1; | |
646 }; | |
647 (($type_in =~ /^unaligned /) && defined($read_in) && | |
648 ($type_out eq 'gapped consensus')) && do { | |
649 $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query); | |
650 $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query); | |
651 last SWITCH1; | |
652 }; | |
653 | |
654 # Transformations between contig (unpadded) and read (unpadded) | |
655 (($type_in eq 'ungapped consensus') && | |
656 ($type_out =~ /^unaligned /) && defined($read_out)) && do { | |
657 $query = $self->change_coord('ungapped consensus','gapped consensus',$query); | |
658 $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query); | |
659 last SWITCH1; | |
660 }; | |
661 (($type_in =~ /^unaligned /) && defined($read_in) && | |
662 ($type_out eq 'ungapped consensus')) && do { | |
663 $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query); | |
664 $query = $self->change_coord('gapped consensus','ungapped consensus',$query); | |
665 last SWITCH1; | |
666 }; | |
667 | |
668 $self->throw("Unknow coordinate system. Args: $type_in, $type_out."); | |
669 $query = undef; # If a coordinate systems just requested is unknown | |
670 } | |
671 | |
672 return $query; | |
673 } | |
674 | |
675 =head2 get_seq_coord | |
676 | |
677 Title : get_seq_coord | |
678 Usage : $contig->get_seq_coord($seq); | |
679 Function : Get "gapped consensus" location for aligned sequence | |
680 Returns : Bio::SeqFeature::Generic for coordinates or undef. | |
681 A warning is printed if sequence coordinates were not set. | |
682 Argument : Bio::LocatabaleSeq object | |
683 | |
684 =cut | |
685 | |
686 sub get_seq_coord { | |
687 my ($self,$seq) = @_; | |
688 | |
689 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
690 $self->throw("$seq is not a Bio::LocatableSeq"); | |
691 } | |
692 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
693 | |
694 unless (exists( $self->{'_elem'}{$seqID} )) { | |
695 $self->warn("No such sequence ($seqID) in contig ".$self->id); | |
696 return undef; | |
697 } | |
698 unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) { | |
699 $self->warn("Location not set for sequence ($seqID) in contig ".$self->id); | |
700 return undef; | |
701 } | |
702 | |
703 return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"}; | |
704 } | |
705 | |
706 =head2 set_seq_coord | |
707 | |
708 Title : set_seq_coord | |
709 Usage : $contig->set_seq_coord($feat,$seq); | |
710 Function : | |
711 | |
712 Set "gapped consensus" location for an aligned | |
713 sequence. If the sequence was previously added using | |
714 add_seq, its coordinates are changed/set. Otherwise, | |
715 add_seq is called and the sequence is added to the | |
716 contig. | |
717 | |
718 Returns : Bio::SeqFeature::Generic for old coordinates or undef. | |
719 Argument : | |
720 $feat : a Bio::SeqFeature::Generic object | |
721 representing a location for the | |
722 aligned sequence, in "gapped | |
723 consensus" coordinates. | |
724 | |
725 Note: the original feature primary tag will | |
726 be lost. | |
727 | |
728 $seq : a Bio::LocatabaleSeq object | |
729 | |
730 =cut | |
731 | |
732 sub set_seq_coord { | |
733 my ($self,$feat,$seq) = @_; | |
734 | |
735 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
736 $self->throw("Unable to process non locatable sequences [".ref($seq)."]"); | |
737 } | |
738 | |
739 # Complaining about inadequate feature object | |
740 $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!") | |
741 unless ( $feat->isa("Bio::SeqFeature::Generic") ); | |
742 $self->throw("Sequence coordinates must have an end!") | |
743 unless (defined $feat->end); | |
744 $self->throw("Sequence coordinates must have a start!") | |
745 unless (defined $feat->start); | |
746 | |
747 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
748 if (exists( $self->{'_elem'}{$seqID} ) && | |
749 exists( $self->{'_elem'}{$seqID}{'_seq'} ) && | |
750 defined( $self->{'_elem'}{$seqID}{'_seq'} ) && | |
751 ($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) { | |
752 $self->warn("Replacing sequence $seqID\n"); | |
753 $self->remove_seq($self->{'_elem'}{$seqID}{'_seq'}); | |
754 } | |
755 $self->add_seq($seq); | |
756 | |
757 # Remove previous coordinates, if any | |
758 $self->remove_features($feat); | |
759 | |
760 # Add new Bio::Generic::SeqFeature | |
761 $feat->add_tag_value('contig',$self->id) | |
762 unless ( $feat->has_tag('contig') ); | |
763 $feat->primary_tag("_aligned_coord:$seqID"); | |
764 $feat->attach_seq($seq); | |
765 $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat; | |
766 $self->add_features([ $feat ]); | |
767 } | |
768 | |
769 =head1 Bio::Assembly::Contig consensus methods | |
770 | |
771 =head2 set_consensus_sequence | |
772 | |
773 Title : set_consensus_sequence | |
774 Usage : $contig->set_consensus_sequence($seq) | |
775 Function : Set the consensus sequence object for this contig | |
776 Returns : consensus length | |
777 Argument : Bio::LocatableSeq | |
778 | |
779 =cut | |
780 | |
781 sub set_consensus_sequence { | |
782 my $self = shift; | |
783 my $seq = shift; | |
784 | |
785 $self->throw("Consensus sequence must be a Bio::LocatableSeq!") | |
786 unless ($seq->isa("Bio::LocatableSeq")); | |
787 | |
788 my $con_len = $seq->length; | |
789 $seq->start(1); $seq->end($con_len); | |
790 | |
791 $self->{'_consensus_gaps'} = []; # Consensus Gap registry | |
792 $self->_register_gaps($seq->seq, | |
793 $self->{'_consensus_gaps'}); | |
794 $self->{'_consensus_sequence'} = $seq; | |
795 | |
796 return $con_len; | |
797 } | |
798 | |
799 =head2 set_consensus_quality | |
800 | |
801 Title : set_consensus_quality | |
802 Usage : $contig->set_consensus_quality($qual) | |
803 Function : Set the quality object for consensus sequence | |
804 Returns : nothing | |
805 Argument : Bio::Seq::QualI object | |
806 | |
807 =cut | |
808 | |
809 sub set_consensus_quality { | |
810 my $self = shift; | |
811 my $qual = shift; | |
812 | |
813 $self->throw("Consensus quality must be a Bio::Seq::QualI object!") | |
814 unless ( $qual->isa("Bio::Seq::QualI") ); | |
815 | |
816 $self->throw("Consensus quality can't be added before you set the consensus sequence!") | |
817 unless (defined $self->{'_consensus_sequence'}); | |
818 | |
819 $self->{'_consensus_quality'} = $qual; | |
820 } | |
821 | |
822 =head2 get_consensus_length | |
823 | |
824 Title : get_consensus_length | |
825 Usage : $contig->get_consensus_length() | |
826 Function : Get consensus sequence length | |
827 Returns : integer | |
828 Argument : none | |
829 | |
830 =cut | |
831 | |
832 sub get_consensus_length { | |
833 my $self = shift; | |
834 | |
835 return $self->{'_consensus_sequence'}->length(); | |
836 } | |
837 | |
838 =head2 get_consensus_sequence | |
839 | |
840 Title : get_consensus_sequence | |
841 Usage : $contig->get_consensus_sequence() | |
842 Function : Get a reference to the consensus sequence object | |
843 for this contig | |
844 Returns : Bio::SeqI object | |
845 Argument : none | |
846 | |
847 =cut | |
848 | |
849 sub get_consensus_sequence { | |
850 my ($self, @args) = @_; | |
851 | |
852 return $self->{'_consensus_sequence'}; | |
853 } | |
854 | |
855 =head2 get_consensus_quality | |
856 | |
857 Title : get_consensus_quality | |
858 Usage : $contig->get_consensus_quality() | |
859 Function : Get a reference to the consensus quality object | |
860 for this contig. | |
861 Returns : A Bio::QualI object | |
862 Argument : none | |
863 | |
864 =cut | |
865 | |
866 sub get_consensus_quality { | |
867 my ($self, @args) = @_; | |
868 | |
869 return $self->{'_consensus_quality'}; | |
870 } | |
871 | |
872 =head1 Bio::Assembly::Contig aligned sequences methods | |
873 | |
874 =head2 set_seq_qual | |
875 | |
876 Title : set_seq_qual | |
877 Usage : $contig->set_seq_qual($seq,$qual); | |
878 Function : Adds quality to an aligned sequence. | |
879 Returns : nothing | |
880 Argument : a Bio::LocatableSeq object and | |
881 a Bio::Seq::QualI object | |
882 | |
883 See L<Bio::LocatableSeq> for more information. | |
884 | |
885 =cut | |
886 | |
887 sub set_seq_qual { | |
888 my ($self,$seq,$qual) = @_; | |
889 | |
890 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
891 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
892 } | |
893 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
894 | |
895 $self->throw("Consensus quality must be a Bio::Seq::QualI object!") | |
896 unless ( $qual->isa("Bio::Seq::QualI") ); | |
897 $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!") | |
898 unless (exists $self->{'_elem'}{$seqID}{'_seq'}); | |
899 $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) )); | |
900 | |
901 # Adding gaps to quality object | |
902 my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq(); | |
903 my $tmp = $qual->qual(); | |
904 @{$tmp} = reverse(@{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1); | |
905 my @quality = (); | |
906 my $previous = 0; | |
907 my $next = 0; | |
908 my $i = 0; my $j = 0; | |
909 while ($i<=$#{$tmp}) { | |
910 # IF base is a gap, quality is the average for neighbouring sites | |
911 if (substr($sequence,$j,1) eq '-') { | |
912 $previous = $tmp->[$i-1] unless ($i == 0); | |
913 if ($i < $#{$tmp}) { | |
914 $next = $tmp->[$i+1]; | |
915 } else { | |
916 $next = 0; | |
917 } | |
918 push(@quality,int( ($previous+$next)/2 )); | |
919 } else { | |
920 push(@quality,$tmp->[$i]); | |
921 $i++; | |
922 } | |
923 $j++; | |
924 } | |
925 | |
926 $self->{'_elem'}{$seqID}{'_qual'} = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality), | |
927 -id=>$seqID); | |
928 } | |
929 | |
930 =head2 get_seq_ids | |
931 | |
932 Title : get_seq_ids | |
933 Usage : $contig->get_seq_ids(-start=>$start, | |
934 -end=>$end, | |
935 -type=>"gapped A0QR67B08.b"); | |
936 Function : Get list of sequence IDs overlapping inteval [$start, $end] | |
937 The default interval is [1,$contig->length] | |
938 Default coordinate system is "gapped contig" | |
939 Returns : An array | |
940 Argument : A hash with optional elements: | |
941 -start : consensus subsequence start | |
942 -end : consensus subsequence end | |
943 -type : the coordinate system type for $start and $end arguments | |
944 Coordinate system avaliable are: | |
945 "gapped consensus" : consensus coordinates with gaps | |
946 "ungapped consensus" : consensus coordinates without gaps | |
947 "aligned $ReadID" : read $ReadID coordinates with gaps | |
948 "unaligned $ReadID" : read $ReadID coordinates without gaps | |
949 | |
950 | |
951 =cut | |
952 | |
953 sub get_seq_ids { | |
954 my ($self, @args) = @_; | |
955 | |
956 my ($type,$start,$end) = | |
957 $self->_rearrange([qw(TYPE START END)], @args); | |
958 | |
959 if (defined($start) && defined($end)) { | |
960 if (defined($type) && ($type ne 'gapped consensus')) { | |
961 $start = $self->change_coord($type,'gapped consensus',$start); | |
962 $end = $self->change_coord($type,'gapped consensus',$end); | |
963 } | |
964 | |
965 my @list = grep { $_->isa("Bio::SeqFeature::Generic") && | |
966 ($_->primary_tag =~ /^_aligned_coord:/) } | |
967 $self->{'_sfc'}->features_in_range(-start=>$start, | |
968 -end=>$end, | |
969 -contain=>0, | |
970 -strandmatch=>'ignore'); | |
971 @list = map { $_->entire_seq->id } @list; | |
972 return @list; | |
973 } | |
974 | |
975 # Entire aligned sequences list | |
976 return map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} }; | |
977 } | |
978 | |
979 =head2 get_seq_feat_by_tag | |
980 | |
981 Title : get_seq_feat_by_tag | |
982 Usage : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID") | |
983 Function : | |
984 | |
985 Get a sequence feature based on its primary_tag. | |
986 When you add | |
987 | |
988 Returns : a Bio::SeqFeature object | |
989 Argument : a Bio::LocatableSeq and a string (feature primary tag) | |
990 | |
991 =cut | |
992 | |
993 sub get_seq_feat_by_tag { | |
994 my ($self,$seq,$tag) = @_; | |
995 | |
996 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
997 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
998 } | |
999 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
1000 | |
1001 return $self->{'_elem'}{$seqID}{'_feat'}{$tag}; | |
1002 } | |
1003 | |
1004 =head2 get_seq_by_name | |
1005 | |
1006 Title : get_seq_by_name | |
1007 Usage : $seq = $contig->get_seq_by_name('Seq1') | |
1008 Function : Gets a sequence based on its id. | |
1009 Returns : a Bio::LocatableSeq object | |
1010 undef if name is not found | |
1011 Argument : string | |
1012 | |
1013 =cut | |
1014 | |
1015 sub get_seq_by_name { | |
1016 my $self = shift; | |
1017 my ($seqID) = @_; | |
1018 | |
1019 unless (exists $self->{'_elem'}{$seqID}{'_seq'}) { | |
1020 $self->throw("Could not find sequence $seqID in contig ".$self->id); | |
1021 return undef; | |
1022 } | |
1023 | |
1024 return $self->{'_elem'}{$seqID}{'_seq'}; | |
1025 } | |
1026 | |
1027 =head2 get_qual_by_name | |
1028 | |
1029 Title : get_qual_by_name | |
1030 Usage : $seq = $contig->get_qual_by_name('Seq1') | |
1031 Function : | |
1032 | |
1033 Gets Bio::Seq::QualI object for a sequence | |
1034 through its id ( as given by $qual->id() ). | |
1035 | |
1036 Returns : a Bio::Seq::QualI object. | |
1037 undef if name is not found | |
1038 Argument : string | |
1039 | |
1040 =cut | |
1041 | |
1042 sub get_qual_by_name { | |
1043 my $self = shift; | |
1044 my ($seqID) = @_; | |
1045 | |
1046 unless (exists $self->{'_elem'}{$seqID}{'_qual'}) { | |
1047 $self->warn("Could not find quality for $seqID in contig!"); | |
1048 return undef; | |
1049 } | |
1050 | |
1051 return $self->{'_elem'}{$seqID}{'_qual'}; | |
1052 } | |
1053 | |
1054 =head1 Bio::Align::AlignI compatible methods | |
1055 | |
1056 =head2 Modifier methods | |
1057 | |
1058 These methods modify the MSE by adding, removing or shuffling complete | |
1059 sequences. | |
1060 | |
1061 =head2 add_seq | |
1062 | |
1063 Title : add_seq | |
1064 Usage : $contig->add_seq($newseq); | |
1065 Function : | |
1066 | |
1067 Adds a sequence to the contig. *Does* | |
1068 *not* align it - just adds it to the | |
1069 hashes. | |
1070 | |
1071 Returns : nothing | |
1072 Argument : a Bio::LocatableSeq object | |
1073 | |
1074 See L<Bio::LocatableSeq> for more information. | |
1075 | |
1076 =cut | |
1077 | |
1078 sub add_seq { | |
1079 my $self = shift; | |
1080 my $seq = shift; | |
1081 | |
1082 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
1083 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
1084 } | |
1085 | |
1086 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
1087 $self->{'_elem'}{$seqID} = {} unless (exists $self->{'elem'}{$seqID}); | |
1088 | |
1089 if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) && | |
1090 ($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) { | |
1091 $self->warn("Adding sequence $seqID, which has already been added"); | |
1092 } | |
1093 | |
1094 # Our locatable sequences are always considered to be complete sequences | |
1095 $seq->start(1); $seq->end($seq->length()); | |
1096 | |
1097 $self->warn("Adding non-nucleotidic sequence ".$seqID) | |
1098 if (lc($seq->alphabet) ne 'dna' && lc($seq->alphabet) ne 'rna'); | |
1099 | |
1100 # build the symbol list for this sequence, | |
1101 # will prune out the gap and missing/match chars | |
1102 # when actually asked for the symbol list in the | |
1103 # symbol_chars | |
1104 if (defined $seq->seq) { | |
1105 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); | |
1106 } else { | |
1107 $self->{'_symbols'} = {}; | |
1108 } | |
1109 | |
1110 my $seq_no = ++$self->{'_nof_seqs'}; | |
1111 | |
1112 if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) { | |
1113 $self->warn("Replacing one sequence [$seqID]\n"); | |
1114 } else { | |
1115 #print STDERR "Assigning $seqID to $order\n"; | |
1116 $self->{'_order'}->{$seq_no} = $seqID; | |
1117 # $self->{'_start_end_lists'}->{$id} = [] | |
1118 # unless(exists $self->{'_start_end_lists'}->{$id}); | |
1119 # push @{$self->{'_start_end_lists'}->{$id}}, $seq; | |
1120 } | |
1121 | |
1122 $self->{'_elem'}{$seqID}{'_seq'} = $seq; | |
1123 $self->{'_elem'}{$seqID}{'_feat'} = {}; | |
1124 $self->{'_elem'}{$seqID}{'_gaps'} = []; | |
1125 my $dbref = $self->{'_elem'}{$seqID}{'_gaps'}; | |
1126 my $nofgaps = $self->_register_gaps($seq->seq,$dbref); | |
1127 | |
1128 # Updating residue count | |
1129 $self->{'_nof_residues'} += $seq->length - $nofgaps; | |
1130 | |
1131 return 1; | |
1132 } | |
1133 | |
1134 =head2 remove_seq | |
1135 | |
1136 Title : remove_seq | |
1137 Usage : $contig->remove_seq($seq); | |
1138 Function : Removes a single sequence from an alignment | |
1139 Returns : 1 on success, 0 otherwise | |
1140 Argument : a Bio::LocatableSeq object | |
1141 | |
1142 =cut | |
1143 | |
1144 sub remove_seq { | |
1145 my ($self,$seq) = @_; | |
1146 | |
1147 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
1148 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
1149 } | |
1150 | |
1151 my $seqID = $seq->id() || $seq->display_id || $seq->primary_id; | |
1152 unless (exists $self->{'_elem'}{$seqID} ) { | |
1153 $self->warn("No sequence named $seqID [$seq]"); | |
1154 return 0; | |
1155 } | |
1156 | |
1157 # Updating residue count | |
1158 $self->{'_nof_residues'} -= $seq->length() + | |
1159 &_nof_gaps( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length ); | |
1160 | |
1161 # Remove all references to features of this sequence | |
1162 my @feats = (); | |
1163 foreach my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) { | |
1164 push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag}); | |
1165 } | |
1166 $self->{'_sfc'}->remove_features(\@feats); | |
1167 delete $self->{'_elem'}{$seqID}; | |
1168 | |
1169 return 1; | |
1170 } | |
1171 | |
1172 =head2 purge | |
1173 | |
1174 Title : purge | |
1175 Usage : $contig->purge(0.7); | |
1176 Function: | |
1177 | |
1178 Removes sequences above whatever %id. | |
1179 | |
1180 This function will grind on large alignments. Beware! | |
1181 (perhaps not ideally implemented) | |
1182 | |
1183 Example : | |
1184 Returns : An array of the removed sequences | |
1185 Argument: | |
1186 | |
1187 | |
1188 =cut | |
1189 | |
1190 sub purge { | |
1191 my ($self) = @_; | |
1192 $self->throw_not_implemented(); | |
1193 } | |
1194 | |
1195 =head2 sort_alphabetically | |
1196 | |
1197 Title : sort_alphabetically | |
1198 Usage : $contig->sort_alphabetically | |
1199 Function : | |
1200 | |
1201 Changes the order of the alignemnt to alphabetical on name | |
1202 followed by numerical by number. | |
1203 | |
1204 Returns : | |
1205 Argument : | |
1206 | |
1207 =cut | |
1208 | |
1209 sub sort_alphabetically { | |
1210 my ($self) = @_; | |
1211 $self->throw_not_implemented(); | |
1212 } | |
1213 | |
1214 =head2 Sequence selection methods | |
1215 | |
1216 Methods returning one or more sequences objects. | |
1217 | |
1218 =head2 each_seq | |
1219 | |
1220 Title : each_seq | |
1221 Usage : foreach $seq ( $contig->each_seq() ) | |
1222 Function : Gets an array of Seq objects from the alignment | |
1223 Returns : an array | |
1224 Argument : | |
1225 | |
1226 =cut | |
1227 | |
1228 sub each_seq { | |
1229 my ($self) = @_; | |
1230 | |
1231 my (@arr,$seqID); | |
1232 | |
1233 foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) { | |
1234 push(@arr,$self->{'_elem'}{$seqID}{'_seq'}); | |
1235 } | |
1236 | |
1237 return @arr; | |
1238 } | |
1239 | |
1240 =head2 each_alphabetically | |
1241 | |
1242 Title : each_alphabetically | |
1243 Usage : foreach $seq ( $contig->each_alphabetically() ) | |
1244 Function : | |
1245 | |
1246 Returns an array of sequence object sorted alphabetically | |
1247 by name and then by start point. | |
1248 Does not change the order of the alignment | |
1249 | |
1250 Returns : | |
1251 Argument : | |
1252 | |
1253 =cut | |
1254 | |
1255 sub each_alphabetically { | |
1256 my($self) = @_; | |
1257 $self->throw_not_implemented(); | |
1258 } | |
1259 | |
1260 =head2 each_seq_with_id | |
1261 | |
1262 Title : each_seq_with_id | |
1263 Usage : foreach $seq ( $contig->each_seq_with_id() ) | |
1264 Function : | |
1265 | |
1266 Gets an array of Seq objects from the | |
1267 alignment, the contents being those sequences | |
1268 with the given name (there may be more than one) | |
1269 | |
1270 Returns : an array | |
1271 Argument : a seq name | |
1272 | |
1273 =cut | |
1274 | |
1275 sub each_seq_with_id { | |
1276 my ($self) = @_; | |
1277 $self->throw_not_implemented(); | |
1278 } | |
1279 | |
1280 =head2 get_seq_by_pos | |
1281 | |
1282 Title : get_seq_by_pos | |
1283 Usage : $seq = $contig->get_seq_by_pos(3) | |
1284 Function : | |
1285 | |
1286 Gets a sequence based on its position in the alignment. | |
1287 Numbering starts from 1. Sequence positions larger than | |
1288 no_sequences() will thow an error. | |
1289 | |
1290 Returns : a Bio::LocatableSeq object | |
1291 Argument : positive integer for the sequence osition | |
1292 | |
1293 =cut | |
1294 | |
1295 sub get_seq_by_pos { | |
1296 my $self = shift; | |
1297 my ($pos) = @_; | |
1298 | |
1299 $self->throw("Sequence position has to be a positive integer, not [$pos]") | |
1300 unless $pos =~ /^\d+$/ and $pos > 0; | |
1301 $self->throw("No sequence at position [$pos]") | |
1302 unless $pos <= $self->no_sequences ; | |
1303 | |
1304 my $seqID = $self->{'_order'}->{--$pos}; | |
1305 return $self->{'_elem'}{$seqID}{'_seq'}; | |
1306 } | |
1307 | |
1308 =head2 Create new alignments | |
1309 | |
1310 The result of these methods are horizontal or vertical subsets of the | |
1311 current MSE. | |
1312 | |
1313 =head2 select | |
1314 | |
1315 Title : select | |
1316 Usage : $contig2 = $contig->select(1, 3) # three first sequences | |
1317 Function : | |
1318 | |
1319 Creates a new alignment from a continuous subset of | |
1320 sequences. Numbering starts from 1. Sequence positions | |
1321 larger than no_sequences() will thow an error. | |
1322 | |
1323 Returns : a Bio::Assembly::Contig object | |
1324 Argument : positive integer for the first sequence | |
1325 positive integer for the last sequence to include (optional) | |
1326 | |
1327 =cut | |
1328 | |
1329 sub select { | |
1330 my ($self) = @_; | |
1331 $self->throw_not_implemented(); | |
1332 } | |
1333 | |
1334 | |
1335 =head2 select_noncont | |
1336 | |
1337 Title : select_noncont | |
1338 Usage : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences | |
1339 Function : | |
1340 | |
1341 Creates a new alignment from a subset of | |
1342 sequences. Numbering starts from 1. Sequence positions | |
1343 larger than no_sequences() will thow an error. | |
1344 | |
1345 Returns : a Bio::Assembly::Contig object | |
1346 Args : array of integers for the sequences | |
1347 | |
1348 =cut | |
1349 | |
1350 sub select_noncont { | |
1351 my ($self) = @_; | |
1352 $self->throw_not_implemented(); | |
1353 } | |
1354 | |
1355 =head2 slice | |
1356 | |
1357 Title : slice | |
1358 Usage : $contig2 = $contig->slice(20, 30) | |
1359 Function : | |
1360 | |
1361 Creates a slice from the alignment inclusive of start and | |
1362 end columns. Sequences with no residues in the slice are | |
1363 excluded from the new alignment and a warning is printed. | |
1364 Slice beyond the length of the sequence does not do | |
1365 padding. | |
1366 | |
1367 Returns : a Bio::Assembly::Contig object | |
1368 Argument : positive integer for start column | |
1369 positive integer for end column | |
1370 | |
1371 =cut | |
1372 | |
1373 sub slice { | |
1374 my ($self) = @_; | |
1375 $self->throw_not_implemented(); | |
1376 } | |
1377 | |
1378 =head2 Change sequences within the MSE | |
1379 | |
1380 These methods affect characters in all sequences without changeing the | |
1381 alignment. | |
1382 | |
1383 | |
1384 =head2 map_chars | |
1385 | |
1386 Title : map_chars | |
1387 Usage : $contig->map_chars('\.','-') | |
1388 Function : | |
1389 | |
1390 Does a s/$arg1/$arg2/ on the sequences. Useful for gap | |
1391 characters | |
1392 | |
1393 Notice that the from (arg1) is interpretted as a regex, | |
1394 so be careful about quoting meta characters (eg | |
1395 $contig->map_chars('.','-') wont do what you want) | |
1396 | |
1397 Returns : | |
1398 Argument : 'from' rexexp | |
1399 'to' string | |
1400 | |
1401 =cut | |
1402 | |
1403 sub map_chars { | |
1404 my ($self) = @_; | |
1405 $self->throw_not_implemented(); | |
1406 } | |
1407 | |
1408 =head2 uppercase | |
1409 | |
1410 Title : uppercase() | |
1411 Usage : $contig->uppercase() | |
1412 Function : Sets all the sequences to uppercase | |
1413 Returns : | |
1414 Argument : | |
1415 | |
1416 =cut | |
1417 | |
1418 sub uppercase { | |
1419 my ($self) = @_; | |
1420 $self->throw_not_implemented(); | |
1421 } | |
1422 | |
1423 =head2 match_line | |
1424 | |
1425 Title : match_line() | |
1426 Usage : $contig->match_line() | |
1427 Function : Generates a match line - much like consensus string | |
1428 except that a line indicating the '*' for a match. | |
1429 Argument : (optional) Match line characters ('*' by default) | |
1430 (optional) Strong match char (':' by default) | |
1431 (optional) Weak match char ('.' by default) | |
1432 | |
1433 =cut | |
1434 | |
1435 sub match_line { | |
1436 my ($self) = @_; | |
1437 $self->throw_not_implemented(); | |
1438 } | |
1439 | |
1440 =head2 match | |
1441 | |
1442 Title : match() | |
1443 Usage : $contig->match() | |
1444 Function : | |
1445 | |
1446 Goes through all columns and changes residues that are | |
1447 identical to residue in first sequence to match '.' | |
1448 character. Sets match_char. | |
1449 | |
1450 USE WITH CARE: Most MSE formats do not support match | |
1451 characters in sequences, so this is mostly for output | |
1452 only. NEXUS format (Bio::AlignIO::nexus) can handle | |
1453 it. | |
1454 | |
1455 Returns : 1 | |
1456 Argument : a match character, optional, defaults to '.' | |
1457 | |
1458 =cut | |
1459 | |
1460 sub match { | |
1461 my ($self) = @_; | |
1462 $self->throw_not_implemented(); | |
1463 } | |
1464 | |
1465 =head2 unmatch | |
1466 | |
1467 Title : unmatch() | |
1468 Usage : $contig->unmatch() | |
1469 Function : | |
1470 | |
1471 Undoes the effect of method match. Unsets match_char. | |
1472 | |
1473 Returns : 1 | |
1474 Argument : a match character, optional, defaults to '.' | |
1475 | |
1476 =cut | |
1477 | |
1478 sub unmatch { | |
1479 my ($self) = @_; | |
1480 $self->throw_not_implemented(); | |
1481 } | |
1482 | |
1483 | |
1484 =head2 MSE attibutes | |
1485 | |
1486 Methods for setting and reading the MSE attributes. | |
1487 | |
1488 Note that the methods defining character semantics depend on the user | |
1489 to set them sensibly. They are needed only by certain input/output | |
1490 methods. Unset them by setting to an empty string (''). | |
1491 | |
1492 =head2 id | |
1493 | |
1494 Title : id | |
1495 Usage : $contig->id("Ig") | |
1496 Function : Gets/sets the id field of the alignment | |
1497 Returns : An id string | |
1498 Argument : An id string (optional) | |
1499 | |
1500 =cut | |
1501 | |
1502 sub id { | |
1503 my ($self, $contig_name) = @_; | |
1504 | |
1505 if (defined( $contig_name )) { | |
1506 $self->{'_id'} = $contig_name; | |
1507 } | |
1508 | |
1509 return $self->{'_id'}; | |
1510 } | |
1511 | |
1512 =head2 missing_char | |
1513 | |
1514 Title : missing_char | |
1515 Usage : $contig->missing_char("?") | |
1516 Function : Gets/sets the missing_char attribute of the alignment | |
1517 It is generally recommended to set it to 'n' or 'N' | |
1518 for nucleotides and to 'X' for protein. | |
1519 Returns : An missing_char string, | |
1520 Argument : An missing_char string (optional) | |
1521 | |
1522 =cut | |
1523 | |
1524 sub missing_char { | |
1525 my ($self) = @_; | |
1526 $self->throw_not_implemented(); | |
1527 } | |
1528 | |
1529 =head2 match_char | |
1530 | |
1531 Title : match_char | |
1532 Usage : $contig->match_char('.') | |
1533 Function : Gets/sets the match_char attribute of the alignment | |
1534 Returns : An match_char string, | |
1535 Argument : An match_char string (optional) | |
1536 | |
1537 =cut | |
1538 | |
1539 sub match_char { | |
1540 my ($self) = @_; | |
1541 $self->throw_not_implemented(); | |
1542 } | |
1543 | |
1544 =head2 gap_char | |
1545 | |
1546 Title : gap_char | |
1547 Usage : $contig->gap_char('-') | |
1548 Function : Gets/sets the gap_char attribute of the alignment | |
1549 Returns : An gap_char string, defaults to '-' | |
1550 Argument : An gap_char string (optional) | |
1551 | |
1552 =cut | |
1553 | |
1554 sub gap_char { | |
1555 my ($self) = @_; | |
1556 $self->throw_not_implemented(); | |
1557 } | |
1558 | |
1559 =head2 symbol_chars | |
1560 | |
1561 Title : symbol_chars | |
1562 Usage : my @symbolchars = $contig->symbol_chars; | |
1563 Function: Returns all the seen symbols (other than gaps) | |
1564 Returns : array of characters that are the seen symbols | |
1565 Argument: boolean to include the gap/missing/match characters | |
1566 | |
1567 =cut | |
1568 | |
1569 sub symbol_chars{ | |
1570 my ($self) = @_; | |
1571 $self->throw_not_implemented(); | |
1572 } | |
1573 | |
1574 =head2 Alignment descriptors | |
1575 | |
1576 These read only methods describe the MSE in various ways. | |
1577 | |
1578 | |
1579 =head2 consensus_string | |
1580 | |
1581 Title : consensus_string | |
1582 Usage : $str = $contig->consensus_string($threshold_percent) | |
1583 Function : Makes a strict consensus | |
1584 Returns : | |
1585 Argument : Optional treshold ranging from 0 to 100. | |
1586 The consensus residue has to appear at least threshold % | |
1587 of the sequences at a given location, otherwise a '?' | |
1588 character will be placed at that location. | |
1589 (Default value = 0%) | |
1590 | |
1591 =cut | |
1592 | |
1593 sub consensus_string { | |
1594 my ($self) = @_; | |
1595 $self->throw_not_implemented(); | |
1596 } | |
1597 | |
1598 =head2 consensus_iupac | |
1599 | |
1600 Title : consensus_iupac | |
1601 Usage : $str = $contig->consensus_iupac() | |
1602 Function : | |
1603 | |
1604 Makes a consensus using IUPAC ambiguity codes from DNA | |
1605 and RNA. The output is in upper case except when gaps in | |
1606 a column force output to be in lower case. | |
1607 | |
1608 Note that if your alignment sequences contain a lot of | |
1609 IUPAC ambiquity codes you often have to manually set | |
1610 alphabet. Bio::PrimarySeq::_guess_type thinks they | |
1611 indicate a protein sequence. | |
1612 | |
1613 Returns : consensus string | |
1614 Argument : none | |
1615 Throws : on protein sequences | |
1616 | |
1617 | |
1618 =cut | |
1619 | |
1620 sub consensus_iupac { | |
1621 my ($self) = @_; | |
1622 $self->throw_not_implemented(); | |
1623 } | |
1624 | |
1625 =head2 is_flush | |
1626 | |
1627 Title : is_flush | |
1628 Usage : if( $contig->is_flush() ) | |
1629 : | |
1630 : | |
1631 Function : Tells you whether the alignment | |
1632 : is flush, ie all of the same length | |
1633 : | |
1634 : | |
1635 Returns : 1 or 0 | |
1636 Argument : | |
1637 | |
1638 =cut | |
1639 | |
1640 sub is_flush { | |
1641 my ($self) = @_; | |
1642 $self->throw_not_implemented(); | |
1643 } | |
1644 | |
1645 =head2 length | |
1646 | |
1647 Title : length() | |
1648 Usage : $len = $contig->length() | |
1649 Function : Returns the maximum length of the alignment. | |
1650 To be sure the alignment is a block, use is_flush | |
1651 Returns : | |
1652 Argument : | |
1653 | |
1654 =cut | |
1655 | |
1656 sub length { | |
1657 my ($self) = @_; | |
1658 | |
1659 $self->throw_not_implemented(); | |
1660 } | |
1661 | |
1662 =head2 maxdisplayname_length | |
1663 | |
1664 Title : maxdisplayname_length | |
1665 Usage : $contig->maxdisplayname_length() | |
1666 Function : | |
1667 | |
1668 Gets the maximum length of the displayname in the | |
1669 alignment. Used in writing out various MSE formats. | |
1670 | |
1671 Returns : integer | |
1672 Argument : | |
1673 | |
1674 =cut | |
1675 | |
1676 sub maxname_length { | |
1677 my ($self) = @_; | |
1678 $self->throw_not_implemented(); | |
1679 } | |
1680 | |
1681 =head2 no_residues | |
1682 | |
1683 Title : no_residues | |
1684 Usage : $no = $contig->no_residues | |
1685 Function : number of residues in total in the alignment | |
1686 Returns : integer | |
1687 Argument : | |
1688 | |
1689 =cut | |
1690 | |
1691 sub no_residues { | |
1692 my ($self) = @_; | |
1693 | |
1694 return $self->{'_nof_residues'}; | |
1695 } | |
1696 | |
1697 =head2 no_sequences | |
1698 | |
1699 Title : no_sequences | |
1700 Usage : $depth = $contig->no_sequences | |
1701 Function : number of sequence in the sequence alignment | |
1702 Returns : integer | |
1703 Argument : None | |
1704 | |
1705 =cut | |
1706 | |
1707 sub no_sequences { | |
1708 my ($self) = @_; | |
1709 | |
1710 return scalar( keys %{ $self->{'_elem'} } ); | |
1711 } | |
1712 | |
1713 =head2 percentage_identity | |
1714 | |
1715 Title : percentage_identity | |
1716 Usage : $id = $contig->percentage_identity | |
1717 Function: The function calculates the percentage identity of the alignment | |
1718 Returns : The percentage identity of the alignment (as defined by the | |
1719 implementation) | |
1720 Argument: None | |
1721 | |
1722 =cut | |
1723 | |
1724 sub percentage_identity{ | |
1725 my ($self) = @_; | |
1726 | |
1727 $self->throw_not_implemeneted(); | |
1728 } | |
1729 | |
1730 =head2 overall_percentage_identity | |
1731 | |
1732 Title : percentage_identity | |
1733 Usage : $id = $contig->percentage_identity | |
1734 Function: The function calculates the percentage identity of | |
1735 the conserved columns | |
1736 Returns : The percentage identity of the conserved columns | |
1737 Args : None | |
1738 | |
1739 =cut | |
1740 | |
1741 sub overall_percentage_identity{ | |
1742 my ($self) = @_; | |
1743 $self->throw_not_implemented(); | |
1744 } | |
1745 | |
1746 | |
1747 =head2 average_percentage_identity | |
1748 | |
1749 Title : average_percentage_identity | |
1750 Usage : $id = $contig->average_percentage_identity | |
1751 Function: The function uses a fast method to calculate the average | |
1752 percentage identity of the alignment | |
1753 Returns : The average percentage identity of the alignment | |
1754 Args : None | |
1755 | |
1756 =cut | |
1757 | |
1758 sub average_percentage_identity { | |
1759 my ($self) = @_; | |
1760 $self->throw_not_implemented(); | |
1761 } | |
1762 | |
1763 =head2 Alignment positions | |
1764 | |
1765 Methods to map a sequence position into an alignment column and back. | |
1766 column_from_residue_number() does the former. The latter is really a | |
1767 property of the sequence object and can done using | |
1768 L<Bio::LocatableSeq::location_from_column>: | |
1769 | |
1770 # select somehow a sequence from the alignment, e.g. | |
1771 my $seq = $contig->get_seq_by_pos(1); | |
1772 #$loc is undef or Bio::LocationI object | |
1773 my $loc = $seq->location_from_column(5); | |
1774 | |
1775 | |
1776 =head2 column_from_residue_number | |
1777 | |
1778 Title : column_from_residue_number | |
1779 Usage : $col = $contig->column_from_residue_number( $seqname, $resnumber) | |
1780 Function: | |
1781 | |
1782 This function gives the position in the alignment | |
1783 (i.e. column number) of the given residue number in the | |
1784 sequence with the given name. For example, for the | |
1785 alignment | |
1786 | |
1787 Seq1/91-97 AC..DEF.GH | |
1788 Seq2/24-30 ACGG.RTY.. | |
1789 Seq3/43-51 AC.DDEFGHI | |
1790 | |
1791 column_from_residue_number( "Seq1", 94 ) returns 5. | |
1792 column_from_residue_number( "Seq2", 25 ) returns 2. | |
1793 column_from_residue_number( "Seq3", 50 ) returns 9. | |
1794 | |
1795 An exception is thrown if the residue number would lie | |
1796 outside the length of the aligment | |
1797 (e.g. column_from_residue_number( "Seq2", 22 ) | |
1798 | |
1799 Note: If the the parent sequence is represented by more than | |
1800 one alignment sequence and the residue number is present in | |
1801 them, this method finds only the first one. | |
1802 | |
1803 Returns : A column number for the position in the alignment of the | |
1804 given residue in the given sequence (1 = first column) | |
1805 Args : A sequence id/name (not a name/start-end) | |
1806 A residue number in the whole sequence (not just that | |
1807 segment of it in the alignment) | |
1808 | |
1809 =cut | |
1810 | |
1811 sub column_from_residue_number { | |
1812 my ($self) = @_; | |
1813 $self->throw_not_implemented(); | |
1814 } | |
1815 | |
1816 =head2 Sequence names | |
1817 | |
1818 Methods to manipulate the display name. The default name based on the | |
1819 sequence id and subsequence positions can be overridden in various | |
1820 ways. | |
1821 | |
1822 =head2 displayname | |
1823 | |
1824 Title : displayname | |
1825 Usage : $contig->displayname("Ig", "IgA") | |
1826 Function : Gets/sets the display name of a sequence in the alignment | |
1827 : | |
1828 Returns : A display name string | |
1829 Argument : name of the sequence | |
1830 displayname of the sequence (optional) | |
1831 | |
1832 =cut | |
1833 | |
1834 sub displayname { # Do nothing | |
1835 } | |
1836 | |
1837 =head2 set_displayname_count | |
1838 | |
1839 Title : set_displayname_count | |
1840 Usage : $contig->set_displayname_count | |
1841 Function : | |
1842 | |
1843 Sets the names to be name_# where # is the number of | |
1844 times this name has been used. | |
1845 | |
1846 Returns : None | |
1847 Argument : None | |
1848 | |
1849 =cut | |
1850 | |
1851 sub set_displayname_count { | |
1852 my ($self) = @_; | |
1853 $self->throw_not_implemented(); | |
1854 } | |
1855 | |
1856 =head2 set_displayname_flat | |
1857 | |
1858 Title : set_displayname_flat | |
1859 Usage : $contig->set_displayname_flat() | |
1860 Function : Makes all the sequences be displayed as just their name, | |
1861 not name/start-end | |
1862 Returns : 1 | |
1863 Argument : None | |
1864 | |
1865 =cut | |
1866 | |
1867 sub set_displayname_flat { # Do nothing! | |
1868 } | |
1869 | |
1870 =head2 set_displayname_normal | |
1871 | |
1872 Title : set_displayname_normal | |
1873 Usage : $contig->set_displayname_normal() | |
1874 Function : Makes all the sequences be displayed as name/start-end | |
1875 Returns : None | |
1876 Argument : None | |
1877 | |
1878 =cut | |
1879 | |
1880 sub set_displayname_normal { # Do nothing! | |
1881 } | |
1882 | |
1883 =head1 Internal Methods | |
1884 | |
1885 =head2 _binary_search | |
1886 | |
1887 Title : _binary_search | |
1888 Usage : _binary_search($list,$query) | |
1889 Function : | |
1890 | |
1891 Find a number in a sorted list of numbers. Return values | |
1892 may be on or two integers. One positive integer or zero | |
1893 (>=0) is the index of the element that stores the queried | |
1894 value. Two positive integers (or zero and another | |
1895 number) are the indexes of elements among which the | |
1896 queried value should be placed. Negative single values | |
1897 mean: | |
1898 | |
1899 -1: $query is smaller than smallest element in list | |
1900 -2: $query is greater than greatest element in list | |
1901 | |
1902 Returns : array of integers | |
1903 Argument : | |
1904 $list : array reference | |
1905 $query : integer | |
1906 | |
1907 =cut | |
1908 | |
1909 sub _binary_search { | |
1910 my $list = shift; | |
1911 my $query = shift; | |
1912 # | |
1913 # If there is only one element in list | |
1914 if (!$#{$list} && ($query == $list->[0])) { return (0) } | |
1915 # If there are others... | |
1916 my $start = 0; | |
1917 my $end = $#{$list}; | |
1918 (&_compare($query,$list->[$start]) == 0) && do { return ($start) }; | |
1919 (&_compare($query,$list->[$end]) == 0) && do { return ($end) }; | |
1920 (&_compare($query,$list->[$start]) < 0) && do { return (-1) }; | |
1921 (&_compare($query,$list->[$end]) > 0) && do { return (-2) }; | |
1922 my $middle = 0; | |
1923 while ($end - $start > 1) { | |
1924 $middle = int(($end+$middle)/2); | |
1925 (&_compare($query,$list->[$middle]) == 0) && return ($middle); | |
1926 (&_compare($query,$list->[$middle]) < 0) && do { $end = $middle ; $middle = 0; next }; | |
1927 $start = $middle; # If &_compare() > 0, move region beggining | |
1928 } | |
1929 return ($start,$end); | |
1930 } | |
1931 | |
1932 =head2 _compare | |
1933 | |
1934 Title : _compare | |
1935 Usage : _compare($arg1,$arg2) | |
1936 Function: Perform numeric or string comparisons | |
1937 Returns : integer (0, 1 or -1) | |
1938 Args : values to be compared | |
1939 | |
1940 =cut | |
1941 | |
1942 sub _compare { | |
1943 my $arg1 = shift; | |
1944 my $arg2 = shift; | |
1945 # | |
1946 if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 } | |
1947 else { return $arg1 cmp $arg2 } | |
1948 } | |
1949 | |
1950 =head2 _nof_gaps | |
1951 | |
1952 Title : _nof_gaps | |
1953 Usage : _nof_gaps($array_ref, $query) | |
1954 Function: number of gaps found before position $query | |
1955 Returns : integer | |
1956 Args : | |
1957 $array_ref : gap registry reference | |
1958 $query : [integer] a position in a sequence | |
1959 | |
1960 =cut | |
1961 | |
1962 #' emacs... | |
1963 sub _nof_gaps { | |
1964 my $list = shift; | |
1965 my $query = shift; | |
1966 # If there are no gaps in this contig | |
1967 return 0 unless (defined($list) && scalar(@{$list})); | |
1968 # Locate query index in gap list (if any) | |
1969 my @index = &_binary_search($list,$query); | |
1970 # If after all alignments, correct using total number of align | |
1971 if ($index[0] == -2) { $query = scalar(@{$list}) } | |
1972 # If before any alignment, return 0 | |
1973 elsif ($index[0] == -1) { $query = 0 } | |
1974 elsif ($index[0] >= 0) { | |
1975 # If query is between alignments, translate coordinates | |
1976 if ($#index > 0) { $query = $index[0] + 1 } | |
1977 # If query sits upon an alignment, do another correction | |
1978 elsif ($#index == 0) { $query = $index[0] } | |
1979 } | |
1980 # | |
1981 return $query; | |
1982 } | |
1983 | |
1984 =head2 _padded_unpadded | |
1985 | |
1986 Title : _padded_unpadded | |
1987 Usage : _padded_unpadded($array_ref, $query) | |
1988 Function: | |
1989 | |
1990 Returns a coordinate corresponding to | |
1991 position $query after gaps were | |
1992 removed from a sequence. | |
1993 | |
1994 Returns : integer | |
1995 Args : | |
1996 $array_ref : reference to this gap registry | |
1997 $query : [integer] coordionate to change | |
1998 | |
1999 =cut | |
2000 | |
2001 sub _padded_unpadded { | |
2002 my $list = shift; | |
2003 my $query = shift; | |
2004 | |
2005 my $align = &_nof_gaps($list,$query); | |
2006 $query-- if (defined($list->[$align]) && ($list->[$align] == $query)); | |
2007 $query = $query - $align; | |
2008 # | |
2009 return $query; | |
2010 } | |
2011 | |
2012 =head2 _unpadded_padded | |
2013 | |
2014 Title : _unpadded_padded | |
2015 Usage : _unpadded_padded($array_ref, $query) | |
2016 Function: | |
2017 | |
2018 Returns the value corresponding to | |
2019 ungapped position $query when gaps are | |
2020 counted as valid sites in a sequence | |
2021 | |
2022 Returns : | |
2023 Args : $array_ref = a reference to this sequence's gap registry | |
2024 $query = [integer] location to change | |
2025 | |
2026 =cut | |
2027 | |
2028 #' | |
2029 sub _unpadded_padded { | |
2030 my $list = shift; | |
2031 my $query = shift; | |
2032 | |
2033 my $align = &_nof_gaps($list,$query); | |
2034 $query = $query + $align; | |
2035 my $new_align = &_nof_gaps($list,$query); | |
2036 while ($new_align - $align > 0) { | |
2037 $query = $query + $new_align - $align; | |
2038 $align = $new_align; | |
2039 $new_align = &_nof_gaps($list,$query); | |
2040 } | |
2041 # If current position is also a align, look for the first upstream base | |
2042 while (defined($list->[$align]) && ($list->[$align] == $query)) { | |
2043 $query++; $align++; | |
2044 } | |
2045 # | |
2046 return $query; | |
2047 } | |
2048 | |
2049 =head2 _register_gaps | |
2050 | |
2051 Title : _register_gaps | |
2052 Usage : $self->_register_gaps($seq, $array_ref) | |
2053 Function: stores gap locations for a sequence | |
2054 Returns : number of gaps found | |
2055 Args : | |
2056 $seq : sequence string | |
2057 $array_ref : a reference to an array, | |
2058 where gap locations will | |
2059 be stored | |
2060 | |
2061 =cut | |
2062 | |
2063 sub _register_gaps { | |
2064 my $self = shift; | |
2065 my $sequence = shift; | |
2066 my $dbref = shift; | |
2067 | |
2068 $self->throw("Not an aligned sequence string to register gaps") | |
2069 if (ref($sequence)); | |
2070 | |
2071 $self->throw("Not an array reference for gap registry") | |
2072 unless (ref($dbref) eq 'ARRAY'); | |
2073 | |
2074 # Registering alignments | |
2075 @{$dbref} = (); # Cleaning registry | |
2076 if (defined $sequence) { | |
2077 my $i = -1; | |
2078 while(1) { | |
2079 $i = index($sequence,"-",$i+1); | |
2080 last if ($i == -1); | |
2081 push(@{$dbref},$i+1); | |
2082 } | |
2083 } else { | |
2084 # $self->warn("Found undefined sequence while registering gaps"); | |
2085 return 0; | |
2086 } | |
2087 | |
2088 return scalar(@{$dbref}); | |
2089 } | |
2090 | |
2091 1; |