0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::BaseAlignFeature - Baseclass providing a common abstract
|
|
24 implmentation for alignment features
|
|
25
|
|
26 =head1 SYNOPSIS
|
|
27
|
|
28 my $feat = new Bio::EnsEMBL::DnaPepAlignFeature(
|
|
29 -slice => $slice,
|
|
30 -start => 100,
|
|
31 -end => 120,
|
|
32 -strand => 1,
|
|
33 -hseqname => 'SP:RF1231',
|
|
34 -hstart => 200,
|
|
35 -hend => 220,
|
|
36 -analysis => $analysis,
|
|
37 -cigar_string => '10M3D5M2I'
|
|
38 );
|
|
39
|
|
40 Alternatively if you have an array of ungapped features
|
|
41
|
|
42 my $feat =
|
|
43 new Bio::EnsEMBL::DnaPepAlignFeature( -features => \@features );
|
|
44
|
|
45 Where @features is an array of Bio::EnsEMBL::FeaturePair
|
|
46
|
|
47 There is a method to manipulate the cigar_string into ungapped features
|
|
48
|
|
49 my @ungapped_features = $feat->ungapped_features();
|
|
50
|
|
51 This converts the cigar string into an array of Bio::EnsEMBL::FeaturePair
|
|
52
|
|
53 $analysis is a Bio::EnsEMBL::Analysis object
|
|
54
|
|
55 Bio::EnsEMBL::Feature methods can be used
|
|
56 Bio::EnsEMBL::FeaturePair methods can be used
|
|
57
|
|
58 The cigar_string contains the ungapped pieces that make up the gapped
|
|
59 alignment
|
|
60
|
|
61 It looks like: n Matches [ x Deletes or Inserts m Matches ]*
|
|
62 but a bit more condensed like "23M4I12M2D1M"
|
|
63 and evenmore condensed as you can ommit 1s "23M4I12M2DM"
|
|
64
|
|
65
|
|
66 To make things clearer this is how a blast HSP would be parsed
|
|
67
|
|
68 >AK014066
|
|
69 Length = 146
|
|
70
|
|
71 Minus Strand HSPs:
|
|
72
|
|
73 Score = 76 (26.8 bits), Expect = 1.4, P = 0.74
|
|
74 Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1
|
|
75
|
|
76 Query: 479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300
|
|
77 G APPP PQG R P P G + P L + + ++ R +A +
|
|
78 Sbjct: 7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64
|
|
79
|
|
80 Query: 299 SSPHNPSPLPS 267
|
|
81 H P+P P+
|
|
82 Sbjct: 65 PLTHTPTPTPT 75
|
|
83
|
|
84 The alignment goes from 267 to 479 in sequence 1 and 7 to 75 in sequence 2
|
|
85 and the strand is -1.
|
|
86
|
|
87 The alignment is made up of the following ungapped pieces :
|
|
88
|
|
89 sequence 1 start 447 , sequence 2 start 7 , match length 33 , strand -1
|
|
90 sequence 1 start 417 , sequence 2 start 18 , match length 27 , strand -1
|
|
91 sequence 1 start 267 , sequence 2 start 27 , match length 137 , strand -1
|
|
92
|
|
93 These ungapped pieces are made up into the following string (called a cigar
|
|
94 string) "33M3I27M3I137M" with start 267 end 479 strand -1 hstart 7 hend 75
|
|
95 hstrand 1 and feature type would be DnaPepAlignFeature
|
|
96
|
|
97 =cut
|
|
98
|
|
99
|
|
100 package Bio::EnsEMBL::BaseAlignFeature;
|
|
101
|
|
102 use Bio::EnsEMBL::FeaturePair;
|
|
103 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
104 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
105 use Scalar::Util qw(weaken isweak);
|
|
106
|
|
107 use vars qw(@ISA);
|
|
108 use strict;
|
|
109
|
|
110 @ISA = qw(Bio::EnsEMBL::FeaturePair);
|
|
111
|
|
112
|
|
113 =head2 new
|
|
114
|
|
115 Arg [..] : List of named arguments. (-cigar_string , -features) defined
|
|
116 in this constructor, others defined in FeaturePair and
|
|
117 SeqFeature superclasses. Either cigar_string or a list
|
|
118 of ungapped features should be provided - not both.
|
|
119 Example : $baf = new BaseAlignFeatureSubclass(-cigar_string => '3M3I12M');
|
|
120 Description: Creates a new BaseAlignFeature using either a cigarstring or
|
|
121 a list of ungapped features. BaseAlignFeature is an abstract
|
|
122 baseclass and should not actually be instantiated - rather its
|
|
123 subclasses should be.
|
|
124 Returntype : Bio::EnsEMBL::BaseAlignFeature
|
|
125 Exceptions : thrown if both feature and cigar string args are provided
|
|
126 thrown if neither feature nor cigar string args are provided
|
|
127 Caller : general
|
|
128 Status : Stable
|
|
129
|
|
130 =cut
|
|
131
|
|
132 sub new {
|
|
133
|
|
134 my $caller = shift;
|
|
135
|
|
136 my $class = ref($caller) || $caller;
|
|
137
|
|
138 my $self = $class->SUPER::new(@_);
|
|
139
|
|
140 my ($cigar_string,$features) = rearrange([qw(CIGAR_STRING FEATURES)], @_);
|
|
141
|
|
142 if (defined($cigar_string) && defined($features)) {
|
|
143 throw("CIGAR_STRING or FEATURES argument is required - not both.");
|
|
144 } elsif (defined($features)) {
|
|
145 $self->_parse_features($features);
|
|
146
|
|
147 } elsif (defined($cigar_string)) {
|
|
148 $self->{'cigar_string'} = $cigar_string;
|
|
149 } else {
|
|
150 throw("CIGAR_STRING or FEATURES argument is required");
|
|
151 }
|
|
152
|
|
153 return $self;
|
|
154 }
|
|
155
|
|
156
|
|
157 =head2 new_fast
|
|
158
|
|
159 Arg [1] : hashref $hashref
|
|
160 A hashref which will be blessed into a PepDnaAlignFeature.
|
|
161 Example : none
|
|
162 Description: This allows for very fast object creation when a large number
|
|
163 of PepDnaAlignFeatures needs to be created. This is a bit of
|
|
164 a hack but necessary when thousands of features need to be
|
|
165 generated within a couple of seconds for web display. It is
|
|
166 not recommended that this method be called unless you know what
|
|
167 you are doing. It requires knowledge of the internals of this
|
|
168 class and its superclasses.
|
|
169 Returntype : Bio::EnsEMBL::BaseAlignFeature
|
|
170 Exceptions : none
|
|
171 Caller : none currently
|
|
172 Status : Stable
|
|
173
|
|
174 =cut
|
|
175
|
|
176 sub new_fast {
|
|
177 my ($class, $hashref) = @_;
|
|
178 my $self = bless $hashref, $class;
|
|
179 weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) );
|
|
180 return $self;
|
|
181 }
|
|
182
|
|
183
|
|
184 =head2 cigar_string
|
|
185
|
|
186 Arg [1] : string $cigar_string
|
|
187 Example : $feature->cigar_string( "12MI3M" );
|
|
188 Description: get/set for attribute cigar_string
|
|
189 cigar_string describes the alignment. "xM" stands for
|
|
190 x matches (mismatches), "xI" for inserts into query sequence
|
|
191 (thats the ensembl sequence), "xD" for deletions
|
|
192 (inserts in the subject). an "x" that is 1 can be omitted.
|
|
193 Returntype : string
|
|
194 Exceptions : none
|
|
195 Caller : general
|
|
196 Status : Stable
|
|
197
|
|
198 =cut
|
|
199
|
|
200 sub cigar_string {
|
|
201 my $self = shift;
|
|
202 $self->{'cigar_string'} = shift if(@_);
|
|
203 return $self->{'cigar_string'};
|
|
204 }
|
|
205
|
|
206
|
|
207 =head2 alignment_length
|
|
208
|
|
209 Arg [1] : None
|
|
210 Description: return the alignment length (including indels) based on the cigar_string
|
|
211 Returntype : int
|
|
212 Exceptions :
|
|
213 Caller :
|
|
214 Status : Stable
|
|
215
|
|
216 =cut
|
|
217
|
|
218 sub alignment_length {
|
|
219 my $self = shift;
|
|
220
|
|
221 if (! defined $self->{'_alignment_length'} && defined $self->cigar_string) {
|
|
222
|
|
223 my @pieces = ( $self->cigar_string =~ /(\d*[MDI])/g );
|
|
224 unless (@pieces) {
|
|
225 print STDERR "Error parsing cigar_string\n";
|
|
226 }
|
|
227 my $alignment_length = 0;
|
|
228 foreach my $piece (@pieces) {
|
|
229 my ($length) = ( $piece =~ /^(\d*)/ );
|
|
230 if (! defined $length || $length eq "") {
|
|
231 $length = 1;
|
|
232 }
|
|
233 $alignment_length += $length;
|
|
234 }
|
|
235 $self->{'_alignment_length'} = $alignment_length;
|
|
236 }
|
|
237 return $self->{'_alignment_length'};
|
|
238 }
|
|
239
|
|
240 =head2 ungapped_features
|
|
241
|
|
242 Args : none
|
|
243 Example : @ungapped_features = $align_feature->get_feature
|
|
244 Description: converts the internal cigar_string into an array of
|
|
245 ungapped feature pairs
|
|
246 Returntype : list of Bio::EnsEMBL::FeaturePair
|
|
247 Exceptions : cigar_string not set internally
|
|
248 Caller : general
|
|
249 Status : Stable
|
|
250
|
|
251 =cut
|
|
252
|
|
253 sub ungapped_features {
|
|
254 my ($self) = @_;
|
|
255
|
|
256 if (!defined($self->{'cigar_string'})) {
|
|
257 throw("No cigar_string defined. Can't return ungapped features");
|
|
258 }
|
|
259
|
|
260 return @{$self->_parse_cigar()};
|
|
261 }
|
|
262
|
|
263 =head2 strands_reversed
|
|
264
|
|
265 Arg [1] : int $strands_reversed
|
|
266 Description: get/set for attribute strands_reversed
|
|
267 0 means that strand and hstrand are the original strands obtained
|
|
268 from the alignment program used
|
|
269 1 means that strand and hstrand have been flipped as compared to
|
|
270 the original result provided by the alignment program used.
|
|
271 You may want to use the reverse_complement method to restore the
|
|
272 original strandness.
|
|
273 Returntype : int
|
|
274 Exceptions : none
|
|
275 Caller : general
|
|
276 Status : Stable
|
|
277
|
|
278 =cut
|
|
279
|
|
280 sub strands_reversed {
|
|
281 my ($self, $arg) = @_;
|
|
282
|
|
283 if ( defined $arg ) {
|
|
284 $self->{'strands_reversed'} = $arg ;
|
|
285 }
|
|
286
|
|
287 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'});
|
|
288
|
|
289 return $self->{'strands_reversed'};
|
|
290 }
|
|
291
|
|
292 =head2 reverse_complement
|
|
293
|
|
294 Args : none
|
|
295 Description: reverse complement the FeaturePair,
|
|
296 modifing strand, hstrand and cigar_string in consequence
|
|
297 Returntype : none
|
|
298 Exceptions : none
|
|
299 Caller : general
|
|
300 Status : Stable
|
|
301
|
|
302 =cut
|
|
303
|
|
304 sub reverse_complement {
|
|
305 my ($self) = @_;
|
|
306
|
|
307 # reverse strand in both sequences
|
|
308 $self->strand($self->strand * -1);
|
|
309 $self->hstrand($self->hstrand * -1);
|
|
310
|
|
311 # reverse cigar_string as consequence
|
|
312 my $cigar_string = $self->cigar_string;
|
|
313 $cigar_string =~ s/(D|I|M)/$1 /g;
|
|
314 my @cigar_pieces = split / /,$cigar_string;
|
|
315 $cigar_string = "";
|
|
316 while (my $piece = pop @cigar_pieces) {
|
|
317 $cigar_string .= $piece;
|
|
318 }
|
|
319
|
|
320 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'});
|
|
321
|
|
322 if ($self->strands_reversed) {
|
|
323 $self->strands_reversed(0)
|
|
324 } else {
|
|
325 $self->strands_reversed(1);
|
|
326 }
|
|
327
|
|
328 $self->cigar_string($cigar_string);
|
|
329 }
|
|
330
|
|
331
|
|
332
|
|
333 =head2 transform
|
|
334
|
|
335 Arg 1 : String $coordinate_system_name
|
|
336 Arg [2] : String $coordinate_system_version
|
|
337 Example : $feature = $feature->transform('contig');
|
|
338 $feature = $feature->transform('chromosome', 'NCBI33');
|
|
339 Description: Moves this AlignFeature to the given coordinate system.
|
|
340 If the feature cannot be transformed to the destination
|
|
341 coordinate system undef is returned instead.
|
|
342 Returntype : Bio::EnsEMBL::BaseAlignFeature;
|
|
343 Exceptions : wrong parameters
|
|
344 Caller : general
|
|
345 Status : Medium Risk
|
|
346 : deprecation needs to be removed at some time
|
|
347
|
|
348 =cut
|
|
349
|
|
350 sub transform {
|
|
351 my $self = shift;
|
|
352
|
|
353 # catch for old style transform calls
|
|
354 if( ref $_[0] eq 'HASH') {
|
|
355 deprecate("Calling transform with a hashref is deprecate.\n" .
|
|
356 'Use $feat->transfer($slice) or ' .
|
|
357 '$feat->transform("coordsysname") instead.');
|
|
358 my (undef, $new_feat) = each(%{$_[0]});
|
|
359 return $self->transfer($new_feat->slice);
|
|
360 }
|
|
361
|
|
362 my $new_feature = $self->SUPER::transform(@_);
|
|
363 if ( !defined($new_feature)
|
|
364 || $new_feature->length() != $self->length() )
|
|
365 {
|
|
366 my @segments = @{ $self->project(@_) };
|
|
367
|
|
368 if ( !@segments ) {
|
|
369 return undef;
|
|
370 }
|
|
371
|
|
372 my @ungapped;
|
|
373 foreach my $f ( $self->ungapped_features() ) {
|
|
374 $f = $f->transform(@_);
|
|
375 if ( defined($f) ) {
|
|
376 push( @ungapped, $f );
|
|
377 } else {
|
|
378 warning( "Failed to transform alignment feature; "
|
|
379 . "ungapped component could not be transformed" );
|
|
380 return undef;
|
|
381 }
|
|
382 }
|
|
383
|
|
384 eval { $new_feature = $self->new( -features => \@ungapped ); };
|
|
385
|
|
386 if ($@) {
|
|
387 warning($@);
|
|
388 return undef;
|
|
389 }
|
|
390 } ## end if ( !defined($new_feature...))
|
|
391
|
|
392 return $new_feature;
|
|
393 }
|
|
394
|
|
395
|
|
396 =head2 _parse_cigar
|
|
397
|
|
398 Args : none
|
|
399 Description: PRIVATE (internal) method - creates ungapped features from
|
|
400 internally stored cigar line
|
|
401 Returntype : list of Bio::EnsEMBL::FeaturePair
|
|
402 Exceptions : none
|
|
403 Caller : ungapped_features
|
|
404 Status : Stable
|
|
405
|
|
406 =cut
|
|
407
|
|
408 sub _parse_cigar {
|
|
409 my ( $self ) = @_;
|
|
410
|
|
411 my $query_unit = $self->_query_unit();
|
|
412 my $hit_unit = $self->_hit_unit();
|
|
413
|
|
414 my $string = $self->{'cigar_string'};
|
|
415
|
|
416 throw("No cigar string defined in object") if(!defined($string));
|
|
417
|
|
418 my @pieces = ( $string =~ /(\d*[MDI])/g );
|
|
419 #print "cigar: ",join ( ",", @pieces ),"\n";
|
|
420
|
|
421 my @features;
|
|
422 my $strand1 = $self->{'strand'} || 1;
|
|
423 my $strand2 = $self->{'hstrand'}|| 1;
|
|
424
|
|
425 my ( $start1, $start2 );
|
|
426
|
|
427 if( $strand1 == 1 ) {
|
|
428 $start1 = $self->{'start'};
|
|
429 } else {
|
|
430 $start1 = $self->{'end'};
|
|
431 }
|
|
432
|
|
433 if( $strand2 == 1 ) {
|
|
434 $start2 = $self->{'hstart'};
|
|
435 } else {
|
|
436 $start2 = $self->{'hend'};
|
|
437 }
|
|
438
|
|
439 #
|
|
440 # Construct ungapped blocks as FeaturePairs objects for each MATCH
|
|
441 #
|
|
442 foreach my $piece (@pieces) {
|
|
443
|
|
444 my ($length) = ( $piece =~ /^(\d*)/ );
|
|
445 if( $length eq "" ) { $length = 1 }
|
|
446 my $mapped_length;
|
|
447
|
|
448 # explicit if statements to avoid rounding problems
|
|
449 # and make sure we have sane coordinate systems
|
|
450 if( $query_unit == 1 && $hit_unit == 3 ) {
|
|
451 $mapped_length = $length*3;
|
|
452 } elsif( $query_unit == 3 && $hit_unit == 1 ) {
|
|
453 $mapped_length = $length / 3;
|
|
454 } elsif ( $query_unit == 1 && $hit_unit == 1 ) {
|
|
455 $mapped_length = $length;
|
|
456 } else {
|
|
457 throw("Internal error $query_unit $hit_unit, currently only " .
|
|
458 "allowing 1 or 3 ");
|
|
459 }
|
|
460
|
|
461 if( int($mapped_length) != $mapped_length and
|
|
462 ($piece =~ /M$/ or $piece =~ /D$/)) {
|
|
463 throw("Internal error with mismapped length of hit, query " .
|
|
464 "$query_unit, hit $hit_unit, length $length");
|
|
465 }
|
|
466
|
|
467 if( $piece =~ /M$/ ) {
|
|
468 #
|
|
469 # MATCH
|
|
470 #
|
|
471 my ( $qstart, $qend);
|
|
472 if( $strand1 == 1 ) {
|
|
473 $qstart = $start1;
|
|
474 $qend = $start1 + $length - 1;
|
|
475 $start1 = $qend + 1;
|
|
476 } else {
|
|
477 $qend = $start1;
|
|
478 $qstart = $start1 - $length + 1;
|
|
479 $start1 = $qstart - 1;
|
|
480 }
|
|
481
|
|
482 my ($hstart, $hend);
|
|
483 if( $strand2 == 1 ) {
|
|
484 $hstart = $start2;
|
|
485 $hend = $start2 + $mapped_length - 1;
|
|
486 $start2 = $hend + 1;
|
|
487 } else {
|
|
488 $hend = $start2;
|
|
489 $hstart = $start2 - $mapped_length + 1;
|
|
490 $start2 = $hstart - 1;
|
|
491 }
|
|
492
|
|
493
|
|
494 push @features, Bio::EnsEMBL::FeaturePair->new
|
|
495 (-SLICE => $self->{'slice'},
|
|
496 -SEQNAME => $self->{'seqname'},
|
|
497 -START => $qstart,
|
|
498 -END => $qend,
|
|
499 -STRAND => $strand1,
|
|
500 -HSLICE => $self->{'hslice'},
|
|
501 -HSEQNAME => $self->{'hseqname'},
|
|
502 -HSTART => $hstart,
|
|
503 -HEND => $hend,
|
|
504 -HSTRAND => $strand2,
|
|
505 -SCORE => $self->{'score'},
|
|
506 -PERCENT_ID => $self->{'percent_id'},
|
|
507 -ANALYSIS => $self->{'analysis'},
|
|
508 -P_VALUE => $self->{'p_value'},
|
|
509 -EXTERNAL_DB_ID => $self->{'external_db_id'},
|
|
510 -HCOVERAGE => $self->{'hcoverage'},
|
|
511 -GROUP_ID => $self->{'group_id'},
|
|
512 -LEVEL_ID => $self->{'level_id'});
|
|
513
|
|
514
|
|
515 # end M cigar bits
|
|
516 } elsif( $piece =~ /I$/ ) {
|
|
517 #
|
|
518 # INSERT
|
|
519 #
|
|
520 if( $strand1 == 1 ) {
|
|
521 $start1 += $length;
|
|
522 } else {
|
|
523 $start1 -= $length;
|
|
524 }
|
|
525 } elsif( $piece =~ /D$/ ) {
|
|
526 #
|
|
527 # DELETION
|
|
528 #
|
|
529 if( $strand2 == 1 ) {
|
|
530 $start2 += $mapped_length;
|
|
531 } else {
|
|
532 $start2 -= $mapped_length;
|
|
533 }
|
|
534 } else {
|
|
535 throw( "Illegal cigar line $string!" );
|
|
536 }
|
|
537 }
|
|
538
|
|
539 return \@features;
|
|
540 }
|
|
541
|
|
542
|
|
543
|
|
544
|
|
545 =head2 _parse_features
|
|
546
|
|
547 Arg [1] : listref Bio::EnsEMBL::FeaturePair $ungapped_features
|
|
548 Description: creates internal cigarstring and start,end hstart,hend
|
|
549 entries.
|
|
550 Returntype : none, fills in values of self
|
|
551 Exceptions : argument list undergoes many sanity checks - throws under many
|
|
552 invalid conditions
|
|
553 Caller : new
|
|
554 Status : Stable
|
|
555
|
|
556 =cut
|
|
557
|
|
558 my $message_only_once = 1;
|
|
559
|
|
560 sub _parse_features {
|
|
561 my ($self,$features ) = @_;
|
|
562
|
|
563 my $query_unit = $self->_query_unit();
|
|
564 my $hit_unit = $self->_hit_unit();
|
|
565
|
|
566 if (ref($features) ne "ARRAY") {
|
|
567 throw("features must be an array reference not a [".ref($features)."]");
|
|
568 }
|
|
569
|
|
570 my $strand = $features->[0]->strand;
|
|
571
|
|
572 throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand);
|
|
573
|
|
574 my @f;
|
|
575
|
|
576 #
|
|
577 # Sort the features on their start position
|
|
578 # Ascending order on positive strand, descending on negative strand
|
|
579 #
|
|
580 if( $strand == 1 ) {
|
|
581 @f = sort {$a->start <=> $b->start} @$features;
|
|
582 } else {
|
|
583 @f = sort { $b->start <=> $a->start} @$features;
|
|
584 }
|
|
585
|
|
586 my $hstrand = $f[0]->hstrand;
|
|
587 my $slice = $f[0]->slice();
|
|
588 my $hslice = $f[0]->hslice();
|
|
589 my $name = $slice->name() if($slice);
|
|
590 my $hname = $f[0]->hseqname;
|
|
591 my $score = $f[0]->score;
|
|
592 my $percent = $f[0]->percent_id;
|
|
593 my $analysis = $f[0]->analysis;
|
|
594 my $pvalue = $f[0]->p_value();
|
|
595 my $external_db_id = $f[0]->external_db_id;
|
|
596 my $hcoverage = $f[0]->hcoverage;
|
|
597 my $group_id = $f[0]->group_id;
|
|
598 my $level_id = $f[0]->level_id;
|
|
599
|
|
600 my $seqname = $f[0]->seqname;
|
|
601 # implicit strand 1 for peptide sequences
|
|
602 $strand ||= 1;
|
|
603 $hstrand ||= 1;
|
|
604 my $ori = $strand * $hstrand;
|
|
605
|
|
606 throw("No features in the array to parse") if(scalar(@f) == 0);
|
|
607
|
|
608 my $prev1; # where last feature q part ended
|
|
609 my $prev2; # where last feature s part ended
|
|
610
|
|
611 my $string;
|
|
612
|
|
613 # Use strandedness info of query and hit to make sure both sets of
|
|
614 # start and end coordinates are oriented the right way around.
|
|
615 my $f1start;
|
|
616 my $f1end;
|
|
617 my $f2end;
|
|
618 my $f2start;
|
|
619
|
|
620 if ($strand == 1) {
|
|
621 $f1start = $f[0]->start;
|
|
622 $f1end = $f[-1]->end;
|
|
623 } else {
|
|
624 $f1start = $f[-1]->start;
|
|
625 $f1end = $f[0]->end;
|
|
626 }
|
|
627
|
|
628 if ($hstrand == 1) {
|
|
629 $f2start = $f[0]->hstart;
|
|
630 $f2end = $f[-1]->hend;
|
|
631 } else {
|
|
632 $f2start = $f[-1]->hstart;
|
|
633 $f2end = $f[0]->hend;
|
|
634 }
|
|
635
|
|
636 #
|
|
637 # Loop through each portion of alignment and construct cigar string
|
|
638 #
|
|
639
|
|
640 foreach my $f (@f) {
|
|
641 #
|
|
642 # Sanity checks
|
|
643 #
|
|
644
|
|
645 if (!$f->isa("Bio::EnsEMBL::FeaturePair")) {
|
|
646 throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair");
|
|
647 }
|
|
648 if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) {
|
|
649 throw("Inconsistent hstrands in feature array");
|
|
650 }
|
|
651 if( defined($f->strand()) && ($f->strand != $strand)) {
|
|
652 throw("Inconsistent strands in feature array");
|
|
653 }
|
|
654 if ( defined($name) && $name ne $f->slice->name()) {
|
|
655 throw("Inconsistent names in feature array [$name - ".
|
|
656 $f->slice->name()."]");
|
|
657 }
|
|
658 if ( defined($hname) && $hname ne $f->hseqname) {
|
|
659 throw("Inconsistent hit names in feature array [$hname - ".
|
|
660 $f->hseqname . "]");
|
|
661 }
|
|
662 if ( defined($score) && $score ne $f->score) {
|
|
663 throw("Inconsisent scores in feature array [$score - " .
|
|
664 $f->score . "]");
|
|
665 }
|
|
666 if (defined($f->percent_id) && $percent ne $f->percent_id) {
|
|
667 throw("Inconsistent pids in feature array [$percent - " .
|
|
668 $f->percent_id . "]");
|
|
669 }
|
|
670 if(defined($pvalue) && $pvalue != $f->p_value()) {
|
|
671 throw("Inconsistant p_values in feature arraw [$pvalue " .
|
|
672 $f->p_value() . "]");
|
|
673 }
|
|
674 if($seqname && $seqname ne $f->seqname){
|
|
675 throw("Inconsistent seqname in feature array [$seqname - ".
|
|
676 $f->seqname . "]");
|
|
677 }
|
|
678 my $start1 = $f->start; #source sequence alignment start
|
|
679 my $start2 = $f->hstart(); #hit sequence alignment start
|
|
680
|
|
681 #
|
|
682 # More sanity checking
|
|
683 #
|
|
684 if (defined($prev1)) {
|
|
685 if ( $strand == 1 ) {
|
|
686 if ($f->start < $prev1) {
|
|
687 throw("Inconsistent coords in feature array (forward strand).\n" .
|
|
688 "Start [".$f->start()."] in current feature should be greater " .
|
|
689 "than previous feature end [$prev1].");
|
|
690 }
|
|
691 } else {
|
|
692 if ($f->end > $prev1) {
|
|
693 throw("Inconsistent coords in feature array (reverse strand).\n" .
|
|
694 "End [".$f->end() ."] should be less than previous feature " .
|
|
695 "start [$prev1].");
|
|
696 }
|
|
697 }
|
|
698 }
|
|
699
|
|
700 my $length = ($f->end - $f->start + 1); #length of source seq alignment
|
|
701 my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment
|
|
702
|
|
703 # using multiplication to avoid rounding errors, hence the
|
|
704 # switch from query to hit for the ratios
|
|
705
|
|
706 #
|
|
707 # Yet more sanity checking
|
|
708 #
|
|
709 if($query_unit > $hit_unit){
|
|
710 # I am going to make the assumption here that this situation will
|
|
711 # only occur with DnaPepAlignFeatures, this may not be true
|
|
712 my $query_p_length = sprintf "%.0f", ($length/$query_unit);
|
|
713 my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit);
|
|
714 if( $query_p_length != $hit_p_length) {
|
|
715 throw( "Feature lengths not comparable Lengths:" .$length .
|
|
716 " " . $hlength . " Ratios:" . $query_unit . " " .
|
|
717 $hit_unit );
|
|
718 }
|
|
719 } else{
|
|
720 my $query_d_length = sprintf "%.0f", ($length*$hit_unit);
|
|
721 my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit);
|
|
722 if( $length * $hit_unit != $hlength * $query_unit ) {
|
|
723 throw( "Feature lengths not comparable Lengths:" . $length .
|
|
724 " " . $hlength . " Ratios:" . $query_unit . " " .
|
|
725 $hit_unit );
|
|
726 }
|
|
727 }
|
|
728
|
|
729 my $hlengthfactor = ($query_unit/$hit_unit);
|
|
730
|
|
731 #
|
|
732 # Check to see if there is an I type (insertion) gap:
|
|
733 # If there is a space between the end of the last source sequence
|
|
734 # alignment and the start of this one, then this is an insertion
|
|
735 #
|
|
736
|
|
737 my $insertion_flag = 0;
|
|
738 if( $strand == 1 ) {
|
|
739 if( ( defined $prev1 ) && ( $f->start > $prev1 + 1 )) {
|
|
740
|
|
741 #there is an insertion
|
|
742 $insertion_flag = 1;
|
|
743 my $gap = $f->start - $prev1 - 1;
|
|
744 if( $gap == 1 ) {
|
|
745 $gap = ""; # no need for a number if gap length is 1
|
|
746 }
|
|
747 $string .= "$gap"."I";
|
|
748
|
|
749 }
|
|
750
|
|
751 #shift our position in the source seq alignment
|
|
752 $prev1 = $f->end();
|
|
753 } else {
|
|
754
|
|
755 if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) {
|
|
756
|
|
757 #there is an insertion
|
|
758 $insertion_flag = 1;
|
|
759 my $gap = $prev1 - $f->end() - 1;
|
|
760 if( $gap == 1 ) {
|
|
761 $gap = ""; # no need for a number if gap length is 1
|
|
762 }
|
|
763 $string .= "$gap"."I";
|
|
764 }
|
|
765
|
|
766 #shift our position in the source seq alignment
|
|
767 $prev1 = $f->start();
|
|
768 }
|
|
769
|
|
770 #
|
|
771 # Check to see if there is a D type (deletion) gap
|
|
772 # There is a deletion gap if there is a space between the end of the
|
|
773 # last portion of the hit sequence alignment and this one
|
|
774 #
|
|
775 if( $hstrand == 1 ) {
|
|
776 if(( defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) {
|
|
777
|
|
778 #there is a deletion
|
|
779 my $gap = $f->hstart - $prev2 - 1;
|
|
780 my $gap2 = int( $gap * $hlengthfactor + 0.5 );
|
|
781
|
|
782 if( $gap2 == 1 ) {
|
|
783 $gap2 = ""; # no need for a number if gap length is 1
|
|
784 }
|
|
785 $string .= "$gap2"."D";
|
|
786
|
|
787 #sanity check, Should not be an insertion and deletion
|
|
788 if($insertion_flag) {
|
|
789 if ($message_only_once) {
|
|
790 warning("Should not be an deletion and insertion on the " .
|
|
791 "same alignment region. cigar_line=$string\n");
|
|
792 $message_only_once = 0;
|
|
793 }
|
|
794 }
|
|
795 }
|
|
796 #shift our position in the hit seq alignment
|
|
797 $prev2 = $f->hend();
|
|
798
|
|
799 } else {
|
|
800 if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) {
|
|
801
|
|
802 #there is a deletion
|
|
803 my $gap = $prev2 - $f->hend - 1;
|
|
804 my $gap2 = int( $gap * $hlengthfactor + 0.5 );
|
|
805
|
|
806 if( $gap2 == 1 ) {
|
|
807 $gap2 = ""; # no need for a number if gap length is 1
|
|
808 }
|
|
809 $string .= "$gap2"."D";
|
|
810
|
|
811 #sanity check, Should not be an insertion and deletion
|
|
812 if($insertion_flag) {
|
|
813 if ($message_only_once) {
|
|
814 warning("Should not be an deletion and insertion on the " .
|
|
815 "same alignment region. prev2 = $prev2; f->hend() = " .
|
|
816 $f->hend() . "; cigar_line = $string;\n");
|
|
817 $message_only_once = 0;
|
|
818 }
|
|
819 }
|
|
820 }
|
|
821 #shift our position in the hit seq alignment
|
|
822 $prev2 = $f->hstart();
|
|
823 }
|
|
824
|
|
825 my $matchlength = $f->end() - $f->start() + 1;
|
|
826 if( $matchlength == 1 ) {
|
|
827 $matchlength = "";
|
|
828 }
|
|
829 $string .= $matchlength."M";
|
|
830 }
|
|
831
|
|
832 $self->{'start'} = $f1start;
|
|
833 $self->{'end'} = $f1end;
|
|
834 $self->{'seqname'} = $seqname;
|
|
835 $self->{'strand'} = $strand;
|
|
836 $self->{'score'} = $score;
|
|
837 $self->{'percent_id'} = $percent;
|
|
838 $self->{'analysis'} = $analysis;
|
|
839 $self->{'slice'} = $slice;
|
|
840 $self->{'hslice'} = $hslice;
|
|
841 $self->{'hstart'} = $f2start;
|
|
842 $self->{'hend'} = $f2end;
|
|
843 $self->{'hstrand'} = $hstrand;
|
|
844 $self->{'hseqname'} = $hname;
|
|
845 $self->{'cigar_string'} = $string;
|
|
846 $self->{'p_value'} = $pvalue;
|
|
847 $self->{'external_db_id'} = $external_db_id;
|
|
848 $self->{'hcoverage'} = $hcoverage;
|
|
849 $self->{'group_id'} = $group_id;
|
|
850 $self->{'level_id'} = $level_id;
|
|
851 }
|
|
852
|
|
853
|
|
854
|
|
855
|
|
856
|
|
857
|
|
858 =head2 _hit_unit
|
|
859
|
|
860 Args : none
|
|
861 Description: abstract method, overwrite with something that returns
|
|
862 one or three
|
|
863 Returntype : int 1,3
|
|
864 Exceptions : none
|
|
865 Caller : internal
|
|
866 Status : Stable
|
|
867
|
|
868 =cut
|
|
869
|
|
870 sub _hit_unit {
|
|
871 my $self = shift;
|
|
872 throw( "Abstract method call!" );
|
|
873 }
|
|
874
|
|
875
|
|
876
|
|
877 =head2 _query_unit
|
|
878
|
|
879 Args : none
|
|
880 Description: abstract method, overwrite with something that returns
|
|
881 one or three
|
|
882 Returntype : int 1,3
|
|
883 Exceptions : none
|
|
884 Caller : internal
|
|
885 Status : Stable
|
|
886
|
|
887 =cut
|
|
888
|
|
889 sub _query_unit {
|
|
890 my $self = shift;
|
|
891 throw( "Abstract method call!" );
|
|
892 }
|
|
893
|
|
894
|
|
895
|
|
896
|
|
897 1;
|