Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/BaseAlignFeature.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::BaseAlignFeature - Baseclass providing a common abstract | |
24 implmentation for alignment features | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 my $feat = new Bio::EnsEMBL::DnaPepAlignFeature( | |
29 -slice => $slice, | |
30 -start => 100, | |
31 -end => 120, | |
32 -strand => 1, | |
33 -hseqname => 'SP:RF1231', | |
34 -hstart => 200, | |
35 -hend => 220, | |
36 -analysis => $analysis, | |
37 -cigar_string => '10M3D5M2I' | |
38 ); | |
39 | |
40 Alternatively if you have an array of ungapped features | |
41 | |
42 my $feat = | |
43 new Bio::EnsEMBL::DnaPepAlignFeature( -features => \@features ); | |
44 | |
45 Where @features is an array of Bio::EnsEMBL::FeaturePair | |
46 | |
47 There is a method to manipulate the cigar_string into ungapped features | |
48 | |
49 my @ungapped_features = $feat->ungapped_features(); | |
50 | |
51 This converts the cigar string into an array of Bio::EnsEMBL::FeaturePair | |
52 | |
53 $analysis is a Bio::EnsEMBL::Analysis object | |
54 | |
55 Bio::EnsEMBL::Feature methods can be used | |
56 Bio::EnsEMBL::FeaturePair methods can be used | |
57 | |
58 The cigar_string contains the ungapped pieces that make up the gapped | |
59 alignment | |
60 | |
61 It looks like: n Matches [ x Deletes or Inserts m Matches ]* | |
62 but a bit more condensed like "23M4I12M2D1M" | |
63 and evenmore condensed as you can ommit 1s "23M4I12M2DM" | |
64 | |
65 | |
66 To make things clearer this is how a blast HSP would be parsed | |
67 | |
68 >AK014066 | |
69 Length = 146 | |
70 | |
71 Minus Strand HSPs: | |
72 | |
73 Score = 76 (26.8 bits), Expect = 1.4, P = 0.74 | |
74 Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1 | |
75 | |
76 Query: 479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300 | |
77 G APPP PQG R P P G + P L + + ++ R +A + | |
78 Sbjct: 7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64 | |
79 | |
80 Query: 299 SSPHNPSPLPS 267 | |
81 H P+P P+ | |
82 Sbjct: 65 PLTHTPTPTPT 75 | |
83 | |
84 The alignment goes from 267 to 479 in sequence 1 and 7 to 75 in sequence 2 | |
85 and the strand is -1. | |
86 | |
87 The alignment is made up of the following ungapped pieces : | |
88 | |
89 sequence 1 start 447 , sequence 2 start 7 , match length 33 , strand -1 | |
90 sequence 1 start 417 , sequence 2 start 18 , match length 27 , strand -1 | |
91 sequence 1 start 267 , sequence 2 start 27 , match length 137 , strand -1 | |
92 | |
93 These ungapped pieces are made up into the following string (called a cigar | |
94 string) "33M3I27M3I137M" with start 267 end 479 strand -1 hstart 7 hend 75 | |
95 hstrand 1 and feature type would be DnaPepAlignFeature | |
96 | |
97 =cut | |
98 | |
99 | |
100 package Bio::EnsEMBL::BaseAlignFeature; | |
101 | |
102 use Bio::EnsEMBL::FeaturePair; | |
103 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
104 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
105 use Scalar::Util qw(weaken isweak); | |
106 | |
107 use vars qw(@ISA); | |
108 use strict; | |
109 | |
110 @ISA = qw(Bio::EnsEMBL::FeaturePair); | |
111 | |
112 | |
113 =head2 new | |
114 | |
115 Arg [..] : List of named arguments. (-cigar_string , -features) defined | |
116 in this constructor, others defined in FeaturePair and | |
117 SeqFeature superclasses. Either cigar_string or a list | |
118 of ungapped features should be provided - not both. | |
119 Example : $baf = new BaseAlignFeatureSubclass(-cigar_string => '3M3I12M'); | |
120 Description: Creates a new BaseAlignFeature using either a cigarstring or | |
121 a list of ungapped features. BaseAlignFeature is an abstract | |
122 baseclass and should not actually be instantiated - rather its | |
123 subclasses should be. | |
124 Returntype : Bio::EnsEMBL::BaseAlignFeature | |
125 Exceptions : thrown if both feature and cigar string args are provided | |
126 thrown if neither feature nor cigar string args are provided | |
127 Caller : general | |
128 Status : Stable | |
129 | |
130 =cut | |
131 | |
132 sub new { | |
133 | |
134 my $caller = shift; | |
135 | |
136 my $class = ref($caller) || $caller; | |
137 | |
138 my $self = $class->SUPER::new(@_); | |
139 | |
140 my ($cigar_string,$features) = rearrange([qw(CIGAR_STRING FEATURES)], @_); | |
141 | |
142 if (defined($cigar_string) && defined($features)) { | |
143 throw("CIGAR_STRING or FEATURES argument is required - not both."); | |
144 } elsif (defined($features)) { | |
145 $self->_parse_features($features); | |
146 | |
147 } elsif (defined($cigar_string)) { | |
148 $self->{'cigar_string'} = $cigar_string; | |
149 } else { | |
150 throw("CIGAR_STRING or FEATURES argument is required"); | |
151 } | |
152 | |
153 return $self; | |
154 } | |
155 | |
156 | |
157 =head2 new_fast | |
158 | |
159 Arg [1] : hashref $hashref | |
160 A hashref which will be blessed into a PepDnaAlignFeature. | |
161 Example : none | |
162 Description: This allows for very fast object creation when a large number | |
163 of PepDnaAlignFeatures needs to be created. This is a bit of | |
164 a hack but necessary when thousands of features need to be | |
165 generated within a couple of seconds for web display. It is | |
166 not recommended that this method be called unless you know what | |
167 you are doing. It requires knowledge of the internals of this | |
168 class and its superclasses. | |
169 Returntype : Bio::EnsEMBL::BaseAlignFeature | |
170 Exceptions : none | |
171 Caller : none currently | |
172 Status : Stable | |
173 | |
174 =cut | |
175 | |
176 sub new_fast { | |
177 my ($class, $hashref) = @_; | |
178 my $self = bless $hashref, $class; | |
179 weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); | |
180 return $self; | |
181 } | |
182 | |
183 | |
184 =head2 cigar_string | |
185 | |
186 Arg [1] : string $cigar_string | |
187 Example : $feature->cigar_string( "12MI3M" ); | |
188 Description: get/set for attribute cigar_string | |
189 cigar_string describes the alignment. "xM" stands for | |
190 x matches (mismatches), "xI" for inserts into query sequence | |
191 (thats the ensembl sequence), "xD" for deletions | |
192 (inserts in the subject). an "x" that is 1 can be omitted. | |
193 Returntype : string | |
194 Exceptions : none | |
195 Caller : general | |
196 Status : Stable | |
197 | |
198 =cut | |
199 | |
200 sub cigar_string { | |
201 my $self = shift; | |
202 $self->{'cigar_string'} = shift if(@_); | |
203 return $self->{'cigar_string'}; | |
204 } | |
205 | |
206 | |
207 =head2 alignment_length | |
208 | |
209 Arg [1] : None | |
210 Description: return the alignment length (including indels) based on the cigar_string | |
211 Returntype : int | |
212 Exceptions : | |
213 Caller : | |
214 Status : Stable | |
215 | |
216 =cut | |
217 | |
218 sub alignment_length { | |
219 my $self = shift; | |
220 | |
221 if (! defined $self->{'_alignment_length'} && defined $self->cigar_string) { | |
222 | |
223 my @pieces = ( $self->cigar_string =~ /(\d*[MDI])/g ); | |
224 unless (@pieces) { | |
225 print STDERR "Error parsing cigar_string\n"; | |
226 } | |
227 my $alignment_length = 0; | |
228 foreach my $piece (@pieces) { | |
229 my ($length) = ( $piece =~ /^(\d*)/ ); | |
230 if (! defined $length || $length eq "") { | |
231 $length = 1; | |
232 } | |
233 $alignment_length += $length; | |
234 } | |
235 $self->{'_alignment_length'} = $alignment_length; | |
236 } | |
237 return $self->{'_alignment_length'}; | |
238 } | |
239 | |
240 =head2 ungapped_features | |
241 | |
242 Args : none | |
243 Example : @ungapped_features = $align_feature->get_feature | |
244 Description: converts the internal cigar_string into an array of | |
245 ungapped feature pairs | |
246 Returntype : list of Bio::EnsEMBL::FeaturePair | |
247 Exceptions : cigar_string not set internally | |
248 Caller : general | |
249 Status : Stable | |
250 | |
251 =cut | |
252 | |
253 sub ungapped_features { | |
254 my ($self) = @_; | |
255 | |
256 if (!defined($self->{'cigar_string'})) { | |
257 throw("No cigar_string defined. Can't return ungapped features"); | |
258 } | |
259 | |
260 return @{$self->_parse_cigar()}; | |
261 } | |
262 | |
263 =head2 strands_reversed | |
264 | |
265 Arg [1] : int $strands_reversed | |
266 Description: get/set for attribute strands_reversed | |
267 0 means that strand and hstrand are the original strands obtained | |
268 from the alignment program used | |
269 1 means that strand and hstrand have been flipped as compared to | |
270 the original result provided by the alignment program used. | |
271 You may want to use the reverse_complement method to restore the | |
272 original strandness. | |
273 Returntype : int | |
274 Exceptions : none | |
275 Caller : general | |
276 Status : Stable | |
277 | |
278 =cut | |
279 | |
280 sub strands_reversed { | |
281 my ($self, $arg) = @_; | |
282 | |
283 if ( defined $arg ) { | |
284 $self->{'strands_reversed'} = $arg ; | |
285 } | |
286 | |
287 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'}); | |
288 | |
289 return $self->{'strands_reversed'}; | |
290 } | |
291 | |
292 =head2 reverse_complement | |
293 | |
294 Args : none | |
295 Description: reverse complement the FeaturePair, | |
296 modifing strand, hstrand and cigar_string in consequence | |
297 Returntype : none | |
298 Exceptions : none | |
299 Caller : general | |
300 Status : Stable | |
301 | |
302 =cut | |
303 | |
304 sub reverse_complement { | |
305 my ($self) = @_; | |
306 | |
307 # reverse strand in both sequences | |
308 $self->strand($self->strand * -1); | |
309 $self->hstrand($self->hstrand * -1); | |
310 | |
311 # reverse cigar_string as consequence | |
312 my $cigar_string = $self->cigar_string; | |
313 $cigar_string =~ s/(D|I|M)/$1 /g; | |
314 my @cigar_pieces = split / /,$cigar_string; | |
315 $cigar_string = ""; | |
316 while (my $piece = pop @cigar_pieces) { | |
317 $cigar_string .= $piece; | |
318 } | |
319 | |
320 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'}); | |
321 | |
322 if ($self->strands_reversed) { | |
323 $self->strands_reversed(0) | |
324 } else { | |
325 $self->strands_reversed(1); | |
326 } | |
327 | |
328 $self->cigar_string($cigar_string); | |
329 } | |
330 | |
331 | |
332 | |
333 =head2 transform | |
334 | |
335 Arg 1 : String $coordinate_system_name | |
336 Arg [2] : String $coordinate_system_version | |
337 Example : $feature = $feature->transform('contig'); | |
338 $feature = $feature->transform('chromosome', 'NCBI33'); | |
339 Description: Moves this AlignFeature to the given coordinate system. | |
340 If the feature cannot be transformed to the destination | |
341 coordinate system undef is returned instead. | |
342 Returntype : Bio::EnsEMBL::BaseAlignFeature; | |
343 Exceptions : wrong parameters | |
344 Caller : general | |
345 Status : Medium Risk | |
346 : deprecation needs to be removed at some time | |
347 | |
348 =cut | |
349 | |
350 sub transform { | |
351 my $self = shift; | |
352 | |
353 # catch for old style transform calls | |
354 if( ref $_[0] eq 'HASH') { | |
355 deprecate("Calling transform with a hashref is deprecate.\n" . | |
356 'Use $feat->transfer($slice) or ' . | |
357 '$feat->transform("coordsysname") instead.'); | |
358 my (undef, $new_feat) = each(%{$_[0]}); | |
359 return $self->transfer($new_feat->slice); | |
360 } | |
361 | |
362 my $new_feature = $self->SUPER::transform(@_); | |
363 if ( !defined($new_feature) | |
364 || $new_feature->length() != $self->length() ) | |
365 { | |
366 my @segments = @{ $self->project(@_) }; | |
367 | |
368 if ( !@segments ) { | |
369 return undef; | |
370 } | |
371 | |
372 my @ungapped; | |
373 foreach my $f ( $self->ungapped_features() ) { | |
374 $f = $f->transform(@_); | |
375 if ( defined($f) ) { | |
376 push( @ungapped, $f ); | |
377 } else { | |
378 warning( "Failed to transform alignment feature; " | |
379 . "ungapped component could not be transformed" ); | |
380 return undef; | |
381 } | |
382 } | |
383 | |
384 eval { $new_feature = $self->new( -features => \@ungapped ); }; | |
385 | |
386 if ($@) { | |
387 warning($@); | |
388 return undef; | |
389 } | |
390 } ## end if ( !defined($new_feature...)) | |
391 | |
392 return $new_feature; | |
393 } | |
394 | |
395 | |
396 =head2 _parse_cigar | |
397 | |
398 Args : none | |
399 Description: PRIVATE (internal) method - creates ungapped features from | |
400 internally stored cigar line | |
401 Returntype : list of Bio::EnsEMBL::FeaturePair | |
402 Exceptions : none | |
403 Caller : ungapped_features | |
404 Status : Stable | |
405 | |
406 =cut | |
407 | |
408 sub _parse_cigar { | |
409 my ( $self ) = @_; | |
410 | |
411 my $query_unit = $self->_query_unit(); | |
412 my $hit_unit = $self->_hit_unit(); | |
413 | |
414 my $string = $self->{'cigar_string'}; | |
415 | |
416 throw("No cigar string defined in object") if(!defined($string)); | |
417 | |
418 my @pieces = ( $string =~ /(\d*[MDI])/g ); | |
419 #print "cigar: ",join ( ",", @pieces ),"\n"; | |
420 | |
421 my @features; | |
422 my $strand1 = $self->{'strand'} || 1; | |
423 my $strand2 = $self->{'hstrand'}|| 1; | |
424 | |
425 my ( $start1, $start2 ); | |
426 | |
427 if( $strand1 == 1 ) { | |
428 $start1 = $self->{'start'}; | |
429 } else { | |
430 $start1 = $self->{'end'}; | |
431 } | |
432 | |
433 if( $strand2 == 1 ) { | |
434 $start2 = $self->{'hstart'}; | |
435 } else { | |
436 $start2 = $self->{'hend'}; | |
437 } | |
438 | |
439 # | |
440 # Construct ungapped blocks as FeaturePairs objects for each MATCH | |
441 # | |
442 foreach my $piece (@pieces) { | |
443 | |
444 my ($length) = ( $piece =~ /^(\d*)/ ); | |
445 if( $length eq "" ) { $length = 1 } | |
446 my $mapped_length; | |
447 | |
448 # explicit if statements to avoid rounding problems | |
449 # and make sure we have sane coordinate systems | |
450 if( $query_unit == 1 && $hit_unit == 3 ) { | |
451 $mapped_length = $length*3; | |
452 } elsif( $query_unit == 3 && $hit_unit == 1 ) { | |
453 $mapped_length = $length / 3; | |
454 } elsif ( $query_unit == 1 && $hit_unit == 1 ) { | |
455 $mapped_length = $length; | |
456 } else { | |
457 throw("Internal error $query_unit $hit_unit, currently only " . | |
458 "allowing 1 or 3 "); | |
459 } | |
460 | |
461 if( int($mapped_length) != $mapped_length and | |
462 ($piece =~ /M$/ or $piece =~ /D$/)) { | |
463 throw("Internal error with mismapped length of hit, query " . | |
464 "$query_unit, hit $hit_unit, length $length"); | |
465 } | |
466 | |
467 if( $piece =~ /M$/ ) { | |
468 # | |
469 # MATCH | |
470 # | |
471 my ( $qstart, $qend); | |
472 if( $strand1 == 1 ) { | |
473 $qstart = $start1; | |
474 $qend = $start1 + $length - 1; | |
475 $start1 = $qend + 1; | |
476 } else { | |
477 $qend = $start1; | |
478 $qstart = $start1 - $length + 1; | |
479 $start1 = $qstart - 1; | |
480 } | |
481 | |
482 my ($hstart, $hend); | |
483 if( $strand2 == 1 ) { | |
484 $hstart = $start2; | |
485 $hend = $start2 + $mapped_length - 1; | |
486 $start2 = $hend + 1; | |
487 } else { | |
488 $hend = $start2; | |
489 $hstart = $start2 - $mapped_length + 1; | |
490 $start2 = $hstart - 1; | |
491 } | |
492 | |
493 | |
494 push @features, Bio::EnsEMBL::FeaturePair->new | |
495 (-SLICE => $self->{'slice'}, | |
496 -SEQNAME => $self->{'seqname'}, | |
497 -START => $qstart, | |
498 -END => $qend, | |
499 -STRAND => $strand1, | |
500 -HSLICE => $self->{'hslice'}, | |
501 -HSEQNAME => $self->{'hseqname'}, | |
502 -HSTART => $hstart, | |
503 -HEND => $hend, | |
504 -HSTRAND => $strand2, | |
505 -SCORE => $self->{'score'}, | |
506 -PERCENT_ID => $self->{'percent_id'}, | |
507 -ANALYSIS => $self->{'analysis'}, | |
508 -P_VALUE => $self->{'p_value'}, | |
509 -EXTERNAL_DB_ID => $self->{'external_db_id'}, | |
510 -HCOVERAGE => $self->{'hcoverage'}, | |
511 -GROUP_ID => $self->{'group_id'}, | |
512 -LEVEL_ID => $self->{'level_id'}); | |
513 | |
514 | |
515 # end M cigar bits | |
516 } elsif( $piece =~ /I$/ ) { | |
517 # | |
518 # INSERT | |
519 # | |
520 if( $strand1 == 1 ) { | |
521 $start1 += $length; | |
522 } else { | |
523 $start1 -= $length; | |
524 } | |
525 } elsif( $piece =~ /D$/ ) { | |
526 # | |
527 # DELETION | |
528 # | |
529 if( $strand2 == 1 ) { | |
530 $start2 += $mapped_length; | |
531 } else { | |
532 $start2 -= $mapped_length; | |
533 } | |
534 } else { | |
535 throw( "Illegal cigar line $string!" ); | |
536 } | |
537 } | |
538 | |
539 return \@features; | |
540 } | |
541 | |
542 | |
543 | |
544 | |
545 =head2 _parse_features | |
546 | |
547 Arg [1] : listref Bio::EnsEMBL::FeaturePair $ungapped_features | |
548 Description: creates internal cigarstring and start,end hstart,hend | |
549 entries. | |
550 Returntype : none, fills in values of self | |
551 Exceptions : argument list undergoes many sanity checks - throws under many | |
552 invalid conditions | |
553 Caller : new | |
554 Status : Stable | |
555 | |
556 =cut | |
557 | |
558 my $message_only_once = 1; | |
559 | |
560 sub _parse_features { | |
561 my ($self,$features ) = @_; | |
562 | |
563 my $query_unit = $self->_query_unit(); | |
564 my $hit_unit = $self->_hit_unit(); | |
565 | |
566 if (ref($features) ne "ARRAY") { | |
567 throw("features must be an array reference not a [".ref($features)."]"); | |
568 } | |
569 | |
570 my $strand = $features->[0]->strand; | |
571 | |
572 throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand); | |
573 | |
574 my @f; | |
575 | |
576 # | |
577 # Sort the features on their start position | |
578 # Ascending order on positive strand, descending on negative strand | |
579 # | |
580 if( $strand == 1 ) { | |
581 @f = sort {$a->start <=> $b->start} @$features; | |
582 } else { | |
583 @f = sort { $b->start <=> $a->start} @$features; | |
584 } | |
585 | |
586 my $hstrand = $f[0]->hstrand; | |
587 my $slice = $f[0]->slice(); | |
588 my $hslice = $f[0]->hslice(); | |
589 my $name = $slice->name() if($slice); | |
590 my $hname = $f[0]->hseqname; | |
591 my $score = $f[0]->score; | |
592 my $percent = $f[0]->percent_id; | |
593 my $analysis = $f[0]->analysis; | |
594 my $pvalue = $f[0]->p_value(); | |
595 my $external_db_id = $f[0]->external_db_id; | |
596 my $hcoverage = $f[0]->hcoverage; | |
597 my $group_id = $f[0]->group_id; | |
598 my $level_id = $f[0]->level_id; | |
599 | |
600 my $seqname = $f[0]->seqname; | |
601 # implicit strand 1 for peptide sequences | |
602 $strand ||= 1; | |
603 $hstrand ||= 1; | |
604 my $ori = $strand * $hstrand; | |
605 | |
606 throw("No features in the array to parse") if(scalar(@f) == 0); | |
607 | |
608 my $prev1; # where last feature q part ended | |
609 my $prev2; # where last feature s part ended | |
610 | |
611 my $string; | |
612 | |
613 # Use strandedness info of query and hit to make sure both sets of | |
614 # start and end coordinates are oriented the right way around. | |
615 my $f1start; | |
616 my $f1end; | |
617 my $f2end; | |
618 my $f2start; | |
619 | |
620 if ($strand == 1) { | |
621 $f1start = $f[0]->start; | |
622 $f1end = $f[-1]->end; | |
623 } else { | |
624 $f1start = $f[-1]->start; | |
625 $f1end = $f[0]->end; | |
626 } | |
627 | |
628 if ($hstrand == 1) { | |
629 $f2start = $f[0]->hstart; | |
630 $f2end = $f[-1]->hend; | |
631 } else { | |
632 $f2start = $f[-1]->hstart; | |
633 $f2end = $f[0]->hend; | |
634 } | |
635 | |
636 # | |
637 # Loop through each portion of alignment and construct cigar string | |
638 # | |
639 | |
640 foreach my $f (@f) { | |
641 # | |
642 # Sanity checks | |
643 # | |
644 | |
645 if (!$f->isa("Bio::EnsEMBL::FeaturePair")) { | |
646 throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair"); | |
647 } | |
648 if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) { | |
649 throw("Inconsistent hstrands in feature array"); | |
650 } | |
651 if( defined($f->strand()) && ($f->strand != $strand)) { | |
652 throw("Inconsistent strands in feature array"); | |
653 } | |
654 if ( defined($name) && $name ne $f->slice->name()) { | |
655 throw("Inconsistent names in feature array [$name - ". | |
656 $f->slice->name()."]"); | |
657 } | |
658 if ( defined($hname) && $hname ne $f->hseqname) { | |
659 throw("Inconsistent hit names in feature array [$hname - ". | |
660 $f->hseqname . "]"); | |
661 } | |
662 if ( defined($score) && $score ne $f->score) { | |
663 throw("Inconsisent scores in feature array [$score - " . | |
664 $f->score . "]"); | |
665 } | |
666 if (defined($f->percent_id) && $percent ne $f->percent_id) { | |
667 throw("Inconsistent pids in feature array [$percent - " . | |
668 $f->percent_id . "]"); | |
669 } | |
670 if(defined($pvalue) && $pvalue != $f->p_value()) { | |
671 throw("Inconsistant p_values in feature arraw [$pvalue " . | |
672 $f->p_value() . "]"); | |
673 } | |
674 if($seqname && $seqname ne $f->seqname){ | |
675 throw("Inconsistent seqname in feature array [$seqname - ". | |
676 $f->seqname . "]"); | |
677 } | |
678 my $start1 = $f->start; #source sequence alignment start | |
679 my $start2 = $f->hstart(); #hit sequence alignment start | |
680 | |
681 # | |
682 # More sanity checking | |
683 # | |
684 if (defined($prev1)) { | |
685 if ( $strand == 1 ) { | |
686 if ($f->start < $prev1) { | |
687 throw("Inconsistent coords in feature array (forward strand).\n" . | |
688 "Start [".$f->start()."] in current feature should be greater " . | |
689 "than previous feature end [$prev1]."); | |
690 } | |
691 } else { | |
692 if ($f->end > $prev1) { | |
693 throw("Inconsistent coords in feature array (reverse strand).\n" . | |
694 "End [".$f->end() ."] should be less than previous feature " . | |
695 "start [$prev1]."); | |
696 } | |
697 } | |
698 } | |
699 | |
700 my $length = ($f->end - $f->start + 1); #length of source seq alignment | |
701 my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment | |
702 | |
703 # using multiplication to avoid rounding errors, hence the | |
704 # switch from query to hit for the ratios | |
705 | |
706 # | |
707 # Yet more sanity checking | |
708 # | |
709 if($query_unit > $hit_unit){ | |
710 # I am going to make the assumption here that this situation will | |
711 # only occur with DnaPepAlignFeatures, this may not be true | |
712 my $query_p_length = sprintf "%.0f", ($length/$query_unit); | |
713 my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit); | |
714 if( $query_p_length != $hit_p_length) { | |
715 throw( "Feature lengths not comparable Lengths:" .$length . | |
716 " " . $hlength . " Ratios:" . $query_unit . " " . | |
717 $hit_unit ); | |
718 } | |
719 } else{ | |
720 my $query_d_length = sprintf "%.0f", ($length*$hit_unit); | |
721 my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit); | |
722 if( $length * $hit_unit != $hlength * $query_unit ) { | |
723 throw( "Feature lengths not comparable Lengths:" . $length . | |
724 " " . $hlength . " Ratios:" . $query_unit . " " . | |
725 $hit_unit ); | |
726 } | |
727 } | |
728 | |
729 my $hlengthfactor = ($query_unit/$hit_unit); | |
730 | |
731 # | |
732 # Check to see if there is an I type (insertion) gap: | |
733 # If there is a space between the end of the last source sequence | |
734 # alignment and the start of this one, then this is an insertion | |
735 # | |
736 | |
737 my $insertion_flag = 0; | |
738 if( $strand == 1 ) { | |
739 if( ( defined $prev1 ) && ( $f->start > $prev1 + 1 )) { | |
740 | |
741 #there is an insertion | |
742 $insertion_flag = 1; | |
743 my $gap = $f->start - $prev1 - 1; | |
744 if( $gap == 1 ) { | |
745 $gap = ""; # no need for a number if gap length is 1 | |
746 } | |
747 $string .= "$gap"."I"; | |
748 | |
749 } | |
750 | |
751 #shift our position in the source seq alignment | |
752 $prev1 = $f->end(); | |
753 } else { | |
754 | |
755 if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) { | |
756 | |
757 #there is an insertion | |
758 $insertion_flag = 1; | |
759 my $gap = $prev1 - $f->end() - 1; | |
760 if( $gap == 1 ) { | |
761 $gap = ""; # no need for a number if gap length is 1 | |
762 } | |
763 $string .= "$gap"."I"; | |
764 } | |
765 | |
766 #shift our position in the source seq alignment | |
767 $prev1 = $f->start(); | |
768 } | |
769 | |
770 # | |
771 # Check to see if there is a D type (deletion) gap | |
772 # There is a deletion gap if there is a space between the end of the | |
773 # last portion of the hit sequence alignment and this one | |
774 # | |
775 if( $hstrand == 1 ) { | |
776 if(( defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) { | |
777 | |
778 #there is a deletion | |
779 my $gap = $f->hstart - $prev2 - 1; | |
780 my $gap2 = int( $gap * $hlengthfactor + 0.5 ); | |
781 | |
782 if( $gap2 == 1 ) { | |
783 $gap2 = ""; # no need for a number if gap length is 1 | |
784 } | |
785 $string .= "$gap2"."D"; | |
786 | |
787 #sanity check, Should not be an insertion and deletion | |
788 if($insertion_flag) { | |
789 if ($message_only_once) { | |
790 warning("Should not be an deletion and insertion on the " . | |
791 "same alignment region. cigar_line=$string\n"); | |
792 $message_only_once = 0; | |
793 } | |
794 } | |
795 } | |
796 #shift our position in the hit seq alignment | |
797 $prev2 = $f->hend(); | |
798 | |
799 } else { | |
800 if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) { | |
801 | |
802 #there is a deletion | |
803 my $gap = $prev2 - $f->hend - 1; | |
804 my $gap2 = int( $gap * $hlengthfactor + 0.5 ); | |
805 | |
806 if( $gap2 == 1 ) { | |
807 $gap2 = ""; # no need for a number if gap length is 1 | |
808 } | |
809 $string .= "$gap2"."D"; | |
810 | |
811 #sanity check, Should not be an insertion and deletion | |
812 if($insertion_flag) { | |
813 if ($message_only_once) { | |
814 warning("Should not be an deletion and insertion on the " . | |
815 "same alignment region. prev2 = $prev2; f->hend() = " . | |
816 $f->hend() . "; cigar_line = $string;\n"); | |
817 $message_only_once = 0; | |
818 } | |
819 } | |
820 } | |
821 #shift our position in the hit seq alignment | |
822 $prev2 = $f->hstart(); | |
823 } | |
824 | |
825 my $matchlength = $f->end() - $f->start() + 1; | |
826 if( $matchlength == 1 ) { | |
827 $matchlength = ""; | |
828 } | |
829 $string .= $matchlength."M"; | |
830 } | |
831 | |
832 $self->{'start'} = $f1start; | |
833 $self->{'end'} = $f1end; | |
834 $self->{'seqname'} = $seqname; | |
835 $self->{'strand'} = $strand; | |
836 $self->{'score'} = $score; | |
837 $self->{'percent_id'} = $percent; | |
838 $self->{'analysis'} = $analysis; | |
839 $self->{'slice'} = $slice; | |
840 $self->{'hslice'} = $hslice; | |
841 $self->{'hstart'} = $f2start; | |
842 $self->{'hend'} = $f2end; | |
843 $self->{'hstrand'} = $hstrand; | |
844 $self->{'hseqname'} = $hname; | |
845 $self->{'cigar_string'} = $string; | |
846 $self->{'p_value'} = $pvalue; | |
847 $self->{'external_db_id'} = $external_db_id; | |
848 $self->{'hcoverage'} = $hcoverage; | |
849 $self->{'group_id'} = $group_id; | |
850 $self->{'level_id'} = $level_id; | |
851 } | |
852 | |
853 | |
854 | |
855 | |
856 | |
857 | |
858 =head2 _hit_unit | |
859 | |
860 Args : none | |
861 Description: abstract method, overwrite with something that returns | |
862 one or three | |
863 Returntype : int 1,3 | |
864 Exceptions : none | |
865 Caller : internal | |
866 Status : Stable | |
867 | |
868 =cut | |
869 | |
870 sub _hit_unit { | |
871 my $self = shift; | |
872 throw( "Abstract method call!" ); | |
873 } | |
874 | |
875 | |
876 | |
877 =head2 _query_unit | |
878 | |
879 Args : none | |
880 Description: abstract method, overwrite with something that returns | |
881 one or three | |
882 Returntype : int 1,3 | |
883 Exceptions : none | |
884 Caller : internal | |
885 Status : Stable | |
886 | |
887 =cut | |
888 | |
889 sub _query_unit { | |
890 my $self = shift; | |
891 throw( "Abstract method call!" ); | |
892 } | |
893 | |
894 | |
895 | |
896 | |
897 1; |