Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/BaseAlignFeature.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 Bio::EnsEMBL::BaseAlignFeature - Baseclass providing a common abstract | |
| 24 implmentation for alignment features | |
| 25 | |
| 26 =head1 SYNOPSIS | |
| 27 | |
| 28 my $feat = new Bio::EnsEMBL::DnaPepAlignFeature( | |
| 29 -slice => $slice, | |
| 30 -start => 100, | |
| 31 -end => 120, | |
| 32 -strand => 1, | |
| 33 -hseqname => 'SP:RF1231', | |
| 34 -hstart => 200, | |
| 35 -hend => 220, | |
| 36 -analysis => $analysis, | |
| 37 -cigar_string => '10M3D5M2I' | |
| 38 ); | |
| 39 | |
| 40 Alternatively if you have an array of ungapped features | |
| 41 | |
| 42 my $feat = | |
| 43 new Bio::EnsEMBL::DnaPepAlignFeature( -features => \@features ); | |
| 44 | |
| 45 Where @features is an array of Bio::EnsEMBL::FeaturePair | |
| 46 | |
| 47 There is a method to manipulate the cigar_string into ungapped features | |
| 48 | |
| 49 my @ungapped_features = $feat->ungapped_features(); | |
| 50 | |
| 51 This converts the cigar string into an array of Bio::EnsEMBL::FeaturePair | |
| 52 | |
| 53 $analysis is a Bio::EnsEMBL::Analysis object | |
| 54 | |
| 55 Bio::EnsEMBL::Feature methods can be used | |
| 56 Bio::EnsEMBL::FeaturePair methods can be used | |
| 57 | |
| 58 The cigar_string contains the ungapped pieces that make up the gapped | |
| 59 alignment | |
| 60 | |
| 61 It looks like: n Matches [ x Deletes or Inserts m Matches ]* | |
| 62 but a bit more condensed like "23M4I12M2D1M" | |
| 63 and evenmore condensed as you can ommit 1s "23M4I12M2DM" | |
| 64 | |
| 65 | |
| 66 To make things clearer this is how a blast HSP would be parsed | |
| 67 | |
| 68 >AK014066 | |
| 69 Length = 146 | |
| 70 | |
| 71 Minus Strand HSPs: | |
| 72 | |
| 73 Score = 76 (26.8 bits), Expect = 1.4, P = 0.74 | |
| 74 Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1 | |
| 75 | |
| 76 Query: 479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300 | |
| 77 G APPP PQG R P P G + P L + + ++ R +A + | |
| 78 Sbjct: 7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64 | |
| 79 | |
| 80 Query: 299 SSPHNPSPLPS 267 | |
| 81 H P+P P+ | |
| 82 Sbjct: 65 PLTHTPTPTPT 75 | |
| 83 | |
| 84 The alignment goes from 267 to 479 in sequence 1 and 7 to 75 in sequence 2 | |
| 85 and the strand is -1. | |
| 86 | |
| 87 The alignment is made up of the following ungapped pieces : | |
| 88 | |
| 89 sequence 1 start 447 , sequence 2 start 7 , match length 33 , strand -1 | |
| 90 sequence 1 start 417 , sequence 2 start 18 , match length 27 , strand -1 | |
| 91 sequence 1 start 267 , sequence 2 start 27 , match length 137 , strand -1 | |
| 92 | |
| 93 These ungapped pieces are made up into the following string (called a cigar | |
| 94 string) "33M3I27M3I137M" with start 267 end 479 strand -1 hstart 7 hend 75 | |
| 95 hstrand 1 and feature type would be DnaPepAlignFeature | |
| 96 | |
| 97 =cut | |
| 98 | |
| 99 | |
| 100 package Bio::EnsEMBL::BaseAlignFeature; | |
| 101 | |
| 102 use Bio::EnsEMBL::FeaturePair; | |
| 103 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
| 104 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
| 105 use Scalar::Util qw(weaken isweak); | |
| 106 | |
| 107 use vars qw(@ISA); | |
| 108 use strict; | |
| 109 | |
| 110 @ISA = qw(Bio::EnsEMBL::FeaturePair); | |
| 111 | |
| 112 | |
| 113 =head2 new | |
| 114 | |
| 115 Arg [..] : List of named arguments. (-cigar_string , -features) defined | |
| 116 in this constructor, others defined in FeaturePair and | |
| 117 SeqFeature superclasses. Either cigar_string or a list | |
| 118 of ungapped features should be provided - not both. | |
| 119 Example : $baf = new BaseAlignFeatureSubclass(-cigar_string => '3M3I12M'); | |
| 120 Description: Creates a new BaseAlignFeature using either a cigarstring or | |
| 121 a list of ungapped features. BaseAlignFeature is an abstract | |
| 122 baseclass and should not actually be instantiated - rather its | |
| 123 subclasses should be. | |
| 124 Returntype : Bio::EnsEMBL::BaseAlignFeature | |
| 125 Exceptions : thrown if both feature and cigar string args are provided | |
| 126 thrown if neither feature nor cigar string args are provided | |
| 127 Caller : general | |
| 128 Status : Stable | |
| 129 | |
| 130 =cut | |
| 131 | |
| 132 sub new { | |
| 133 | |
| 134 my $caller = shift; | |
| 135 | |
| 136 my $class = ref($caller) || $caller; | |
| 137 | |
| 138 my $self = $class->SUPER::new(@_); | |
| 139 | |
| 140 my ($cigar_string,$features) = rearrange([qw(CIGAR_STRING FEATURES)], @_); | |
| 141 | |
| 142 if (defined($cigar_string) && defined($features)) { | |
| 143 throw("CIGAR_STRING or FEATURES argument is required - not both."); | |
| 144 } elsif (defined($features)) { | |
| 145 $self->_parse_features($features); | |
| 146 | |
| 147 } elsif (defined($cigar_string)) { | |
| 148 $self->{'cigar_string'} = $cigar_string; | |
| 149 } else { | |
| 150 throw("CIGAR_STRING or FEATURES argument is required"); | |
| 151 } | |
| 152 | |
| 153 return $self; | |
| 154 } | |
| 155 | |
| 156 | |
| 157 =head2 new_fast | |
| 158 | |
| 159 Arg [1] : hashref $hashref | |
| 160 A hashref which will be blessed into a PepDnaAlignFeature. | |
| 161 Example : none | |
| 162 Description: This allows for very fast object creation when a large number | |
| 163 of PepDnaAlignFeatures needs to be created. This is a bit of | |
| 164 a hack but necessary when thousands of features need to be | |
| 165 generated within a couple of seconds for web display. It is | |
| 166 not recommended that this method be called unless you know what | |
| 167 you are doing. It requires knowledge of the internals of this | |
| 168 class and its superclasses. | |
| 169 Returntype : Bio::EnsEMBL::BaseAlignFeature | |
| 170 Exceptions : none | |
| 171 Caller : none currently | |
| 172 Status : Stable | |
| 173 | |
| 174 =cut | |
| 175 | |
| 176 sub new_fast { | |
| 177 my ($class, $hashref) = @_; | |
| 178 my $self = bless $hashref, $class; | |
| 179 weaken($self->{adaptor}) if ( ! isweak($self->{adaptor}) ); | |
| 180 return $self; | |
| 181 } | |
| 182 | |
| 183 | |
| 184 =head2 cigar_string | |
| 185 | |
| 186 Arg [1] : string $cigar_string | |
| 187 Example : $feature->cigar_string( "12MI3M" ); | |
| 188 Description: get/set for attribute cigar_string | |
| 189 cigar_string describes the alignment. "xM" stands for | |
| 190 x matches (mismatches), "xI" for inserts into query sequence | |
| 191 (thats the ensembl sequence), "xD" for deletions | |
| 192 (inserts in the subject). an "x" that is 1 can be omitted. | |
| 193 Returntype : string | |
| 194 Exceptions : none | |
| 195 Caller : general | |
| 196 Status : Stable | |
| 197 | |
| 198 =cut | |
| 199 | |
| 200 sub cigar_string { | |
| 201 my $self = shift; | |
| 202 $self->{'cigar_string'} = shift if(@_); | |
| 203 return $self->{'cigar_string'}; | |
| 204 } | |
| 205 | |
| 206 | |
| 207 =head2 alignment_length | |
| 208 | |
| 209 Arg [1] : None | |
| 210 Description: return the alignment length (including indels) based on the cigar_string | |
| 211 Returntype : int | |
| 212 Exceptions : | |
| 213 Caller : | |
| 214 Status : Stable | |
| 215 | |
| 216 =cut | |
| 217 | |
| 218 sub alignment_length { | |
| 219 my $self = shift; | |
| 220 | |
| 221 if (! defined $self->{'_alignment_length'} && defined $self->cigar_string) { | |
| 222 | |
| 223 my @pieces = ( $self->cigar_string =~ /(\d*[MDI])/g ); | |
| 224 unless (@pieces) { | |
| 225 print STDERR "Error parsing cigar_string\n"; | |
| 226 } | |
| 227 my $alignment_length = 0; | |
| 228 foreach my $piece (@pieces) { | |
| 229 my ($length) = ( $piece =~ /^(\d*)/ ); | |
| 230 if (! defined $length || $length eq "") { | |
| 231 $length = 1; | |
| 232 } | |
| 233 $alignment_length += $length; | |
| 234 } | |
| 235 $self->{'_alignment_length'} = $alignment_length; | |
| 236 } | |
| 237 return $self->{'_alignment_length'}; | |
| 238 } | |
| 239 | |
| 240 =head2 ungapped_features | |
| 241 | |
| 242 Args : none | |
| 243 Example : @ungapped_features = $align_feature->get_feature | |
| 244 Description: converts the internal cigar_string into an array of | |
| 245 ungapped feature pairs | |
| 246 Returntype : list of Bio::EnsEMBL::FeaturePair | |
| 247 Exceptions : cigar_string not set internally | |
| 248 Caller : general | |
| 249 Status : Stable | |
| 250 | |
| 251 =cut | |
| 252 | |
| 253 sub ungapped_features { | |
| 254 my ($self) = @_; | |
| 255 | |
| 256 if (!defined($self->{'cigar_string'})) { | |
| 257 throw("No cigar_string defined. Can't return ungapped features"); | |
| 258 } | |
| 259 | |
| 260 return @{$self->_parse_cigar()}; | |
| 261 } | |
| 262 | |
| 263 =head2 strands_reversed | |
| 264 | |
| 265 Arg [1] : int $strands_reversed | |
| 266 Description: get/set for attribute strands_reversed | |
| 267 0 means that strand and hstrand are the original strands obtained | |
| 268 from the alignment program used | |
| 269 1 means that strand and hstrand have been flipped as compared to | |
| 270 the original result provided by the alignment program used. | |
| 271 You may want to use the reverse_complement method to restore the | |
| 272 original strandness. | |
| 273 Returntype : int | |
| 274 Exceptions : none | |
| 275 Caller : general | |
| 276 Status : Stable | |
| 277 | |
| 278 =cut | |
| 279 | |
| 280 sub strands_reversed { | |
| 281 my ($self, $arg) = @_; | |
| 282 | |
| 283 if ( defined $arg ) { | |
| 284 $self->{'strands_reversed'} = $arg ; | |
| 285 } | |
| 286 | |
| 287 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'}); | |
| 288 | |
| 289 return $self->{'strands_reversed'}; | |
| 290 } | |
| 291 | |
| 292 =head2 reverse_complement | |
| 293 | |
| 294 Args : none | |
| 295 Description: reverse complement the FeaturePair, | |
| 296 modifing strand, hstrand and cigar_string in consequence | |
| 297 Returntype : none | |
| 298 Exceptions : none | |
| 299 Caller : general | |
| 300 Status : Stable | |
| 301 | |
| 302 =cut | |
| 303 | |
| 304 sub reverse_complement { | |
| 305 my ($self) = @_; | |
| 306 | |
| 307 # reverse strand in both sequences | |
| 308 $self->strand($self->strand * -1); | |
| 309 $self->hstrand($self->hstrand * -1); | |
| 310 | |
| 311 # reverse cigar_string as consequence | |
| 312 my $cigar_string = $self->cigar_string; | |
| 313 $cigar_string =~ s/(D|I|M)/$1 /g; | |
| 314 my @cigar_pieces = split / /,$cigar_string; | |
| 315 $cigar_string = ""; | |
| 316 while (my $piece = pop @cigar_pieces) { | |
| 317 $cigar_string .= $piece; | |
| 318 } | |
| 319 | |
| 320 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'}); | |
| 321 | |
| 322 if ($self->strands_reversed) { | |
| 323 $self->strands_reversed(0) | |
| 324 } else { | |
| 325 $self->strands_reversed(1); | |
| 326 } | |
| 327 | |
| 328 $self->cigar_string($cigar_string); | |
| 329 } | |
| 330 | |
| 331 | |
| 332 | |
| 333 =head2 transform | |
| 334 | |
| 335 Arg 1 : String $coordinate_system_name | |
| 336 Arg [2] : String $coordinate_system_version | |
| 337 Example : $feature = $feature->transform('contig'); | |
| 338 $feature = $feature->transform('chromosome', 'NCBI33'); | |
| 339 Description: Moves this AlignFeature to the given coordinate system. | |
| 340 If the feature cannot be transformed to the destination | |
| 341 coordinate system undef is returned instead. | |
| 342 Returntype : Bio::EnsEMBL::BaseAlignFeature; | |
| 343 Exceptions : wrong parameters | |
| 344 Caller : general | |
| 345 Status : Medium Risk | |
| 346 : deprecation needs to be removed at some time | |
| 347 | |
| 348 =cut | |
| 349 | |
| 350 sub transform { | |
| 351 my $self = shift; | |
| 352 | |
| 353 # catch for old style transform calls | |
| 354 if( ref $_[0] eq 'HASH') { | |
| 355 deprecate("Calling transform with a hashref is deprecate.\n" . | |
| 356 'Use $feat->transfer($slice) or ' . | |
| 357 '$feat->transform("coordsysname") instead.'); | |
| 358 my (undef, $new_feat) = each(%{$_[0]}); | |
| 359 return $self->transfer($new_feat->slice); | |
| 360 } | |
| 361 | |
| 362 my $new_feature = $self->SUPER::transform(@_); | |
| 363 if ( !defined($new_feature) | |
| 364 || $new_feature->length() != $self->length() ) | |
| 365 { | |
| 366 my @segments = @{ $self->project(@_) }; | |
| 367 | |
| 368 if ( !@segments ) { | |
| 369 return undef; | |
| 370 } | |
| 371 | |
| 372 my @ungapped; | |
| 373 foreach my $f ( $self->ungapped_features() ) { | |
| 374 $f = $f->transform(@_); | |
| 375 if ( defined($f) ) { | |
| 376 push( @ungapped, $f ); | |
| 377 } else { | |
| 378 warning( "Failed to transform alignment feature; " | |
| 379 . "ungapped component could not be transformed" ); | |
| 380 return undef; | |
| 381 } | |
| 382 } | |
| 383 | |
| 384 eval { $new_feature = $self->new( -features => \@ungapped ); }; | |
| 385 | |
| 386 if ($@) { | |
| 387 warning($@); | |
| 388 return undef; | |
| 389 } | |
| 390 } ## end if ( !defined($new_feature...)) | |
| 391 | |
| 392 return $new_feature; | |
| 393 } | |
| 394 | |
| 395 | |
| 396 =head2 _parse_cigar | |
| 397 | |
| 398 Args : none | |
| 399 Description: PRIVATE (internal) method - creates ungapped features from | |
| 400 internally stored cigar line | |
| 401 Returntype : list of Bio::EnsEMBL::FeaturePair | |
| 402 Exceptions : none | |
| 403 Caller : ungapped_features | |
| 404 Status : Stable | |
| 405 | |
| 406 =cut | |
| 407 | |
| 408 sub _parse_cigar { | |
| 409 my ( $self ) = @_; | |
| 410 | |
| 411 my $query_unit = $self->_query_unit(); | |
| 412 my $hit_unit = $self->_hit_unit(); | |
| 413 | |
| 414 my $string = $self->{'cigar_string'}; | |
| 415 | |
| 416 throw("No cigar string defined in object") if(!defined($string)); | |
| 417 | |
| 418 my @pieces = ( $string =~ /(\d*[MDI])/g ); | |
| 419 #print "cigar: ",join ( ",", @pieces ),"\n"; | |
| 420 | |
| 421 my @features; | |
| 422 my $strand1 = $self->{'strand'} || 1; | |
| 423 my $strand2 = $self->{'hstrand'}|| 1; | |
| 424 | |
| 425 my ( $start1, $start2 ); | |
| 426 | |
| 427 if( $strand1 == 1 ) { | |
| 428 $start1 = $self->{'start'}; | |
| 429 } else { | |
| 430 $start1 = $self->{'end'}; | |
| 431 } | |
| 432 | |
| 433 if( $strand2 == 1 ) { | |
| 434 $start2 = $self->{'hstart'}; | |
| 435 } else { | |
| 436 $start2 = $self->{'hend'}; | |
| 437 } | |
| 438 | |
| 439 # | |
| 440 # Construct ungapped blocks as FeaturePairs objects for each MATCH | |
| 441 # | |
| 442 foreach my $piece (@pieces) { | |
| 443 | |
| 444 my ($length) = ( $piece =~ /^(\d*)/ ); | |
| 445 if( $length eq "" ) { $length = 1 } | |
| 446 my $mapped_length; | |
| 447 | |
| 448 # explicit if statements to avoid rounding problems | |
| 449 # and make sure we have sane coordinate systems | |
| 450 if( $query_unit == 1 && $hit_unit == 3 ) { | |
| 451 $mapped_length = $length*3; | |
| 452 } elsif( $query_unit == 3 && $hit_unit == 1 ) { | |
| 453 $mapped_length = $length / 3; | |
| 454 } elsif ( $query_unit == 1 && $hit_unit == 1 ) { | |
| 455 $mapped_length = $length; | |
| 456 } else { | |
| 457 throw("Internal error $query_unit $hit_unit, currently only " . | |
| 458 "allowing 1 or 3 "); | |
| 459 } | |
| 460 | |
| 461 if( int($mapped_length) != $mapped_length and | |
| 462 ($piece =~ /M$/ or $piece =~ /D$/)) { | |
| 463 throw("Internal error with mismapped length of hit, query " . | |
| 464 "$query_unit, hit $hit_unit, length $length"); | |
| 465 } | |
| 466 | |
| 467 if( $piece =~ /M$/ ) { | |
| 468 # | |
| 469 # MATCH | |
| 470 # | |
| 471 my ( $qstart, $qend); | |
| 472 if( $strand1 == 1 ) { | |
| 473 $qstart = $start1; | |
| 474 $qend = $start1 + $length - 1; | |
| 475 $start1 = $qend + 1; | |
| 476 } else { | |
| 477 $qend = $start1; | |
| 478 $qstart = $start1 - $length + 1; | |
| 479 $start1 = $qstart - 1; | |
| 480 } | |
| 481 | |
| 482 my ($hstart, $hend); | |
| 483 if( $strand2 == 1 ) { | |
| 484 $hstart = $start2; | |
| 485 $hend = $start2 + $mapped_length - 1; | |
| 486 $start2 = $hend + 1; | |
| 487 } else { | |
| 488 $hend = $start2; | |
| 489 $hstart = $start2 - $mapped_length + 1; | |
| 490 $start2 = $hstart - 1; | |
| 491 } | |
| 492 | |
| 493 | |
| 494 push @features, Bio::EnsEMBL::FeaturePair->new | |
| 495 (-SLICE => $self->{'slice'}, | |
| 496 -SEQNAME => $self->{'seqname'}, | |
| 497 -START => $qstart, | |
| 498 -END => $qend, | |
| 499 -STRAND => $strand1, | |
| 500 -HSLICE => $self->{'hslice'}, | |
| 501 -HSEQNAME => $self->{'hseqname'}, | |
| 502 -HSTART => $hstart, | |
| 503 -HEND => $hend, | |
| 504 -HSTRAND => $strand2, | |
| 505 -SCORE => $self->{'score'}, | |
| 506 -PERCENT_ID => $self->{'percent_id'}, | |
| 507 -ANALYSIS => $self->{'analysis'}, | |
| 508 -P_VALUE => $self->{'p_value'}, | |
| 509 -EXTERNAL_DB_ID => $self->{'external_db_id'}, | |
| 510 -HCOVERAGE => $self->{'hcoverage'}, | |
| 511 -GROUP_ID => $self->{'group_id'}, | |
| 512 -LEVEL_ID => $self->{'level_id'}); | |
| 513 | |
| 514 | |
| 515 # end M cigar bits | |
| 516 } elsif( $piece =~ /I$/ ) { | |
| 517 # | |
| 518 # INSERT | |
| 519 # | |
| 520 if( $strand1 == 1 ) { | |
| 521 $start1 += $length; | |
| 522 } else { | |
| 523 $start1 -= $length; | |
| 524 } | |
| 525 } elsif( $piece =~ /D$/ ) { | |
| 526 # | |
| 527 # DELETION | |
| 528 # | |
| 529 if( $strand2 == 1 ) { | |
| 530 $start2 += $mapped_length; | |
| 531 } else { | |
| 532 $start2 -= $mapped_length; | |
| 533 } | |
| 534 } else { | |
| 535 throw( "Illegal cigar line $string!" ); | |
| 536 } | |
| 537 } | |
| 538 | |
| 539 return \@features; | |
| 540 } | |
| 541 | |
| 542 | |
| 543 | |
| 544 | |
| 545 =head2 _parse_features | |
| 546 | |
| 547 Arg [1] : listref Bio::EnsEMBL::FeaturePair $ungapped_features | |
| 548 Description: creates internal cigarstring and start,end hstart,hend | |
| 549 entries. | |
| 550 Returntype : none, fills in values of self | |
| 551 Exceptions : argument list undergoes many sanity checks - throws under many | |
| 552 invalid conditions | |
| 553 Caller : new | |
| 554 Status : Stable | |
| 555 | |
| 556 =cut | |
| 557 | |
| 558 my $message_only_once = 1; | |
| 559 | |
| 560 sub _parse_features { | |
| 561 my ($self,$features ) = @_; | |
| 562 | |
| 563 my $query_unit = $self->_query_unit(); | |
| 564 my $hit_unit = $self->_hit_unit(); | |
| 565 | |
| 566 if (ref($features) ne "ARRAY") { | |
| 567 throw("features must be an array reference not a [".ref($features)."]"); | |
| 568 } | |
| 569 | |
| 570 my $strand = $features->[0]->strand; | |
| 571 | |
| 572 throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand); | |
| 573 | |
| 574 my @f; | |
| 575 | |
| 576 # | |
| 577 # Sort the features on their start position | |
| 578 # Ascending order on positive strand, descending on negative strand | |
| 579 # | |
| 580 if( $strand == 1 ) { | |
| 581 @f = sort {$a->start <=> $b->start} @$features; | |
| 582 } else { | |
| 583 @f = sort { $b->start <=> $a->start} @$features; | |
| 584 } | |
| 585 | |
| 586 my $hstrand = $f[0]->hstrand; | |
| 587 my $slice = $f[0]->slice(); | |
| 588 my $hslice = $f[0]->hslice(); | |
| 589 my $name = $slice->name() if($slice); | |
| 590 my $hname = $f[0]->hseqname; | |
| 591 my $score = $f[0]->score; | |
| 592 my $percent = $f[0]->percent_id; | |
| 593 my $analysis = $f[0]->analysis; | |
| 594 my $pvalue = $f[0]->p_value(); | |
| 595 my $external_db_id = $f[0]->external_db_id; | |
| 596 my $hcoverage = $f[0]->hcoverage; | |
| 597 my $group_id = $f[0]->group_id; | |
| 598 my $level_id = $f[0]->level_id; | |
| 599 | |
| 600 my $seqname = $f[0]->seqname; | |
| 601 # implicit strand 1 for peptide sequences | |
| 602 $strand ||= 1; | |
| 603 $hstrand ||= 1; | |
| 604 my $ori = $strand * $hstrand; | |
| 605 | |
| 606 throw("No features in the array to parse") if(scalar(@f) == 0); | |
| 607 | |
| 608 my $prev1; # where last feature q part ended | |
| 609 my $prev2; # where last feature s part ended | |
| 610 | |
| 611 my $string; | |
| 612 | |
| 613 # Use strandedness info of query and hit to make sure both sets of | |
| 614 # start and end coordinates are oriented the right way around. | |
| 615 my $f1start; | |
| 616 my $f1end; | |
| 617 my $f2end; | |
| 618 my $f2start; | |
| 619 | |
| 620 if ($strand == 1) { | |
| 621 $f1start = $f[0]->start; | |
| 622 $f1end = $f[-1]->end; | |
| 623 } else { | |
| 624 $f1start = $f[-1]->start; | |
| 625 $f1end = $f[0]->end; | |
| 626 } | |
| 627 | |
| 628 if ($hstrand == 1) { | |
| 629 $f2start = $f[0]->hstart; | |
| 630 $f2end = $f[-1]->hend; | |
| 631 } else { | |
| 632 $f2start = $f[-1]->hstart; | |
| 633 $f2end = $f[0]->hend; | |
| 634 } | |
| 635 | |
| 636 # | |
| 637 # Loop through each portion of alignment and construct cigar string | |
| 638 # | |
| 639 | |
| 640 foreach my $f (@f) { | |
| 641 # | |
| 642 # Sanity checks | |
| 643 # | |
| 644 | |
| 645 if (!$f->isa("Bio::EnsEMBL::FeaturePair")) { | |
| 646 throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair"); | |
| 647 } | |
| 648 if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) { | |
| 649 throw("Inconsistent hstrands in feature array"); | |
| 650 } | |
| 651 if( defined($f->strand()) && ($f->strand != $strand)) { | |
| 652 throw("Inconsistent strands in feature array"); | |
| 653 } | |
| 654 if ( defined($name) && $name ne $f->slice->name()) { | |
| 655 throw("Inconsistent names in feature array [$name - ". | |
| 656 $f->slice->name()."]"); | |
| 657 } | |
| 658 if ( defined($hname) && $hname ne $f->hseqname) { | |
| 659 throw("Inconsistent hit names in feature array [$hname - ". | |
| 660 $f->hseqname . "]"); | |
| 661 } | |
| 662 if ( defined($score) && $score ne $f->score) { | |
| 663 throw("Inconsisent scores in feature array [$score - " . | |
| 664 $f->score . "]"); | |
| 665 } | |
| 666 if (defined($f->percent_id) && $percent ne $f->percent_id) { | |
| 667 throw("Inconsistent pids in feature array [$percent - " . | |
| 668 $f->percent_id . "]"); | |
| 669 } | |
| 670 if(defined($pvalue) && $pvalue != $f->p_value()) { | |
| 671 throw("Inconsistant p_values in feature arraw [$pvalue " . | |
| 672 $f->p_value() . "]"); | |
| 673 } | |
| 674 if($seqname && $seqname ne $f->seqname){ | |
| 675 throw("Inconsistent seqname in feature array [$seqname - ". | |
| 676 $f->seqname . "]"); | |
| 677 } | |
| 678 my $start1 = $f->start; #source sequence alignment start | |
| 679 my $start2 = $f->hstart(); #hit sequence alignment start | |
| 680 | |
| 681 # | |
| 682 # More sanity checking | |
| 683 # | |
| 684 if (defined($prev1)) { | |
| 685 if ( $strand == 1 ) { | |
| 686 if ($f->start < $prev1) { | |
| 687 throw("Inconsistent coords in feature array (forward strand).\n" . | |
| 688 "Start [".$f->start()."] in current feature should be greater " . | |
| 689 "than previous feature end [$prev1]."); | |
| 690 } | |
| 691 } else { | |
| 692 if ($f->end > $prev1) { | |
| 693 throw("Inconsistent coords in feature array (reverse strand).\n" . | |
| 694 "End [".$f->end() ."] should be less than previous feature " . | |
| 695 "start [$prev1]."); | |
| 696 } | |
| 697 } | |
| 698 } | |
| 699 | |
| 700 my $length = ($f->end - $f->start + 1); #length of source seq alignment | |
| 701 my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment | |
| 702 | |
| 703 # using multiplication to avoid rounding errors, hence the | |
| 704 # switch from query to hit for the ratios | |
| 705 | |
| 706 # | |
| 707 # Yet more sanity checking | |
| 708 # | |
| 709 if($query_unit > $hit_unit){ | |
| 710 # I am going to make the assumption here that this situation will | |
| 711 # only occur with DnaPepAlignFeatures, this may not be true | |
| 712 my $query_p_length = sprintf "%.0f", ($length/$query_unit); | |
| 713 my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit); | |
| 714 if( $query_p_length != $hit_p_length) { | |
| 715 throw( "Feature lengths not comparable Lengths:" .$length . | |
| 716 " " . $hlength . " Ratios:" . $query_unit . " " . | |
| 717 $hit_unit ); | |
| 718 } | |
| 719 } else{ | |
| 720 my $query_d_length = sprintf "%.0f", ($length*$hit_unit); | |
| 721 my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit); | |
| 722 if( $length * $hit_unit != $hlength * $query_unit ) { | |
| 723 throw( "Feature lengths not comparable Lengths:" . $length . | |
| 724 " " . $hlength . " Ratios:" . $query_unit . " " . | |
| 725 $hit_unit ); | |
| 726 } | |
| 727 } | |
| 728 | |
| 729 my $hlengthfactor = ($query_unit/$hit_unit); | |
| 730 | |
| 731 # | |
| 732 # Check to see if there is an I type (insertion) gap: | |
| 733 # If there is a space between the end of the last source sequence | |
| 734 # alignment and the start of this one, then this is an insertion | |
| 735 # | |
| 736 | |
| 737 my $insertion_flag = 0; | |
| 738 if( $strand == 1 ) { | |
| 739 if( ( defined $prev1 ) && ( $f->start > $prev1 + 1 )) { | |
| 740 | |
| 741 #there is an insertion | |
| 742 $insertion_flag = 1; | |
| 743 my $gap = $f->start - $prev1 - 1; | |
| 744 if( $gap == 1 ) { | |
| 745 $gap = ""; # no need for a number if gap length is 1 | |
| 746 } | |
| 747 $string .= "$gap"."I"; | |
| 748 | |
| 749 } | |
| 750 | |
| 751 #shift our position in the source seq alignment | |
| 752 $prev1 = $f->end(); | |
| 753 } else { | |
| 754 | |
| 755 if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) { | |
| 756 | |
| 757 #there is an insertion | |
| 758 $insertion_flag = 1; | |
| 759 my $gap = $prev1 - $f->end() - 1; | |
| 760 if( $gap == 1 ) { | |
| 761 $gap = ""; # no need for a number if gap length is 1 | |
| 762 } | |
| 763 $string .= "$gap"."I"; | |
| 764 } | |
| 765 | |
| 766 #shift our position in the source seq alignment | |
| 767 $prev1 = $f->start(); | |
| 768 } | |
| 769 | |
| 770 # | |
| 771 # Check to see if there is a D type (deletion) gap | |
| 772 # There is a deletion gap if there is a space between the end of the | |
| 773 # last portion of the hit sequence alignment and this one | |
| 774 # | |
| 775 if( $hstrand == 1 ) { | |
| 776 if(( defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) { | |
| 777 | |
| 778 #there is a deletion | |
| 779 my $gap = $f->hstart - $prev2 - 1; | |
| 780 my $gap2 = int( $gap * $hlengthfactor + 0.5 ); | |
| 781 | |
| 782 if( $gap2 == 1 ) { | |
| 783 $gap2 = ""; # no need for a number if gap length is 1 | |
| 784 } | |
| 785 $string .= "$gap2"."D"; | |
| 786 | |
| 787 #sanity check, Should not be an insertion and deletion | |
| 788 if($insertion_flag) { | |
| 789 if ($message_only_once) { | |
| 790 warning("Should not be an deletion and insertion on the " . | |
| 791 "same alignment region. cigar_line=$string\n"); | |
| 792 $message_only_once = 0; | |
| 793 } | |
| 794 } | |
| 795 } | |
| 796 #shift our position in the hit seq alignment | |
| 797 $prev2 = $f->hend(); | |
| 798 | |
| 799 } else { | |
| 800 if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) { | |
| 801 | |
| 802 #there is a deletion | |
| 803 my $gap = $prev2 - $f->hend - 1; | |
| 804 my $gap2 = int( $gap * $hlengthfactor + 0.5 ); | |
| 805 | |
| 806 if( $gap2 == 1 ) { | |
| 807 $gap2 = ""; # no need for a number if gap length is 1 | |
| 808 } | |
| 809 $string .= "$gap2"."D"; | |
| 810 | |
| 811 #sanity check, Should not be an insertion and deletion | |
| 812 if($insertion_flag) { | |
| 813 if ($message_only_once) { | |
| 814 warning("Should not be an deletion and insertion on the " . | |
| 815 "same alignment region. prev2 = $prev2; f->hend() = " . | |
| 816 $f->hend() . "; cigar_line = $string;\n"); | |
| 817 $message_only_once = 0; | |
| 818 } | |
| 819 } | |
| 820 } | |
| 821 #shift our position in the hit seq alignment | |
| 822 $prev2 = $f->hstart(); | |
| 823 } | |
| 824 | |
| 825 my $matchlength = $f->end() - $f->start() + 1; | |
| 826 if( $matchlength == 1 ) { | |
| 827 $matchlength = ""; | |
| 828 } | |
| 829 $string .= $matchlength."M"; | |
| 830 } | |
| 831 | |
| 832 $self->{'start'} = $f1start; | |
| 833 $self->{'end'} = $f1end; | |
| 834 $self->{'seqname'} = $seqname; | |
| 835 $self->{'strand'} = $strand; | |
| 836 $self->{'score'} = $score; | |
| 837 $self->{'percent_id'} = $percent; | |
| 838 $self->{'analysis'} = $analysis; | |
| 839 $self->{'slice'} = $slice; | |
| 840 $self->{'hslice'} = $hslice; | |
| 841 $self->{'hstart'} = $f2start; | |
| 842 $self->{'hend'} = $f2end; | |
| 843 $self->{'hstrand'} = $hstrand; | |
| 844 $self->{'hseqname'} = $hname; | |
| 845 $self->{'cigar_string'} = $string; | |
| 846 $self->{'p_value'} = $pvalue; | |
| 847 $self->{'external_db_id'} = $external_db_id; | |
| 848 $self->{'hcoverage'} = $hcoverage; | |
| 849 $self->{'group_id'} = $group_id; | |
| 850 $self->{'level_id'} = $level_id; | |
| 851 } | |
| 852 | |
| 853 | |
| 854 | |
| 855 | |
| 856 | |
| 857 | |
| 858 =head2 _hit_unit | |
| 859 | |
| 860 Args : none | |
| 861 Description: abstract method, overwrite with something that returns | |
| 862 one or three | |
| 863 Returntype : int 1,3 | |
| 864 Exceptions : none | |
| 865 Caller : internal | |
| 866 Status : Stable | |
| 867 | |
| 868 =cut | |
| 869 | |
| 870 sub _hit_unit { | |
| 871 my $self = shift; | |
| 872 throw( "Abstract method call!" ); | |
| 873 } | |
| 874 | |
| 875 | |
| 876 | |
| 877 =head2 _query_unit | |
| 878 | |
| 879 Args : none | |
| 880 Description: abstract method, overwrite with something that returns | |
| 881 one or three | |
| 882 Returntype : int 1,3 | |
| 883 Exceptions : none | |
| 884 Caller : internal | |
| 885 Status : Stable | |
| 886 | |
| 887 =cut | |
| 888 | |
| 889 sub _query_unit { | |
| 890 my $self = shift; | |
| 891 throw( "Abstract method call!" ); | |
| 892 } | |
| 893 | |
| 894 | |
| 895 | |
| 896 | |
| 897 1; |
