diff variant_effect_predictor/Bio/SearchIO/exonerate.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SearchIO/exonerate.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,578 @@
+# $Id: exonerate.pm,v 1.3.2.3 2003/03/29 20:30:54 jason Exp $
+#
+# BioPerl module for Bio::SearchIO::exonerate
+#
+# Cared for by Jason Stajich <jason@bioperl.org>
+#
+# Copyright Jason Stajich
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SearchIO::exonerate - parser for Exonerate
+
+=head1 SYNOPSIS
+
+  # do not use this module directly, it is a driver for SearchIO
+
+  use Bio::SearchIO;
+  my $searchio = new Bio::SearchIO(-file => 'file.exonerate',
+                                   -format => 'exonerate');
+
+
+  while( my $r = $searchio->next_result ) {
+    print $r->query_name, "\n";
+  }
+
+=head1 DESCRIPTION
+
+This is a driver for the SearchIO system for parsing Exonerate (Guy
+Slater) output.  You can get Exonerate at
+http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate/?cvsroot=Ensembl
+[until Guy puts up a Web reference,publication for it.]).
+
+An optional parameter -min_intron is supported by the L<new>
+initialization method.  This is if you run Exonerate with a different
+minimum intron length (default is 30) the parser will be able to
+detect the difference between standard deletions and an intron.  Still
+some room to play with there that might cause this to get
+misinterpreted that has not been fully tested or explored.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bioperl.org
+  http://bioperl.org/bioperl-bugs/
+
+=head1 AUTHOR - Jason Stajich
+
+Email jason@bioperl.org
+
+Describe contact details here
+
+=head1 CONTRIBUTORS
+
+Additional contributors names and emails here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::SearchIO::exonerate;
+use strict;
+use vars qw(@ISA @STATES %MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $MIN_INTRON);
+use Bio::SearchIO;
+
+@ISA = qw(Bio::SearchIO );
+
+use POSIX;
+
+
+%MODEMAP = ('ExonerateOutput' => 'result',
+    'Hit'             => 'hit',
+    'Hsp'             => 'hsp'
+    );
+%MAPPING =
+    (
+    'Hsp_query-from'=>  'HSP-query_start',
+    'Hsp_query-to'  =>  'HSP-query_end',
+    'Hsp_hit-from'  =>  'HSP-hit_start',
+    'Hsp_hit-to'    =>  'HSP-hit_end',
+    'Hsp_qseq'      =>  'HSP-query_seq',
+    'Hsp_hseq'      =>  'HSP-hit_seq',
+    'Hsp_midline'   =>  'HSP-homology_seq',
+    'Hsp_score'     =>  'HSP-score',
+    'Hsp_qlength'   =>  'HSP-query_length',
+    'Hsp_hlength'   =>  'HSP-hit_length',
+    'Hsp_align-len' =>  'HSP-hsp_length',
+    'Hsp_identity'  =>  'HSP-identical',
+    'Hsp_gaps'       => 'HSP-hsp_gaps',
+    'Hsp_hitgaps'    => 'HSP-hit_gaps',
+    'Hsp_querygaps'  => 'HSP-query_gaps',
+
+    'Hit_id'        => 'HIT-name',
+    'Hit_desc'      => 'HIT-description',
+    'Hit_len'       => 'HIT-length',
+    'Hit_score'     => 'HIT-score',
+
+    'ExonerateOutput_program'   => 'RESULT-algorithm_name',
+    'ExonerateOutput_query-def' => 'RESULT-query_name',
+    'ExonerateOutput_query-desc'=> 'RESULT-query_description',
+    'ExonerateOutput_query-len' => 'RESULT-query_length',
+    );
+
+$DEFAULT_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
+
+$MIN_INTRON=30; # This is the minimum intron size
+
+=head2 new
+
+ Title   : new
+ Usage   : my $obj = new Bio::SearchIO::exonerate();
+ Function: Builds a new Bio::SearchIO::exonerate object
+ Returns : an instance of Bio::SearchIO::exonerate
+ Args    :
+
+
+=cut
+
+sub new {
+    my ($class) = shift;
+    my $self = $class->SUPER::new(@_);
+
+    my ($min_intron) = $self->_rearrange([qw(MIN_INTRON)], @_);
+    if( $min_intron ) {
+	$MIN_INTRON = $min_intron;
+    }
+    $self;
+}
+
+=head2 next_result
+
+ Title   : next_result
+ Usage   : my $hit = $searchio->next_result;
+ Function: Returns the next Result from a search
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+=cut
+
+sub next_result{
+   my ($self) = @_;
+   $self->{'_last_data'} = '';
+   my ($reporttype,$seenquery,$reportline);
+   $self->start_document();
+   my @hit_signifs;
+   my $seentop;
+   my (@q_ex, @m_ex, @h_ex); ## gc addition
+   while( defined($_ = $self->_readline) ) {
+       if( /^Query:\s+(\S+)(\s+(.+))?/ ) {
+	   if( $seentop ) {
+	       $self->end_element({'Name' => 'ExonerateOutput'});
+	       $self->_pushback($_);
+	       return $self->end_document();
+	   }
+	   $seentop = 1;
+	   my ($nm,$desc) = ($1,$2);
+	   chomp($desc) if defined $desc;
+	   $self->{'_result_count'}++;
+	   $self->start_element({'Name' => 'ExonerateOutput'});
+	   $self->element({'Name' => 'ExonerateOutput_query-def',
+			   'Data' => $nm });
+	   $self->element({'Name' => 'ExonerateOutput_query-desc',
+			   'Data' => $desc });
+	   $self->element({'Name' => 'ExonerateOutput_program',
+			    'Data' => 'Exonerate' });
+
+       } elsif ( /^Target:\s+(\S+)(\s+(.+))?/ ) {
+	    my ($nm,$desc) = ($1,$2);
+	   chomp($desc) if defined $desc;
+	   $self->start_element({'Name' => 'Hit'});
+	   $self->element({'Name' => 'Hit_id',
+			   'Data' => $nm});
+	   $self->element({'Name' => 'Hit_desc',
+			   'Data' => $desc});
+       } elsif(  s/^cigar:\s+(\S+)\s+          # query sequence id
+		 (\d+)\s+(\d+)\s+([\-\+])\s+   # query start-end-strand
+		 (\S+)\s+                      # target sequence id
+		 (\d+)\s+(\d+)\s+([\-\+])\s+   # target start-end-strand
+		 (\d+)\s+                      # score
+		 //ox ) {
+	
+	   ## gc note:
+	   ## $qe and $he are no longer used for calculating the ends,
+	   ## just the $qs and $hs values and the alignment and insert lenghts
+	   my ($qs,$qe,$qstrand) = ($2,$3,$4);
+	   my ($hs,$he,$hstrand) = ($6,$7,$8);
+	   my $score = $9;
+#	   $self->element({'Name' => 'ExonerateOutput_query-len',
+#			   'Data' => $qe});
+#	   $self->element({'Name' => 'Hit_len',
+#			   'Data' => $he});
+	
+	   my @rest = split;
+	   if( $qstrand eq '-' ) {
+	       $qstrand = -1;
+	       ($qs,$qe) = ($qe,$qs); # flip-flop if we're on opp strand
+	   		$qs--; $qe++;
+	   } else { $qstrand = 1; }
+	   if( $hstrand eq '-' ) {
+	       $hstrand = -1;
+	       ($hs,$he) = ($he,$hs); # flip-flop if we're on opp strand
+	       $hs--; $he++;
+	   } else { $hstrand = 1; }
+	   # okay let's do this right and generate a set of HSPs
+	   # from the cigar line
+
+		## gc note:
+		## add one because these values are zero-based
+		## this calculation was originally done lower in the code,
+		## but it's clearer to do it just once at the start
+	   $qs++;
+	   $hs++;
+
+	   my ($aln_len,$inserts,$deletes) = (0,0,0);
+	   while( @rest >= 2 ) {
+	       my ($state,$len) = (shift @rest, shift @rest);
+	       if( $state eq 'I' ) {
+		   $inserts+=$len;
+	       } elsif( $state eq 'D' ) {
+		   if( $len >= $MIN_INTRON ) {
+		       $self->start_element({'Name' => 'Hsp'});
+
+		       $self->element({'Name' => 'Hsp_score',
+				       'Data' => $score});
+		       $self->element({'Name' => 'Hsp_align-len',
+				       'Data' => $aln_len});
+		       $self->element({'Name' => 'Hsp_identity',
+				       'Data' => $aln_len -
+					   ($inserts + $deletes)});
+		
+		       # HSP ends where the other begins
+		       $self->element({'Name' => 'Hsp_query-from',
+				       'Data' => $qs});
+		       ## gc note:
+		       ## $qs is now the start of the next hsp
+		       ## the end of this hsp is 1 before this position
+		       ## (or 1 after in case of reverse strand)
+		       $qs += $aln_len*$qstrand;
+		       $self->element({'Name' => 'Hsp_query-to',
+				       'Data' => $qs - ($qstrand*1)});
+		
+		       $hs += $deletes*$hstrand;
+		       $self->element({'Name' => 'Hsp_hit-from',
+				       'Data' => $hs});
+		       $hs += $aln_len*$hstrand;
+		       $self->element({'Name' => 'Hsp_hit-to',
+				       'Data' => $hs-($hstrand*1)});
+		
+		       $self->element({'Name' => 'Hsp_align-len',
+				       'Data' => $aln_len + $inserts
+					   + $deletes});
+		       $self->element({'Name' => 'Hsp_identity',
+				       'Data' => $aln_len });
+
+		       $self->element({'Name' => 'Hsp_gaps',
+				       'Data' => $inserts + $deletes});
+		       $self->element({'Name' => 'Hsp_querygaps',
+				       'Data' => $inserts});
+		       $self->element({'Name' => 'Hsp_hitgaps',
+				       'Data' => $deletes});
+		
+## gc addition start
+		
+		       $self->element({'Name' => 'Hsp_qseq',
+				       'Data' => shift @q_ex,
+				   });
+		       $self->element({'Name' => 'Hsp_hseq',
+				       'Data' => shift @h_ex,
+				   });
+		       $self->element({'Name' => 'Hsp_midline',
+				       'Data' => shift @m_ex,
+				   });
+## gc addition end
+		       $self->end_element({'Name' => 'Hsp'});
+		       		
+		       $aln_len = $inserts = $deletes = 0;
+		   }
+		   $deletes+=$len;		
+	       } else {
+		   $aln_len += $len;
+	       }
+	   }
+	   $self->start_element({'Name' => 'Hsp'});
+	
+## gc addition start
+		
+		       $self->element({'Name' => 'Hsp_qseq',
+				       'Data' => shift @q_ex,
+				   });
+		       $self->element({'Name' => 'Hsp_hseq',
+				       'Data' => shift @h_ex,
+				   });
+		       $self->element({'Name' => 'Hsp_midline',
+				       'Data' => shift @m_ex,
+				   });
+## gc addition end
+
+	   $self->element({'Name' => 'Hsp_score',
+			   'Data' => $score});
+	
+	   $self->element({'Name' => 'Hsp_query-from',
+			   'Data' => $qs});
+
+	   $qs += $aln_len*$qstrand;
+	   $self->element({'Name' => 'Hsp_query-to',
+				       'Data' => $qs - ($qstrand*1)});
+
+	   $hs += $deletes*$hstrand;
+	   $self->element({'Name' => 'Hsp_hit-from',
+			   'Data' => $hs});
+	   $hs += $aln_len*$hstrand;
+	   $self->element({'Name' => 'Hsp_hit-to',
+			   'Data' => $hs -($hstrand*1)});	
+
+	   $self->element({'Name' => 'Hsp_align-len',
+			   'Data' => $aln_len});
+	
+	   $self->element({'Name' => 'Hsp_identity',
+			   'Data' => $aln_len - ($inserts + $deletes)});
+
+	   $self->element({'Name' => 'Hsp_gaps',
+			   'Data' => $inserts + $deletes});
+	
+	   $self->element({'Name' => 'Hsp_querygaps',
+			   'Data' => $inserts});
+	   $self->element({'Name' => 'Hsp_hitgaps',
+			   'Data' => $deletes});	   	
+	   $self->end_element({'Name' => 'Hsp'});
+	   $self->element({'Name' => 'Hit_score',
+			   'Data' => $score});
+	   $self->end_element({'Name' => 'Hit'});
+	   $self->end_element({'Name' => 'ExonerateOutput'});
+
+	   return $self->end_document();	
+       } else {
+       }
+   }
+   return $self->end_document() if( $seentop );
+}
+
+=head2 start_element
+
+ Title   : start_element
+ Usage   : $eventgenerator->start_element
+ Function: Handles a start element event
+ Returns : none
+ Args    : hashref with at least 2 keys 'Data' and 'Name'
+
+
+=cut
+
+sub start_element{
+   my ($self,$data) = @_;
+   # we currently don't care about attributes
+   my $nm = $data->{'Name'};
+   my $type = $MODEMAP{$nm};
+
+   if( $type ) {
+       if( $self->_eventHandler->will_handle($type) ) {
+	   my $func = sprintf("start_%s",lc $type);
+	   $self->_eventHandler->$func($data->{'Attributes'});
+       }
+       unshift @{$self->{'_elements'}}, $type;
+
+       if($type eq 'result') {
+	   $self->{'_values'} = {};
+	   $self->{'_result'}= undef;
+       }
+   }
+
+}
+
+=head2 end_element
+
+ Title   : start_element
+ Usage   : $eventgenerator->end_element
+ Function: Handles an end element event
+ Returns : none
+ Args    : hashref with at least 2 keys 'Data' and 'Name'
+
+
+=cut
+
+sub end_element {
+    my ($self,$data) = @_;
+    my $nm = $data->{'Name'};
+    my $type = $MODEMAP{$nm};
+    my $rc;
+
+    if( $type = $MODEMAP{$nm} ) {
+	if( $self->_eventHandler->will_handle($type) ) {
+	    my $func = sprintf("end_%s",lc $type);
+	    $rc = $self->_eventHandler->$func($self->{'_reporttype'},
+					      $self->{'_values'});
+	}
+	shift @{$self->{'_elements'}};
+
+    } elsif( $MAPPING{$nm} ) {
+
+	if ( ref($MAPPING{$nm}) =~ /hash/i ) {
+	    my $key = (keys %{$MAPPING{$nm}})[0];
+	    $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
+	} else {
+	    $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
+	}
+    } else {
+	$self->debug( "unknown nm $nm, ignoring\n");
+    }
+    $self->{'_last_data'} = ''; # remove read data if we are at
+				# end of an element
+    $self->{'_result'} = $rc if( defined $type && $type eq 'result' );
+    return $rc;
+}
+
+=head2 element
+
+ Title   : element
+ Usage   : $eventhandler->element({'Name' => $name, 'Data' => $str});
+ Function: Convience method that calls start_element, characters, end_element
+ Returns : none
+ Args    : Hash ref with the keys 'Name' and 'Data'
+
+
+=cut
+
+sub element{
+   my ($self,$data) = @_;
+   $self->start_element($data);
+   $self->characters($data);
+   $self->end_element($data);
+}
+
+=head2 characters
+
+ Title   : characters
+ Usage   : $eventgenerator->characters($str)
+ Function: Send a character events
+ Returns : none
+ Args    : string
+
+
+=cut
+
+sub characters{
+   my ($self,$data) = @_;
+
+   return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
+
+   $self->{'_last_data'} = $data->{'Data'};
+}
+
+=head2 within_element
+
+ Title   : within_element
+ Usage   : if( $eventgenerator->within_element($element) ) {}
+ Function: Test if we are within a particular element
+           This is different than 'in' because within can be tested
+           for a whole block.
+ Returns : boolean
+ Args    : string element name
+
+
+=cut
+
+sub within_element{
+   my ($self,$name) = @_;
+   return 0 if ( ! defined $name &&
+		 ! defined  $self->{'_elements'} ||
+		 scalar @{$self->{'_elements'}} == 0) ;
+   foreach (  @{$self->{'_elements'}} ) {
+       if( $_ eq $name  ) {
+	   return 1;
+       }
+   }
+   return 0;
+}
+
+
+=head2 in_element
+
+ Title   : in_element
+ Usage   : if( $eventgenerator->in_element($element) ) {}
+ Function: Test if we are in a particular element
+           This is different than 'in' because within can be tested
+           for a whole block.
+ Returns : boolean
+ Args    : string element name
+
+
+=cut
+
+sub in_element{
+   my ($self,$name) = @_;
+   return 0 if ! defined $self->{'_elements'}->[0];
+   return ( $self->{'_elements'}->[0] eq $name)
+}
+
+=head2 start_document
+
+ Title   : start_document
+ Usage   : $eventgenerator->start_document
+ Function: Handle a start document event
+ Returns : none
+ Args    : none
+
+
+=cut
+
+sub start_document{
+    my ($self) = @_;
+    $self->{'_lasttype'} = '';
+    $self->{'_values'} = {};
+    $self->{'_result'}= undef;
+    $self->{'_elements'} = [];
+    $self->{'_reporttype'} = 'exonerate';
+}
+
+
+=head2 end_document
+
+ Title   : end_document
+ Usage   : $eventgenerator->end_document
+ Function: Handles an end document event
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+
+=cut
+
+sub end_document{
+   my ($self,@args) = @_;
+   return $self->{'_result'};
+}
+
+
+sub write_result {
+   my ($self, $blast, @args) = @_;
+
+   if( not defined($self->writer) ) {
+       $self->warn("Writer not defined. Using a $DEFAULT_WRITER_CLASS");
+       $self->writer( $DEFAULT_WRITER_CLASS->new() );
+   }
+   $self->SUPER::write_result( $blast, @args );
+}
+
+sub result_count {
+    my $self = shift;
+    return $self->{'_result_count'};
+}
+
+sub report_count { shift->result_count }
+
+1;
+