diff variant_effect_predictor/Bio/SearchIO/waba.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SearchIO/waba.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,517 @@
+# $Id: waba.pm,v 1.8 2002/12/11 22:12:32 jason Exp $
+#
+# BioPerl module for Bio::SearchIO::waba
+#
+# Cared for by Jason Stajich <jason@bioperl.org>
+#
+# Copyright Jason Stajich
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SearchIO::waba - SearchIO parser for Jim Kent WABA program
+alignment output
+
+=head1 SYNOPSIS
+
+    # do not use this object directly, rather through Bio::SearchIO
+
+    use Bio::SearchIO;
+    my $in = new Bio::SearchIO(-format => 'waba',
+			       -file   => 'output.wab');
+    while( my $result = $in->next_result ) {
+	while( my $hit = $result->next_hit ) {
+	    while( my $hsp = $result->next_hsp ) {
+
+	    }
+	}
+    }
+
+=head1 DESCRIPTION
+
+This parser will process the waba output (NOT the human readable format).
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bioperl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Jason Stajich
+
+Email jason@bioperl.org
+
+Describe contact details here
+
+=head1 CONTRIBUTORS
+
+Additional contributors names and emails here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::SearchIO::waba;
+use vars qw(@ISA  %MODEMAP %MAPPING @STATES);
+use strict;
+
+# Object preamble - inherits from Bio::Root::Root
+
+use Bio::SearchIO;
+
+use POSIX;
+
+BEGIN { 
+    # mapping of NCBI Blast terms to Bioperl hash keys
+    %MODEMAP = ('WABAOutput' => 'result',
+		'Hit'         => 'hit',
+		'Hsp'         => 'hsp'
+		);
+    @STATES = qw(Hsp_qseq Hsp_hseq Hsp_stateseq);
+    %MAPPING = 
+	( 
+	  'Hsp_query-from'=>  'HSP-query_start',
+	  'Hsp_query-to'  =>  'HSP-query_end',
+	  'Hsp_hit-from'  =>  'HSP-hit_start',
+	  'Hsp_hit-to'    =>  'HSP-hit_end',
+	  'Hsp_qseq'      =>  'HSP-query_seq',
+	  'Hsp_hseq'      =>  'HSP-hit_seq',
+	  'Hsp_midline'   =>  'HSP-homology_seq',
+	  'Hsp_stateseq'  =>  'HSP-hmmstate_seq',
+	  'Hsp_align-len' =>  'HSP-hsp_length',
+	  
+	  'Hit_id'        => 'HIT-name',
+	  'Hit_accession' => 'HIT-accession',
+
+	  'WABAOutput_program'  => 'RESULT-algorithm_name',
+	  'WABAOutput_version'  => 'RESULT-algorithm_version',
+	  'WABAOutput_query-def'=> 'RESULT-query_name',
+	  'WABAOutput_query-db' => 'RESULT-query_database',
+ 	  'WABAOutput_db'       => 'RESULT-database_name',
+	  );
+}
+
+
+@ISA = qw(Bio::SearchIO );
+
+=head2 new
+
+ Title   : new
+ Usage   : my $obj = new Bio::SearchIO::waba();
+ Function: Builds a new Bio::SearchIO::waba object 
+ Returns : Bio::SearchIO::waba
+ Args    : see Bio::SearchIO
+
+=cut
+
+sub _initialize {
+    my ($self,@args) = @_;
+    $self->SUPER::_initialize(@args);
+    $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::WABAResult'));
+
+    $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::WABAHSP'));
+}
+
+
+=head2 next_result
+
+ Title   : next_result
+ Usage   : my $hit = $searchio->next_result;
+ Function: Returns the next Result from a search
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+=cut
+
+sub next_result{
+    my ($self) = @_;
+    
+    my ($curquery,$curhit);
+    my $state = -1;
+    $self->start_document();
+    my @hit_signifs;
+    while( defined ($_ = $self->_readline )) { 
+	
+	if( $state == -1 ) {
+	    my ($qid, $qhspid,$qpercent, $junk,
+		$alnlen,$qdb,$qacc,$qstart,$qend,$qstrand,
+		$hitdb,$hacc,$hstart,$hend,
+		$hstrand) =
+		    ( /^(\S+)\.(\S+)\s+align\s+ # get the queryid
+		      (\d+(\.\d+)?)\%\s+     # get the percentage
+		      of\s+(\d+)\s+  # get the length of the alignment
+		      (\S+)\s+           # this is the query database
+		      (\S+):(\d+)\-(\d+) # The accession:start-end for query
+		      \s+([\-\+])        # query strand
+		      \s+(\S+)\.         # hit db
+		      (\S+):(\d+)\-(\d+) # The accession:start-end for hit
+		      \s+([\-\+])\s*$    # hit strand
+		      /ox );
+	    
+	    # Curses.  Jim's code is 0 based, the following is to readjust
+	    $hstart++; $hend++; $qstart++; $qend++;
+	    
+	    if( ! defined $alnlen ) {
+		$self->warn("Unable to parse the rest of the WABA alignment info for: $_");
+		last;
+	    }
+	    $self->{'_reporttype'} = 'WABA'; # hardcoded - only 
+	                                     # one type of WABA AFAIK	    
+	    if( defined $curquery && 
+		$curquery ne $qid ) { 
+		$self->end_element({'Name' => 'Hit'});
+		$self->_pushback($_);
+		$self->end_element({'Name' => 'WABAOutput'});
+		return $self->end_document();
+	    } 
+	    
+	    if( defined $curhit &&
+		$curhit ne $hacc) {
+		# slight duplication here -- keep these in SYNC
+		$self->end_element({'Name' => 'Hit'});
+		$self->start_element({'Name' => 'Hit'});
+		$self->element({'Name' => 'Hit_id',
+				'Data' => $hacc});
+		$self->element({'Name' => 'Hit_accession',
+				'Data' => $hacc});
+
+	    } elsif ( ! defined $curquery ) {
+		$self->start_element({'Name' => 'WABAOutput'});
+		$self->{'_result_count'}++;
+		$self->element({'Name' => 'WABAOutput_query-def',
+				'Data' => $qid });
+		$self->element({'Name' => 'WABAOutput_program',
+				'Data' => 'WABA'});
+		$self->element({'Name' => 'WABAOutput_query-db',
+				'Data' => $qdb});
+		$self->element({'Name' => 'WABAOutput_db',
+				'Data' => $hitdb});
+		
+		# slight duplication here -- keep these N'SYNC ;-)
+		$self->start_element({'Name' => 'Hit'});
+		$self->element({'Name' => 'Hit_id',
+				'Data' => $hacc});
+		$self->element({'Name' => 'Hit_accession',
+				'Data' => $hacc});
+	    }
+
+	    
+	    # strand is inferred by start,end values
+	    # in the Result Builder
+	    if( $qstrand eq '-' ) {
+		($qstart,$qend) = ($qend,$qstart);
+	    }
+	    if( $hstrand eq '-' ) {
+		($hstart,$hend) = ($hend,$hstart);
+	    }
+
+	    $self->start_element({'Name' => 'Hsp'});
+	    $self->element({'Name' => 'Hsp_query-from',
+			    'Data' => $qstart});
+	    $self->element({'Name' => 'Hsp_query-to',
+			    'Data' => $qend});
+	    $self->element({'Name' => 'Hsp_hit-from',
+			    'Data' => $hstart});
+	    $self->element({'Name' => 'Hsp_hit-to',
+			    'Data' => $hend});
+	    $self->element({'Name' => 'Hsp_align-len',
+			    'Data' => $alnlen});
+	    
+	    $curquery = $qid;
+	    $curhit   = $hacc;
+	    $state = 0;
+	} elsif( ! defined $curquery ) {
+	    $self->warn("skipping because no Hit begin line was recognized\n$_") if( $_ !~ /^\s+$/ );
+	    next;
+	} else { 
+	    chomp;
+	    $self->element({'Name' => $STATES[$state++],
+			    'Data' => $_});
+	    if( $state >= scalar @STATES ) {
+		$state = -1;
+		$self->end_element({'Name' => 'Hsp'});
+	    }
+	}
+    }
+    if( defined $curquery  ) {
+	$self->end_element({'Name' => 'Hit'});
+	$self->end_element({'Name' => 'WABAOutput'});
+	return $self->end_document();
+    }
+    return undef;
+}
+
+=head2 start_element
+
+ Title   : start_element
+ Usage   : $eventgenerator->start_element
+ Function: Handles a start element event
+ Returns : none
+ Args    : hashref with at least 2 keys 'Data' and 'Name'
+
+
+=cut
+
+sub start_element{
+   my ($self,$data) = @_;
+    # we currently don't care about attributes
+    my $nm = $data->{'Name'};    
+   if( my $type = $MODEMAP{$nm} ) {
+	$self->_mode($type);
+	if( $self->_eventHandler->will_handle($type) ) {
+	    my $func = sprintf("start_%s",lc $type);
+	    $self->_eventHandler->$func($data->{'Attributes'});
+	}						 
+	unshift @{$self->{'_elements'}}, $type;
+    }
+    if($nm eq 'WABAOutput') {
+	$self->{'_values'} = {};
+	$self->{'_result'}= undef;
+	$self->{'_mode'} = '';
+    }
+
+}
+
+=head2 end_element
+
+ Title   : start_element
+ Usage   : $eventgenerator->end_element
+ Function: Handles an end element event
+ Returns : none
+ Args    : hashref with at least 2 keys 'Data' and 'Name'
+
+
+=cut
+
+sub end_element {
+    my ($self,$data) = @_;
+    my $nm = $data->{'Name'};
+    my $rc;
+    # Hsp are sort of weird, in that they end when another
+    # object begins so have to detect this in end_element for now
+    if( $nm eq 'Hsp' ) {
+	foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
+	    $self->element({'Name' => $_,
+			    'Data' => $self->{'_last_hspdata'}->{$_}});
+	}
+	$self->{'_last_hspdata'} = {}
+    }
+
+    if( my $type = $MODEMAP{$nm} ) {
+	if( $self->_eventHandler->will_handle($type) ) {
+	    my $func = sprintf("end_%s",lc $type);
+	    $rc = $self->_eventHandler->$func($self->{'_reporttype'},
+					      $self->{'_values'});	    
+	}
+	shift @{$self->{'_elements'}};
+
+    } elsif( $MAPPING{$nm} ) { 	
+	if ( ref($MAPPING{$nm}) =~ /hash/i ) {
+	    my $key = (keys %{$MAPPING{$nm}})[0];	    
+	    $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
+	} else {
+	    $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
+	}
+    } else { 
+	$self->warn( "unknown nm $nm ignoring\n");
+    }
+    $self->{'_last_data'} = ''; # remove read data if we are at 
+				# end of an element
+    $self->{'_result'} = $rc if( $nm eq 'WABAOutput' );
+    return $rc;
+
+}
+
+=head2 element
+
+ Title   : element
+ Usage   : $eventhandler->element({'Name' => $name, 'Data' => $str});
+ Function: Convience method that calls start_element, characters, end_element
+ Returns : none
+ Args    : Hash ref with the keys 'Name' and 'Data'
+
+
+=cut
+
+sub element{
+   my ($self,$data) = @_;
+   $self->start_element($data);
+   $self->characters($data);
+   $self->end_element($data);
+}
+
+
+=head2 characters
+
+ Title   : characters
+ Usage   : $eventgenerator->characters($str)
+ Function: Send a character events
+ Returns : none
+ Args    : string
+
+
+=cut
+
+sub characters{
+   my ($self,$data) = @_;   
+
+   return unless ( defined $data->{'Data'} );
+   if( $data->{'Data'} =~ /^\s+$/ ) {
+       return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
+   }
+
+   if( $self->in_element('hsp') && 
+       $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
+       
+       $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
+   }  
+   
+   $self->{'_last_data'} = $data->{'Data'}; 
+}
+
+=head2 _mode
+
+ Title   : _mode
+ Usage   : $obj->_mode($newval)
+ Function: 
+ Example : 
+ Returns : value of _mode
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _mode{
+    my ($self,$value) = @_;
+    if( defined $value) {
+	$self->{'_mode'} = $value;
+    }
+    return $self->{'_mode'};
+}
+
+=head2 within_element
+
+ Title   : within_element
+ Usage   : if( $eventgenerator->within_element($element) ) {}
+ Function: Test if we are within a particular element
+           This is different than 'in' because within can be tested
+           for a whole block.
+ Returns : boolean
+ Args    : string element name 
+
+
+=cut
+
+sub within_element{
+   my ($self,$name) = @_;  
+   return 0 if ( ! defined $name &&
+		 ! defined  $self->{'_elements'} ||
+		 scalar @{$self->{'_elements'}} == 0) ;
+   foreach (  @{$self->{'_elements'}} ) {
+       if( $_ eq $name  ) {
+	   return 1;
+       } 
+   }
+   return 0;
+}
+
+=head2 in_element
+
+ Title   : in_element
+ Usage   : if( $eventgenerator->in_element($element) ) {}
+ Function: Test if we are in a particular element
+           This is different than 'in' because within can be tested
+           for a whole block.
+ Returns : boolean
+ Args    : string element name 
+
+
+=cut
+
+sub in_element{
+   my ($self,$name) = @_;  
+   return 0 if ! defined $self->{'_elements'}->[0];
+   return ( $self->{'_elements'}->[0] eq $name)
+}
+
+
+=head2 start_document
+
+ Title   : start_document
+ Usage   : $eventgenerator->start_document
+ Function: Handles a start document event
+ Returns : none
+ Args    : none
+
+
+=cut
+
+sub start_document{
+    my ($self) = @_;
+    $self->{'_lasttype'} = '';
+    $self->{'_values'} = {};
+    $self->{'_result'}= undef;
+    $self->{'_mode'} = '';
+    $self->{'_elements'} = [];
+}
+
+
+=head2 end_document
+
+ Title   : end_document
+ Usage   : $eventgenerator->end_document
+ Function: Handles an end document event
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+
+=cut
+
+sub end_document{
+   my ($self,@args) = @_;
+   return $self->{'_result'};
+}
+
+=head2 result_count
+
+ Title   : result_count
+ Usage   : my $count = $searchio->result_count
+ Function: Returns the number of results we have processed
+ Returns : integer
+ Args    : none
+
+
+=cut
+
+sub result_count {
+    my $self = shift;
+    return $self->{'_result_count'};
+}
+
+sub report_count { shift->result_count }
+
+1;