Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/SearchIO/fasta.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/fasta.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,1066 @@ +# $Id: fasta.pm,v 1.33.2.3 2003/08/28 16:01:03 jason Exp $ +# +# BioPerl module for Bio::SearchIO::fasta +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SearchIO::fasta - A SearchIO parser for FASTA results + +=head1 SYNOPSIS + + # Do not use this object directly, use it through the SearchIO system + use Bio::SearchIO; + my $searchio = new Bio::SearchIO(-format => 'fasta', + -file => 'report.FASTA'); + while( my $result = $searchio->next_result ) { + # .... + } + +=head1 DESCRIPTION + +This object contains the event based parsing code for FASTA format reports. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::fasta; +use vars qw(@ISA %MODEMAP %MAPPING $IDLENGTH); +use strict; + +# Object preamble - inherits from Bio::Root::RootI + +use Bio::SearchIO; +use POSIX; + +BEGIN { + # Set IDLENGTH to a new value if you have + # compile FASTA with a different ID length + # (actually newest FASTA allows the setting of this + # via -C parameter, default is 6) + $IDLENGTH = 6; + + # mapping of NCBI Blast terms to Bioperl hash keys + %MODEMAP = ('FastaOutput' => 'result', + 'Hit' => 'hit', + 'Hsp' => 'hsp' + ); + + # This should really be done more intelligently, like with + # XSLT + + %MAPPING = + ( + 'Hsp_bit-score' => 'HSP-bits', + 'Hsp_score' => 'HSP-score', + 'Hsp_sw-score' => 'HSP-swscore', + 'Hsp_evalue' => 'HSP-evalue', + 'Hsp_query-from'=> 'HSP-query_start', + 'Hsp_query-to' => 'HSP-query_end', + 'Hsp_hit-from' => 'HSP-hit_start', + 'Hsp_hit-to' => 'HSP-hit_end', + 'Hsp_positive' => 'HSP-conserved', + 'Hsp_identity' => 'HSP-identical', + 'Hsp_gaps' => 'HSP-hsp_gaps', + 'Hsp_hitgaps' => 'HSP-hit_gaps', + 'Hsp_querygaps' => 'HSP-query_gaps', + 'Hsp_qseq' => 'HSP-query_seq', + 'Hsp_hseq' => 'HSP-hit_seq', + 'Hsp_midline' => 'HSP-homology_seq', + 'Hsp_align-len' => 'HSP-hsp_length', + 'Hsp_query-frame'=> 'HSP-query_frame', + 'Hsp_hit-frame' => 'HSP-hit_frame', + + 'Hit_id' => 'HIT-name', + 'Hit_len' => 'HIT-length', + 'Hit_accession' => 'HIT-accession', + 'Hit_def' => 'HIT-description', + 'Hit_signif' => 'HIT-significance', + 'Hit_score' => 'HIT-score', + + 'FastaOutput_program' => 'RESULT-algorithm_name', + 'FastaOutput_version' => 'RESULT-algorithm_version', + 'FastaOutput_query-def'=> 'RESULT-query_name', + 'FastaOutput_querydesc'=> 'RESULT-query_description', + 'FastaOutput_query-len'=> 'RESULT-query_length', + 'FastaOutput_db' => 'RESULT-database_name', + 'FastaOutput_db-len' => 'RESULT-database_entries', + 'FastaOutput_db-let' => 'RESULT-database_letters', + + 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'}, + 'Parameters_expect' => { 'RESULT-parameters' => 'expect'}, + 'Parameters_include' => { 'RESULT-parameters' => 'include'}, + 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'}, + 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'}, + 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'}, + 'Parameters_gap-ext' => { 'RESULT-parameters' => 'gapext'}, + 'Parameters_word-size' => { 'RESULT-parameters' => 'wordsize'}, + 'Parameters_ktup' => { 'RESULT-parameters' => 'ktup'}, + 'Parameters_filter' => {'RESULT-parameters' => 'filter'}, + 'Statistics_db-num' => { 'RESULT-statistics' => 'dbentries'}, + 'Statistics_db-len' => { 'RESULT-statistics' => 'dbletters'}, + 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'}, + 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'}, + 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' }, + 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' }, + 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'}, + ); +} + + +@ISA = qw(Bio::SearchIO ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::SearchIO::fasta(); + Function: Builds a new Bio::SearchIO::fasta object + Returns : Bio::SearchIO::fasta + Args : -idlength - set ID length to something other + than the default (7), this is only + necessary if you have compiled FASTA + with a new default id length to display + in the HSP alignment blocks + +=cut + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + return unless @args; + my ($idlength) = $self->_rearrange([qw(IDLENGTH)],@args); + $self->idlength($idlength || $IDLENGTH); + $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::FastaHSP')); + + return 1; +} + +=head2 next_result + + Title : next_result + Usage : my $hit = $searchio->next_result; + Function: Returns the next Result from a search + Returns : Bio::Search::Result::ResultI object + Args : none + +=cut + +sub next_result{ + my ($self) = @_; + + my $data = ''; + my $seentop = 0; + my $current_hsp; + $self->start_document(); + my @hit_signifs; + while( defined ($_ = $self->_readline )) { + next if( ! $self->in_element('hsp') && + /^\s+$/); # skip empty lines + if( /(\S+)\s+searches\s+a\s+((protein\s+or\s+DNA\s+sequence)|(sequence\s+database))/i || /(\S+) compares a/ || + ( m/^# / && ($_ = $self->_readline) && + /(\S+)\s+searches\s+a\s+((protein\s+or\s+DNA\s+sequence)|(sequence\s+database))/i || /(\S+) compares a/ + ) + ) { + if( $seentop ) { + $self->_pushback($_); + $self->end_element({ 'Name' => 'FastaOutput'}); + return $self->end_document(); + } + $self->{'_reporttype'} = $1; + $self->start_element({ 'Name' => 'FastaOutput' } ); + $self->{'_result_count'}++; + $seentop = 1; + + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'}}); + $_ = $self->_readline(); + my ($version) = (/version\s+(\S+)/); + $version = '' unless defined $version; + $self->{'_version'} = $version; + $self->element({ 'Name' => 'FastaOutput_version', + 'Data' => $version}); + + my ($last, $leadin, $type, $querylen, $querytype, $querydef); + + while( defined($_ = $self->_readline()) ) { + if( /^ ( + (?:\s+>) | # fa33 lead-in + (?:\s*\d+\s*>>>) # fa34 mlib lead-in + ) + (.*) + /x + ) { + ($leadin, $querydef) = ($1, $2); + if ($leadin =~ m/>>>/) { + if($querydef =~ /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt)\s*$/o ) { + ($querydef, $querylen, $querytype) = ($1, $2, $3); + last; + } + } else { + if( $last =~ /(\S+)[:,]\s*(\d+)\s+(aa|nt)/ ) { + ($querylen, $querytype) = ($2, $3); + $querydef ||= $1; + last; + } + } + } elsif ( m/^\s*vs\s+\S+/o ) { + if ( $last =~ /(\S+)[,:]\s+(\d+)\s+(aa|nt)/o) { + ($querydef, $querylen, $querytype) = ($1, $2, $3); + last; + } + } + $last = $_; + } + + if( $self->{'_reporttype'} && + $self->{'_reporttype'} eq 'FASTA' + ) { + if( $querytype eq 'nt') { + $self->{'_reporttype'} = 'FASTN' ; + } elsif( $querytype eq 'aa' ) { + $self->{'_reporttype'} = 'FASTP' ; + } + } + my ($name, $descr) = $querydef =~ m/^(\S+)\s*(.*?)\s*$/o; + $self->element({'Name' => 'FastaOutput_query-def', + 'Data' => $name}); + $self->element({'Name' => 'FastaOutput_querydesc', + 'Data' => $descr}); + if ($querylen) { + $self->element({'Name' => 'FastaOutput_query-len', + 'Data' => $querylen}); + } else { + $self->warn("unable to find and set query length"); + } + + if( $last =~ /^\s*vs\s+(\S+)/ || + ($last =~ /^searching\s+(\S+)\s+library/) || + (defined $_ && /^\s*vs\s+(\S+)/) || + (defined ($_ = $self->_readline()) && /^\s*vs\s+(\S+)/) + ) { + $self->element({'Name' => 'FastaOutput_db', + 'Data' => $1}); + } elsif (m/^\s+opt(?:\s+E\(\))?$/o) { + # histogram ... read over it more rapidly than the larger outer loop: + while (defined($_ = $self->_readline)) { + last if m/^>\d+/; + } + } + + } elsif( /(\d+) residues in\s+(\d+)\s+sequences/ ) { + $self->element({'Name' => 'FastaOutput_db-let', + 'Data' => $1}); + $self->element({'Name' => 'FastaOutput_db-len', + 'Data' => $2}); + $self->element({'Name' => 'Statistics_db-len', + 'Data' => $1}); + $self->element({'Name' => 'Statistics_db-num', + 'Data' => $2}); + } elsif( /Lambda=\s*(\S+)/ ) { + $self->element({'Name' => 'Statistics_lambda', + 'Data' => $1}); + } elsif (/K=\s*(\S+)/) { + $self->element({'Name' => 'Statistics_kappa', + 'Data' => $1}); + } elsif( /^\s*(Smith-Waterman).+(\S+)\s*matrix [^\]]*?(xS)?\]/ ) { + $self->element({'Name' => 'Parameters_matrix', + 'Data' => $2}); + $self->element({'Name' => 'Parameters_filter', + 'Data' => defined $3 ? 1 : 0, + }); + $self->{'_reporttype'} = $1; + + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'}}); + + } elsif( /The best( related| unrelated)? scores are:/ ) { + my $rel = $1; + my @labels = split; + @labels = map { + if ($_ =~ m/^E\((\d+)\)$/o) { + $self->element({'Name' => 'Statistics_eff-space', 'Data' => $1}); + "evalue"; + } else { + $_; + } + } @labels[$rel ? 5 : 4 .. $#labels]; + + while( defined ($_ = $self->_readline() ) && + ! /^\s+$/ ) { + my @line = split; + + if ($line[-1] =~ m/\=/o && $labels[-1] eq 'fs') { + # unlabelled alignment hit; + push @labels, "aln_code"; + } + + my %data; + @data{@labels} = splice(@line, @line - @labels); + if ($line[-1] =~ m/\[([1-6rf])\]/o) { + my $fr = $1; + $data{lframe} = ($fr =~ /\d/o ? + ($fr <= 3 ? "+$fr" : "-@{[$fr-3]}") : + ($fr eq 'f' ? '+1' : '-1') + ); + pop @line; + } else { + $data{lframe} = '0'; + } + + if ($line[-1] =~ m/^\(?(\d+)\)$/) { + $data{hit_len} = $1; + pop @line; + if ($line[-1] =~ m/^\($/) { + pop @line; + } + } else { + $data{hit_len} = 0; + } + + # rebuild the first part of the line, preserving spaces: + ($_) = m/^(\S+(?:\s+\S+){$#line})/; + + my ($id, $desc) = split(/\s+/,$_,2); + my @pieces = split(/\|/,$id); + my $acc = pop @pieces; + $acc =~ s/\.\d+$//; + + @data{qw(id desc acc)} = ($id, $desc, $acc); + + push @hit_signifs, \%data; + } + } elsif( /^\s*([T]?FAST[XYAF]).+,\s*(\S+)\s*matrix[^\]]+?(xS)?\]\s*ktup:\s*(\d+)/ ) { + $self->element({'Name' => 'Parameters_matrix', + 'Data' => $2}); + $self->element({'Name' => 'Parameters_filter', + 'Data' => defined $3 ? 1 : 0, + }); + $self->element({'Name' => 'Parameters_ktup', + 'Data' => $4}); + $self->{'_reporttype'} = $1 if( $self->{'_reporttype'} !~ /FAST[PN]/i ) ; + + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'}}); + + } elsif( /(?:gap\-pen|open\/ext):\s+([\-\+]?\d+)\s*\/\s*([\-\+]?\d+).+width:\s+(\d+)/ ) { + $self->element({'Name' => 'Parameters_gap-open', + 'Data' => $1}); + $self->element({'Name' => 'Parameters_gap-ext', + 'Data' => $2}); + $self->element({'Name' => 'Parameters_word-size', + 'Data' => $3}); + } elsif( /^>>(.+?)\s+\((\d+)\s*(aa|nt)\)$/ ) { + if( $self->in_element('hsp') ) { + $self->end_element({ 'Name' => 'Hsp'}); + } + if( $self->in_element('hit') ) { + $self->end_element({ 'Name' => 'Hit'}); + } + + $self->start_element({'Name' => 'Hit'}); + $self->element({ 'Name' => 'Hit_len', + 'Data' => $2}); + my ($id,$desc) = split(/\s+/,$1,2); + $self->element({ 'Name' => 'Hit_id', + 'Data' => $id}); + my @pieces = split(/\|/,$id); + my $acc = pop @pieces; + $acc =~ s/\.\d+$//; + $self->element({ 'Name' => 'Hit_accession', + 'Data' => $acc}); + $self->element({ 'Name' => 'Hit_def', + 'Data' => $desc}); + + $_ = $self->_readline(); + my ($score,$bits,$e) = /Z-score: \s* (\S+) \s* + (?: bits: \s* (\S+) \s+ )? + (?: E|expect ) \s* \(\) :? \s*(\S+)/x; + $bits = $score unless defined $bits; + + my $v = shift @hit_signifs; + if( defined $v ) { + @{$v}{qw(evalue bits z-sc)} = ($e, $bits, $score); + } + + $self->element({'Name' => 'Hit_signif', + 'Data' => $v ? $v->{evalue} : $e }); + $self->element({'Name' => 'Hit_score', + 'Data' => $v ? $v->{bits} : $bits }); + $self->start_element({'Name' => 'Hsp'}); + + $self->element({'Name' => 'Hsp_score', + 'Data' => $v ? $v->{'z-sc'} : $score }); + $self->element({'Name' => 'Hsp_evalue', + 'Data' => $v ? $v->{evalue} : $e }); + $self->element({'Name' => 'Hsp_bit-score', + 'Data' => $v ? $v->{bits} : $bits }); + $_ = $self->_readline(); + if( /Smith-Waterman score:\s*(\d+)/ ) { + $self->element({'Name' => 'Hsp_sw-score', + 'Data' => $1}); + } + if( / (\S+)\% \s* identity + (?:\s* \( \s* (\S+)\% \s* ungapped \) )? + \s* in \s* (\d+) \s+ (?:aa|nt) \s+ overlap \s* + \( (\d+) \- (\d+) : (\d+) \- (\d+) \) + /x ) { + my ($identper,$gapper,$len,$querystart, + $queryend,$hitstart,$hitend) = ($1,$2,$3,$4,$5,$6,$7); + my $ident = POSIX::ceil(($identper/100) * $len); + my $gaps = ( defined $gapper ) ? POSIX::ceil ( ($gapper/100) * $len) : undef; + + $self->element({'Name' => 'Hsp_gaps', + 'Data' => $gaps}); + $self->element({'Name' => 'Hsp_identity', + 'Data' => $ident}); + $self->element({'Name' => 'Hsp_positive', + 'Data' => $ident}); + $self->element({'Name' => 'Hsp_align-len', + 'Data' => $len}); + + $self->element({'Name' => 'Hsp_query-from', + 'Data' => $querystart}); + $self->element({'Name' => 'Hsp_query-to', + 'Data' => $queryend}); + $self->element({'Name' => 'Hsp_hit-from', + 'Data' => $hitstart}); + $self->element({'Name' => 'Hsp_hit-to', + 'Data' => $hitend}); + + } + + if ($v) { + $self->element({'Name' => 'Hsp_querygaps', 'Data' => $v->{qgaps} }) if exists $v->{qgaps}; + $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $v->{lgaps} }) if exists $v->{lgaps}; + + if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) { + if( 8 == scalar grep { exists $v->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) { + if ($v->{ax0} < $v->{an0}) { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($v->{px0} - $v->{ax0}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($v->{an0} - $v->{pn0}) % 3) + 1]}" }); + } + if ($v->{ax1} < $v->{an1}) { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($v->{px1} - $v->{ax1}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($v->{an1} - $v->{pn1}) % 3) + 1]}" }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => $v->{lframe} }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $v->{lframe} }); + } + + } else { + $self->warn( "unable to parse FASTA score line: $_"); + } + } elsif( /\d+\s*residues\s*in\s*\d+\s*query\s*sequences/ ) { + if( $self->in_element('hsp') ) { + $self->end_element({'Name' => 'Hsp'}); + } + if( $self->in_element('hit') ) { + $self->end_element({'Name' => 'Hit'}); + } + +# $_ = $self->_readline(); +# my ( $liblen,$libsize) = /(\d+)\s+residues\s*in(\d+)\s*library/; + # fast forward to the end of the file as there is + # nothing else left to do with this file and want to be sure and + # reset it + while(defined($_ = $self->_readline() ) ) { + last if( /^Function used was/); + if( /(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+sequence)|(sequence\s+database)/ ) { + $self->_pushback($_); + } + } + + if (@hit_signifs) { + # process remaining best hits + for my $h (@hit_signifs) { + # Hsp_score Hsp_evalue Hsp_bit-score + # Hsp_sw-score Hsp_gaps Hsp_identity Hsp_positive + # Hsp_align-len Hsp_query-from Hsp_query-to + # Hsp_hit-from Hsp_hit-to Hsp_qseq Hsp_midline + + $self->start_element({'Name' => 'Hit'}); + $self->element({ 'Name' => 'Hit_len', + 'Data' => $h->{hit_len} + }) if exists $h->{hit_len}; + $self->element({ 'Name' => 'Hit_id', + 'Data' => $h->{id} + }) if exists $h->{id}; + $self->element({ 'Name' => 'Hit_accession', + 'Data' => $h->{acc} + }) if exists $h->{acc}; + $self->element({ 'Name' => 'Hit_def', + 'Data' => $h->{desc} + }) if exists $h->{desc}; + $self->element({'Name' => 'Hit_signif', + 'Data' => $h->{evalue} + }) if exists $h->{evalue}; + $self->element({'Name' => 'Hit_score', + 'Data' => $h->{bits} + }) if exists $h->{bits}; + + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} }) if exists $h->{'z-sc'}; + $self->element({'Name' => 'Hsp_evalue', 'Data' => $h->{evalue} }) if exists $h->{evalue}; + $self->element({'Name' => 'Hsp_bit-score', 'Data' => $h->{bits} }) if exists $h->{bits}; + $self->element({'Name' => 'Hsp_sw-score', 'Data' => $h->{sw} }) if exists $h->{sw}; + $self->element({'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} }) if exists $h->{'%_gid'}; + $self->element({'Name' => 'Hsp_identity', 'Data' => POSIX::ceil($h->{'%_id'} * $h->{alen}) }) + if (exists $h->{'%_id'} && exists $h->{alen}); + $self->element({'Name' => 'Hsp_positive', 'Data' => 100 * $h->{'%_id'} }) if exists $h->{'%_id'}; + $self->element({'Name' => 'Hsp_align-len', 'Data' => $h->{alen} }) if exists $h->{alen}; + $self->element({'Name' => 'Hsp_query-from', 'Data' => $h->{an0} }) if exists $h->{an0}; + $self->element({'Name' => 'Hsp_query-to', 'Data' => $h->{ax0} }) if exists $h->{ax0}; + $self->element({'Name' => 'Hsp_hit-from', 'Data' => $h->{an1} }) if exists $h->{an1}; + $self->element({'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1} }) if exists $h->{ax1}; + + $self->element({'Name' => 'Hsp_querygaps', 'Data' => $h->{qgaps} }) if exists $h->{qgaps}; + $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps} }) if exists $h->{lgaps}; + + if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) { + if( 8 == scalar grep { exists $h->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) { + if ($h->{ax0} < $h->{an0}) { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}" }); + } + if ($h->{ax1} < $h->{an1}) { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}" }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => $h->{lframe} }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $h->{lframe} }); + } + + $self->end_element({'Name' => 'Hsp'}); + $self->end_element({'Name' => 'Hit'}); + } + } + + $self->end_element({ 'Name' => 'FastaOutput'}); + return $self->end_document(); + } elsif( /^\s*\d+\s*>>>/) { + if ($self->within_element('FastaOutput')) { + if( $self->in_element('hsp') ) { + $self->end_element({'Name' => 'Hsp'}); + } + if( $self->in_element('hit') ) { + $self->end_element({'Name' => 'Hit'}); + } + + if (@hit_signifs) { + # process remaining best hits + for my $h (@hit_signifs) { + $self->start_element({'Name' => 'Hit'}); + $self->element({ 'Name' => 'Hit_len', + 'Data' => $h->{hit_len} + }) if exists $h->{hit_len}; + $self->element({ 'Name' => 'Hit_id', + 'Data' => $h->{id} + }) if exists $h->{id}; + $self->element({ 'Name' => 'Hit_accession', + 'Data' => $h->{acc} + }) if exists $h->{acc}; + $self->element({ 'Name' => 'Hit_def', + 'Data' => $h->{desc} + }) if exists $h->{desc}; + $self->element({'Name' => 'Hit_signif', + 'Data' => $h->{evalue} + }) if exists $h->{evalue}; + $self->element({'Name' => 'Hit_score', + 'Data' => $h->{bits} + }) if exists $h->{bits}; + + $self->start_element({'Name' => 'Hsp'}); + $self->element({'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} }) if exists $h->{'z-sc'}; + $self->element({'Name' => 'Hsp_evalue', 'Data' => $h->{evalue} }) if exists $h->{evalue}; + $self->element({'Name' => 'Hsp_bit-score', 'Data' => $h->{bits} }) if exists $h->{bits}; + $self->element({'Name' => 'Hsp_sw-score', 'Data' => $h->{sw} }) if exists $h->{sw}; + $self->element({'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} }) if exists $h->{'%_gid'}; + $self->element({'Name' => 'Hsp_identity', 'Data' => POSIX::ceil($h->{'%_id'} * $h->{alen}) }) + if (exists $h->{'%_id'} && exists $h->{alen}); + $self->element({'Name' => 'Hsp_positive', 'Data' => $h->{'%_id'} }) if exists $h->{'%_id'}; + $self->element({'Name' => 'Hsp_align-len', 'Data' => $h->{alen} }) if exists $h->{alen}; + $self->element({'Name' => 'Hsp_query-from', 'Data' => $h->{an0} }) if exists $h->{an0}; + $self->element({'Name' => 'Hsp_query-to', 'Data' => $h->{ax0} }) if exists $h->{ax0}; + $self->element({'Name' => 'Hsp_hit-from', 'Data' => $h->{an1} }) if exists $h->{an1}; + $self->element({'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1} }) if exists $h->{ax1}; + + $self->element({'Name' => 'Hsp_querygaps', 'Data' => $h->{qgaps} }) if exists $h->{qgaps}; + $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps} }) if exists $h->{lgaps}; + + if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) { + if( 8 == scalar grep { exists $h->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) { + if ($h->{ax0} < $h->{an0}) { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}" }); + } + if ($h->{ax1} < $h->{an1}) { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}" }); + } else { + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}" }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => $h->{lframe} }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 }); + } + } else { + $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 }); + $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $h->{lframe} }); + } + + $self->end_element({'Name' => 'Hsp'}); + $self->end_element({'Name' => 'Hit'}); + } + } + $self->end_element({ 'Name' => 'FastaOutput' }); + $self->_pushback($_); + return $self->end_document(); + } else { + $self->start_element({ 'Name' => 'FastaOutput' }); + $self->{'_result_count'}++; + $seentop = 1; + $self->element({ 'Name' => 'FastaOutput_program', + 'Data' => $self->{'_reporttype'} }); + $self->element({ 'Name' => 'FastaOutput_version', + 'Data' => $self->{'_version'} }); + + my ($type, $querylen, $querytype, $querydef); + + if( /^\s*\d+\s*>>>(.*)/ ) { + $querydef = $1; + if($querydef =~ /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt)\s*$/o ) { + ($querydef, $querylen, $querytype) = ($1, $2, $3); + } + } + + if( $self->{'_reporttype'} && + $self->{'_reporttype'} eq 'FASTA' + ) { + if( $querytype eq 'nt') { + $self->{'_reporttype'} = 'FASTN' ; + } elsif( $querytype eq 'aa' ) { + $self->{'_reporttype'} = 'FASTP' ; + } + } + my ($name,$descr) = ($querydef =~ m/^(\S+)(?:\s+(.*))?\s*$/o); + $self->element({'Name' => 'FastaOutput_query-def', + 'Data' => $name}); + $self->element({'Name' => 'FastaOutput_querydesc', + 'Data' => $descr}); + if ($querylen) { + $self->element({'Name' => 'FastaOutput_query-len', + 'Data' => $querylen}); + } else { + $self->warn("unable to find and set query length"); + } + + + if( defined ($_ = $self->_readline()) && /^\s*vs\s+(\S+)/ ) { + $self->element({'Name' => 'FastaOutput_db', + 'Data' => $1}); + } + } + } elsif( $self->in_element('hsp' ) ) { + + my @data = ( '','',''); + my $count = 0; + my $len = $self->idlength + 1; + my ($seq1_id); + while( defined($_ ) ) { + chomp; + $self->debug( "$count $_\n"); + + if( /residues in \d+\s+query\s+sequences/) { + $self->_pushback($_); + last; + } elsif( /^>>/ ) { + $self->_pushback($_); + last; + } elsif (/^\s*\d+\s*>>>/) { + $self->_pushback($_); + last; + } + if( $count == 0 ) { + unless( /^\s+\d+/ || /^\s+$/) { + $self->_pushback($_); + $count = 2; + } + } elsif( $count == 1 || $count == 3 ) { + if( /^(\S+)\s+/ ) { + $len = CORE::length($1) if $len < CORE::length($1); + s/\s+$//; # trim trailing spaces,we don't want them + $data[$count-1] = substr($_,$len); + } elsif( /^\s+(\d+)/ ) { + $count = -1; + $self->_pushback($_); + } elsif( /^\s+$/ || length($_) == 0) { + $count = 5; + # going to skip these + } else { + $self->warn("Unrecognized alignment line ($count) '$_'"); + } + } elsif( $count == 2 ) { + if( /^\s+\d+\s+/ ) { + $self->warn("$_\n"); + $count = 4; + } else { + # toss the first IDLENGTH characters of the line + if( length($_) >= $len ) { + $data[$count-1] = substr($_,$len); + } + } + } + last if( $count++ >= 5); + $_ = $self->_readline(); + } + if( length($data[0]) > 0 || length($data[2]) > 0 ) { + $self->characters({'Name' => 'Hsp_qseq', + 'Data' => $data[0] }); + $self->characters({'Name' => 'Hsp_midline', + 'Data' => $data[1]}); + $self->characters({'Name' => 'Hsp_hseq', + 'Data' => $data[2]}); + } + } else { + if( ! $seentop ) { + $self->debug($_); + $self->warn("unrecognized FASTA Family report file!"); + return undef; + } + } + } +} + + +=head2 start_element + + Title : start_element + Usage : $eventgenerator->start_element + Function: Handles a start element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub start_element{ + my ($self,$data) = @_; + # we currently don't care about attributes + my $nm = $data->{'Name'}; + if( my $type = $MODEMAP{$nm} ) { + $self->_mode($type); + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("start_%s",lc $type); + $self->_eventHandler->$func($data->{'Attributes'}); + } + unshift @{$self->{'_elements'}}, $type; + } + if($nm eq 'FastaOutput') { + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_mode'} = ''; + } + +} + +=head2 end_element + + Title : start_element + Usage : $eventgenerator->end_element + Function: Handles an end element event + Returns : none + Args : hashref with at least 2 keys 'Data' and 'Name' + + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $rc; + # Hsp are sort of weird, in that they end when another + # object begins so have to detect this in end_element for now + if( $nm eq 'Hsp' ) { + foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { + $self->element({'Name' => $_, + 'Data' => $self->{'_last_hspdata'}->{$_}}); + } + $self->{'_last_hspdata'} = {} + } + + if( my $type = $MODEMAP{$nm} ) { + if( $self->_eventHandler->will_handle($type) ) { + my $func = sprintf("end_%s",lc $type); + $rc = $self->_eventHandler->$func($self->{'_reporttype'}, + $self->{'_values'}); + } + shift @{$self->{'_elements'}}; + + } elsif( $MAPPING{$nm} ) { + if ( ref($MAPPING{$nm}) =~ /hash/i ) { + my $key = (keys %{$MAPPING{$nm}})[0]; + $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; + } else { + $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; + } + } else { + $self->warn( "unknown nm $nm, ignoring\n"); + } + $self->{'_last_data'} = ''; # remove read data if we are at + # end of an element + $self->{'_result'} = $rc if( $nm eq 'FastaOutput' ); + return $rc; + +} + +=head2 element + + Title : element + Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); + Function: Convience method that calls start_element, characters, end_element + Returns : none + Args : Hash ref with the keys 'Name' and 'Data' + + +=cut + +sub element{ + my ($self,$data) = @_; + $self->start_element($data); + $self->characters($data); + $self->end_element($data); +} + + +=head2 characters + + Title : characters + Usage : $eventgenerator->characters($str) + Function: Send a character events + Returns : none + Args : string + + +=cut + +sub characters{ + my ($self,$data) = @_; + + return unless ( defined $data->{'Data'} ); + if( $data->{'Data'} =~ /^\s+$/ ) { + return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/; + } + + if( $self->in_element('hsp') && + $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) { + + $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; + } + + $self->{'_last_data'} = $data->{'Data'}; +} + +=head2 _mode + + Title : _mode + Usage : $obj->_mode($newval) + Function: + Example : + Returns : value of _mode + Args : newvalue (optional) + + +=cut + +sub _mode{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_mode'} = $value; + } + return $self->{'_mode'}; +} + +=head2 within_element + + Title : within_element + Usage : if( $eventgenerator->within_element($element) ) {} + Function: Test if we are within a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub within_element{ + my ($self,$name) = @_; + return 0 if ( ! defined $name && + ! defined $self->{'_elements'} || + scalar @{$self->{'_elements'}} == 0) ; + foreach ( @{$self->{'_elements'}} ) { + if( $_ eq $name || $_ eq $MODEMAP{$name} ) { + return 1; + } + } + return 0; +} + +=head2 in_element + + Title : in_element + Usage : if( $eventgenerator->in_element($element) ) {} + Function: Test if we are in a particular element + This is different than 'in' because within can be tested + for a whole block. + Returns : boolean + Args : string element name + + +=cut + +sub in_element{ + my ($self,$name) = @_; + return 0 if ! defined $self->{'_elements'}->[0]; + return ( $self->{'_elements'}->[0] eq $name || + (exists $MODEMAP{$name} && $self->{'_elements'}->[0] eq $MODEMAP{$name}) + ); +} + + +=head2 start_document + + Title : start_document + Usage : $eventgenerator->start_document + Function: Handles a start document event + Returns : none + Args : none + + +=cut + +sub start_document{ + my ($self) = @_; + $self->{'_lasttype'} = ''; + $self->{'_values'} = {}; + $self->{'_result'}= undef; + $self->{'_mode'} = ''; + $self->{'_elements'} = []; +} + + +=head2 end_document + + Title : end_document + Usage : $eventgenerator->end_document + Function: Handles an end document event + Returns : Bio::Search::Result::ResultI object + Args : none + + +=cut + +sub end_document{ + my ($self,@args) = @_; + return $self->{'_result'}; +} + +=head2 idlength + + Title : idlength + Usage : $obj->idlength($newval) + Function: Internal storage of the length of the ID desc + in the HSP alignment blocks. Defaults to + $IDLENGTH class variable value + Returns : value of idlength + Args : newvalue (optional) + + +=cut + +sub idlength{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_idlength'} = $value; + } + return $self->{'_idlength'} || $IDLENGTH; +} + + +=head2 result_count + + Title : result_count + Usage : my $count = $searchio->result_count + Function: Returns the number of results we have processed + Returns : integer + Args : none + + +=cut + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +1; +