ensembl: variant_effect_predictor/Bio/Tools/Sim4/Results.pm comparison

comparison variant_effect_predictor/Bio/Tools/Sim4/Results.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+#
+# BioPerl module for Bio::Tools::Sim4::Results
+#
+# Cared for by Ewan Birney <birney@sanger.ac.uk>
+#          and Hilmar Lapp <hlapp@gmx.net>
+#
+# Copyright Ewan Birney and Hilmar Lapp
+#
+# You may distribute this module under the same terms as perl itself
+# POD documentation - main docs before the code
+=head1 NAME
+Bio::Tools::Sim4::Results - Results of one Sim4 run
+=head1 SYNOPSIS
+# to preset the order of EST and genomic file as given on the sim4
+# command line:
+my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4',
+-estfirst => 1);
+# to let the order be determined automatically (by length comparison):
+$sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' );
+# filehandle:
+$sim4 = Bio::Tools::Sim4::Results->new( -fh   => \*INPUT );
+# parse the results
+while(my $exonset = $sim4->next_exonset()) {
+# $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons
+# as sub features
+print "Delimited on sequence ", $exonset->seq_id(),
+"from ", $exonset->start(), " to ", $exonset->end(), "\n";
+foreach my $exon ( $exonset->sub_SeqFeature() ) {
+	  # $exon is-a Bio::SeqFeature::FeaturePair
+	  print "Exon from ", $exon->start, " to ", $exon->end,
+" on strand ", $exon->strand(), "\n";
+# you can get out what it matched using the est_hit attribute
+my $homol = $exon->est_hit();
+print "Matched to sequence ", $homol->seq_id,
+" at ", $homol->start," to ", $homol->end, "\n";
+}
+}
+# essential if you gave a filename at initialization (otherwise the file
+# stays open)
+$sim4->close();
+=head1 DESCRIPTION
+The sim4 module provides a parser and results object for sim4 output. The
+sim4 results are specialised types of SeqFeatures, meaning you can add them
+to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner.
+The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The
+$esthit = $exon-E<gt>est_hit() is the alignment as a feature on the matching
+object (normally, an EST), in which the start/end points are where the hit
+lies.
+To make this module work sensibly you need to run
+sim4 genomic.fasta est.database.fasta
+or
+sim4 est.fasta genomic.database.fasta
+To get the sequence identifiers recorded for the first sequence, too, use
+A=4 as output option for sim4.
+One fiddle here is that there are only two real possibilities to the matching
+criteria: either one sequence needs reversing or not. Because of this, it
+is impossible to tell whether the match is in the forward or reverse strand
+of the genomic DNA. We solve this here by assuming that the genomic DNA is
+always forward. As a consequence, the strand attribute of the matching EST is
+unknown, and the strand attribute of the genomic DNA (i.e., the Exon object)
+will reflect the direction of the hit.
+See the documentation of parse_next_alignment() for abilities of the parser
+to deal with the different output format options of sim4.
+=head1 FEEDBACK
+=head2 Mailing Lists
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+bioperl-l@bioperl.org          - General discussion
+http://bio.perl.org/MailList.html             - About the mailing lists
+=head2 Reporting Bugs
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+bioperl-bugs@bio.perl.org
+http://bugzilla.bioperl.org/
+=head1 AUTHOR - Ewan Birney, Hilmar Lapp
+Email birney@sanger.ac.uk
+hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
+Describe contact details here
+=head1 APPENDIX
+The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
+=cut
+# Let the code begin...
+package Bio::Tools::Sim4::Results;
+use vars qw(@ISA);
+use strict;
+# Object preamble - inherits from Bio::Root::Object
+use File::Basename;
+use Bio::Root::Root;
+use Bio::Tools::AnalysisResult;
+use Bio::Tools::Sim4::Exon;
+@ISA = qw(Bio::Tools::AnalysisResult);
+sub _initialize_state {
+my($self,@args) = @_;
+# call the inherited method first
+my $make = $self->SUPER::_initialize_state(@args);
+my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args);
+delete($self->{'_est_is_first'});
+$self->{'_est_is_first'} = $est_is_first if(defined($est_is_first));
+$self->analysis_method("Sim4");
+}
+=head2 analysis_method
+Usage     : $sim4->analysis_method();
+Purpose   : Inherited method. Overridden to ensure that the name matches
+/sim4/i.
+Returns   : String
+Argument  : n/a
+=cut
+#-------------
+sub analysis_method {
+#-------------
+my ($self, $method) = @_;
+if($method && ($method !~ /sim4/i)) {
+	$self->throw("method $method not supported in " . ref($self));
+}
+return $self->SUPER::analysis_method($method);
+}
+=head2 parse_next_alignment
+Title   : parse_next_alignment
+Usage   : @exons = $sim4_result->parse_next_alignment;
+foreach $exon (@exons) {
+# do something
+}
+Function: Parses the next alignment of the Sim4 result file and returns the
+found exons as an array of Bio::Tools::Sim4::Exon objects. Call
+this method repeatedly until an empty array is returned to get the
+results for all alignments.
+The $exon->seq_id() attribute will be set to the identifier of the
+respective sequence for both sequences if A=4 was used in the sim4
+run, and otherwise for the second sequence only. If the output does
+not contain the identifier, the filename stripped of path and
+extension is used instead. In addition, the full filename
+will be recorded for both features ($exon inherits off
+Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length
+is accessible via the seqlength() attribute of $exon->query() and
+$exon->est_hit().
+Note that this method is capable of dealing with outputs generated
+with format 0,1,3, and 4 (via the A=n option to sim4). It
+automatically determines which of the two sequences has been
+reversed, and adjusts the coordinates for that sequence. It will
+also detect whether the EST sequence(s) were given as first or as
+second file to sim4, unless this has been specified at creation
+time of the object.
+Example :
+Returns : An array of Bio::Tools::Sim4::Exon objects
+Args    :
+=cut
+sub parse_next_alignment {
+my ($self) = @_;
+my @exons = ();
+my %seq1props = ();
+my %seq2props = ();
+# we refer to the properties of each seq by reference
+my ($estseq, $genomseq, $to_reverse);
+my $started = 0;
+my $hit_direction = 1;
+my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
+# output yet)
+while(defined($_ = $self->_readline())) {
+#chomp();
+#
+# bascially, each sim4 'hit' starts with seq1...
+#
+/^seq1/ && do {
+	   if($started) {
+	       $self->_pushback($_);
+	       last;
+	   }
+	   $started = 1;
+	   # filename and length of seq 1
+	   /^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
+	       $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
+	   $seq1props{'filename'} = $1;
+	   $seq1props{'length'} = $2;
+	   next;
+};
+/^seq2/ && do {
+	   # the second hit has also the database name in the >name syntax
+	   # (in brackets).
+	   /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
+	       $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
+	   $seq2props{'filename'} = $1;
+	   $seq2props{'seqname'} = $2;
+	   $seq2props{'length'} = $3;
+	   next;
+};
+if(/^>(\S+)\s*(.*)$/) {
+	   # output option was A=4, which not only gives the complete
+	   # description lines, but also causes the longer sequence to be
+	   # reversed if the second file contained one (genomic) sequence
+	   $seq1props{'seqname'} = $1;
+	   $seq1props{'description'} = $2 if $2;
+	   $output_fmt = 4;
+	   # we handle seq1 and seq2 both here
+	   if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
+	       $seq2props{'seqname'} = $1; # redundant, since already set above
+	       $seq2props{'description'} = $2 if $2;
+	   }
+	   next;
+}
+/^\(complement\)/ && do {
+	   $hit_direction = -1;
+	   next;
+};
+# this matches
+# start-end (start-end) pctid%
+if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
+	   $seq1props{'start'} = $1;
+	   $seq1props{'end'} = $2;
+	   $seq2props{'start'} = $3;
+	   $seq2props{'end'} = $4;
+	   my $pctid   = $5;
+	   if(! defined($estseq)) {
+	       # for the first time here: need to set the references referring
+	       # to seq1 and seq2
+	       if(! exists($self->{'_est_is_first'})) {
+		   # detect which one is the EST by looking at the lengths,
+		   # and assume that this holds throughout the entire result
+		   # file (i.e., when this method is called for the next
+		   # alignment, this will not be checked again)
+		   if($seq1props{'length'} > $seq2props{'length'}) {
+		       $self->{'_est_is_first'} = 0;
+		   } else {
+		       $self->{'_est_is_first'} = 1;
+		   }
+	       }
+	       if($self->{'_est_is_first'}) {
+		   $estseq = \%seq1props;
+		   $genomseq = \%seq2props;
+		   # if the EST is given first, A=4 selects the genomic
+		   # seq for being reversed (reversing the EST is default)
+		   $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
+	       } else {
+		   $estseq = \%seq2props;
+		   $genomseq = \%seq1props;
+		   # if the EST is the second, A=4 does not change the
+		   # seq being reversed (always the EST is reversed)
+		   $to_reverse = $estseq;
+	       }
+	   }
+	   if($hit_direction == -1) {
+	       # we have to reverse the coordinates of one of both seqs
+	       my $tmp = $to_reverse->{'start'};
+	       $to_reverse->{'start'} =
+		   $to_reverse->{'length'} - $to_reverse->{'end'} + 1;
+	       $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
+	   }
+	   # create and initialize the exon object
+	   my $exon = Bio::Tools::Sim4::Exon->new(
+					    '-start' => $genomseq->{'start'},
+					    '-end'   => $genomseq->{'end'},
+					    '-strand' => $hit_direction);
+	   if(exists($genomseq->{'seqname'})) {
+	       $exon->seq_id($genomseq->{'seqname'});
+	   } else {
+	       # take filename stripped of path as fall back
+	       my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
+	       $exon->seq_id($basename);
+	   }
+	   $exon->feature1()->add_tag_value('filename',
+					    $genomseq->{'filename'});
+	   # feature1 is supposed to be initialized to a Similarity object,
+# but we provide a safety net
+	   if($exon->feature1()->can('seqlength')) {
+	       $exon->feature1()->seqlength($genomseq->{'length'});
+	   } else {
+	       $exon->feature1()->add_tag_value('SeqLength',
+						$genomseq->{'length'});
+	   }
+	   # create and initialize the feature wrapping the 'hit' (the EST)
+	   my $fea2 = Bio::SeqFeature::Similarity->new(
+'-start' => $estseq->{'start'},
+					    '-end'   => $estseq->{'end'},
+					    '-strand' => 0,
+					    '-primary' => "aligning_EST");
+	   if(exists($estseq->{'seqname'})) {
+	       $fea2->seq_id($estseq->{'seqname'});
+	   } else {
+	       # take filename stripped of path as fall back
+	       my ($basename) =
+		   &File::Basename::fileparse($estseq->{'filename'}, '\..*');
+	       $fea2->seq_id($basename);
+	   }
+	   $fea2->add_tag_value('filename', $estseq->{'filename'});
+	   $fea2->seqlength($estseq->{'length'});
+	   # store
+	   $exon->est_hit($fea2);
+	   # general properties
+	   $exon->source_tag($self->analysis_method());
+	   $exon->percentage_id($pctid);
+	   $exon->score($exon->percentage_id());
+	   # push onto array
+	   push(@exons, $exon);
+	   next; # back to while loop
+}
+}
+return @exons;
+}
+=head2 next_exonset
+Title   : next_exonset
+Usage   : $exonset = $sim4_result->parse_next_exonset;
+print "Exons start at ", $exonset->start(),
+"and end at ", $exonset->end(), "\n";
+foreach $exon ($exonset->sub_SeqFeature()) {
+# do something
+}
+Function: Parses the next alignment of the Sim4 result file and returns the
+set of exons as a container of features. The container is itself
+a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon
+objects as sub features. Start, end, and strand of the container
+will represent the total region covered by the exons of this set.
+See the documentation of parse_next_alignment() for further
+reference about parsing and how the information is stored.
+Example :
+Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon
+objects as sub features.
+Args    :
+=cut
+sub next_exonset {
+my $self = shift;
+my $exonset;
+# get the next array of exons
+my @exons = $self->parse_next_alignment();
+return if($#exons < 0);
+# create the container of exons as a feature object itself, with the
+# data of the first exon for initialization
+$exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(),
+					     '-end' => $exons[0]->end(),
+					     '-strand' => $exons[0]->strand(),
+					     '-primary' => "ExonSet");
+$exonset->source_tag($exons[0]->source_tag());
+$exonset->seq_id($exons[0]->seq_id());
+# now add all exons as sub features, with enabling EXPANsion of the region
+# covered in total
+foreach my $exon (@exons) {
+	$exonset->add_sub_SeqFeature($exon, 'EXPAND');
+}
+return $exonset;
+}
+=head2 next_feature
+Title   : next_feature
+Usage   : while($exonset = $sim4->next_feature()) {
+# do something
+}
+Function: Does the same as L<next_exonset()>. See there for documentation of
+the functionality. Call this method repeatedly until FALSE is
+returned.
+The returned object is actually a SeqFeatureI implementing object.
+This method is required for classes implementing the
+SeqAnalysisParserI interface, and is merely an alias for
+next_exonset() at present.
+Example :
+Returns : A Bio::SeqFeature::Generic object.
+Args    :
+=cut
+sub next_feature {
+my ($self,@args) = @_;
+# even though next_exonset doesn't expect any args (and this method
+# does neither), we pass on args in order to be prepared if this changes
+# ever
+return $self->next_exonset(@args);
+}
+1;

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/Tools/Sim4/Results.pm @ 0:1f6dce3d34e0