Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/Tools/Sim4/Results.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Sim4/Results.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,435 @@ + +# +# BioPerl module for Bio::Tools::Sim4::Results +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# and Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Ewan Birney and Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Sim4::Results - Results of one Sim4 run + +=head1 SYNOPSIS + + # to preset the order of EST and genomic file as given on the sim4 + # command line: + my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4', + -estfirst => 1); + # to let the order be determined automatically (by length comparison): + $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' ); + # filehandle: + $sim4 = Bio::Tools::Sim4::Results->new( -fh => \*INPUT ); + + # parse the results + while(my $exonset = $sim4->next_exonset()) { + # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons + # as sub features + print "Delimited on sequence ", $exonset->seq_id(), + "from ", $exonset->start(), " to ", $exonset->end(), "\n"; + foreach my $exon ( $exonset->sub_SeqFeature() ) { + # $exon is-a Bio::SeqFeature::FeaturePair + print "Exon from ", $exon->start, " to ", $exon->end, + " on strand ", $exon->strand(), "\n"; + # you can get out what it matched using the est_hit attribute + my $homol = $exon->est_hit(); + print "Matched to sequence ", $homol->seq_id, + " at ", $homol->start," to ", $homol->end, "\n"; + } + } + + # essential if you gave a filename at initialization (otherwise the file + # stays open) + $sim4->close(); + +=head1 DESCRIPTION + +The sim4 module provides a parser and results object for sim4 output. The +sim4 results are specialised types of SeqFeatures, meaning you can add them +to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner. + +The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The +$esthit = $exon-E<gt>est_hit() is the alignment as a feature on the matching +object (normally, an EST), in which the start/end points are where the hit +lies. + +To make this module work sensibly you need to run + + sim4 genomic.fasta est.database.fasta +or + sim4 est.fasta genomic.database.fasta + +To get the sequence identifiers recorded for the first sequence, too, use +A=4 as output option for sim4. + +One fiddle here is that there are only two real possibilities to the matching +criteria: either one sequence needs reversing or not. Because of this, it +is impossible to tell whether the match is in the forward or reverse strand +of the genomic DNA. We solve this here by assuming that the genomic DNA is +always forward. As a consequence, the strand attribute of the matching EST is +unknown, and the strand attribute of the genomic DNA (i.e., the Exon object) +will reflect the direction of the hit. + +See the documentation of parse_next_alignment() for abilities of the parser +to deal with the different output format options of sim4. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, Hilmar Lapp + +Email birney@sanger.ac.uk + hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Sim4::Results; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object + +use File::Basename; +use Bio::Root::Root; +use Bio::Tools::AnalysisResult; +use Bio::Tools::Sim4::Exon; + +@ISA = qw(Bio::Tools::AnalysisResult); + + +sub _initialize_state { + my($self,@args) = @_; + + # call the inherited method first + my $make = $self->SUPER::_initialize_state(@args); + + my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args); + + delete($self->{'_est_is_first'}); + $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first)); + $self->analysis_method("Sim4"); +} + +=head2 analysis_method + + Usage : $sim4->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /sim4/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /sim4/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 parse_next_alignment + + Title : parse_next_alignment + Usage : @exons = $sim4_result->parse_next_alignment; + foreach $exon (@exons) { + # do something + } + Function: Parses the next alignment of the Sim4 result file and returns the + found exons as an array of Bio::Tools::Sim4::Exon objects. Call + this method repeatedly until an empty array is returned to get the + results for all alignments. + + The $exon->seq_id() attribute will be set to the identifier of the + respective sequence for both sequences if A=4 was used in the sim4 + run, and otherwise for the second sequence only. If the output does + not contain the identifier, the filename stripped of path and + extension is used instead. In addition, the full filename + will be recorded for both features ($exon inherits off + Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length + is accessible via the seqlength() attribute of $exon->query() and + $exon->est_hit(). + + Note that this method is capable of dealing with outputs generated + with format 0,1,3, and 4 (via the A=n option to sim4). It + automatically determines which of the two sequences has been + reversed, and adjusts the coordinates for that sequence. It will + also detect whether the EST sequence(s) were given as first or as + second file to sim4, unless this has been specified at creation + time of the object. + + Example : + Returns : An array of Bio::Tools::Sim4::Exon objects + Args : + + +=cut + +sub parse_next_alignment { + my ($self) = @_; + my @exons = (); + my %seq1props = (); + my %seq2props = (); + # we refer to the properties of each seq by reference + my ($estseq, $genomseq, $to_reverse); + my $started = 0; + my $hit_direction = 1; + my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced + # output yet) + + while(defined($_ = $self->_readline())) { + #chomp(); + + # + # bascially, each sim4 'hit' starts with seq1... + # + /^seq1/ && do { + if($started) { + $self->_pushback($_); + last; + } + $started = 1; + + # filename and length of seq 1 + /^seq1\s+=\s+(\S+)\,\s+(\d+)/ || + $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!"); + $seq1props{'filename'} = $1; + $seq1props{'length'} = $2; + next; + }; + /^seq2/ && do { + # the second hit has also the database name in the >name syntax + # (in brackets). + /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/|| + $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!"); + $seq2props{'filename'} = $1; + $seq2props{'seqname'} = $2; + $seq2props{'length'} = $3; + next; + }; + if(/^>(\S+)\s*(.*)$/) { + # output option was A=4, which not only gives the complete + # description lines, but also causes the longer sequence to be + # reversed if the second file contained one (genomic) sequence + $seq1props{'seqname'} = $1; + $seq1props{'description'} = $2 if $2; + $output_fmt = 4; + # we handle seq1 and seq2 both here + if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) { + $seq2props{'seqname'} = $1; # redundant, since already set above + $seq2props{'description'} = $2 if $2; + } + next; + } + /^\(complement\)/ && do { + $hit_direction = -1; + next; + }; + # this matches + # start-end (start-end) pctid% + if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) { + $seq1props{'start'} = $1; + $seq1props{'end'} = $2; + $seq2props{'start'} = $3; + $seq2props{'end'} = $4; + my $pctid = $5; + + if(! defined($estseq)) { + # for the first time here: need to set the references referring + # to seq1 and seq2 + if(! exists($self->{'_est_is_first'})) { + # detect which one is the EST by looking at the lengths, + # and assume that this holds throughout the entire result + # file (i.e., when this method is called for the next + # alignment, this will not be checked again) + if($seq1props{'length'} > $seq2props{'length'}) { + $self->{'_est_is_first'} = 0; + } else { + $self->{'_est_is_first'} = 1; + } + } + if($self->{'_est_is_first'}) { + $estseq = \%seq1props; + $genomseq = \%seq2props; + # if the EST is given first, A=4 selects the genomic + # seq for being reversed (reversing the EST is default) + $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq; + } else { + $estseq = \%seq2props; + $genomseq = \%seq1props; + # if the EST is the second, A=4 does not change the + # seq being reversed (always the EST is reversed) + $to_reverse = $estseq; + } + } + if($hit_direction == -1) { + # we have to reverse the coordinates of one of both seqs + my $tmp = $to_reverse->{'start'}; + $to_reverse->{'start'} = + $to_reverse->{'length'} - $to_reverse->{'end'} + 1; + $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1; + } + # create and initialize the exon object + my $exon = Bio::Tools::Sim4::Exon->new( + '-start' => $genomseq->{'start'}, + '-end' => $genomseq->{'end'}, + '-strand' => $hit_direction); + if(exists($genomseq->{'seqname'})) { + $exon->seq_id($genomseq->{'seqname'}); + } else { + # take filename stripped of path as fall back + my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*'); + $exon->seq_id($basename); + } + $exon->feature1()->add_tag_value('filename', + $genomseq->{'filename'}); + # feature1 is supposed to be initialized to a Similarity object, + # but we provide a safety net + if($exon->feature1()->can('seqlength')) { + $exon->feature1()->seqlength($genomseq->{'length'}); + } else { + $exon->feature1()->add_tag_value('SeqLength', + $genomseq->{'length'}); + } + # create and initialize the feature wrapping the 'hit' (the EST) + my $fea2 = Bio::SeqFeature::Similarity->new( + '-start' => $estseq->{'start'}, + '-end' => $estseq->{'end'}, + '-strand' => 0, + '-primary' => "aligning_EST"); + if(exists($estseq->{'seqname'})) { + $fea2->seq_id($estseq->{'seqname'}); + } else { + # take filename stripped of path as fall back + my ($basename) = + &File::Basename::fileparse($estseq->{'filename'}, '\..*'); + $fea2->seq_id($basename); + } + $fea2->add_tag_value('filename', $estseq->{'filename'}); + $fea2->seqlength($estseq->{'length'}); + # store + $exon->est_hit($fea2); + # general properties + $exon->source_tag($self->analysis_method()); + $exon->percentage_id($pctid); + $exon->score($exon->percentage_id()); + # push onto array + push(@exons, $exon); + next; # back to while loop + } + } + return @exons; +} + +=head2 next_exonset + + Title : next_exonset + Usage : $exonset = $sim4_result->parse_next_exonset; + print "Exons start at ", $exonset->start(), + "and end at ", $exonset->end(), "\n"; + foreach $exon ($exonset->sub_SeqFeature()) { + # do something + } + Function: Parses the next alignment of the Sim4 result file and returns the + set of exons as a container of features. The container is itself + a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon + objects as sub features. Start, end, and strand of the container + will represent the total region covered by the exons of this set. + + See the documentation of parse_next_alignment() for further + reference about parsing and how the information is stored. + + Example : + Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon + objects as sub features. + Args : + +=cut + +sub next_exonset { + my $self = shift; + my $exonset; + + # get the next array of exons + my @exons = $self->parse_next_alignment(); + return if($#exons < 0); + # create the container of exons as a feature object itself, with the + # data of the first exon for initialization + $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(), + '-end' => $exons[0]->end(), + '-strand' => $exons[0]->strand(), + '-primary' => "ExonSet"); + $exonset->source_tag($exons[0]->source_tag()); + $exonset->seq_id($exons[0]->seq_id()); + # now add all exons as sub features, with enabling EXPANsion of the region + # covered in total + foreach my $exon (@exons) { + $exonset->add_sub_SeqFeature($exon, 'EXPAND'); + } + return $exonset; +} + +=head2 next_feature + + Title : next_feature + Usage : while($exonset = $sim4->next_feature()) { + # do something + } + Function: Does the same as L<next_exonset()>. See there for documentation of + the functionality. Call this method repeatedly until FALSE is + returned. + + The returned object is actually a SeqFeatureI implementing object. + This method is required for classes implementing the + SeqAnalysisParserI interface, and is merely an alias for + next_exonset() at present. + + Example : + Returns : A Bio::SeqFeature::Generic object. + Args : + +=cut + +sub next_feature { + my ($self,@args) = @_; + # even though next_exonset doesn't expect any args (and this method + # does neither), we pass on args in order to be prepared if this changes + # ever + return $self->next_exonset(@args); +} + +1;