diff variant_effect_predictor/Bio/Tools/Genscan.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Tools/Genscan.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,491 @@
+# $Id: Genscan.pm,v 1.22 2002/10/22 07:38:46 lapp Exp $
+#
+# BioPerl module for Bio::Tools::Genscan
+#
+# Cared for by Hilmar Lapp <hlapp@gmx.net>
+#
+# Copyright Hilmar Lapp
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Tools::Genscan - Results of one Genscan run
+
+=head1 SYNOPSIS
+
+   $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
+   # filehandle:
+   $genscan = Bio::Tools::Genscan->new( -fh  => \*INPUT );
+
+   # parse the results
+   # note: this class is-a Bio::Tools::AnalysisResult which implements
+   # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
+   while($gene = $genscan->next_prediction()) {
+       # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
+       # off Bio::SeqFeature::Gene::Transcript.
+       #
+       # $gene->exons() returns an array of 
+       # Bio::Tools::Prediction::Exon objects
+       # all exons:
+       @exon_arr = $gene->exons();
+
+       # initial exons only
+       @init_exons = $gene->exons('Initial');
+       # internal exons only
+       @intrl_exons = $gene->exons('Internal');
+       # terminal exons only
+       @term_exons = $gene->exons('Terminal');
+       # singleton exons: 
+       ($single_exon) = $gene->exons();
+   }
+
+   # essential if you gave a filename at initialization (otherwise the file
+   # will stay open)
+   $genscan->close();
+
+=head1 DESCRIPTION
+
+The Genscan module provides a parser for Genscan gene structure prediction
+output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
+derived object.
+
+This module also implements the Bio::SeqAnalysisParserI interface, and thus
+can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org          - General discussion
+  http://bio.perl.org/MailList.html             - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Hilmar Lapp
+
+Email hlapp@gmx.net
+
+Describe contact details here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::Tools::Genscan;
+use vars qw(@ISA);
+use strict;
+use Symbol;
+
+use Bio::Root::Root;
+use Bio::Tools::AnalysisResult;
+use Bio::Tools::Prediction::Gene;
+use Bio::Tools::Prediction::Exon;
+
+@ISA = qw(Bio::Tools::AnalysisResult);
+
+sub _initialize_state {
+    my ($self,@args) = @_;
+
+    # first call the inherited method!
+    $self->SUPER::_initialize_state(@args);
+
+    # our private state variables
+    $self->{'_preds_parsed'} = 0;
+    $self->{'_has_cds'} = 0;
+    # array of pre-parsed predictions
+    $self->{'_preds'} = [];
+    # seq stack
+    $self->{'_seqstack'} = [];
+}
+
+=head2 analysis_method
+
+ Usage     : $genscan->analysis_method();
+ Purpose   : Inherited method. Overridden to ensure that the name matches
+             /genscan/i.
+ Returns   : String
+ Argument  : n/a
+
+=cut
+
+#-------------
+sub analysis_method { 
+#-------------
+    my ($self, $method) = @_;  
+    if($method && ($method !~ /genscan/i)) {
+	$self->throw("method $method not supported in " . ref($self));
+    }
+    return $self->SUPER::analysis_method($method);
+}
+
+=head2 next_feature
+
+ Title   : next_feature
+ Usage   : while($gene = $genscan->next_feature()) {
+                  # do something
+           }
+ Function: Returns the next gene structure prediction of the Genscan result
+           file. Call this method repeatedly until FALSE is returned.
+
+           The returned object is actually a SeqFeatureI implementing object.
+           This method is required for classes implementing the
+           SeqAnalysisParserI interface, and is merely an alias for 
+           next_prediction() at present.
+
+ Example :
+ Returns : A Bio::Tools::Prediction::Gene object.
+ Args    :
+
+=cut
+
+sub next_feature {
+    my ($self,@args) = @_;
+    # even though next_prediction doesn't expect any args (and this method
+    # does neither), we pass on args in order to be prepared if this changes
+    # ever
+    return $self->next_prediction(@args);
+}
+
+=head2 next_prediction
+
+ Title   : next_prediction
+ Usage   : while($gene = $genscan->next_prediction()) {
+                  # do something
+           }
+ Function: Returns the next gene structure prediction of the Genscan result
+           file. Call this method repeatedly until FALSE is returned.
+
+ Example :
+ Returns : A Bio::Tools::Prediction::Gene object.
+ Args    :
+
+=cut
+
+sub next_prediction {
+    my ($self) = @_;
+    my $gene;
+
+    # if the prediction section hasn't been parsed yet, we do this now
+    $self->_parse_predictions() unless $self->_predictions_parsed();
+
+    # get next gene structure
+    $gene = $self->_prediction();
+
+    if($gene) {
+	# fill in predicted protein, and if available the predicted CDS
+	#
+	my ($id, $seq);
+	# use the seq stack if there's a seq on it
+	my $seqobj = pop(@{$self->{'_seqstack'}});
+	if(! $seqobj) {
+	    # otherwise read from input stream
+	    ($id, $seq) = $self->_read_fasta_seq();
+	    # there may be no sequence at all, or none any more
+	    if($id && $seq) {
+		$seqobj = Bio::PrimarySeq->new('-seq' => $seq,
+					       '-display_id' => $id,
+					       '-alphabet' => "protein");
+	    }
+	}
+	if($seqobj) {
+	    # check that prediction number matches the prediction number
+	    # indicated in the sequence id (there may be incomplete gene
+	    # predictions that contain only signals with no associated protein
+	    # and CDS, like promoters, poly-A sites etc)
+	    $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
+	    my $prednr = $1;
+	    if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
+		# this is not our sequence, so push back for next prediction
+		push(@{$self->{'_seqstack'}}, $seqobj);
+	    } else {
+		$gene->predicted_protein($seqobj);
+		# CDS prediction, too?
+		if($self->_has_cds()) {
+		    ($id, $seq) = $self->_read_fasta_seq();
+		    $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
+						   '-display_id' => $id,
+						   '-alphabet' => "dna");
+		    $gene->predicted_cds($seqobj);
+		}
+	    }
+	}
+    }
+
+    return $gene;
+}
+
+=head2 _parse_predictions
+
+ Title   : _parse_predictions()
+ Usage   : $obj->_parse_predictions()
+ Function: Parses the prediction section. Automatically called by
+           next_prediction() if not yet done.
+ Example :
+ Returns : 
+
+=cut
+
+sub _parse_predictions {
+    my ($self) = @_;
+    my %exontags = ('Init' => 'Initial',
+		    'Intr' => 'Internal',
+		    'Term' => 'Terminal',
+		    'Sngl' => '');
+    my $gene;
+    my $seqname;
+
+    while(defined($_ = $self->_readline())) {
+	if(/^\s*(\d+)\.(\d+)/) {
+	    # exon or signal
+	    my $prednr = $1;
+	    my $signalnr = $2; # not used presently
+	    if(! defined($gene)) {
+		$gene = Bio::Tools::Prediction::Gene->new(
+                                       '-primary' => "GenePrediction$prednr",
+				       '-source' => 'Genscan');
+	    }
+	    # split into fields
+	    chomp();
+	    my @flds = split(' ', $_);
+	    # create the feature object depending on the type of signal
+	    my $predobj;
+	    my $is_exon = grep {$_ eq $flds[1];} (keys(%exontags));
+	    if($is_exon) {
+		$predobj = Bio::Tools::Prediction::Exon->new();
+	    } else {
+		# PolyA site, or Promoter
+		$predobj = Bio::SeqFeature::Generic->new();
+	    }
+	    # set common fields
+	    $predobj->source_tag('Genscan');
+	    $predobj->score($flds[$#flds]);
+	    $predobj->strand((($flds[2] eq '+') ? 1 : -1));
+	    my ($start, $end) = @flds[(3,4)];
+	    if($predobj->strand() == 1) {
+		$predobj->start($start);
+		$predobj->end($end);
+	    } else {
+		$predobj->end($start);
+		$predobj->start($end);
+	    }
+	    # add to gene structure (should be done only when start and end
+	    # are set, in order to allow for proper expansion of the range)
+	    if($is_exon) {
+		# first, set fields unique to exons
+		$predobj->start_signal_score($flds[8]);
+		$predobj->end_signal_score($flds[9]);
+		$predobj->coding_signal_score($flds[10]);
+		$predobj->significance($flds[11]);
+		$predobj->primary_tag($exontags{$flds[1]} . 'Exon');
+		$predobj->is_coding(1);
+		# Figure out the frame of this exon. This is NOT the frame
+		# given by Genscan, which is the absolute frame of the base
+		# starting the first predicted complete codon. By comparing
+		# to the absolute frame of the first base we can compute the
+		# offset of the first complete codon to the first base of the
+		# exon, which determines the frame of the exon.
+		my $cod_offset;
+		if($predobj->strand() == 1) {
+		    $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
+		    # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
+		    # to offsets 2 and 1, resp. Offset 3 is the same as 0.
+		    $cod_offset += 3 if($cod_offset < 1);		    
+		} else {
+		    # On the reverse strand the Genscan frame also refers to
+		    # the first base of the first complete codon, but viewed
+		    # from forward, which is the third base viewed from
+		    # reverse.
+		    $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
+		    # Possible values are -2, -1, 0, 1, 2. Due to the reverse
+		    # situation, {2,-1} and {1,-2} correspond to offsets
+		    # 1 and 2, resp. Offset 3 is the same as 0.
+		    $cod_offset -= 3 if($cod_offset >= 0);
+		    $cod_offset = -$cod_offset;
+		}
+		# Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
+		# is the frame of the first base relative to the exon, or the
+		# number of bases the first codon is missing).
+		$predobj->frame(3 - $cod_offset);
+		# then add to gene structure object
+		$gene->add_exon($predobj, $exontags{$flds[1]});		
+	    } elsif($flds[1] eq 'PlyA') {
+		$predobj->primary_tag("PolyAsite");
+		$gene->poly_A_site($predobj);
+	    } elsif($flds[1] eq 'Prom') {
+		$predobj->primary_tag("Promoter");
+		$gene->add_promoter($predobj);
+	    }
+	    next;
+	}
+	if(/^\s*$/ && defined($gene)) {
+	    # current gene is completed
+	    $gene->seq_id($seqname);
+	    $self->_add_prediction($gene);
+	    $gene = undef;
+	    next;
+	}
+	if(/^(GENSCAN)\s+(\S+)/) {
+	    $self->analysis_method($1);
+	    $self->analysis_method_version($2);
+	    next;
+	}
+	if(/^Sequence\s+(\S+)\s*:/) {
+	    $seqname = $1;
+	    next;
+	}
+        
+	if(/^Parameter matrix:\s+(\S+)/i) {
+	    $self->analysis_subject($1);
+	   next;
+	}
+	
+	if(/^Predicted coding/) {
+	    $self->_has_cds(1);
+	    next;
+	}
+	/^>/ && do {
+	    # section of predicted sequences
+	    $self->_pushback($_);
+	    last;
+	};
+    }
+    $self->_predictions_parsed(1);
+}
+
+=head2 _prediction
+
+ Title   : _prediction()
+ Usage   : $gene = $obj->_prediction()
+ Function: internal
+ Example :
+ Returns : 
+
+=cut
+
+sub _prediction {
+    my ($self) = @_;
+
+    return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
+    return shift(@{$self->{'_preds'}});
+}
+
+=head2 _add_prediction
+
+ Title   : _add_prediction()
+ Usage   : $obj->_add_prediction($gene)
+ Function: internal
+ Example :
+ Returns : 
+
+=cut
+
+sub _add_prediction {
+    my ($self, $gene) = @_;
+
+    if(! exists($self->{'_preds'})) {
+	$self->{'_preds'} = [];
+    }
+    push(@{$self->{'_preds'}}, $gene);
+}
+
+=head2 _predictions_parsed
+
+ Title   : _predictions_parsed
+ Usage   : $obj->_predictions_parsed
+ Function: internal
+ Example :
+ Returns : TRUE or FALSE
+
+=cut
+
+sub _predictions_parsed {
+    my ($self, $val) = @_;
+
+    $self->{'_preds_parsed'} = $val if $val;
+    if(! exists($self->{'_preds_parsed'})) {
+	$self->{'_preds_parsed'} = 0;
+    }
+    return $self->{'_preds_parsed'};
+}
+
+=head2 _has_cds
+
+ Title   : _has_cds()
+ Usage   : $obj->_has_cds()
+ Function: Whether or not the result contains the predicted CDSs, too.
+ Example :
+ Returns : TRUE or FALSE
+
+=cut
+
+sub _has_cds {
+    my ($self, $val) = @_;
+
+    $self->{'_has_cds'} = $val if $val;
+    if(! exists($self->{'_has_cds'})) {
+	$self->{'_has_cds'} = 0;
+    }
+    return $self->{'_has_cds'};
+}
+
+=head2 _read_fasta_seq
+
+ Title   : _read_fasta_seq()
+ Usage   : ($id,$seqstr) = $obj->_read_fasta_seq();
+ Function: Simple but specialised FASTA format sequence reader. Uses
+           $self->_readline() to retrieve input, and is able to strip off
+           the traling description lines.
+ Example :
+ Returns : An array of two elements.
+
+=cut
+
+sub _read_fasta_seq {
+    my ($self) = @_;
+    my ($id, $seq);
+    local $/ = ">";
+    
+    my $entry = $self->_readline();
+    if($entry) {
+	$entry =~ s/^>//;
+	# complete the entry if the first line came from a pushback buffer
+	while($entry !~ />$/) {
+	    last unless $_ = $self->_readline();
+	    $entry .= $_;
+	}
+	# delete everything onwards from an intervening empty line (at the
+	# end there might be statistics stuff)
+	$entry =~ s/\n\n.*$//s;
+	# id and sequence
+	if($entry =~ /^(\S+)\n([^>]+)/) {
+	    $id = $1;
+	    $seq = $2;
+	} else {
+	    $self->throw("Can't parse Genscan predicted sequence entry");
+	}
+	$seq =~ s/\s//g; # Remove whitespace
+    }
+    return ($id, $seq);
+}
+
+1;