diff variant_effect_predictor/Bio/Tools/Genemark.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Tools/Genemark.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,515 @@
+# $Id: Genemark.pm,v 1.11.2.1 2003/04/24 08:51:48 heikki Exp $
+#
+# BioPerl module for Bio::Tools::Genemark
+#
+# Cared for by Mark Fiers <hlapp@gmx.net>
+#
+# Copyright Hilmar Lapp, Mark Fiers
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Tools::Genemark - Results of one Genemark run
+
+=head1 SYNOPSIS
+
+   $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
+   # filehandle:
+   $Genemark = Bio::Tools::Genemark->new( -fh  => \*INPUT );
+
+   # parse the results
+   # note: this class is-a Bio::Tools::AnalysisResult which implements
+   # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
+   while($gene = $Genemark->next_prediction()) {
+       # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
+       # off Bio::SeqFeature::Gene::Transcript.
+       #
+       # $gene->exons() returns an array of 
+       # Bio::Tools::Prediction::Exon objects
+       # all exons:
+       @exon_arr = $gene->exons();
+
+       # initial exons only
+       @init_exons = $gene->exons('Initial');
+       # internal exons only
+       @intrl_exons = $gene->exons('Internal');
+       # terminal exons only
+       @term_exons = $gene->exons('Terminal');
+       # singleton exons: 
+       ($single_exon) = $gene->exons();
+   }
+
+   # essential if you gave a filename at initialization (otherwise the file
+   # will stay open)
+   $Genemark->close();
+
+=head1 DESCRIPTION
+
+The Genemark module provides a parser for Genemark gene structure prediction
+output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
+derived object.
+
+This module has been developed around genemark.hmm for eukaryots v2.2a and will 
+probably not work with other versions.
+
+
+This module also implements the Bio::SeqAnalysisParserI interface, and thus
+can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org          - General discussion
+  http://bio.perl.org/MailList.html             - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Hilmar Lapp, Mark Fiers
+
+Email hlapp@gmx.net
+      m.w.e.j.fiers@plant.wag-ur.nl
+
+Describe contact details here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::Tools::Genemark;
+use vars qw(@ISA);
+use strict;
+use Symbol;
+
+use Bio::Root::Root;
+use Bio::Tools::AnalysisResult;
+use Bio::Tools::Prediction::Gene;
+use Bio::Tools::Prediction::Exon;
+use Bio::Seq;
+
+@ISA = qw(Bio::Tools::AnalysisResult);
+
+sub _initialize_state {
+    my ($self,@args) = @_;
+
+    # first call the inherited method!
+    $self->SUPER::_initialize_state(@args);
+
+    # our private state variables
+    $self->{'_preds_parsed'} = 0;
+    $self->{'_has_cds'} = 0;
+    # array of pre-parsed predictions
+    $self->{'_preds'} = [];
+    # seq stack
+    $self->{'_seqstack'} = [];
+}
+
+=head2 analysis_method
+
+ Usage     : $Genemark->analysis_method();
+ Purpose   : Inherited method. Overridden to ensure that the name matches
+             /GeneMark.hmm/i.
+ Returns   : String
+ Argument  : n/a
+
+=cut
+
+#-------------
+sub analysis_method { 
+#-------------
+    my ($self, $method) = @_;  
+    if($method && ($method !~ /Genemark\.hmm/i)) {
+	$self->throw("method $method not supported in " . ref($self));
+    }
+    return $self->SUPER::analysis_method($method);
+}
+
+=head2 next_feature
+
+ Title   : next_feature
+ Usage   : while($gene = $Genemark->next_feature()) {
+                  # do something
+           }
+ Function: Returns the next gene structure prediction of the Genemark result
+           file. Call this method repeatedly until FALSE is returned.
+
+           The returned object is actually a SeqFeatureI implementing object.
+           This method is required for classes implementing the
+           SeqAnalysisParserI interface, and is merely an alias for 
+           next_prediction() at present.
+
+ Example :
+ Returns : A Bio::Tools::Prediction::Gene object.
+ Args    :
+
+=cut
+
+sub next_feature {
+    my ($self,@args) = @_;
+    # even though next_prediction doesn't expect any args (and this method
+    # does neither), we pass on args in order to be prepared if this changes
+    # ever
+    return $self->next_prediction(@args);
+}
+
+=head2 next_prediction
+
+ Title   : next_prediction
+ Usage   : while($gene = $Genemark->next_prediction()) {
+                  # do something
+           }
+ Function: Returns the next gene structure prediction of the Genemark result
+           file. Call this method repeatedly until FALSE is returned.
+
+ Example :
+ Returns : A Bio::Tools::Prediction::Gene object.
+ Args    :
+
+=cut
+
+sub next_prediction {
+    my ($self) = @_;
+    my $gene;
+
+    # if the prediction section hasn't been parsed yet, we do this now
+    $self->_parse_predictions() unless $self->_predictions_parsed();
+
+    # get next gene structure
+    $gene = $self->_prediction();
+    
+    return $gene;
+}
+
+=head2 _parse_predictions
+
+ Title   : _parse_predictions()
+ Usage   : $obj->_parse_predictions()
+ Function: Parses the prediction section. Automatically called by
+           next_prediction() if not yet done.
+ Example :
+ Returns : 
+
+=cut
+
+sub _parse_predictions {
+    my ($self) = @_;
+    my %exontags = ('Initial' => 'Initial',
+		    'Internal' => 'Internal',
+		    'Terminal' => 'Terminal',
+		    'Single' => '',
+		    '_na_' => '');
+    my $exontag;
+    my $gene;
+    my $seqname;
+    my $exontype;
+    my $current_gene_no = -1;
+
+    while(defined($_ = $self->_readline())) {
+        
+	if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
+
+	    #  this is an exon, Genemark doesn't predict anything else
+	    # $prednr corresponds to geneno.
+	    my $prednr = $1;
+
+	    #exon no:
+	    my $signalnr = 0;
+	    if ($2) { my $signalnr = $2; } # used in tag: exon_no
+	    
+	    # split into fields
+	    chomp();
+	    my @flds = split(' ', $_);
+
+	    # create the feature (an exon) object
+	    my $predobj = Bio::Tools::Prediction::Exon->new();
+
+		 
+	    # define info depending on it being eu- or prokaryot
+	    my ($start, $end, $orientation, $prediction_source);
+
+	    if ($self->analysis_method() =~ /PROKARYOTIC/i) {
+	        $prediction_source = "Genemark.hmm.pro";
+	       	$orientation = ($flds[1] eq '+') ? 1 : -1;
+	        ($start, $end) = @flds[(2,3)];
+		$exontag = "_na_";
+
+	    } else {		   
+	        $prediction_source = "Genemark.hmm.eu";
+	       	$orientation = ($flds[2] eq '+') ? 1 : -1;
+	        ($start, $end) = @flds[(4,5)];
+		$exontag = $flds[3];
+	    }
+
+	    #store the data in the exon object
+            $predobj->source_tag($prediction_source);
+	    $predobj->start($start);		
+	    $predobj->end($end);
+	    $predobj->strand($orientation);
+
+	    $predobj->primary_tag($exontags{$exontag} . "Exon");
+
+	    $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
+
+    	    $predobj->is_coding(1);
+		
+		
+	    # frame calculation as in the genscan module
+	    # is to be implemented...
+	    
+	    #If the $prednr is not equal to the current gene, we
+	    #need to make a new gene and close the old one
+	    if($prednr != $current_gene_no) {
+ 	        # a new gene, store the old one if it exists
+		if (defined ($gene)) {
+		    $gene->seq_id($seqname);
+		    $gene = undef ;
+		}
+		#and make a new one
+		$gene = Bio::Tools::Prediction::Gene->new
+		    (
+		     '-primary' => "GenePrediction$prednr",
+		     '-source' => $prediction_source);
+                $self->_add_prediction($gene);
+		$current_gene_no = $prednr;
+	    } 
+	    
+	    # Add the exon to the gene
+	    $gene->add_exon($predobj, ($exontag eq "_na_" ?
+				       undef : $exontags{$exontag}));
+
+	}
+
+	if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
+	    $self->analysis_method($1);
+
+	    my $gm_version = $2; 
+
+	    $self->analysis_method_version($gm_version);
+	    next;
+	}
+
+       #Matrix file for eukaryot version
+       if (/^Matrices file:\s+(\S+)?/i)  {
+	    $self->analysis_subject($1);
+	    # since the line after the matrix file is always the date
+	    # (in the output file's I have seen!) extract and store this 
+	    # here
+	     if (defined(my $_date = $self->_readline())) {
+	         chomp ($_date);
+	     	 $self->analysis_date($_date);
+	     }
+	}			   
+	
+        #Matrix file for prokaryot version
+       if (/^Model file name:\s+(\S+)/) {
+	    $self->analysis_subject($1);
+	    # since the line after the matrix file is always the date
+	    # (in the output file's I have seen!) extract and store this 
+	    # here
+	    my $_date = $self->_readline() ;
+	    if (defined($_date = $self->_readline())) {
+	         chomp ($_date);
+	     	 $self->analysis_date($_date);
+	     }
+	}
+	
+	if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
+	    $seqname = $1;
+	    #    $self->analysis_subject($seqname);
+	    next;
+	}
+	
+
+	/^>/ && do {		
+    	    $self->_pushback($_);
+
+	    # section of predicted aa sequences on recognition
+	    # of a fasta start, read all sequences and find the
+	    # appropriate gene
+            while (1) {
+	       my ($aa_id, $seq) = $self->_read_fasta_seq();
+	       last unless ($aa_id);
+	       	       	
+	       #now parse through the predictions to add the pred. protein
+	       FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
+	            $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
+		    my $geneno = $1;
+		    if ($aa_id =~ /\|gene.$geneno\|/) {
+		          #print "x SEQ : \n $seq \nXXXX\n";
+  			  my $seqobj = Bio::Seq->new('-seq' => $seq,
+	                     		             '-display_id' => $aa_id,
+					              '-alphabet' => "protein");
+			$gene->predicted_protein($seqobj);
+			last FINDPRED;
+		    }	  
+
+	       }
+           }				      
+
+ 	   last;
+	};
+    }
+    
+    # if the analysis query object contains a ref to a Seq of PrimarySeq
+    # object, then extract the predicted sequences and add it to the gene
+    # object.
+    if (defined $self->analysis_query()) { 
+        my $orig_seq = $self->analysis_query();
+        FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) { 
+	   my $predseq = "";
+	   foreach my $exon ($gene->exons()) {
+		#print $exon->start() . " " . $exon->end () . "\n";
+		$predseq .= $orig_seq->subseq($exon->start(), $exon->end());
+	   }
+
+	   my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
+	                     		     '-display_id' => "transl");
+	   $gene->predicted_cds($seqobj);
+	}
+    }
+    
+    
+    $self->_predictions_parsed(1);
+}
+
+=head2 _prediction
+
+ Title   : _prediction()
+ Usage   : $gene = $obj->_prediction()
+ Function: internal
+ Example :
+ Returns : 
+
+=cut
+
+sub _prediction {
+    my ($self) = @_;
+
+    return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
+    return shift(@{$self->{'_preds'}});
+}
+
+=head2 _add_prediction
+
+ Title   : _add_prediction()
+ Usage   : $obj->_add_prediction($gene)
+ Function: internal
+ Example :
+ Returns : 
+
+=cut
+
+sub _add_prediction {
+    my ($self, $gene) = @_;
+
+    if(! exists($self->{'_preds'})) {
+	$self->{'_preds'} = [];
+    }
+    push(@{$self->{'_preds'}}, $gene);
+}
+
+=head2 _predictions_parsed
+
+ Title   : _predictions_parsed
+ Usage   : $obj->_predictions_parsed
+ Function: internal
+ Example :
+ Returns : TRUE or FALSE
+
+=cut
+
+sub _predictions_parsed {
+    my ($self, $val) = @_;
+
+    $self->{'_preds_parsed'} = $val if $val;
+    if(! exists($self->{'_preds_parsed'})) {
+	$self->{'_preds_parsed'} = 0;
+    }
+    return $self->{'_preds_parsed'};
+}
+
+=head2 _has_cds
+
+ Title   : _has_cds()
+ Usage   : $obj->_has_cds()
+ Function: Whether or not the result contains the predicted CDSs, too.
+ Example :
+ Returns : TRUE or FALSE
+
+=cut
+
+sub _has_cds {
+    my ($self, $val) = @_;
+
+    $self->{'_has_cds'} = $val if $val;
+    if(! exists($self->{'_has_cds'})) {
+	$self->{'_has_cds'} = 0;
+    }
+    return $self->{'_has_cds'};
+}
+
+=head2 _read_fasta_seq
+
+ Title   : _read_fasta_seq()
+ Usage   : ($id,$seqstr) = $obj->_read_fasta_seq();
+ Function: Simple but specialised FASTA format sequence reader. Uses
+           $self->_readline() to retrieve input, and is able to strip off
+           the traling description lines.
+ Example :
+ Returns : An array of two elements.
+
+=cut
+
+sub _read_fasta_seq {
+    my ($self) = @_;
+    my ($id, $seq);
+    local $/ = ">";
+    
+    return 0 unless (my $entry = $self->_readline());
+    
+    $entry =~ s/^>//;
+    # complete the entry if the first line came from a pushback buffer
+    while(! ($entry =~ />$/)) {
+	last unless ($_ = $self->_readline());
+	$entry .= $_;
+    }
+
+    # delete everything onwards from an new fasta start (>)
+    $entry =~ s/\n>.*$//s;
+    # id and sequence
+    
+    if($entry =~ s/^(.+)\n//) {
+	$id = $1;
+	$id =~ s/ /_/g;
+	$seq = $entry;
+	$seq =~ s/\s//g;	
+	#print "\n@@ $id \n@@ $seq \n##\n";
+    } else {
+	$self->throw("Can't parse Genemark predicted sequence entry");
+    }
+    $seq =~ s/\s//g; # Remove whitespace
+    return ($id, $seq);  
+}
+
+1;
+