diff variant_effect_predictor/Bio/SeqIO/gcg.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SeqIO/gcg.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,298 @@
+# $Id: gcg.pm,v 1.21 2002/10/25 16:22:01 jason Exp $
+#
+# BioPerl module for Bio::SeqIO::gcg
+#
+# Cared for by Ewan Birney <birney@ebi.ac.uk>
+#          and Lincoln Stein <lstein@cshl.org>
+#
+# Copyright Ewan Birney & Lincoln Stein
+#
+# You may distribute this module under the same terms as perl itself
+#
+# _history
+# October 18, 1999  Largely rewritten by Lincoln Stein
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SeqIO::gcg - GCG sequence input/output stream
+
+=head1 SYNOPSIS
+
+Do not use this module directly.  Use it via the Bio::SeqIO class.
+
+=head1 DESCRIPTION
+
+This object can transform Bio::Seq objects to and from GCG flat
+file databases.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this
+and other Bioperl modules. Send your comments and suggestions preferably
+ to one of the Bioperl mailing lists.
+Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                 - General discussion
+  http://www.bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHORS - Ewan Birney & Lincoln Stein
+
+Email: E<lt>birney@ebi.ac.ukE<gt>
+       E<lt>lstein@cshl.orgE<gt>       
+
+=head1 CONTRIBUTORS
+
+Jason Stajich, jason@bioperl.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+# Let the code begin...
+
+package Bio::SeqIO::gcg;
+use vars qw(@ISA);
+use strict;
+
+use Bio::SeqIO;
+use Bio::Seq::SeqFactory;
+
+@ISA = qw(Bio::SeqIO);
+
+sub _initialize {
+  my($self,@args) = @_;
+  $self->SUPER::_initialize(@args);    
+  if( ! defined $self->sequence_factory ) {
+      $self->sequence_factory(new Bio::Seq::SeqFactory
+			      (-verbose => $self->verbose(), 
+			       -type => 'Bio::Seq::RichSeq'));      
+   }
+}
+
+=head2 next_seq
+
+ Title   : next_seq
+ Usage   : $seq = $stream->next_seq()
+ Function: returns the next sequence in the stream
+ Returns : Bio::Seq object
+ Args    :
+
+=cut
+
+sub next_seq {
+   my ($self,@args)    = @_;
+   my($id,$type,$desc,$line,$chksum,$sequence,$date,$len);
+
+   while( defined($_ = $self->_readline()) ) {
+
+       ## Get the descriptive info (anything before the line with '..')
+       unless( /\.\.$/ ) { $desc.= $_; }
+       ## Pull ID, Checksum & Type from the line containing '..'
+       /\.\.$/ && do     { $line = $_; chomp; 
+                           if(/Check\:\s(\d+)\s/) { $chksum = $1; }
+                           if(/Type:\s(\w)\s/)    { $type   = $1; }
+                           if(/(\S+)\s+Length/) 
+			   { $id     = $1; }
+			   if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ )
+			   { $len = $1; $date = $2;}
+                           last; 
+                         }
+   }   
+   return if ( !defined $_);
+   chomp($desc);  # remove last "\n"
+
+   while( defined($_ = $self->_readline()) ) {
+
+       ## This is where we grab the sequence info.
+
+       if( /\.\.$/ ) { 
+        $self->throw("Looks like start of another sequence. See documentation. "); 
+       }
+
+       next if($_ eq "\n");       ## skip whitespace lines in formatted seq
+       s/[^a-zA-Z]//g;            ## remove anything that is not alphabet char
+       # $_ = uc($_);               ## uppercase sequence: NO. Keep the case. HL
+       $sequence .= $_;
+   }
+   ##If we parsed out a checksum, we might as well test it
+
+   if(defined $chksum) { 
+       unless(_validate_checksum($sequence,$chksum)) {
+	   $self->throw("Checksum failure on parsed sequence.");
+       }
+   }
+
+   ## Remove whitespace from identifier because the constructor
+   ## will throw a warning otherwise...
+   if(defined $id) { $id =~ s/\s+//g;}
+
+   ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate
+   ## keyword that the constructor expects...
+   if(defined $type) {
+       if($type eq "N") { $type = "dna";      }
+       if($type eq "P") { $type = "prot";    }
+   }
+
+   return $self->sequence_factory->create(-seq  => $sequence, 
+					  -id   => $id, 
+					  -desc => $desc, 
+					  -type => $type,
+					  -dates => [ $date ]
+					  );
+}
+
+=head2 write_seq
+
+ Title   : write_seq
+ Usage   : $stream->write_seq(@seq)
+ Function: writes the formatted $seq object into the stream
+ Returns : 1 for success and 0 for error
+ Args    : array of Bio::PrimarySeqI object
+
+
+=cut
+
+sub write_seq {
+    my ($self,@seq) = @_;
+    for my $seq (@seq) {
+	$self->throw("Did not provide a valid Bio::PrimarySeqI object") 
+	    unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
+
+	my $str         = $seq->seq;
+	my $comment     = $seq->desc; 
+	my $id          = $seq->id;
+	my $type        = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P';
+	my $timestamp;
+
+	if( $seq->can('get_dates') ) {
+	    ($timestamp) = $seq->get_dates;
+	} else { 
+	    $timestamp = localtime(time);
+	}
+	my($sum,$offset,$len,$i,$j,$cnt,@out);
+
+	$len = length($str);
+	## Set the offset if we have any non-standard numbering going on
+	$offset=1;
+	# checksum
+	$sum = $self->GCG_checksum($seq);
+
+	#Output the sequence header info
+	push(@out,"$comment\n");                        
+	push(@out,"$id  Length: $len  $timestamp  Type: $type  Check: $sum  ..\n\n");
+
+	#Format the sequence
+	$i = $#out + 1;
+	for($j = 0 ; $j < $len ; ) {
+	    if( $j % 50 == 0) {
+		$out[$i] = sprintf("%8d  ",($j+$offset)); #numbering 
+	    }
+	    $out[$i] .= sprintf("%s",substr($str,$j,10));
+	    $j += 10;
+	    if( $j < $len && $j % 50 != 0 ) {
+		$out[$i] .= " ";
+	    }elsif($j % 50 == 0 ) {
+		$out[$i++] .= "\n\n";
+	    }                           
+	}
+	local($^W) = 0;
+	if($j % 50 != 0 ) {
+	    $out[$i] .= "\n";
+	}
+	$out[$i] .= "\n";
+	return unless $self->_print(@out);
+    }
+
+    $self->flush if $self->_flush_on_write && defined $self->_fh;
+    return 1;
+}
+
+=head2 GCG_checksum
+
+ Title     : GCG_checksum
+ Usage     : $cksum = $gcgio->GCG_checksum($seq);
+ Function  : returns a gcg checksum for the sequence specified
+
+             This method can also be called as a class method.
+ Example   : 
+ Returns   : a GCG checksum string
+ Argument  : a Bio::PrimarySeqI implementing object
+
+=cut
+
+sub GCG_checksum {
+    my ($self,$seqobj) = @_;
+    my $index = 0;
+    my $checksum = 0;
+    my $char;
+
+    my $seq = $seqobj->seq();
+    $seq =~ tr/a-z/A-Z/;
+    
+    foreach $char ( split(/[\.\-]*/, $seq)) {
+	$index++;
+	$checksum += ($index * (unpack("c",$char) || 0) );
+	if( $index ==  57 ) {
+	    $index = 0;
+	}
+    }
+
+    return ($checksum % 10000);
+}
+
+=head2 _validate_checksum
+
+ Title   : _validate_checksum
+ Usage   : n/a - internal method
+ Function: if parsed gcg sequence contains a checksum field
+         : we compare it to a value computed here on the parsed
+         : sequence. A checksum mismatch would indicate some
+         : type of parsing failure occured.
+         :
+ Returns : 1 for success, 0 for failure
+ Args    : string containing parsed seq, value of parsed cheksum
+
+
+=cut
+
+sub _validate_checksum {
+    my($seq,$parsed_sum) = @_;
+    my($i,$len,$computed_sum,$cnt);
+
+    $len = length($seq);
+
+    #Generate the GCG Checksum value
+
+    for($i=0; $i<$len ;$i++) {             
+	$cnt++;
+	$computed_sum += $cnt * ord(substr($seq,$i,1));
+	($cnt == 57) && ($cnt=0);
+    }
+    $computed_sum %= 10000;
+
+    ## Compare and decide if success or failure
+
+    if($parsed_sum == $computed_sum) {
+	return 1;
+    } else { return 0; }
+
+
+}
+
+1;