diff variant_effect_predictor/Bio/SeqIO/fasta.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SeqIO/fasta.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,271 @@
+# $Id: fasta.pm,v 1.41.2.4 2003/09/18 02:43:16 jason Exp $
+# BioPerl module for Bio::SeqIO::fasta
+#
+# Cared for by Ewan Birney <birney@ebi.ac.uk>
+#          and Lincoln Stein <lstein@cshl.org>
+#
+# Copyright Ewan Birney & Lincoln Stein
+#
+# You may distribute this module under the same terms as perl itself
+# _history
+# October 18, 1999  Largely rewritten by Lincoln Stein
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SeqIO::fasta - fasta sequence input/output stream
+
+=head1 SYNOPSIS
+
+Do not use this module directly.  Use it via the Bio::SeqIO class.
+
+=head1 DESCRIPTION
+
+This object can transform Bio::Seq objects to and from fasta flat
+file databases.
+
+A method L<preferred_id_type()> can be used to specify the type of ID
+we would like to parse from the fasta line.  By default 'display' is
+used, which means it parses everything from the '>' to the first space
+and makes that the 'display_id' for the sequence.
+
+Can be one of:
+ - accession 
+ - accession.version 
+ - display
+ - primary
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org            - General discussion
+  http://bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via the
+web:
+
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHORS - Ewan Birney & Lincoln Stein
+
+Email: birney@ebi.ac.uk
+       lstein@cshl.org
+
+=head1 CONTRIBUTORS
+
+Jason Stajich, jason-at-bioperl.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+# Let the code begin...
+
+package Bio::SeqIO::fasta;
+use vars qw(@ISA $WIDTH @SEQ_ID_TYPES $DEFAULT_SEQ_ID_TYPE);
+use strict;
+# Object preamble - inherits from Bio::Root::Object
+
+use Bio::SeqIO;
+use Bio::Seq::SeqFactory;
+use Bio::Seq::SeqFastaSpeedFactory;
+
+@ISA = qw(Bio::SeqIO);
+
+@SEQ_ID_TYPES = qw(accession accession.version display primary);
+$DEFAULT_SEQ_ID_TYPE = 'display';
+
+BEGIN { $WIDTH = 60}
+
+sub _initialize {
+  my($self,@args) = @_;
+  $self->SUPER::_initialize(@args);  
+  my ($width) = $self->_rearrange([qw(WIDTH)], @args);
+  $width && $self->width($width);
+  unless ( defined $self->sequence_factory ) {
+      $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new());
+  }
+}
+
+=head2 next_seq
+
+ Title   : next_seq
+ Usage   : $seq = $stream->next_seq()
+ Function: returns the next sequence in the stream
+ Returns : Bio::Seq object
+ Args    : NONE
+
+=cut
+
+sub next_seq {
+    my( $self ) = @_;
+    my $seq;
+    my $alphabet;
+    local $/ = "\n>";
+    return unless my $entry = $self->_readline;
+
+    chomp($entry);
+    if ($entry =~ m/\A\s*\Z/s)  { # very first one
+	return unless $entry = $self->_readline;
+	chomp($entry);
+    }
+    $entry =~ s/^>//;
+
+    my ($top,$sequence) = split(/\n/,$entry,2);
+    defined $sequence && $sequence =~ s/>//g;
+#    my ($top,$sequence) = $entry =~ /^>?(.+?)\n+([^>]*)/s
+#	or $self->throw("Can't parse fasta entry");
+
+    my ($id,$fulldesc);
+    if( $top =~ /^\s*(\S+)\s*(.*)/ ) {
+	($id,$fulldesc) = ($1,$2);
+    }
+    
+    if (defined $id && $id eq '') {$id=$fulldesc;} # FIX incase no space 
+                                                   # between > and name \AE
+    defined $sequence && $sequence =~ s/\s//g;	# Remove whitespace
+
+    # for empty sequences we need to know the mol.type
+    $alphabet = $self->alphabet();
+    if(defined $sequence && length($sequence) == 0) {
+	if(! defined($alphabet)) {
+	    # let's default to dna
+	    $alphabet = "dna";
+	}
+    } else {
+	# we don't need it really, so disable
+	$alphabet = undef;
+    }
+
+    $seq = $self->sequence_factory->create(
+					   -seq         => $sequence,
+					   -id          => $id,
+					   # Ewan's note - I don't think this healthy
+					   # but obviously to taste.
+					   #-primary_id  => $id,
+					   -desc        => $fulldesc,
+					   -alphabet    => $alphabet,
+					   -direct      => 1,
+					   );
+
+
+
+
+    # if there wasn't one before, set the guessed type
+    unless ( defined $alphabet ) {
+	$self->alphabet($seq->alphabet());
+    }
+    return $seq;
+
+}
+
+=head2 write_seq
+
+ Title   : write_seq
+ Usage   : $stream->write_seq(@seq)
+ Function: writes the $seq object into the stream
+ Returns : 1 for success and 0 for error
+ Args    : array of 1 to n Bio::PrimarySeqI objects
+
+
+=cut
+
+sub write_seq {
+   my ($self,@seq) = @_;
+   my $width = $self->width;
+   foreach my $seq (@seq) {
+       $self->throw("Did not provide a valid Bio::PrimarySeqI object") 
+	   unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
+
+       my $str = $seq->seq;
+       my $top;
+
+       # Allow for different ids 
+       my $id_type = $self->preferred_id_type;
+       if( $id_type =~ /^acc/i ) {
+	   $top = $seq->accession_number();
+	   if( $id_type =~ /vers/i ) {
+	       $top .= "." . $seq->version();
+	   }
+       } elsif($id_type =~ /^displ/i ) {
+	   $top = $seq->display_id();
+       } elsif($id_type =~ /^pri/i ) {
+	   $top = $seq->primary_id();
+       }
+
+       if ($seq->can('desc') and my $desc = $seq->desc()) {
+	   $desc =~ s/\n//g;
+	   $top .= " $desc";
+       }
+       if(length($str) > 0) {
+	   $str =~ s/(.{1,$width})/$1\n/g;
+       } else {
+	   $str = "\n";
+       }
+       $self->_print (">",$top,"\n",$str) or return;
+   }
+
+   $self->flush if $self->_flush_on_write && defined $self->_fh;
+   return 1;
+}
+
+=head2 width
+
+ Title   : width
+ Usage   : $obj->width($newval)
+ Function: Get/Set the line width for FASTA output
+ Returns : value of width
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub width{
+   my ($self,$value) = @_;
+   if( defined $value) {
+      $self->{'width'} = $value;
+    }
+    return $self->{'width'} || $WIDTH;
+}
+
+=head2 preferred_id_type
+
+ Title   : preferred_id_type
+ Usage   : $obj->preferred_id_type('accession')
+ Function: Get/Set the preferred type of identifier to use in the ">ID" position
+           for FASTA output.
+ Returns : string, one of values defined in @Bio::SeqIO::fasta::SEQ_ID_TYPES.
+           Default = $Bio::SeqIO::fasta::DEFAULT_SEQ_ID_TYPE ('display').
+ Args    : string when setting. This must be one of values defined in 
+           @Bio::SeqIO::fasta::SEQ_ID_TYPES. Allowable values:
+           accession, accession.version, display, primary
+ Throws  : fatal exception if the supplied id type is not in @SEQ_ID_TYPES.
+
+=cut
+
+sub preferred_id_type {
+    my ($self,$type) = @_;
+    if( defined $type ) {
+	if( ! grep lc($type) eq $_, @SEQ_ID_TYPES) {
+	    $self->throw(-class=>'Bio::Root::BadParameter',
+			 -text=>"Invalid ID type \"$type\". Must be one of: @SEQ_ID_TYPES");
+	}
+	$self->{'_seq_id_type'} = lc($type);
+#	print STDERR "Setting preferred_id_type=$type\n";
+    }
+    $self->{'_seq_id_type'} || $DEFAULT_SEQ_ID_TYPE;
+}
+
+1;