diff variant_effect_predictor/Bio/AlignIO.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/AlignIO.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,505 @@
+# $Id: AlignIO.pm,v 1.28 2002/10/22 07:38:23 lapp Exp $
+#
+# BioPerl module for Bio::AlignIO
+#
+#	based on the Bio::SeqIO module
+#       by Ewan Birney <birney@sanger.ac.uk>
+#       and Lincoln Stein  <lstein@cshl.org>
+#
+# Copyright Peter Schattner
+#
+# You may distribute this module under the same terms as perl itself
+#
+# _history
+# October 18, 1999  SeqIO largely rewritten by Lincoln Stein
+# September, 2000 AlignIO written by Peter Schattner
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::AlignIO - Handler for AlignIO Formats
+
+=head1 SYNOPSIS
+
+    use Bio::AlignIO;
+
+    $inputfilename = "testaln.fasta";
+    $in  = Bio::AlignIO->new(-file => $inputfilename ,
+                             '-format' => 'fasta');
+    $out = Bio::AlignIO->new(-file => ">out.aln.pfam" ,
+                             '-format' => 'pfam');
+    # note: we quote -format to keep older perl's from complaining.
+
+    while ( my $aln = $in->next_aln() ) {
+        $out->write_aln($aln);
+    }
+
+or
+
+    use Bio::AlignIO;
+
+    $inputfilename = "testaln.fasta";
+    $in  = Bio::AlignIO->newFh(-file => $inputfilename ,
+                               '-format' => 'fasta');
+    $out = Bio::AlignIO->newFh('-format' => 'pfam');
+
+    # World's shortest Fasta<->pfam format converter:
+    print $out $_ while <$in>;
+
+=head1 DESCRIPTION
+
+Bio::AlignIO is a handler module for the formats in the AlignIO set
+(eg, Bio::AlignIO::fasta). It is the officially sanctioned way of
+getting at the alignment objects, which most people should use. The
+resulting alignment is a Bio::Align::AlignI compliant object. See
+L<Bio::Align::AlignI> for more information.
+
+The idea is that you request a stream object for a particular format.
+All the stream objects have a notion of an internal file that is read
+from or written to. A particular AlignIO object instance is configured
+for either input or output. A specific example of a stream object is
+the Bio::AlignIO::fasta object.
+
+Each stream object has functions
+
+   $stream->next_aln();
+
+and
+
+   $stream->write_aln($aln);
+
+also
+
+   $stream->type() # returns 'INPUT' or 'OUTPUT'
+
+As an added bonus, you can recover a filehandle that is tied to the
+AlignIO object, allowing you to use the standard E<lt>E<gt> and print
+operations to read and write sequence objects:
+
+    use Bio::AlignIO;
+
+     # read from standard input
+    $stream = Bio::AlignIO->newFh(-format => 'Fasta');
+
+    while ( $aln = <$stream> ) {
+	# do something with $aln
+    }
+
+and
+
+    print $stream $aln; # when stream is in output mode
+
+This makes the simplest ever reformatter
+
+    #!/usr/local/bin/perl
+
+    $format1 = shift;
+    $format2 = shift ||
+        die "Usage: reformat format1 format2 < input > output";
+
+    use Bio::AlignIO;
+
+    $in  = Bio::AlignIO->newFh(-format => $format1 );
+    $out = Bio::AlignIO->newFh(-format => $format2 );
+    # note: you might want to quote -format to keep 
+    #  older perl's from complaining.
+
+    print $out $_ while <$in>;
+
+AlignIO.pm is patterned on the module SeqIO.pm and shares most the
+SeqIO.pm features.  One significant difference currently is that
+AlignIO.pm usually handles IO for only a single alignment at a time
+(SeqIO.pm handles IO for multiple sequences in a single stream.)  The
+principal reason for this is that whereas simultaneously handling
+multiple sequences is a common requirement, simultaneous handling of
+multiple alignments is not. The only current exception is format
+"bl2seq" which parses results of the Blast bl2seq program and which
+may produce several alignment pairs.  This set of alignment pairs can
+be read using multiple calls to next_aln.
+
+Capability for IO for more than one multiple alignment - other than
+for bl2seq format -(which may be of use for certain applications such
+as IO for Pfam libraries) may be included in the future.  For this
+reason we keep the name "next_aln()" for the alignment input routine,
+even though in most cases only one alignment is read (or written) at a
+time and the name "read_aln()" might be more appropriate.
+
+=head1 CONSTRUCTORS
+
+=head2 Bio::AlignIO-E<gt>new()
+
+   $seqIO = Bio::AlignIO->new(-file => 'filename',   -format=>$format);
+   $seqIO = Bio::AlignIO->new(-fh   => \*FILEHANDLE, -format=>$format);
+   $seqIO = Bio::AlignIO->new(-format => $format);
+
+The new() class method constructs a new Bio::AlignIO object.  The
+returned object can be used to retrieve or print BioAlign
+objects. new() accepts the following parameters:
+
+=over 4
+
+=item -file
+
+A file path to be opened for reading or writing.  The usual Perl
+conventions apply:
+
+   'file'       # open file for reading
+   '>file'      # open file for writing
+   '>>file'     # open file for appending
+   '+<file'     # open file read/write
+   'command |'  # open a pipe from the command
+   '| command'  # open a pipe to the command
+
+=item -fh
+
+You may provide new() with a previously-opened filehandle.  For
+example, to read from STDIN:
+
+   $seqIO = Bio::AlignIO->new(-fh => \*STDIN);
+
+Note that you must pass filehandles as references to globs.
+
+If neither a filehandle nor a filename is specified, then the module
+will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt>
+semantics.
+
+=item -format
+
+Specify the format of the file.  Supported formats include:
+
+   fasta       FASTA format
+   selex       selex (hmmer) format
+   stockholm   stockholm format
+   prodom      prodom (protein domain) format
+   clustalw    clustalw (.aln) format
+   msf         msf (GCG) format
+   mase        mase (seaview) format
+   bl2seq      Bl2seq Blast output
+   nexus       Swofford et al NEXUS format
+   pfam        Pfam sequence alignment format
+   phylip      Felsenstein's PHYLIP format
+   emboss      EMBOSS water and needle format
+   mega        MEGA format
+   meme        MEME format
+   psi         PSI-BLAST format
+
+Currently only those formats which were implemented in L<Bio::SimpleAlign>
+have been incorporated in AlignIO.pm.  Specifically, mase, stockholm
+and prodom have only been implemented for input. See the specific module
+(e.g. L<Bio::AlignIO::meme>) for notes on supported versions.
+
+If no format is specified and a filename is given, then the module
+will attempt to deduce it from the filename suffix.  If this is unsuccessful,
+Fasta format is assumed.
+
+The format name is case insensitive.  'FASTA', 'Fasta' and 'fasta' are
+all supported.
+
+=back
+
+=head2 Bio::AlignIO-E<gt>newFh()
+
+   $fh = Bio::AlignIO->newFh(-fh   => \*FILEHANDLE, -format=>$format);
+   $fh = Bio::AlignIO->newFh(-format => $format);
+   # etc.
+
+This constructor behaves like new(), but returns a tied filehandle
+rather than a Bio::AlignIO object.  You can read sequences from this
+object using the familiar E<lt>E<gt> operator, and write to it using print().
+The usual array and $_ semantics work.  For example, you can read all
+sequence objects into an array like this:
+
+  @sequences = <$fh>;
+
+Other operations, such as read(), sysread(), write(), close(), and printf() 
+are not supported.
+
+=over 1
+
+=item -flush
+
+By default, all files (or filehandles) opened for writing alignments
+will be flushed after each write_aln() (making the file immediately
+usable).  If you don't need this facility and would like to marginally
+improve the efficiency of writing multiple sequences to the same file
+(or filehandle), pass the -flush option '0' or any other value that
+evaluates as defined but false:
+
+  my $clustal = new Bio::AlignIO -file   => "<prot.aln",
+                          -format => "clustalw";
+  my $msf = new Bio::AlignIO -file   => ">prot.msf",
+                          -format => "msf",
+                          -flush  => 0; # go as fast as we can!
+  while($seq = $clustal->next_aln) { $msf->write_aln($seq) }
+
+=back
+
+=head1 OBJECT METHODS
+
+See below for more detailed summaries.  The main methods are:
+
+=head2 $alignment = $AlignIO-E<gt>next_aln()
+
+Fetch an alignment from a formatted file.
+
+=head2 $AlignIO-E<gt>write_aln($aln)
+
+Write the specified alignment to a file..
+
+=head2 TIEHANDLE(), READLINE(), PRINT()
+
+These provide the tie interface.  See L<perltie> for more details.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org               - General discussion
+  http://bio.perl.org/MailList.html   - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+ the bugs and their resolution.
+ Bug reports can be submitted via email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Peter Schattner
+
+Email: schattner@alum.mit.edu
+
+=head1 CONTRIBUTORS
+
+Jason Stajich, jason@bioperl.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+# 'Let the code begin...
+
+package Bio::AlignIO;
+
+use strict;
+use vars qw(@ISA);
+
+use Bio::Root::Root;
+use Bio::Seq;
+use Bio::LocatableSeq;
+use Bio::SimpleAlign;
+use Bio::Root::IO;
+@ISA = qw(Bio::Root::Root Bio::Root::IO);
+
+=head2 new
+
+ Title   : new
+ Usage   : $stream = Bio::AlignIO->new(-file => $filename,
+                                       '-format' => 'Format')
+ Function: Returns a new seqstream
+ Returns : A Bio::AlignIO::Handler initialised with 
+           the appropriate format
+ Args    : -file => $filename 
+           -format => format
+           -fh => filehandle to attach to
+
+=cut
+
+sub new {
+    my ($caller,@args) = @_;
+    my $class = ref($caller) || $caller;
+    
+    # or do we want to call SUPER on an object if $caller is an
+    # object?
+    if( $class =~ /Bio::AlignIO::(\S+)/ ) {
+	my ($self) = $class->SUPER::new(@args);	
+	$self->_initialize(@args);
+	return $self;
+    } else { 
+
+	my %param = @args;
+	@param{ map { lc $_ } keys %param } = values %param; # lowercase keys
+	my $format = $param{'-format'} || 
+	    $class->_guess_format( $param{-file} || $ARGV[0] ) ||
+		'fasta';
+	$format = "\L$format";	# normalize capitalization to lower case
+
+	# normalize capitalization
+	return undef unless( $class->_load_format_module($format) );
+	return "Bio::AlignIO::$format"->new(@args);
+    }
+}
+
+
+=head2 newFh
+
+ Title   : newFh
+ Usage   : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
+ Function: does a new() followed by an fh()
+ Example : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
+           $sequence = <$fh>;   # read a sequence object
+           print $fh $sequence; # write a sequence object
+ Returns : filehandle tied to the Bio::AlignIO::Fh class
+ Args    :
+
+=cut
+
+sub newFh {
+  my $class = shift;
+  return unless my $self = $class->new(@_);
+  return $self->fh;
+}
+
+=head2 fh
+
+ Title   : fh
+ Usage   : $obj->fh
+ Function:
+ Example : $fh = $obj->fh;      # make a tied filehandle
+           $sequence = <$fh>;   # read a sequence object
+           print $fh $sequence; # write a sequence object
+ Returns : filehandle tied to the Bio::AlignIO::Fh class
+ Args    :
+
+=cut
+
+
+sub fh {
+  my $self = shift;
+  my $class = ref($self) || $self;
+  my $s = Symbol::gensym;
+  tie $$s,$class,$self;
+  return $s;
+}
+
+# _initialize is where the heavy stuff will happen when new is called
+
+sub _initialize {
+  my($self,@args) = @_;
+
+  $self->_initialize_io(@args);
+  1;
+}
+
+=head2 _load_format_module
+
+ Title   : _load_format_module
+ Usage   : *INTERNAL AlignIO stuff*
+ Function: Loads up (like use) a module at run time on demand
+ Example :
+ Returns : 
+ Args    :
+
+=cut
+
+sub _load_format_module {
+  my ($self,$format) = @_;
+  my $module = "Bio::AlignIO::" . $format;
+  my $ok;
+  
+  eval {
+      $ok = $self->_load_module($module);
+  };
+  if ( $@ ) {
+    print STDERR <<END;
+$self: $format cannot be found
+Exception $@
+For more information about the AlignIO system please see the AlignIO docs.
+This includes ways of checking for formats at compile time, not run time
+END
+  ;
+    return;
+  }
+  return 1;
+}
+
+=head2 next_aln
+
+ Title   : next_aln
+ Usage   : $aln = stream->next_aln
+ Function: reads the next $aln object from the stream
+ Returns : a Bio::Align::AlignI compliant object
+ Args    : 
+
+=cut
+
+sub next_aln {
+   my ($self,$aln) = @_;
+   $self->throw("Sorry, you cannot read from a generic Bio::AlignIO object.");
+}
+
+=head2 write_aln
+
+ Title   : write_aln
+ Usage   : $stream->write_aln($aln)
+ Function: writes the $aln object into the stream
+ Returns : 1 for success and 0 for error
+ Args    : Bio::Seq object
+
+=cut
+
+sub write_aln {
+    my ($self,$aln) = @_;
+    $self->throw("Sorry, you cannot write to a generic Bio::AlignIO object.");
+}
+
+=head2 _guess_format
+
+ Title   : _guess_format
+ Usage   : $obj->_guess_format($filename)
+ Function: 
+ Example : 
+ Returns : guessed format of filename (lower case)
+ Args    : 
+
+=cut
+
+sub _guess_format {
+   my $class = shift;
+   return unless $_ = shift;
+   return 'fasta'   if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i;
+   return 'msf'     if /\.(msf|pileup|gcg)$/i;
+   return 'pfam'    if /\.(pfam|pfm)$/i;
+   return 'selex'   if /\.(selex|slx|selx|slex|sx)$/i;
+   return 'phylip'  if /\.(phylip|phlp|phyl|phy|phy|ph)$/i;
+   return 'nexus'   if /\.(nexus|nex)$/i;
+   return 'mega'     if( /\.(meg|mega)$/i );
+   return 'clustalw' if( /\.aln$/i );
+   return 'meme'     if( /\.meme$/i );
+   return 'emboss'   if( /\.(water|needle)$/i );
+   return 'psi'      if( /\.psi$/i );
+}
+
+sub DESTROY {
+    my $self = shift;
+    $self->close();
+}
+
+sub TIEHANDLE {
+  my $class = shift;
+  return bless {'alignio' => shift},$class;
+}
+
+sub READLINE {
+  my $self = shift;
+  return $self->{'alignio'}->next_aln() unless wantarray;
+  my (@list,$obj);
+  push @list,$obj  while $obj = $self->{'alignio'}->next_aln();
+  return @list;
+}
+
+sub PRINT {
+  my $self = shift;
+  $self->{'alignio'}->write_aln(@_);
+}
+
+1;