Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqIO.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,748 @@ + +# $Id: SeqIO.pm,v 1.59.2.4 2003/09/14 19:16:53 jason Exp $ +# +# BioPerl module for Bio::SeqIO +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# and Lincoln Stein <lstein@cshl.org> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself +# +# _history +# October 18, 1999 Largely rewritten by Lincoln Stein + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO - Handler for SeqIO Formats + +=head1 SYNOPSIS + + use Bio::SeqIO; + + $in = Bio::SeqIO->new(-file => "inputfilename" , '-format' => 'Fasta'); + $out = Bio::SeqIO->new(-file => ">outputfilename" , '-format' => 'EMBL'); + # note: we quote -format to keep older Perls from complaining. + + while ( my $seq = $in->next_seq() ) { + $out->write_seq($seq); + } + +Now, to actually get at the sequence object, use the standard Bio::Seq +methods (look at L<Bio::Seq> if you don't know what they are) + + use Bio::SeqIO; + + $in = Bio::SeqIO->new(-file => "inputfilename" , '-format' => 'genbank'); + + while ( my $seq = $in->next_seq() ) { + print "Sequence ",$seq->id," first 10 bases ",$seq->subseq(1,10),"\n"; + } + + +The SeqIO system does have a filehandle binding. Most people find this +a little confusing, but it does mean you write the world's smallest +reformatter + + use Bio::SeqIO; + + $in = Bio::SeqIO->newFh(-file => "inputfilename" , '-format' => 'Fasta'); + $out = Bio::SeqIO->newFh('-format' => 'EMBL'); + + # World's shortest Fasta<->EMBL format converter: + print $out $_ while <$in>; + + +=head1 DESCRIPTION + +Bio::SeqIO is a handler module for the formats in the SeqIO set (eg, +Bio::SeqIO::fasta). It is the officially sanctioned way of getting at +the format objects, which most people should use. + +The Bio::SeqIO system can be thought of like biological file handles. +They are attached to filehandles with smart formatting rules (eg, +genbank format, or EMBL format, or binary trace file format) and +can either read or write sequence objects (Bio::Seq objects, or +more correctly, Bio::SeqI implementing objects, of which Bio::Seq is +one such object). If you want to know what to do with a Bio::Seq +object, read L<Bio::Seq>. + +The idea is that you request a stream object for a particular format. +All the stream objects have a notion of an internal file that is read +from or written to. A particular SeqIO object instance is configured +for either input or output. A specific example of a stream object is +the Bio::SeqIO::fasta object. + +Each stream object has functions + + $stream->next_seq(); + +and + + $stream->write_seq($seq); + +As an added bonus, you can recover a filehandle that is tied to the +SeqIO object, allowing you to use the standard E<lt>E<gt> and print operations +to read and write sequence objects: + + use Bio::SeqIO; + + $stream = Bio::SeqIO->newFh(-format => 'Fasta'); # read from standard input + + while ( $seq = <$stream> ) { + # do something with $seq + } + +and + + print $stream $seq; # when stream is in output mode + +This makes the simplest ever reformatter + + #!/usr/local/bin/perl + + $format1 = shift; + $format2 = shift || die "Usage: reformat format1 format2 < input > output"; + + use Bio::SeqIO; + + $in = Bio::SeqIO->newFh(-format => $format1 ); + $out = Bio::SeqIO->newFh(-format => $format2 ); + #note: you might want to quote -format to keep older perl's from complaining. + + print $out $_ while <$in>; + + +=head1 CONSTRUCTORS + +=head2 Bio::SeqIO-E<gt>new() + + $seqIO = Bio::SeqIO->new(-file => 'filename', -format=>$format); + $seqIO = Bio::SeqIO->new(-fh => \*FILEHANDLE, -format=>$format); + $seqIO = Bio::SeqIO->new(-format => $format); + +The new() class method constructs a new Bio::SeqIO object. The +returned object can be used to retrieve or print Seq objects. new() +accepts the following parameters: + +=over 4 + +=item -file + +A file path to be opened for reading or writing. The usual Perl +conventions apply: + + 'file' # open file for reading + '>file' # open file for writing + '>>file' # open file for appending + '+<file' # open file read/write + 'command |' # open a pipe from the command + '| command' # open a pipe to the command + +=item -fh + +You may provide new() with a previously-opened filehandle. For +example, to read from STDIN: + + $seqIO = Bio::SeqIO->new(-fh => \*STDIN); + +Note that you must pass filehandles as references to globs. + +If neither a filehandle nor a filename is specified, then the module +will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt> +semantics. + +A string filehandle is handy if you want to modify the output in the +memory, before printing it out. The following program reads in EMBL +formatted entries from a file and prints them out in fasta format with +some HTML tags: + + use Bio::SeqIO; + use IO::String; + my $in = Bio::SeqIO->new('-file' => "emblfile" , + '-format' => 'EMBL'); + while ( my $seq = $in->next_seq() ) { + # the output handle is reset for every file + my $stringio = IO::String->new($string); + my $out = Bio::SeqIO->new('-fh' => $stringio, + '-format' => 'fasta'); + # output goes into $string + $out->write_seq($seq); + # modify $string + $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g; + # print into STDOUT + print $string; + } + +=item -format + +Specify the format of the file. Supported formats include: + + Fasta FASTA format + EMBL EMBL format + GenBank GenBank format + swiss Swissprot format + PIR Protein Information Resource format + GCG GCG format + raw Raw format (one sequence per line, no ID) + ace ACeDB sequence format + game GAME XML format + phd phred output + qual Quality values (get a sequence of quality scores) + Fastq Fastq format + SCF SCF tracefile format + ABI ABI tracefile format + ALF ALF tracefile format + CTF CTF tracefile format + ZTR ZTR tracefile format + PLN Staden plain tracefile format + EXP Staden tagged experiment tracefile format + +If no format is specified and a filename is given then the module +will attempt to deduce the format from the filename suffix. If this +is unsuccessful then Fasta format is assumed. + +The format name is case insensitive. 'FASTA', 'Fasta' and 'fasta' are +all valid suffixes. + +Currently, the tracefile formats (except for SCF) require installation +of the external Staden "io_lib" package, as well as the +Bio::SeqIO::staden::read package available from the bioperl-ext +repository. + +=item -flush + +By default, all files (or filehandles) opened for writing sequences +will be flushed after each write_seq() (making the file immediately +usable). If you don't need this facility and would like to marginally +improve the efficiency of writing multiple sequences to the same file +(or filehandle), pass the -flush option '0' or any other value that +evaluates as defined but false: + + my $gb = new Bio::SeqIO -file => "<gball.gbk", + -format => "gb"; + my $fa = new Bio::SeqIO -file => ">gball.fa", + -format => "fasta", + -flush => 0; # go as fast as we can! + while($seq = $gb->next_seq) { $fa->write_seq($seq) } + + +=back + +=head2 Bio::SeqIO-E<gt>newFh() + + $fh = Bio::SeqIO->newFh(-fh => \*FILEHANDLE, -format=>$format); + $fh = Bio::SeqIO->newFh(-format => $format); + # etc. + +This constructor behaves like new(), but returns a tied filehandle +rather than a Bio::SeqIO object. You can read sequences from this +object using the familiar E<lt>E<gt> operator, and write to it using +print(). The usual array and $_ semantics work. For example, you can +read all sequence objects into an array like this: + + @sequences = <$fh>; + +Other operations, such as read(), sysread(), write(), close(), and printf() +are not supported. + +=head1 OBJECT METHODS + +See below for more detailed summaries. The main methods are: + +=head2 $sequence = $seqIO-E<gt>next_seq() + +Fetch the next sequence from the stream. + +=head2 $seqIO-E<gt>write_seq($sequence [,$another_sequence,...]) + +Write the specified sequence(s) to the stream. + +=head2 TIEHANDLE(), READLINE(), PRINT() + +These provide the tie interface. See L<perltie> for more details. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably +to one of the Bioperl mailing lists. + +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, Lincoln Stein + +Email birney@ebi.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' Let the code begin... + +package Bio::SeqIO; + +use strict; +use vars qw(@ISA); + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::Factory::SequenceStreamI; +use Bio::Factory::FTLocationFactory; +use Bio::Seq::SeqBuilder; +use Symbol(); + +@ISA = qw(Bio::Root::Root Bio::Root::IO Bio::Factory::SequenceStreamI); + +sub BEGIN { + eval { require Bio::SeqIO::staden::read; }; +} + +my %valid_alphabet_cache; + +=head2 new + + Title : new + Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'Format') + Function: Returns a new seqstream + Returns : A Bio::SeqIO stream initialised with the appropriate format + Args : Named parameters: + -file => $filename + -fh => filehandle to attach to + -format => format + + Additional arguments may be used to set factories and + builders involved in the sequence object creation. None of + these must be provided, they all have reasonable defaults. + -seqfactory the L<Bio::Factory::SequenceFactoryI> object + -locfactory the L<Bio::Factory::LocationFactoryI> object + -objbuilder the L<Bio::Factory::ObjectBuilderI> object + +See L<Bio::SeqIO::Handler> + +=cut + +my $entry = 0; + +sub new { + my ($caller,@args) = @_; + my $class = ref($caller) || $caller; + + # or do we want to call SUPER on an object if $caller is an + # object? + if( $class =~ /Bio::SeqIO::(\S+)/ ) { + my ($self) = $class->SUPER::new(@args); + $self->_initialize(@args); + return $self; + } else { + + my %param = @args; + @param{ map { lc $_ } keys %param } = values %param; # lowercase keys + my $format = $param{'-format'} || + $class->_guess_format( $param{-file} || $ARGV[0] ) || + 'fasta'; + $format = "\L$format"; # normalize capitalization to lower case + + # normalize capitalization + return undef unless( $class->_load_format_module($format) ); + return "Bio::SeqIO::$format"->new(@args); + } +} + +=head2 newFh + + Title : newFh + Usage : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') + Function: does a new() followed by an fh() + Example : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') + $sequence = <$fh>; # read a sequence object + print $fh $sequence; # write a sequence object + Returns : filehandle tied to the Bio::SeqIO::Fh class + Args : + +See L<Bio::SeqIO::Fh> + +=cut + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + +=head2 fh + + Title : fh + Usage : $obj->fh + Function: + Example : $fh = $obj->fh; # make a tied filehandle + $sequence = <$fh>; # read a sequence object + print $fh $sequence; # write a sequence object + Returns : filehandle tied to Bio::SeqIO class + Args : none + +=cut + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + +# _initialize is chained for all SeqIO classes + +sub _initialize { + my($self, @args) = @_; + + # flush is initialized by the Root::IO init + + my ($seqfact,$locfact,$objbuilder) = + $self->_rearrange([qw(SEQFACTORY + LOCFACTORY + OBJBUILDER) + ], @args); + + $locfact = Bio::Factory::FTLocationFactory->new(-verbose => $self->verbose) if ! $locfact; + $objbuilder = Bio::Seq::SeqBuilder->new(-verbose => $self->verbose) unless $objbuilder; + $self->sequence_builder($objbuilder); + $self->location_factory($locfact); + # note that this should come last because it propagates the sequence + # factory to the sequence builder + $seqfact && $self->sequence_factory($seqfact); + + # initialize the IO part + $self->_initialize_io(@args); +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = stream->next_seq + Function: Reads the next sequence object from the stream and returns it. + + Certain driver modules may encounter entries in the stream that + are either misformatted or that use syntax not yet understood + by the driver. If such an incident is recoverable, e.g., by + dismissing a feature of a feature table or some other non-mandatory + part of an entry, the driver will issue a warning. In the case + of a non-recoverable situation an exception will be thrown. + Do not assume that you can resume parsing the same stream after + catching the exception. Note that you can always turn recoverable + errors into exceptions by calling $stream->verbose(2). + Returns : a Bio::Seq sequence object + Args : none + +See L<Bio::Root::RootI>, L<Bio::Factory::SeqStreamI>, L<Bio::Seq> + +=cut + +sub next_seq { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot read from a generic Bio::SeqIO object."); +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + +=cut + +sub write_seq { + my ($self, $seq) = @_; + $self->throw("Sorry, you cannot write to a generic Bio::SeqIO object."); +} + + +=head2 alphabet + + Title : alphabet + Usage : $self->alphabet($newval) + Function: Set/get the molecule type for the Seq objects to be created. + Example : $seqio->alphabet('protein') + Returns : value of alphabet: 'dna', 'rna', or 'protein' + Args : newvalue (optional) + Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein' + +=cut + +sub alphabet { + my ($self, $value) = @_; + + if ( defined $value) { + $value = lc $value; + unless ($valid_alphabet_cache{$value}) { + # instead of hard-coding the allowed values once more, we check by + # creating a dummy sequence object + eval { + require Bio::PrimarySeq; + my $seq = Bio::PrimarySeq->new('-verbose' => $self->verbose, + '-alphabet' => $value); + + }; + if ($@) { + $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values."); + } + $valid_alphabet_cache{$value} = 1; + } + $self->{'alphabet'} = $value; + } + return $self->{'alphabet'}; +} + +=head2 _load_format_module + + Title : _load_format_module + Usage : *INTERNAL SeqIO stuff* + Function: Loads up (like use) a module at run time on demand + Example : + Returns : + Args : + +=cut + +sub _load_format_module { + my ($self, $format) = @_; + my $module = "Bio::SeqIO::" . $format; + my $ok; + + eval { + $ok = $self->_load_module($module); + }; + if ( $@ ) { + print STDERR <<END; +$self: $format cannot be found +Exception $@ +For more information about the SeqIO system please see the SeqIO docs. +This includes ways of checking for formats at compile time, not run time +END + ; + } + return $ok; +} + +=head2 _concatenate_lines + + Title : _concatenate_lines + Usage : $s = _concatenate_lines($line, $continuation_line) + Function: Private. Concatenates two strings assuming that the second stems + from a continuation line of the first. Adds a space between both + unless the first ends with a dash. + + Takes care of either arg being empty. + Example : + Returns : A string. + Args : + +=cut + +sub _concatenate_lines { + my ($self, $s1, $s2) = @_; + + $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2); + return ($s1 ? $s1 : "") . ($s2 ? $s2 : ""); +} + +=head2 _filehandle + + Title : _filehandle + Usage : $obj->_filehandle($newval) + Function: This method is deprecated. Call _fh() instead. + Example : + Returns : value of _filehandle + Args : newvalue (optional) + + +=cut + +sub _filehandle { + my ($self,@args) = @_; + return $self->_fh(@args); +} + +=head2 _guess_format + + Title : _guess_format + Usage : $obj->_guess_format($filename) + Function: guess format based on file suffix + Example : + Returns : guessed format of filename (lower case) + Args : + Notes : formats that _filehandle() will guess include fasta, + genbank, scf, pir, embl, raw, gcg, ace, bsml, swissprot, + fastq and phd/phred + +=cut + +sub _guess_format { + my $class = shift; + return unless $_ = shift; + return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i; + return 'genbank' if /\.(gb|gbank|genbank|gbk|gbs)$/i; + return 'scf' if /\.scf$/i; + return 'scf' if /\.scf$/i; + return 'abi' if /\.abi$/i; + return 'alf' if /\.alf$/i; + return 'ctf' if /\.ctf$/i; + return 'ztr' if /\.ztr$/i; + return 'pln' if /\.pln$/i; + return 'exp' if /\.exp$/i; + return 'pir' if /\.pir$/i; + return 'embl' if /\.(embl|ebl|emb|dat)$/i; + return 'raw' if /\.(txt)$/i; + return 'gcg' if /\.gcg$/i; + return 'ace' if /\.ace$/i; + return 'bsml' if /\.(bsm|bsml)$/i; + return 'swiss' if /\.(swiss|sp)$/i; + return 'phd' if /\.(phd|phred)$/i; + return 'fastq' if /\.fastq$/i; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'seqio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'seqio'}->next_seq() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'seqio'}->next_seq(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'seqio'}->write_seq(@_); +} + +=head2 sequence_factory + + Title : sequence_factory + Usage : $seqio->sequence_factory($seqfactory) + Function: Get/Set the Bio::Factory::SequenceFactoryI + Returns : Bio::Factory::SequenceFactoryI + Args : [optional] Bio::Factory::SequenceFactoryI + + +=cut + +sub sequence_factory{ + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) { + $self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)."::sequence_factory()"); + } + $self->{'_seqio_seqfactory'} = $obj; + my $builder = $self->sequence_builder(); + if($builder && $builder->can('sequence_factory') && + (! $builder->sequence_factory())) { + $builder->sequence_factory($obj); + } + } + $self->{'_seqio_seqfactory'}; +} + +=head2 object_factory + + Title : object_factory + Usage : $obj->object_factory($newval) + Function: This is an alias to sequence_factory with a more generic name. + Example : + Returns : value of object_factory (a scalar) + Args : on set, new value (a scalar or undef, optional) + + +=cut + +sub object_factory{ + return shift->sequence_factory(@_); +} + +=head2 sequence_builder + + Title : sequence_builder + Usage : $seqio->sequence_builder($seqfactory) + Function: Get/Set the L<Bio::Factory::ObjectBuilderI> used to build sequence + objects. + + If you do not set the sequence object builder yourself, it + will in fact be an instance of L<Bio::Seq::SeqBuilder>, and + you may use all methods documented there to configure it. + + Returns : a L<Bio::Factory::ObjectBuilderI> compliant object + Args : [optional] a L<Bio::Factory::ObjectBuilderI> compliant object + + +=cut + +sub sequence_builder{ + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::ObjectBuilderI') ) { + $self->throw("Must provide a valid Bio::Factory::ObjectBuilderI object to ".ref($self)."::sequence_builder()"); + } + $self->{'_object_builder'} = $obj; + } + $self->{'_object_builder'}; +} + +=head2 location_factory + + Title : location_factory + Usage : $seqio->location_factory($locfactory) + Function: Get/Set the Bio::Factory::LocationFactoryI object to be used for + location string parsing + Returns : a L<Bio::Factory::LocationFactoryI> implementing object + Args : [optional] on set, a L<Bio::Factory::LocationFactoryI> implementing + object. + + +=cut + +sub location_factory{ + my ($self,$obj) = @_; + if( defined $obj ) { + if( ! ref($obj) || ! $obj->isa('Bio::Factory::LocationFactoryI') ) { + $self->throw("Must provide a valid Bio::Factory::LocationFactoryI". + " object to ".ref($self)."->location_factory()"); + } + $self->{'_seqio_locfactory'} = $obj; + } + $self->{'_seqio_locfactory'}; +} + +1; +