Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Index/Fasta.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/Fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,223 @@ +# +# $Id: Fasta.pm,v 1.20 2002/10/22 07:38:33 lapp Exp $ +# +# BioPerl module for Bio::Index::Fasta +# +# Cared for by James Gilbert <jgrg@sanger.ac.uk> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::Fasta - Interface for indexing (multiple) fasta files + +=head1 SYNOPSIS + + # Complete code for making an index for several + # fasta files + use Bio::Index::Fasta; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::Fasta->new( + '-filename' => $Index_File_Name, + '-write_flag' => 1); + $inx->make_index(@ARGV); + + # Print out several sequences present in the index + # in Fasta format + use Bio::Index::Fasta; + use strict; + + my $Index_File_Name = shift; + my $inx = Bio::Index::Fasta->new('-filename' => $Index_File_Name); + my $out = Bio::SeqIO->new('-format' => 'Fasta','-fh' => \*STDOUT); + + foreach my $id (@ARGV) { + my $seq = $inx->fetch($id); # Returns Bio::Seq object + $out->write_seq($seq); + } + + # or, alternatively + + my $seq = $inx->get_Seq_by_id($id); #identical to fetch + +=head1 DESCRIPTION + +Inherits functions for managing dbm files from Bio::Index::Abstract.pm, +and provides the basic funtionallity for indexing fasta files, and +retrieving the sequence from them. Note: for best results 'use strict'. + +Bio::Index::Fasta supports the Bio::DB::BioSeqI interface, meaning +it can be used a a Sequence database for other parts of bioperl + +=head1 FEED_BACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - James Gilbert + +Email - jgrg@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Index::Fasta; + +use vars qw($VERSION @ISA); +use strict; + +use Bio::Index::AbstractSeq; +use Bio::Seq; + +@ISA = qw(Bio::Index::AbstractSeq); + +# +# Suggested fix by Michael G Schwern <schwern@pobox.com> to +# get around a clash with CPAN shell... +# + +BEGIN { + $VERSION = 0.2; +} + +sub _version { + return $VERSION; +} + +=head2 _file_format + + Title : _file_format + Function: The file format for this package, which is needed + by the SeqIO system when reading the sequence. + Returns : 'Fasta' + +=cut + +sub _file_format { + return 'Fasta'; +} + + + +=head2 _index_file + + Title : _index_file + Usage : $index->_index_file( $file_name, $i ) + Function: Specialist function to index FASTA format files. + Is provided with a filename and an integer + by make_index in its SUPER class. + Example : + Returns : + Args : + +=cut + +sub _index_file { + my( $self, + $file, # File name + $i, # Index-number of file being indexed + ) = @_; + + my( $begin, # Offset from start of file of the start + # of the last found record. + ); + + $begin = 0; + + my $id_parser = $self->id_parser; + + open FASTA, $file or $self->throw("Can't open file for read : $file"); + + # Main indexing loop + while (<FASTA>) { + if (/^>/) { + # $begin is the position of the first character after the '>' + my $begin = tell(FASTA) - length( $_ ) + 1; + + foreach my $id (&$id_parser($_)) { + $self->add_record($id, $i, $begin); + } + } + } + + close FASTA; + return 1; +} + +=head2 id_parser + + Title : id_parser + Usage : $index->id_parser( CODE ) + Function: Stores or returns the code used by record_id to + parse the ID for record from a string. Useful + for (for instance) specifying a different + parser for different flavours of FASTA file. + Returns \&default_id_parser (see below) if not + set. If you supply your own id_parser + subroutine, then it should expect a fasta + description line. An entry will be added to + the index for each string in the list returned. + Example : $index->id_parser( \&my_id_parser ) + Returns : ref to CODE if called without arguments + Args : CODE + +=cut + +sub id_parser { + my( $self, $code ) = @_; + + if ($code) { + $self->{'_id_parser'} = $code; + } + return $self->{'_id_parser'} || \&default_id_parser; +} + + + +=head2 default_id_parser + + Title : default_id_parser + Usage : $id = default_id_parser( $header ) + Function: The default Fasta ID parser for Fasta.pm + Returns $1 from applying the regexp /^>\s*(\S+)/ + to $header. + Returns : ID string + Args : a fasta header line string + +=cut + +sub default_id_parser { + if ($_[0] =~ /^>\s*(\S+)/) { + return $1; + } else { + return; + } +} + +1;