Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/DB/BioFetch.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/DB/BioFetch.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,516 @@
+# $Id: BioFetch.pm,v 1.13.2.1 2003/06/25 13:44:18 heikki Exp $
+#
+# BioPerl module for Bio::DB::BioFetch
+#
+# Cared for by Lincoln Stein <lstein@cshl.org>
+#
+# Copyright Lincoln Stein
+#
+# You may distribute this module under the same terms as perl itself
+#
+# POD documentation - main docs before the code
+#
+
+package Bio::DB::BioFetch;
+use strict;
+use Bio::DB::WebDBSeqI;
+use HTTP::Request::Common 'POST';
+
+=head1 NAME
+
+Bio::DB::BioFetch - Database object interface to BioFetch retrieval
+
+=head1 SYNOPSIS
+
+ use Bio::DB::BioFetch;
+
+ $bf = new Bio::DB::BioFetch;
+
+ $seq = $sp->get_Seq_by_id('BUM');  # EMBL or SWALL ID
+
+ # change formats, storage procedures
+ $bf = new Bio::DB::BioFetch(-format        => 'fasta',
+ 			     -retrievaltype => 'tempfile',
+  			     -db            => 'EMBL');
+
+ $stream = $bf->get_Stream_by_id(['BUM','J00231']);
+ while (my $s = $stream->next_seq) {
+    print $s->seq,"\n";
+ }
+ # get a RefSeq entry
+ $bf->db('refseq');
+ eval {
+     $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION
+ };
+ print "accession is ", $seq->accession_number, "\n" unless $@;
+
+
+=head1 DESCRIPTION
+
+Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching
+method.  It goes to the Web-based dbfetch server located at the EBI
+(http://www.ebi.ac.uk/cgi-bin/dbfetch) to retrieve sequences in the
+EMBL or GenBank sequence repositories.
+
+This module implements all the Bio::DB::RandomAccessI interface, plus
+the get_Stream_by_id() and get_Stream_by_acc() methods that are found
+in the Bio::DB::SwissProt interface.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+
+  bioperl-l@bioperl.org                         - General discussion
+  http://bio.perl.org/MailList.html             - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Lincoln Stein
+
+Email Lincoln Stein  E<lt>lstein@cshl.orgE<lt>
+
+Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for the
+BioFetch server and interface specification.
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+# Let the code begin...
+use vars qw(@ISA $VERSION %FORMATMAP );
+use Bio::Root::Root;
+@ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root);
+$VERSION = '1.0';
+
+# warning: names used here must map into Bio::SeqIO::* space
+use constant DEFAULT_LOCATION => 'http://www.ebi.ac.uk/cgi-bin/dbfetch';
+
+BEGIN {
+
+  %FORMATMAP = (
+		'embl' => {
+			   default => 'embl', # default BioFetch format/SeqIOmodule pair
+			   embl => 'embl',    # alternative BioFetch format/module pair 
+			   fasta => 'fasta',   # alternative BioFetch format/module pair 
+			   namespace => 'embl',
+			  },
+		'swissprot' => {
+				default => 'swiss',
+				swissprot => 'swiss',
+				fasta => 'fasta',
+				namespace => 'swall',
+			       },
+		'refseq' => {
+			     default => 'genbank',
+			     genbank => 'genbank',
+			     fasta => 'fasta',
+			     namespace => 'RefSeq',
+			    },
+		'swall' => {
+			    default => 'swiss',
+			    swissprot => 'swiss',
+			    fasta => 'fasta',
+			    namespace => 'swall',
+			   },
+		'genbank' => {
+			      default => 'genbank',
+			      genbank => 'genbank',
+			      namespace => 'genbank',
+			     },
+		'genpep' => {
+			     default => 'genbank',
+			     genbank => 'genbank',
+			     namespace => 'genpep',
+			    },
+	       );
+}
+
+=head2 new
+
+ Title   : new
+ Usage   : $bf = Bio::DB::BioFetch->new(@args)
+ Function: Construct a new Bio::DB::BioFetch object
+ Returns : a Bio::DB::BioFetch object
+ Args    : see below
+ Throws  :
+
+@args are standard -name=E<gt>value options as listed in the following
+table. If you do not provide any options, the module assumes reasonable
+defaults.
+
+  Option         Value                            Default
+  ------         -----                            -------
+
+  -baseaddress   location of dbfetch server       http://www.ebi.ac.uk/cgi-bin/dbfetch
+  -retrievaltype "tempfile" or "io_string"        io_string
+  -format        "embl", "fasta", "swissprot",    embl
+                  or "genbank"
+  -db            "embl", "genbank" or "swissprot" embl
+
+=cut
+
+#'
+sub new {
+  my ($class,@args) = @_;
+  my $self = $class->SUPER::new(@args);
+  my ($db) = $self->_rearrange([qw(DB)],@args);
+  $db    ||= $self->default_db;
+  $self->db($db);
+  $self->url_base_address(DEFAULT_LOCATION) unless $self->url_base_address;
+  $self;
+}
+
+=head2 new_from_registry
+
+ Title   : new_from_registry
+ Usage   : $biofetch = $db->new_from_registry(%config)
+ Function: Creates a BioFetch object from the registry config hash
+ Returns : itself
+ Args    : A configuration hash (see Registry.pm)
+ Throws  : 
+
+
+=cut
+
+sub new_from_registry {
+    my ($class,%config)=@_;
+
+    my $self = $class->SUPER::new(
+				  -BASEADDRESS=>$config{'location'}
+				  );
+    $self->db($config{'dbname'}) if $config{dbname};
+    return $self;
+}
+
+# from Bio::DB::RandomAccessI
+
+=head2 get_Seq_by_id
+
+ Title   : get_Seq_by_id
+ Usage   : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
+ Function: Gets a Bio::Seq object by its name
+ Returns : a Bio::Seq object
+ Args    : the id (as a string) of a sequence
+ Throws  : "id does not exist" exception
+
+
+=cut
+
+=head2 get_Seq_by_acc
+
+ Title   : get_Seq_by_acc
+ Usage   : $seq = $db->get_Seq_by_acc('X77802');
+ Function: Gets a Bio::Seq object by accession number
+ Returns : A Bio::Seq object
+ Args    : accession number (as a string)
+ Throws  : "acc does not exist" exception
+
+=cut
+
+=head2 get_Seq_by_gi
+
+ Title   : get_Seq_by_gi
+ Usage   : $seq = $db->get_Seq_by_gi('405830');
+ Function: Gets a Bio::Seq object by gi number
+ Returns : A Bio::Seq object
+ Args    : gi number (as a string)
+ Throws  : "gi does not exist" exception
+
+=cut
+
+=head2 get_Seq_by_version
+
+ Title   : get_Seq_by_version
+ Usage   : $seq = $db->get_Seq_by_version('X77802.1');
+ Function: Gets a Bio::Seq object by sequence version
+ Returns : A Bio::Seq object
+ Args    : accession.version (as a string)
+ Throws  : "acc.version does not exist" exception
+
+=cut
+
+sub get_Seq_by_version {
+   my ($self,$seqid) = @_;
+   return $self->get_Seq_by_acc($seqid);
+}
+
+
+=head2 get_Stream_by_id
+
+  Title   : get_Stream_by_id
+  Usage   : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
+  Function: Gets a series of Seq objects by unique identifiers
+  Returns : a Bio::SeqIO stream object
+  Args    : $ref : a reference to an array of unique identifiers for
+                   the desired sequence entries
+
+=cut
+
+=head2 get_Stream_by_gi
+
+  Title   : get_Stream_by_gi
+  Usage   : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
+  Function: Gets a series of Seq objects by gi numbers
+  Returns : a Bio::SeqIO stream object
+  Args    : $ref : a reference to an array of gi numbers for
+                   the desired sequence entries
+  Note    : For GenBank, this just calls the same code for get_Stream_by_id()
+
+=cut
+
+=head2 get_Stream_by_batch
+
+  Title   : get_Stream_by_batch
+  Usage   : $seq = $db->get_Stream_by_batch($ref);
+  Function: Get a series of Seq objects by their IDs
+  Example :
+  Returns : a Bio::SeqIO stream object
+  Args    : $ref : an array reference containing a list of unique
+            ids/accession numbers.
+
+In some of the Bio::DB::* moduels, get_Stream_by_id() is called
+get_Stream_by_batch().  Since there seems to be no consensus, this
+is provided as an alias.
+
+=cut
+
+*get_Stream_by_batch = \&Bio::DB::WebDBSeqI::get_Stream_by_id;
+
+=head1 The remainder of these methods are for internal use
+
+=head2 get_request
+
+ Title   : get_request
+ Usage   : my $url = $self->get_request
+ Function: returns a HTTP::Request object
+ Returns : 
+ Args    : %qualifiers = a hash of qualifiers (ids, format, etc)
+
+=cut
+
+
+sub get_request {
+    my ($self, @qualifiers) = @_;
+    my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
+					    @qualifiers);
+    my $db     = $self->db;
+    my $namespace = $self->_namespace;
+
+    $self->throw("Must specify a value for UIDs to fetch")
+	unless defined $uids;
+    my $tmp;
+    my $format_string = '';
+
+    $format ||= $self->default_format;
+    ($format, $tmp) = $self->request_format($format);
+
+    my $base = $self->url_base_address;
+    my $uid = join('+', ref $uids ? @$uids : $uids);
+    $self->debug("\n$base$format_string&id=$uid\n");
+    return POST($base,
+		[ db     => $namespace,
+		  id     => join('+',ref $uids ? @$uids : $uids),
+		  format => $format,
+		  style  => 'raw'
+	     ]);
+}
+
+=head2 default_format
+
+ Title   : default_format
+ Usage   : $format = $self->default_format
+ Function: return the default format
+ Returns : a string
+ Args    : 
+
+=cut
+
+sub default_format { 
+    return 'default';
+}
+
+=head2 default_db
+
+ Title   : default_db
+ Usage   : $db = $self->default_db
+ Function: return the default database
+ Returns : a string
+ Args    :
+
+=cut
+
+sub default_db     { 'embl' }
+
+=head2 db
+
+ Title   : db
+ Usage   : $db = $self->db([$db])
+ Function: get/set the database
+ Returns : a string
+ Args    : new database
+
+=cut
+
+sub db {
+  my $self = shift;
+
+  if (@_) {
+
+      my $db = lc shift;
+      $FORMATMAP{$db} or $self->throw("invalid db [$db], must be one of [".
+				     join(' ',keys %FORMATMAP).  "]");
+      $self->{_db} = $db;
+  }
+  return $self->{_db} || $self->default_db ;
+}
+
+sub _namespace {
+  my $self = shift;
+  my $db = $self->db;
+  return $FORMATMAP{$db}{namespace} or $db;
+}
+
+=head2 postprocess_data
+
+ Title   : postprocess_data
+ Usage   : $self->postprocess_data ( 'type' => 'string',
+				     'location' => \$datastr);
+ Function: process downloaded data before loading into a Bio::SeqIO
+ Returns : void
+ Args    : hash with two keys - 'type' can be 'string' or 'file'
+                              - 'location' either file location or string 
+                                 reference containing data
+
+=cut
+
+sub postprocess_data {
+  my ($self,%args) = @_;
+
+  # check for errors in the stream
+  if ($args{'type'} eq 'string') {
+    my $stringref = $args{'location'};
+    if ($$stringref =~ /^ERROR (\d+) (.+)/m) {
+      $self->throw("BioFetch Error $1: $2");
+    }
+  }
+
+  elsif ($args{'type'} eq 'file') {
+    open (F,$args{'location'}) or $self->throw("Couldn't open $args{location}: $!");
+    # this is dumb, but the error may be anywhere on the first three lines because the
+    # CGI headers are sometimes printed out by the server...
+    my @data = (scalar <F>,scalar <F>,scalar <F>);
+    if (join('',@data) =~ /^ERROR (\d+) (.+)/m) {
+      $self->throw("BioFetch Error $1: $2");
+    }
+    close F;
+  }
+
+  else {
+    $self->throw("Don't know how to postprocess data of type $args{'type'}");
+  }
+}
+
+
+=head2 request_format
+
+ Title   : request_format
+ Usage   : my ($req_format, $ioformat) = $self->request_format;
+           $self->request_format("genbank");
+           $self->request_format("fasta");
+ Function: Get/Set sequence format retrieval. The get-form will normally not
+           be used outside of this and derived modules.
+ Returns : Array of two strings, the first representing the format for
+           retrieval, and the second specifying the corresponding SeqIO format.
+ Args    : $format = sequence format
+
+=cut
+
+sub request_format {
+    my ($self, $value) = @_;
+    if ( defined $value ) { 
+	my $db = $self->db;
+	my $namespace = $self->_namespace;
+	my $format = lc $value;
+	print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n" 
+	    if $self->verbose > 0; 
+	$self->throw("Invalid format [$format], must be one of [".
+		     join(' ',keys %{$FORMATMAP{$db}}). "]")
+	    unless  $format eq 'default' || $FORMATMAP{$db}->{$format};
+
+	$self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}];
+    }
+    return @{$self->{'_format'}};
+}
+
+
+=head2 Bio::DB::WebDBSeqI methods
+
+Overriding WebDBSeqI method to help newbies to retrieve sequences.
+EMBL database is all too often passed RefSeq accessions. This
+redirects those calls. See L<Bio::DB::RefSeq>.
+
+
+=head2 get_Stream_by_acc
+
+  Title   : get_Stream_by_acc
+  Usage   : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
+  Function: Gets a series of Seq objects by accession numbers
+  Returns : a Bio::SeqIO stream object
+  Args    : $ref : a reference to an array of accession numbers for
+                   the desired sequence entries
+
+=cut
+
+sub get_Stream_by_acc {
+    my ($self, $ids ) = @_;
+    $self->_check_id($ids);
+    return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
+}
+
+
+=head2 _check_id
+
+  Title   : _check_id
+  Usage   : 
+  Function: Throw on whole chromosome NCBI sequences not in sequence databases
+            and redirect RefSeq accession requests sent to EMBL.
+  Returns : 
+  Args    : $id(s), $string
+  Throws  : if accessionn number indicates whole chromosome NCBI sequence
+
+=cut
+
+sub _check_id {
+    my ($self, $id) = @_;
+
+    # NT contigs can not be retrieved
+    $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
+		 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") 
+	if $id =~ /NT_/;
+
+    # Asking for a RefSeq from EMBL/GenBank
+
+    if ($id =~ /N._/ &&  $self->db ne 'refseq') {
+	$self->warn("[$id] is not a normal sequence database but a RefSeq entry.".
+		   " Redirecting the request.\n")
+	    if $self->verbose >= 0;
+	$self->db('RefSeq');
+    }
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children