diff variant_effect_predictor/Bio/DB/Query/GenBank.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/DB/Query/GenBank.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,286 @@
+# $Id: GenBank.pm,v 1.4.2.1 2003/09/09 21:28:52 lstein Exp $
+#
+# BioPerl module for Bio::DB::Query::GenBank.pm
+#
+# Cared for by Lincoln Stein <lstein@cshl.org>
+#
+# Copyright Lincoln Stein
+#
+# You may distribute this module under the same terms as perl itself
+#
+# POD documentation - main docs before the code
+#
+
+=head1 NAME
+
+Bio::DB::Query::GenBank - Build a GenBank Entrez Query
+
+=head1 SYNOPSIS
+
+   my $query_string = 'Oryza[Organism] AND EST[Keyword]';
+   my $query = Bio::DB::Query::GenBank->new(-db=>'nucleotide',
+                                            -query=>$query_string,
+					    -mindate => '2001',
+					    -maxdate => '2002');
+   my $count = $query->count;
+   my @ids   = $query->ids;
+
+   # get a genbank database handle
+   my $gb = new Bio::DB::GenBank;
+   my $stream = $gb->get_Stream_by_query($query);
+   while (my $seq = $stream->next_seq) {
+      ...
+   }
+
+   # initialize the list yourself
+   my $query = Bio::DB::Query::GenBank->new(-ids=>[195052,2981014,11127914]);
+
+
+=head1 DESCRIPTION
+
+This class encapsulates NCBI Entrez queries.  It can be used to store
+a list of GI numbers, to translate an Entrez query expression into a
+list of GI numbers, or to count the number of terms that would be
+returned by a query.  Once created, the query object can be passed to
+a Bio::DB::GenBank object in order to retrieve the entries
+corresponding to the query.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the
+evolution of this and other Bioperl modules. Send
+your comments and suggestions preferably to one
+of the Bioperl mailing lists. Your participation
+is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to
+help us keep track the bugs and their resolution.
+Bug reports can be submitted via email or the
+web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Lincoln Stein
+
+Email lstein@cshl.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the
+object methods. Internal methods are usually
+preceded with a _
+
+=cut
+
+# Let the code begin...
+
+package Bio::DB::Query::GenBank;
+use strict;
+use Bio::DB::Query::WebQuery;
+use URI::Escape 'uri_unescape';
+
+use constant EPOST               => 'http://www.ncbi.nih.gov/entrez/eutils/epost.fcgi';
+use constant ESEARCH             => 'http://www.ncbi.nih.gov/entrez/eutils/esearch.fcgi';
+use constant DEFAULT_DB          => 'protein';
+use constant MAXENTRY            => 100;
+
+use vars qw(@ISA @ATTRIBUTES $VERSION);
+
+@ISA     = 'Bio::DB::Query::WebQuery';
+$VERSION = '0.2';
+
+BEGIN {
+  @ATTRIBUTES = qw(db reldate mindate maxdate datetype);
+  for my $method (@ATTRIBUTES) {
+    eval <<END;
+sub $method {
+   my \$self = shift;
+   my \$d    = \$self->{'_$method'};
+   \$self->{'_$method'} = shift if \@_;
+   \$d;
+}
+END
+  }
+}
+
+=head2 new
+
+ Title   : new
+ Usage   : $db = Bio::DB::Query::GenBank->new(@args)
+ Function: create new query object
+ Returns : new query object
+ Args    : -db       database ('protein' or 'nucleotide')
+           -query    query string
+           -mindate  minimum date to retrieve from
+           -maxdate  maximum date to retrieve from
+           -reldate  relative date to retrieve from (days)
+           -datetype date field to use ('edat' or 'mdat')
+           -ids      array ref of gids (overrides query)
+
+This method creates a new query object.  Typically you will specify a
+-db and a -query argument, possibly modified by -mindate, -maxdate, or
+-reldate.  -mindate and -maxdate specify minimum and maximum dates for
+entries you are interested in retrieving, expressed in the form
+DD/MM/YYYY.  -reldate is used to fetch entries that are more recent
+than the indicated number of days.
+
+If you provide an array reference of IDs in -ids, the query will be
+ignored and the list of IDs will be used when the query is passed to a
+Bio::DB::GenBank object's get_Stream_by_query() method.  A variety of
+IDs are automatically recognized, including GI numbers, Accession
+numbers, Accession.version numbers and locus names.
+
+=cut
+
+sub new {
+  my $class = shift;
+  my $self  = $class->SUPER::new(@_);
+  my ($db,$reldate,$mindate,$maxdate,$datetype,$ids)
+    = $self->_rearrange([qw(DB RELDATE MINDATE MAXDATE DATETYPE IDS)],@_);
+  $self->db($db || DEFAULT_DB);
+  $reldate  && $self->reldate($reldate);
+  $mindate  && $self->mindate($mindate);
+  $maxdate  && $self->maxdate($maxdate);
+  $datetype ||= 'mdat';
+  $datetype && $self->datetype($datetype);
+  $self;
+}
+
+=head2 cookie
+
+ Title   : cookie
+ Usage   : ($cookie,$querynum) = $db->cookie
+ Function: return the NCBI query cookie
+ Returns : list of (cookie,querynum)
+ Args    : none
+
+NOTE: this information is used by Bio::DB::GenBank in
+conjunction with efetch.
+
+=cut
+
+sub cookie {
+  my $self = shift;
+  if (@_) {
+    $self->{'_cookie'}   = shift;
+    $self->{'_querynum'} = shift;
+  }
+
+  else {
+    $self->_run_query;
+    @{$self}{qw(_cookie _querynum)};
+  }
+}
+
+=head2 _request_parameters
+
+ Title   : _request_parameters
+ Usage   : ($method,$base,@params = $db->_request_parameters
+ Function: return information needed to construct the request
+ Returns : list of method, url base and key=>value pairs
+ Args    : none
+
+=cut
+
+sub _request_parameters {
+  my $self = shift;
+  my ($method,$base);
+  my @params = map {eval("\$self->$_") ? ($_ => eval("\$self->$_")) : () } @ATTRIBUTES;
+  push @params,('usehistory'=>'y','tool'=>'bioperl');
+  $method = 'get';
+  $base   = ESEARCH;
+  push @params,('term'   => $self->query);
+  push @params,('retmax' => $self->{'_count'} || MAXENTRY);
+  ($method,$base,@params);
+}
+
+
+=head2 count
+
+ Title   : count
+ Usage   : $count = $db->count;
+ Function: return count of number of entries retrieved by query
+ Returns : integer
+ Args    : none
+
+Returns the number of entries that are matched by the query.
+
+=cut
+
+sub count   {
+  my $self = shift;
+  if (@_) {
+    my $d = $self->{'_count'};
+    $self->{'_count'}   = shift;
+    return $d;
+  }
+  else {
+    $self->_run_query;
+    return $self->{'_count'};
+  }
+}
+
+=head2 ids
+
+ Title   : ids
+ Usage   : @ids = $db->ids([@ids])
+ Function: get/set matching ids
+ Returns : array of sequence ids
+ Args    : (optional) array ref with new set of ids
+
+=cut
+
+=head2 query
+
+ Title   : query
+ Usage   : $query = $db->query([$query])
+ Function: get/set query string
+ Returns : string
+ Args    : (optional) new query string
+
+=cut
+
+=head2 _parse_response
+
+ Title   : _parse_response
+ Usage   : $db->_parse_response($content)
+ Function: parse out response
+ Returns : empty
+ Args    : none
+ Throws  : 'unparseable output exception'
+
+=cut
+
+sub _parse_response {
+  my $self    = shift;
+  my $content = shift;
+  if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
+    warn "Warning(s) from GenBank: $warning\n";
+  }
+  if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
+    $self->throw("Error from Genbank: $error");
+  }
+
+  my ($count) = $content =~  /<Count>(\d+)/;
+  my ($max)   = $content =~  /<RetMax>(\d+)/;
+  my $truncated = $count > $max;
+  $self->count($count);
+  if (!$truncated) {
+    my @ids = $content =~ /<Id>(\d+)/g;
+    $self->ids(\@ids);
+  }
+  $self->_truncated($truncated);
+  my ($cookie)    = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
+  my ($querykey)  = $content =~ m!<QueryKey>(\d+)!;
+  $self->cookie(uri_unescape($cookie),$querykey);
+}
+
+1;