0
|
1 # $Id: GenBank.pm,v 1.4.2.1 2003/09/09 21:28:52 lstein Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::DB::Query::GenBank.pm
|
|
4 #
|
|
5 # Cared for by Lincoln Stein <lstein@cshl.org>
|
|
6 #
|
|
7 # Copyright Lincoln Stein
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12 #
|
|
13
|
|
14 =head1 NAME
|
|
15
|
|
16 Bio::DB::Query::GenBank - Build a GenBank Entrez Query
|
|
17
|
|
18 =head1 SYNOPSIS
|
|
19
|
|
20 my $query_string = 'Oryza[Organism] AND EST[Keyword]';
|
|
21 my $query = Bio::DB::Query::GenBank->new(-db=>'nucleotide',
|
|
22 -query=>$query_string,
|
|
23 -mindate => '2001',
|
|
24 -maxdate => '2002');
|
|
25 my $count = $query->count;
|
|
26 my @ids = $query->ids;
|
|
27
|
|
28 # get a genbank database handle
|
|
29 my $gb = new Bio::DB::GenBank;
|
|
30 my $stream = $gb->get_Stream_by_query($query);
|
|
31 while (my $seq = $stream->next_seq) {
|
|
32 ...
|
|
33 }
|
|
34
|
|
35 # initialize the list yourself
|
|
36 my $query = Bio::DB::Query::GenBank->new(-ids=>[195052,2981014,11127914]);
|
|
37
|
|
38
|
|
39 =head1 DESCRIPTION
|
|
40
|
|
41 This class encapsulates NCBI Entrez queries. It can be used to store
|
|
42 a list of GI numbers, to translate an Entrez query expression into a
|
|
43 list of GI numbers, or to count the number of terms that would be
|
|
44 returned by a query. Once created, the query object can be passed to
|
|
45 a Bio::DB::GenBank object in order to retrieve the entries
|
|
46 corresponding to the query.
|
|
47
|
|
48 =head1 FEEDBACK
|
|
49
|
|
50 =head2 Mailing Lists
|
|
51
|
|
52 User feedback is an integral part of the
|
|
53 evolution of this and other Bioperl modules. Send
|
|
54 your comments and suggestions preferably to one
|
|
55 of the Bioperl mailing lists. Your participation
|
|
56 is much appreciated.
|
|
57
|
|
58 bioperl-l@bioperl.org - General discussion
|
|
59 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
60
|
|
61 =head2 Reporting Bugs
|
|
62
|
|
63 Report bugs to the Bioperl bug tracking system to
|
|
64 help us keep track the bugs and their resolution.
|
|
65 Bug reports can be submitted via email or the
|
|
66 web:
|
|
67
|
|
68 bioperl-bugs@bio.perl.org
|
|
69 http://bugzilla.bioperl.org/
|
|
70
|
|
71 =head1 AUTHOR - Lincoln Stein
|
|
72
|
|
73 Email lstein@cshl.org
|
|
74
|
|
75 =head1 APPENDIX
|
|
76
|
|
77 The rest of the documentation details each of the
|
|
78 object methods. Internal methods are usually
|
|
79 preceded with a _
|
|
80
|
|
81 =cut
|
|
82
|
|
83 # Let the code begin...
|
|
84
|
|
85 package Bio::DB::Query::GenBank;
|
|
86 use strict;
|
|
87 use Bio::DB::Query::WebQuery;
|
|
88 use URI::Escape 'uri_unescape';
|
|
89
|
|
90 use constant EPOST => 'http://www.ncbi.nih.gov/entrez/eutils/epost.fcgi';
|
|
91 use constant ESEARCH => 'http://www.ncbi.nih.gov/entrez/eutils/esearch.fcgi';
|
|
92 use constant DEFAULT_DB => 'protein';
|
|
93 use constant MAXENTRY => 100;
|
|
94
|
|
95 use vars qw(@ISA @ATTRIBUTES $VERSION);
|
|
96
|
|
97 @ISA = 'Bio::DB::Query::WebQuery';
|
|
98 $VERSION = '0.2';
|
|
99
|
|
100 BEGIN {
|
|
101 @ATTRIBUTES = qw(db reldate mindate maxdate datetype);
|
|
102 for my $method (@ATTRIBUTES) {
|
|
103 eval <<END;
|
|
104 sub $method {
|
|
105 my \$self = shift;
|
|
106 my \$d = \$self->{'_$method'};
|
|
107 \$self->{'_$method'} = shift if \@_;
|
|
108 \$d;
|
|
109 }
|
|
110 END
|
|
111 }
|
|
112 }
|
|
113
|
|
114 =head2 new
|
|
115
|
|
116 Title : new
|
|
117 Usage : $db = Bio::DB::Query::GenBank->new(@args)
|
|
118 Function: create new query object
|
|
119 Returns : new query object
|
|
120 Args : -db database ('protein' or 'nucleotide')
|
|
121 -query query string
|
|
122 -mindate minimum date to retrieve from
|
|
123 -maxdate maximum date to retrieve from
|
|
124 -reldate relative date to retrieve from (days)
|
|
125 -datetype date field to use ('edat' or 'mdat')
|
|
126 -ids array ref of gids (overrides query)
|
|
127
|
|
128 This method creates a new query object. Typically you will specify a
|
|
129 -db and a -query argument, possibly modified by -mindate, -maxdate, or
|
|
130 -reldate. -mindate and -maxdate specify minimum and maximum dates for
|
|
131 entries you are interested in retrieving, expressed in the form
|
|
132 DD/MM/YYYY. -reldate is used to fetch entries that are more recent
|
|
133 than the indicated number of days.
|
|
134
|
|
135 If you provide an array reference of IDs in -ids, the query will be
|
|
136 ignored and the list of IDs will be used when the query is passed to a
|
|
137 Bio::DB::GenBank object's get_Stream_by_query() method. A variety of
|
|
138 IDs are automatically recognized, including GI numbers, Accession
|
|
139 numbers, Accession.version numbers and locus names.
|
|
140
|
|
141 =cut
|
|
142
|
|
143 sub new {
|
|
144 my $class = shift;
|
|
145 my $self = $class->SUPER::new(@_);
|
|
146 my ($db,$reldate,$mindate,$maxdate,$datetype,$ids)
|
|
147 = $self->_rearrange([qw(DB RELDATE MINDATE MAXDATE DATETYPE IDS)],@_);
|
|
148 $self->db($db || DEFAULT_DB);
|
|
149 $reldate && $self->reldate($reldate);
|
|
150 $mindate && $self->mindate($mindate);
|
|
151 $maxdate && $self->maxdate($maxdate);
|
|
152 $datetype ||= 'mdat';
|
|
153 $datetype && $self->datetype($datetype);
|
|
154 $self;
|
|
155 }
|
|
156
|
|
157 =head2 cookie
|
|
158
|
|
159 Title : cookie
|
|
160 Usage : ($cookie,$querynum) = $db->cookie
|
|
161 Function: return the NCBI query cookie
|
|
162 Returns : list of (cookie,querynum)
|
|
163 Args : none
|
|
164
|
|
165 NOTE: this information is used by Bio::DB::GenBank in
|
|
166 conjunction with efetch.
|
|
167
|
|
168 =cut
|
|
169
|
|
170 sub cookie {
|
|
171 my $self = shift;
|
|
172 if (@_) {
|
|
173 $self->{'_cookie'} = shift;
|
|
174 $self->{'_querynum'} = shift;
|
|
175 }
|
|
176
|
|
177 else {
|
|
178 $self->_run_query;
|
|
179 @{$self}{qw(_cookie _querynum)};
|
|
180 }
|
|
181 }
|
|
182
|
|
183 =head2 _request_parameters
|
|
184
|
|
185 Title : _request_parameters
|
|
186 Usage : ($method,$base,@params = $db->_request_parameters
|
|
187 Function: return information needed to construct the request
|
|
188 Returns : list of method, url base and key=>value pairs
|
|
189 Args : none
|
|
190
|
|
191 =cut
|
|
192
|
|
193 sub _request_parameters {
|
|
194 my $self = shift;
|
|
195 my ($method,$base);
|
|
196 my @params = map {eval("\$self->$_") ? ($_ => eval("\$self->$_")) : () } @ATTRIBUTES;
|
|
197 push @params,('usehistory'=>'y','tool'=>'bioperl');
|
|
198 $method = 'get';
|
|
199 $base = ESEARCH;
|
|
200 push @params,('term' => $self->query);
|
|
201 push @params,('retmax' => $self->{'_count'} || MAXENTRY);
|
|
202 ($method,$base,@params);
|
|
203 }
|
|
204
|
|
205
|
|
206 =head2 count
|
|
207
|
|
208 Title : count
|
|
209 Usage : $count = $db->count;
|
|
210 Function: return count of number of entries retrieved by query
|
|
211 Returns : integer
|
|
212 Args : none
|
|
213
|
|
214 Returns the number of entries that are matched by the query.
|
|
215
|
|
216 =cut
|
|
217
|
|
218 sub count {
|
|
219 my $self = shift;
|
|
220 if (@_) {
|
|
221 my $d = $self->{'_count'};
|
|
222 $self->{'_count'} = shift;
|
|
223 return $d;
|
|
224 }
|
|
225 else {
|
|
226 $self->_run_query;
|
|
227 return $self->{'_count'};
|
|
228 }
|
|
229 }
|
|
230
|
|
231 =head2 ids
|
|
232
|
|
233 Title : ids
|
|
234 Usage : @ids = $db->ids([@ids])
|
|
235 Function: get/set matching ids
|
|
236 Returns : array of sequence ids
|
|
237 Args : (optional) array ref with new set of ids
|
|
238
|
|
239 =cut
|
|
240
|
|
241 =head2 query
|
|
242
|
|
243 Title : query
|
|
244 Usage : $query = $db->query([$query])
|
|
245 Function: get/set query string
|
|
246 Returns : string
|
|
247 Args : (optional) new query string
|
|
248
|
|
249 =cut
|
|
250
|
|
251 =head2 _parse_response
|
|
252
|
|
253 Title : _parse_response
|
|
254 Usage : $db->_parse_response($content)
|
|
255 Function: parse out response
|
|
256 Returns : empty
|
|
257 Args : none
|
|
258 Throws : 'unparseable output exception'
|
|
259
|
|
260 =cut
|
|
261
|
|
262 sub _parse_response {
|
|
263 my $self = shift;
|
|
264 my $content = shift;
|
|
265 if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
|
|
266 warn "Warning(s) from GenBank: $warning\n";
|
|
267 }
|
|
268 if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
|
|
269 $self->throw("Error from Genbank: $error");
|
|
270 }
|
|
271
|
|
272 my ($count) = $content =~ /<Count>(\d+)/;
|
|
273 my ($max) = $content =~ /<RetMax>(\d+)/;
|
|
274 my $truncated = $count > $max;
|
|
275 $self->count($count);
|
|
276 if (!$truncated) {
|
|
277 my @ids = $content =~ /<Id>(\d+)/g;
|
|
278 $self->ids(\@ids);
|
|
279 }
|
|
280 $self->_truncated($truncated);
|
|
281 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
|
|
282 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
|
|
283 $self->cookie(uri_unescape($cookie),$querykey);
|
|
284 }
|
|
285
|
|
286 1;
|