0
|
1 # $Id: NCBIHelper.pm,v 1.24.2.2 2003/06/12 09:29:38 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::DB::NCBIHelper
|
|
4 #
|
|
5 # Cared for by Jason Stajich
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12 #
|
|
13 # Interfaces with new WebDBSeqI interface
|
|
14
|
|
15 =head1 NAME
|
|
16
|
|
17 Bio::DB::NCBIHelper - A collection of routines useful for queries to
|
|
18 NCBI databases.
|
|
19
|
|
20 =head1 SYNOPSIS
|
|
21
|
|
22 #Do not use this module directly.
|
|
23
|
|
24 # get a Bio::DB::NCBIHelper object somehow
|
|
25 my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']);
|
|
26 foreach my $seq ( $seqio->next_seq ) {
|
|
27 # process seq
|
|
28 }
|
|
29
|
|
30 =head1 DESCRIPTION
|
|
31
|
|
32 Provides a single place to setup some common methods for querying NCBI
|
|
33 web databases. This module just centralizes the methods for
|
|
34 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
|
|
35 common HTML stripping done in L<postprocess_data>().
|
|
36
|
|
37 The base NCBI query URL used is
|
|
38 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi.
|
|
39
|
|
40 =head1 FEEDBACK
|
|
41
|
|
42 =head2 Mailing Lists
|
|
43
|
|
44 User feedback is an integral part of the
|
|
45 evolution of this and other Bioperl modules. Send
|
|
46 your comments and suggestions preferably to one
|
|
47 of the Bioperl mailing lists. Your participation
|
|
48 is much appreciated.
|
|
49
|
|
50 bioperl-l@bioperl.org - General discussion
|
|
51 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
52
|
|
53 =head2 Reporting Bugs
|
|
54
|
|
55 Report bugs to the Bioperl bug tracking system to
|
|
56 help us keep track the bugs and their resolution.
|
|
57 Bug reports can be submitted via email or the
|
|
58 web:
|
|
59
|
|
60 bioperl-bugs@bio.perl.org
|
|
61 http://bugzilla.bioperl.org/
|
|
62
|
|
63 =head1 AUTHOR - Jason Stajich
|
|
64
|
|
65 Email jason@bioperl.org
|
|
66
|
|
67 =head1 APPENDIX
|
|
68
|
|
69 The rest of the documentation details each of the
|
|
70 object methods. Internal methods are usually
|
|
71 preceded with a _
|
|
72
|
|
73 =cut
|
|
74
|
|
75 # Let the code begin...
|
|
76
|
|
77 package Bio::DB::NCBIHelper;
|
|
78 use strict;
|
|
79 use vars qw(@ISA $HOSTBASE %CGILOCATION %FORMATMAP
|
|
80 $DEFAULTFORMAT $MAX_ENTRIES $VERSION);
|
|
81
|
|
82 use Bio::DB::WebDBSeqI;
|
|
83 use Bio::DB::Query::GenBank;
|
|
84 use HTTP::Request::Common;
|
|
85 use URI;
|
|
86 use Bio::Root::IO;
|
|
87 use Bio::DB::RefSeq;
|
|
88 use Bio::Root::Root;
|
|
89
|
|
90 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root);
|
|
91 $VERSION = '0.8';
|
|
92
|
|
93 BEGIN {
|
|
94 $MAX_ENTRIES = 19000;
|
|
95 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
|
|
96 %CGILOCATION = (
|
|
97 'batch' => ['post' => '/entrez/eutils/efetch.fcgi'],
|
|
98 'query' => ['get' => '/entrez/eutils/efetch.fcgi'],
|
|
99 'single' => ['get' => '/entrez/eutils/efetch.fcgi'],
|
|
100 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'],
|
|
101 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'],
|
|
102 );
|
|
103
|
|
104 %FORMATMAP = ( 'gb' => 'genbank',
|
|
105 'gp' => 'genbank',
|
|
106 'fasta' => 'fasta',
|
|
107 );
|
|
108
|
|
109 $DEFAULTFORMAT = 'gb';
|
|
110 }
|
|
111
|
|
112 # the new way to make modules a little more lightweight
|
|
113
|
|
114 sub new {
|
|
115 my ($class, @args ) = @_;
|
|
116 my $self = $class->SUPER::new(@args);
|
|
117 return $self;
|
|
118 }
|
|
119
|
|
120
|
|
121 =head2 get_params
|
|
122
|
|
123 Title : get_params
|
|
124 Usage : my %params = $self->get_params($mode)
|
|
125 Function: Returns key,value pairs to be passed to NCBI database
|
|
126 for either 'batch' or 'single' sequence retrieval method
|
|
127 Returns : a key,value pair hash
|
|
128 Args : 'single' or 'batch' mode for retrieval
|
|
129
|
|
130 =cut
|
|
131
|
|
132 sub get_params {
|
|
133 my ($self, $mode) = @_;
|
|
134 $self->throw("subclass did not implement get_params");
|
|
135 }
|
|
136
|
|
137 =head2 default_format
|
|
138
|
|
139 Title : default_format
|
|
140 Usage : my $format = $self->default_format
|
|
141 Function: Returns default sequence format for this module
|
|
142 Returns : string
|
|
143 Args : none
|
|
144
|
|
145 =cut
|
|
146
|
|
147 sub default_format {
|
|
148 return $DEFAULTFORMAT;
|
|
149 }
|
|
150
|
|
151 =head2 get_request
|
|
152
|
|
153 Title : get_request
|
|
154 Usage : my $url = $self->get_request
|
|
155 Function: HTTP::Request
|
|
156 Returns :
|
|
157 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
|
|
158
|
|
159 =cut
|
|
160
|
|
161 sub get_request {
|
|
162 my ($self, @qualifiers) = @_;
|
|
163 my ($mode, $uids, $format, $query) = $self->_rearrange([qw(MODE UIDS
|
|
164 FORMAT QUERY)],
|
|
165 @qualifiers);
|
|
166
|
|
167 $mode = lc $mode;
|
|
168 ($format) = $self->request_format() unless ( defined $format);
|
|
169 if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
|
|
170 my %params = $self->get_params($mode);
|
|
171 if( ! %params ) {
|
|
172 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
|
|
173 }
|
|
174 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
|
|
175
|
|
176 unless( defined $uids or defined $query) {
|
|
177 $self->throw("Must specify a query or list of uids to fetch");
|
|
178 }
|
|
179
|
|
180 if ($uids) {
|
|
181 if( ref($uids) =~ /array/i ) {
|
|
182 $uids = join(",", @$uids);
|
|
183 }
|
|
184 $params{'id'} = $uids;
|
|
185 }
|
|
186
|
|
187 elsif ($query && $query->can('cookie')) {
|
|
188 @params{'WebEnv','query_key'} = $query->cookie;
|
|
189 $params{'db'} = $query->db;
|
|
190 }
|
|
191
|
|
192 elsif ($query) {
|
|
193 $params{'id'} = join ',',$query->ids;
|
|
194 }
|
|
195
|
|
196 $params{'rettype'} = $format;
|
|
197 if ($CGILOCATION{$mode}[0] eq 'post') {
|
|
198 return POST $url,[%params];
|
|
199 } else {
|
|
200 $url->query_form(%params);
|
|
201 $self->debug("url is $url \n");
|
|
202 return GET $url;
|
|
203 }
|
|
204 }
|
|
205
|
|
206 =head2 get_Stream_by_batch
|
|
207
|
|
208 Title : get_Stream_by_batch
|
|
209 Usage : $seq = $db->get_Stream_by_batch($ref);
|
|
210 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
|
|
211 at a time. For large numbers of sequences, this is far superior
|
|
212 than get_Stream_by_[id/acc]().
|
|
213 Example :
|
|
214 Returns : a Bio::SeqIO stream object
|
|
215 Args : $ref : either an array reference, a filename, or a filehandle
|
|
216 from which to get the list of unique ids/accession numbers.
|
|
217
|
|
218 NOTE: deprecated API. Use get_Stream_by_id() instead.
|
|
219
|
|
220 =cut
|
|
221
|
|
222 *get_Stream_by_batch = sub {
|
|
223 my $self = shift;
|
|
224 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
|
|
225 $self->get_Stream_by_id(@_)
|
|
226 };
|
|
227
|
|
228 =head2 get_Stream_by_query
|
|
229
|
|
230 Title : get_Stream_by_query
|
|
231 Usage : $seq = $db->get_Stream_by_query($query);
|
|
232 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
|
|
233 at a time. For large numbers of sequences, this is far superior
|
|
234 than get_Stream_by_[id/acc]().
|
|
235 Example :
|
|
236 Returns : a Bio::SeqIO stream object
|
|
237 Args : $query : An Entrez query string or a
|
|
238 Bio::DB::Query::GenBank object. It is suggested that you
|
|
239 create a Bio::DB::Query::GenBank object and get the entry
|
|
240 count before you fetch a potentially large stream.
|
|
241
|
|
242 =cut
|
|
243
|
|
244 sub get_Stream_by_query {
|
|
245 my ($self, $query) = @_;
|
|
246 unless (ref $query && $query->can('query')) {
|
|
247 $query = Bio::DB::Query::GenBank->new($query);
|
|
248 }
|
|
249 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
|
|
250 }
|
|
251
|
|
252 =head2 postprocess_data
|
|
253
|
|
254 Title : postprocess_data
|
|
255 Usage : $self->postprocess_data ( 'type' => 'string',
|
|
256 'location' => \$datastr);
|
|
257 Function: process downloaded data before loading into a Bio::SeqIO
|
|
258 Returns : void
|
|
259 Args : hash with two keys - 'type' can be 'string' or 'file'
|
|
260 - 'location' either file location or string
|
|
261 reference containing data
|
|
262
|
|
263 =cut
|
|
264
|
|
265 # the default method, works for genbank/genpept, other classes should
|
|
266 # override it with their own method.
|
|
267
|
|
268 sub postprocess_data {
|
|
269 my ($self, %args) = @_;
|
|
270 my $data;
|
|
271 my $type = uc $args{'type'};
|
|
272 my $location = $args{'location'};
|
|
273 if( !defined $type || $type eq '' || !defined $location) {
|
|
274 return;
|
|
275 } elsif( $type eq 'STRING' ) {
|
|
276 $data = $$location;
|
|
277 } elsif ( $type eq 'FILE' ) {
|
|
278 open(TMP, $location) or $self->throw("could not open file $location");
|
|
279 my @in = <TMP>;
|
|
280 close TMP;
|
|
281 $data = join("", @in);
|
|
282 }
|
|
283
|
|
284 # transform links to appropriate descriptions
|
|
285 if ($data =~ /\nCONTIG\s+/) {
|
|
286 $self->warn("CONTIG found. GenBank get_Stream_by_acc about to run.");
|
|
287 my(@batch,@accession,%accessions,@location,$id,
|
|
288 $contig,$stream,$aCount,$cCount,$gCount,$tCount);
|
|
289
|
|
290 # process GenBank CONTIG join(...) into two arrays
|
|
291 $data =~ /(?:CONTIG\s+join\()((?:.+\n)+)(?:\/\/)/;
|
|
292 $contig = $1;
|
|
293 $contig =~ s/\n|\)//g;
|
|
294 foreach (split /\s*,\s*/,$contig){
|
|
295 if (/>(.+)<.+>:(.+)/) {
|
|
296 ($id) = split /\./, $1;
|
|
297 push @accession, $id;
|
|
298 push @location, $2;
|
|
299 $accessions{$id}->{'count'}++;
|
|
300 } elsif( /([\w\.]+):(.+)/ ) {
|
|
301 ($id) = split /\./, $1;
|
|
302 $accessions{$id}->{'count'}++;
|
|
303 push @accession, $id;
|
|
304 push @location, $2;
|
|
305 }
|
|
306 }
|
|
307
|
|
308 # grab multiple sequences by batch and join based location variable
|
|
309 my @unique_accessions = keys %accessions;
|
|
310 $stream = $self->get_Stream_by_acc(\@unique_accessions);
|
|
311 $contig = "";
|
|
312 my $ct = 0;
|
|
313 while( my $seq = $stream->next_seq() ) {
|
|
314 if( $seq->accession_number !~ /$unique_accessions[$ct]/ ) {
|
|
315 printf STDERR "warning, %s does not match %s\n",
|
|
316 $seq->accession_number, $unique_accessions[$ct];
|
|
317 }
|
|
318 $accessions{$unique_accessions[$ct]}->{'seq'} = $seq;
|
|
319 $ct++;
|
|
320 }
|
|
321 for (my $i = 0; $i < @accession; $i++) {
|
|
322 my $seq = $accessions{$accession[$i]}->{'seq'};
|
|
323 unless( defined $seq ) {
|
|
324 # seq not cached, get next sequence
|
|
325 $self->warn("unable to find sequence $accession[$i]\n");
|
|
326 return undef;
|
|
327 }
|
|
328 my($start,$end) = split(/\.\./, $location[$i]);
|
|
329 $contig .= $seq->subseq($start,$end-$start);
|
|
330 }
|
|
331
|
|
332 # count number of each letter in sequence
|
|
333 $aCount = () = $contig =~ /a/ig;
|
|
334 $cCount = () = $contig =~ /c/ig;
|
|
335 $gCount = () = $contig =~ /g/ig;
|
|
336 $tCount = () = $contig =~ /t/ig;
|
|
337
|
|
338 # remove everything after and including CONTIG
|
|
339 $data =~ s/(CONTIG[\s\S]+)$//i;
|
|
340
|
|
341 # build ORIGIN part of data file using sequence and counts
|
|
342 $data .= "BASE COUNT $aCount a $cCount c $gCount g $tCount t\n";
|
|
343 $data .= "ORIGIN \n";
|
|
344 $data .= "$contig\n//";
|
|
345 }
|
|
346 else {
|
|
347 $data =~ s/<a\s+href\s*=.+>\s*(\S+)\s*<\s*\/a\s*\>/$1/ig;
|
|
348 }
|
|
349
|
|
350 # fix gt and lt
|
|
351 $data =~ s/>/>/ig;
|
|
352 $data =~ s/</</ig;
|
|
353 if( $type eq 'FILE' ) {
|
|
354 open(TMP, ">$location") or $self->throw("couldn't overwrite file $location");
|
|
355 print TMP $data;
|
|
356 close TMP;
|
|
357 } elsif ( $type eq 'STRING' ) {
|
|
358 ${$args{'location'}} = $data;
|
|
359 }
|
|
360 $self->debug("format is ". join(',',$self->request_format()).
|
|
361 " data is\n$data\n");
|
|
362 }
|
|
363
|
|
364
|
|
365 =head2 request_format
|
|
366
|
|
367 Title : request_format
|
|
368 Usage : my ($req_format, $ioformat) = $self->request_format;
|
|
369 $self->request_format("genbank");
|
|
370 $self->request_format("fasta");
|
|
371 Function: Get/Set sequence format retrieval. The get-form will normally not
|
|
372 be used outside of this and derived modules.
|
|
373 Returns : Array of two strings, the first representing the format for
|
|
374 retrieval, and the second specifying the corresponding SeqIO format.
|
|
375 Args : $format = sequence format
|
|
376
|
|
377 =cut
|
|
378
|
|
379 sub request_format {
|
|
380 my ($self, $value) = @_;
|
|
381 if( defined $value ) {
|
|
382 $value = lc $value;
|
|
383 if( defined $FORMATMAP{$value} ) {
|
|
384 $self->{'_format'} = [ $value, $FORMATMAP{$value}];
|
|
385 } else {
|
|
386 # Try to fall back to a default. Alternatively, we could throw
|
|
387 # an exception
|
|
388 $self->{'_format'} = [ $value, $value ];
|
|
389 }
|
|
390 }
|
|
391 return @{$self->{'_format'}};
|
|
392 }
|
|
393
|
|
394 =head2 Bio::DB::WebDBSeqI methods
|
|
395
|
|
396 Overriding WebDBSeqI method to help newbies to retrieve sequences
|
|
397
|
|
398 =head2 get_Stream_by_acc
|
|
399
|
|
400 Title : get_Stream_by_acc
|
|
401 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
|
|
402 Function: Gets a series of Seq objects by accession numbers
|
|
403 Returns : a Bio::SeqIO stream object
|
|
404 Args : $ref : a reference to an array of accession numbers for
|
|
405 the desired sequence entries
|
|
406 Note : For GenBank, this just calls the same code for get_Stream_by_id()
|
|
407
|
|
408 =cut
|
|
409
|
|
410 sub get_Stream_by_acc {
|
|
411 my ($self, $ids ) = @_;
|
|
412 my $newdb = $self->_check_id($ids);
|
|
413 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
|
|
414 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
|
|
415 } else {
|
|
416 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
|
|
417 }
|
|
418 }
|
|
419
|
|
420
|
|
421 =head2 _check_id
|
|
422
|
|
423 Title : _check_id
|
|
424 Usage :
|
|
425 Function:
|
|
426 Returns : A Bio::DB::RefSeq reference or throws
|
|
427 Args : $id(s), $string
|
|
428
|
|
429 =cut
|
|
430
|
|
431 sub _check_id {
|
|
432 my ($self, $ids) = @_;
|
|
433
|
|
434 # NT contigs can not be retrieved
|
|
435 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
|
|
436 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
|
|
437 if $ids =~ /NT_/;
|
|
438
|
|
439 # Asking for a RefSeq from EMBL/GenBank
|
|
440
|
|
441 if ($ids =~ /N._/) {
|
|
442 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
|
|
443 " Redirecting the request.\n")
|
|
444 if $self->verbose >= 0;
|
|
445 return new Bio::DB::RefSeq;
|
|
446 }
|
|
447 }
|
|
448
|
|
449 =head2 delay_policy
|
|
450
|
|
451 Title : delay_policy
|
|
452 Usage : $secs = $self->delay_policy
|
|
453 Function: return number of seconds to delay between calls to remote db
|
|
454 Returns : number of seconds to delay
|
|
455 Args : none
|
|
456
|
|
457 NOTE: NCBI requests a delay of 3s between requests. This method
|
|
458 implements that policy.
|
|
459
|
|
460 =cut
|
|
461
|
|
462 sub delay_policy {
|
|
463 my $self = shift;
|
|
464 return 3;
|
|
465 }
|
|
466
|
|
467 1;
|
|
468 __END__
|