comparison variant_effect_predictor/Bio/DB/NCBIHelper.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: NCBIHelper.pm,v 1.24.2.2 2003/06/12 09:29:38 heikki Exp $
2 #
3 # BioPerl module for Bio::DB::NCBIHelper
4 #
5 # Cared for by Jason Stajich
6 #
7 # Copyright Jason Stajich
8 #
9 # You may distribute this module under the same terms as perl itself
10 #
11 # POD documentation - main docs before the code
12 #
13 # Interfaces with new WebDBSeqI interface
14
15 =head1 NAME
16
17 Bio::DB::NCBIHelper - A collection of routines useful for queries to
18 NCBI databases.
19
20 =head1 SYNOPSIS
21
22 #Do not use this module directly.
23
24 # get a Bio::DB::NCBIHelper object somehow
25 my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']);
26 foreach my $seq ( $seqio->next_seq ) {
27 # process seq
28 }
29
30 =head1 DESCRIPTION
31
32 Provides a single place to setup some common methods for querying NCBI
33 web databases. This module just centralizes the methods for
34 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
35 common HTML stripping done in L<postprocess_data>().
36
37 The base NCBI query URL used is
38 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi.
39
40 =head1 FEEDBACK
41
42 =head2 Mailing Lists
43
44 User feedback is an integral part of the
45 evolution of this and other Bioperl modules. Send
46 your comments and suggestions preferably to one
47 of the Bioperl mailing lists. Your participation
48 is much appreciated.
49
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/MailList.shtml - About the mailing lists
52
53 =head2 Reporting Bugs
54
55 Report bugs to the Bioperl bug tracking system to
56 help us keep track the bugs and their resolution.
57 Bug reports can be submitted via email or the
58 web:
59
60 bioperl-bugs@bio.perl.org
61 http://bugzilla.bioperl.org/
62
63 =head1 AUTHOR - Jason Stajich
64
65 Email jason@bioperl.org
66
67 =head1 APPENDIX
68
69 The rest of the documentation details each of the
70 object methods. Internal methods are usually
71 preceded with a _
72
73 =cut
74
75 # Let the code begin...
76
77 package Bio::DB::NCBIHelper;
78 use strict;
79 use vars qw(@ISA $HOSTBASE %CGILOCATION %FORMATMAP
80 $DEFAULTFORMAT $MAX_ENTRIES $VERSION);
81
82 use Bio::DB::WebDBSeqI;
83 use Bio::DB::Query::GenBank;
84 use HTTP::Request::Common;
85 use URI;
86 use Bio::Root::IO;
87 use Bio::DB::RefSeq;
88 use Bio::Root::Root;
89
90 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root);
91 $VERSION = '0.8';
92
93 BEGIN {
94 $MAX_ENTRIES = 19000;
95 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
96 %CGILOCATION = (
97 'batch' => ['post' => '/entrez/eutils/efetch.fcgi'],
98 'query' => ['get' => '/entrez/eutils/efetch.fcgi'],
99 'single' => ['get' => '/entrez/eutils/efetch.fcgi'],
100 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'],
101 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'],
102 );
103
104 %FORMATMAP = ( 'gb' => 'genbank',
105 'gp' => 'genbank',
106 'fasta' => 'fasta',
107 );
108
109 $DEFAULTFORMAT = 'gb';
110 }
111
112 # the new way to make modules a little more lightweight
113
114 sub new {
115 my ($class, @args ) = @_;
116 my $self = $class->SUPER::new(@args);
117 return $self;
118 }
119
120
121 =head2 get_params
122
123 Title : get_params
124 Usage : my %params = $self->get_params($mode)
125 Function: Returns key,value pairs to be passed to NCBI database
126 for either 'batch' or 'single' sequence retrieval method
127 Returns : a key,value pair hash
128 Args : 'single' or 'batch' mode for retrieval
129
130 =cut
131
132 sub get_params {
133 my ($self, $mode) = @_;
134 $self->throw("subclass did not implement get_params");
135 }
136
137 =head2 default_format
138
139 Title : default_format
140 Usage : my $format = $self->default_format
141 Function: Returns default sequence format for this module
142 Returns : string
143 Args : none
144
145 =cut
146
147 sub default_format {
148 return $DEFAULTFORMAT;
149 }
150
151 =head2 get_request
152
153 Title : get_request
154 Usage : my $url = $self->get_request
155 Function: HTTP::Request
156 Returns :
157 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
158
159 =cut
160
161 sub get_request {
162 my ($self, @qualifiers) = @_;
163 my ($mode, $uids, $format, $query) = $self->_rearrange([qw(MODE UIDS
164 FORMAT QUERY)],
165 @qualifiers);
166
167 $mode = lc $mode;
168 ($format) = $self->request_format() unless ( defined $format);
169 if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
170 my %params = $self->get_params($mode);
171 if( ! %params ) {
172 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
173 }
174 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
175
176 unless( defined $uids or defined $query) {
177 $self->throw("Must specify a query or list of uids to fetch");
178 }
179
180 if ($uids) {
181 if( ref($uids) =~ /array/i ) {
182 $uids = join(",", @$uids);
183 }
184 $params{'id'} = $uids;
185 }
186
187 elsif ($query && $query->can('cookie')) {
188 @params{'WebEnv','query_key'} = $query->cookie;
189 $params{'db'} = $query->db;
190 }
191
192 elsif ($query) {
193 $params{'id'} = join ',',$query->ids;
194 }
195
196 $params{'rettype'} = $format;
197 if ($CGILOCATION{$mode}[0] eq 'post') {
198 return POST $url,[%params];
199 } else {
200 $url->query_form(%params);
201 $self->debug("url is $url \n");
202 return GET $url;
203 }
204 }
205
206 =head2 get_Stream_by_batch
207
208 Title : get_Stream_by_batch
209 Usage : $seq = $db->get_Stream_by_batch($ref);
210 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
211 at a time. For large numbers of sequences, this is far superior
212 than get_Stream_by_[id/acc]().
213 Example :
214 Returns : a Bio::SeqIO stream object
215 Args : $ref : either an array reference, a filename, or a filehandle
216 from which to get the list of unique ids/accession numbers.
217
218 NOTE: deprecated API. Use get_Stream_by_id() instead.
219
220 =cut
221
222 *get_Stream_by_batch = sub {
223 my $self = shift;
224 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
225 $self->get_Stream_by_id(@_)
226 };
227
228 =head2 get_Stream_by_query
229
230 Title : get_Stream_by_query
231 Usage : $seq = $db->get_Stream_by_query($query);
232 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
233 at a time. For large numbers of sequences, this is far superior
234 than get_Stream_by_[id/acc]().
235 Example :
236 Returns : a Bio::SeqIO stream object
237 Args : $query : An Entrez query string or a
238 Bio::DB::Query::GenBank object. It is suggested that you
239 create a Bio::DB::Query::GenBank object and get the entry
240 count before you fetch a potentially large stream.
241
242 =cut
243
244 sub get_Stream_by_query {
245 my ($self, $query) = @_;
246 unless (ref $query && $query->can('query')) {
247 $query = Bio::DB::Query::GenBank->new($query);
248 }
249 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
250 }
251
252 =head2 postprocess_data
253
254 Title : postprocess_data
255 Usage : $self->postprocess_data ( 'type' => 'string',
256 'location' => \$datastr);
257 Function: process downloaded data before loading into a Bio::SeqIO
258 Returns : void
259 Args : hash with two keys - 'type' can be 'string' or 'file'
260 - 'location' either file location or string
261 reference containing data
262
263 =cut
264
265 # the default method, works for genbank/genpept, other classes should
266 # override it with their own method.
267
268 sub postprocess_data {
269 my ($self, %args) = @_;
270 my $data;
271 my $type = uc $args{'type'};
272 my $location = $args{'location'};
273 if( !defined $type || $type eq '' || !defined $location) {
274 return;
275 } elsif( $type eq 'STRING' ) {
276 $data = $$location;
277 } elsif ( $type eq 'FILE' ) {
278 open(TMP, $location) or $self->throw("could not open file $location");
279 my @in = <TMP>;
280 close TMP;
281 $data = join("", @in);
282 }
283
284 # transform links to appropriate descriptions
285 if ($data =~ /\nCONTIG\s+/) {
286 $self->warn("CONTIG found. GenBank get_Stream_by_acc about to run.");
287 my(@batch,@accession,%accessions,@location,$id,
288 $contig,$stream,$aCount,$cCount,$gCount,$tCount);
289
290 # process GenBank CONTIG join(...) into two arrays
291 $data =~ /(?:CONTIG\s+join\()((?:.+\n)+)(?:\/\/)/;
292 $contig = $1;
293 $contig =~ s/\n|\)//g;
294 foreach (split /\s*,\s*/,$contig){
295 if (/>(.+)<.+>:(.+)/) {
296 ($id) = split /\./, $1;
297 push @accession, $id;
298 push @location, $2;
299 $accessions{$id}->{'count'}++;
300 } elsif( /([\w\.]+):(.+)/ ) {
301 ($id) = split /\./, $1;
302 $accessions{$id}->{'count'}++;
303 push @accession, $id;
304 push @location, $2;
305 }
306 }
307
308 # grab multiple sequences by batch and join based location variable
309 my @unique_accessions = keys %accessions;
310 $stream = $self->get_Stream_by_acc(\@unique_accessions);
311 $contig = "";
312 my $ct = 0;
313 while( my $seq = $stream->next_seq() ) {
314 if( $seq->accession_number !~ /$unique_accessions[$ct]/ ) {
315 printf STDERR "warning, %s does not match %s\n",
316 $seq->accession_number, $unique_accessions[$ct];
317 }
318 $accessions{$unique_accessions[$ct]}->{'seq'} = $seq;
319 $ct++;
320 }
321 for (my $i = 0; $i < @accession; $i++) {
322 my $seq = $accessions{$accession[$i]}->{'seq'};
323 unless( defined $seq ) {
324 # seq not cached, get next sequence
325 $self->warn("unable to find sequence $accession[$i]\n");
326 return undef;
327 }
328 my($start,$end) = split(/\.\./, $location[$i]);
329 $contig .= $seq->subseq($start,$end-$start);
330 }
331
332 # count number of each letter in sequence
333 $aCount = () = $contig =~ /a/ig;
334 $cCount = () = $contig =~ /c/ig;
335 $gCount = () = $contig =~ /g/ig;
336 $tCount = () = $contig =~ /t/ig;
337
338 # remove everything after and including CONTIG
339 $data =~ s/(CONTIG[\s\S]+)$//i;
340
341 # build ORIGIN part of data file using sequence and counts
342 $data .= "BASE COUNT $aCount a $cCount c $gCount g $tCount t\n";
343 $data .= "ORIGIN \n";
344 $data .= "$contig\n//";
345 }
346 else {
347 $data =~ s/<a\s+href\s*=.+>\s*(\S+)\s*<\s*\/a\s*\>/$1/ig;
348 }
349
350 # fix gt and lt
351 $data =~ s/&gt;/>/ig;
352 $data =~ s/&lt;/</ig;
353 if( $type eq 'FILE' ) {
354 open(TMP, ">$location") or $self->throw("couldn't overwrite file $location");
355 print TMP $data;
356 close TMP;
357 } elsif ( $type eq 'STRING' ) {
358 ${$args{'location'}} = $data;
359 }
360 $self->debug("format is ". join(',',$self->request_format()).
361 " data is\n$data\n");
362 }
363
364
365 =head2 request_format
366
367 Title : request_format
368 Usage : my ($req_format, $ioformat) = $self->request_format;
369 $self->request_format("genbank");
370 $self->request_format("fasta");
371 Function: Get/Set sequence format retrieval. The get-form will normally not
372 be used outside of this and derived modules.
373 Returns : Array of two strings, the first representing the format for
374 retrieval, and the second specifying the corresponding SeqIO format.
375 Args : $format = sequence format
376
377 =cut
378
379 sub request_format {
380 my ($self, $value) = @_;
381 if( defined $value ) {
382 $value = lc $value;
383 if( defined $FORMATMAP{$value} ) {
384 $self->{'_format'} = [ $value, $FORMATMAP{$value}];
385 } else {
386 # Try to fall back to a default. Alternatively, we could throw
387 # an exception
388 $self->{'_format'} = [ $value, $value ];
389 }
390 }
391 return @{$self->{'_format'}};
392 }
393
394 =head2 Bio::DB::WebDBSeqI methods
395
396 Overriding WebDBSeqI method to help newbies to retrieve sequences
397
398 =head2 get_Stream_by_acc
399
400 Title : get_Stream_by_acc
401 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
402 Function: Gets a series of Seq objects by accession numbers
403 Returns : a Bio::SeqIO stream object
404 Args : $ref : a reference to an array of accession numbers for
405 the desired sequence entries
406 Note : For GenBank, this just calls the same code for get_Stream_by_id()
407
408 =cut
409
410 sub get_Stream_by_acc {
411 my ($self, $ids ) = @_;
412 my $newdb = $self->_check_id($ids);
413 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
414 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
415 } else {
416 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
417 }
418 }
419
420
421 =head2 _check_id
422
423 Title : _check_id
424 Usage :
425 Function:
426 Returns : A Bio::DB::RefSeq reference or throws
427 Args : $id(s), $string
428
429 =cut
430
431 sub _check_id {
432 my ($self, $ids) = @_;
433
434 # NT contigs can not be retrieved
435 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
436 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
437 if $ids =~ /NT_/;
438
439 # Asking for a RefSeq from EMBL/GenBank
440
441 if ($ids =~ /N._/) {
442 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
443 " Redirecting the request.\n")
444 if $self->verbose >= 0;
445 return new Bio::DB::RefSeq;
446 }
447 }
448
449 =head2 delay_policy
450
451 Title : delay_policy
452 Usage : $secs = $self->delay_policy
453 Function: return number of seconds to delay between calls to remote db
454 Returns : number of seconds to delay
455 Args : none
456
457 NOTE: NCBI requests a delay of 3s between requests. This method
458 implements that policy.
459
460 =cut
461
462 sub delay_policy {
463 my $self = shift;
464 return 3;
465 }
466
467 1;
468 __END__