Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/BioFetch.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: BioFetch.pm,v 1.13.2.1 2003/06/25 13:44:18 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::DB::BioFetch | |
4 # | |
5 # Cared for by Lincoln Stein <lstein@cshl.org> | |
6 # | |
7 # Copyright Lincoln Stein | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # | |
11 # POD documentation - main docs before the code | |
12 # | |
13 | |
14 package Bio::DB::BioFetch; | |
15 use strict; | |
16 use Bio::DB::WebDBSeqI; | |
17 use HTTP::Request::Common 'POST'; | |
18 | |
19 =head1 NAME | |
20 | |
21 Bio::DB::BioFetch - Database object interface to BioFetch retrieval | |
22 | |
23 =head1 SYNOPSIS | |
24 | |
25 use Bio::DB::BioFetch; | |
26 | |
27 $bf = new Bio::DB::BioFetch; | |
28 | |
29 $seq = $sp->get_Seq_by_id('BUM'); # EMBL or SWALL ID | |
30 | |
31 # change formats, storage procedures | |
32 $bf = new Bio::DB::BioFetch(-format => 'fasta', | |
33 -retrievaltype => 'tempfile', | |
34 -db => 'EMBL'); | |
35 | |
36 $stream = $bf->get_Stream_by_id(['BUM','J00231']); | |
37 while (my $s = $stream->next_seq) { | |
38 print $s->seq,"\n"; | |
39 } | |
40 # get a RefSeq entry | |
41 $bf->db('refseq'); | |
42 eval { | |
43 $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION | |
44 }; | |
45 print "accession is ", $seq->accession_number, "\n" unless $@; | |
46 | |
47 | |
48 =head1 DESCRIPTION | |
49 | |
50 Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching | |
51 method. It goes to the Web-based dbfetch server located at the EBI | |
52 (http://www.ebi.ac.uk/cgi-bin/dbfetch) to retrieve sequences in the | |
53 EMBL or GenBank sequence repositories. | |
54 | |
55 This module implements all the Bio::DB::RandomAccessI interface, plus | |
56 the get_Stream_by_id() and get_Stream_by_acc() methods that are found | |
57 in the Bio::DB::SwissProt interface. | |
58 | |
59 =head1 FEEDBACK | |
60 | |
61 =head2 Mailing Lists | |
62 | |
63 User feedback is an integral part of the evolution of this and other | |
64 Bioperl modules. Send your comments and suggestions preferably to one | |
65 of the Bioperl mailing lists. Your participation is much appreciated. | |
66 | |
67 | |
68 bioperl-l@bioperl.org - General discussion | |
69 http://bio.perl.org/MailList.html - About the mailing lists | |
70 | |
71 =head2 Reporting Bugs | |
72 | |
73 Report bugs to the Bioperl bug tracking system to help us keep track | |
74 the bugs and their resolution. Bug reports can be submitted via email | |
75 or the web: | |
76 | |
77 bioperl-bugs@bio.perl.org | |
78 http://bugzilla.bioperl.org/ | |
79 | |
80 =head1 AUTHOR - Lincoln Stein | |
81 | |
82 Email Lincoln Stein E<lt>lstein@cshl.orgE<lt> | |
83 | |
84 Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for the | |
85 BioFetch server and interface specification. | |
86 | |
87 =head1 APPENDIX | |
88 | |
89 The rest of the documentation details each of the object | |
90 methods. Internal methods are usually preceded with a _ | |
91 | |
92 =cut | |
93 | |
94 # Let the code begin... | |
95 use vars qw(@ISA $VERSION %FORMATMAP ); | |
96 use Bio::Root::Root; | |
97 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root); | |
98 $VERSION = '1.0'; | |
99 | |
100 # warning: names used here must map into Bio::SeqIO::* space | |
101 use constant DEFAULT_LOCATION => 'http://www.ebi.ac.uk/cgi-bin/dbfetch'; | |
102 | |
103 BEGIN { | |
104 | |
105 %FORMATMAP = ( | |
106 'embl' => { | |
107 default => 'embl', # default BioFetch format/SeqIOmodule pair | |
108 embl => 'embl', # alternative BioFetch format/module pair | |
109 fasta => 'fasta', # alternative BioFetch format/module pair | |
110 namespace => 'embl', | |
111 }, | |
112 'swissprot' => { | |
113 default => 'swiss', | |
114 swissprot => 'swiss', | |
115 fasta => 'fasta', | |
116 namespace => 'swall', | |
117 }, | |
118 'refseq' => { | |
119 default => 'genbank', | |
120 genbank => 'genbank', | |
121 fasta => 'fasta', | |
122 namespace => 'RefSeq', | |
123 }, | |
124 'swall' => { | |
125 default => 'swiss', | |
126 swissprot => 'swiss', | |
127 fasta => 'fasta', | |
128 namespace => 'swall', | |
129 }, | |
130 'genbank' => { | |
131 default => 'genbank', | |
132 genbank => 'genbank', | |
133 namespace => 'genbank', | |
134 }, | |
135 'genpep' => { | |
136 default => 'genbank', | |
137 genbank => 'genbank', | |
138 namespace => 'genpep', | |
139 }, | |
140 ); | |
141 } | |
142 | |
143 =head2 new | |
144 | |
145 Title : new | |
146 Usage : $bf = Bio::DB::BioFetch->new(@args) | |
147 Function: Construct a new Bio::DB::BioFetch object | |
148 Returns : a Bio::DB::BioFetch object | |
149 Args : see below | |
150 Throws : | |
151 | |
152 @args are standard -name=E<gt>value options as listed in the following | |
153 table. If you do not provide any options, the module assumes reasonable | |
154 defaults. | |
155 | |
156 Option Value Default | |
157 ------ ----- ------- | |
158 | |
159 -baseaddress location of dbfetch server http://www.ebi.ac.uk/cgi-bin/dbfetch | |
160 -retrievaltype "tempfile" or "io_string" io_string | |
161 -format "embl", "fasta", "swissprot", embl | |
162 or "genbank" | |
163 -db "embl", "genbank" or "swissprot" embl | |
164 | |
165 =cut | |
166 | |
167 #' | |
168 sub new { | |
169 my ($class,@args) = @_; | |
170 my $self = $class->SUPER::new(@args); | |
171 my ($db) = $self->_rearrange([qw(DB)],@args); | |
172 $db ||= $self->default_db; | |
173 $self->db($db); | |
174 $self->url_base_address(DEFAULT_LOCATION) unless $self->url_base_address; | |
175 $self; | |
176 } | |
177 | |
178 =head2 new_from_registry | |
179 | |
180 Title : new_from_registry | |
181 Usage : $biofetch = $db->new_from_registry(%config) | |
182 Function: Creates a BioFetch object from the registry config hash | |
183 Returns : itself | |
184 Args : A configuration hash (see Registry.pm) | |
185 Throws : | |
186 | |
187 | |
188 =cut | |
189 | |
190 sub new_from_registry { | |
191 my ($class,%config)=@_; | |
192 | |
193 my $self = $class->SUPER::new( | |
194 -BASEADDRESS=>$config{'location'} | |
195 ); | |
196 $self->db($config{'dbname'}) if $config{dbname}; | |
197 return $self; | |
198 } | |
199 | |
200 # from Bio::DB::RandomAccessI | |
201 | |
202 =head2 get_Seq_by_id | |
203 | |
204 Title : get_Seq_by_id | |
205 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
206 Function: Gets a Bio::Seq object by its name | |
207 Returns : a Bio::Seq object | |
208 Args : the id (as a string) of a sequence | |
209 Throws : "id does not exist" exception | |
210 | |
211 | |
212 =cut | |
213 | |
214 =head2 get_Seq_by_acc | |
215 | |
216 Title : get_Seq_by_acc | |
217 Usage : $seq = $db->get_Seq_by_acc('X77802'); | |
218 Function: Gets a Bio::Seq object by accession number | |
219 Returns : A Bio::Seq object | |
220 Args : accession number (as a string) | |
221 Throws : "acc does not exist" exception | |
222 | |
223 =cut | |
224 | |
225 =head2 get_Seq_by_gi | |
226 | |
227 Title : get_Seq_by_gi | |
228 Usage : $seq = $db->get_Seq_by_gi('405830'); | |
229 Function: Gets a Bio::Seq object by gi number | |
230 Returns : A Bio::Seq object | |
231 Args : gi number (as a string) | |
232 Throws : "gi does not exist" exception | |
233 | |
234 =cut | |
235 | |
236 =head2 get_Seq_by_version | |
237 | |
238 Title : get_Seq_by_version | |
239 Usage : $seq = $db->get_Seq_by_version('X77802.1'); | |
240 Function: Gets a Bio::Seq object by sequence version | |
241 Returns : A Bio::Seq object | |
242 Args : accession.version (as a string) | |
243 Throws : "acc.version does not exist" exception | |
244 | |
245 =cut | |
246 | |
247 sub get_Seq_by_version { | |
248 my ($self,$seqid) = @_; | |
249 return $self->get_Seq_by_acc($seqid); | |
250 } | |
251 | |
252 | |
253 =head2 get_Stream_by_id | |
254 | |
255 Title : get_Stream_by_id | |
256 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); | |
257 Function: Gets a series of Seq objects by unique identifiers | |
258 Returns : a Bio::SeqIO stream object | |
259 Args : $ref : a reference to an array of unique identifiers for | |
260 the desired sequence entries | |
261 | |
262 =cut | |
263 | |
264 =head2 get_Stream_by_gi | |
265 | |
266 Title : get_Stream_by_gi | |
267 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]); | |
268 Function: Gets a series of Seq objects by gi numbers | |
269 Returns : a Bio::SeqIO stream object | |
270 Args : $ref : a reference to an array of gi numbers for | |
271 the desired sequence entries | |
272 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
273 | |
274 =cut | |
275 | |
276 =head2 get_Stream_by_batch | |
277 | |
278 Title : get_Stream_by_batch | |
279 Usage : $seq = $db->get_Stream_by_batch($ref); | |
280 Function: Get a series of Seq objects by their IDs | |
281 Example : | |
282 Returns : a Bio::SeqIO stream object | |
283 Args : $ref : an array reference containing a list of unique | |
284 ids/accession numbers. | |
285 | |
286 In some of the Bio::DB::* moduels, get_Stream_by_id() is called | |
287 get_Stream_by_batch(). Since there seems to be no consensus, this | |
288 is provided as an alias. | |
289 | |
290 =cut | |
291 | |
292 *get_Stream_by_batch = \&Bio::DB::WebDBSeqI::get_Stream_by_id; | |
293 | |
294 =head1 The remainder of these methods are for internal use | |
295 | |
296 =head2 get_request | |
297 | |
298 Title : get_request | |
299 Usage : my $url = $self->get_request | |
300 Function: returns a HTTP::Request object | |
301 Returns : | |
302 Args : %qualifiers = a hash of qualifiers (ids, format, etc) | |
303 | |
304 =cut | |
305 | |
306 | |
307 sub get_request { | |
308 my ($self, @qualifiers) = @_; | |
309 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)], | |
310 @qualifiers); | |
311 my $db = $self->db; | |
312 my $namespace = $self->_namespace; | |
313 | |
314 $self->throw("Must specify a value for UIDs to fetch") | |
315 unless defined $uids; | |
316 my $tmp; | |
317 my $format_string = ''; | |
318 | |
319 $format ||= $self->default_format; | |
320 ($format, $tmp) = $self->request_format($format); | |
321 | |
322 my $base = $self->url_base_address; | |
323 my $uid = join('+', ref $uids ? @$uids : $uids); | |
324 $self->debug("\n$base$format_string&id=$uid\n"); | |
325 return POST($base, | |
326 [ db => $namespace, | |
327 id => join('+',ref $uids ? @$uids : $uids), | |
328 format => $format, | |
329 style => 'raw' | |
330 ]); | |
331 } | |
332 | |
333 =head2 default_format | |
334 | |
335 Title : default_format | |
336 Usage : $format = $self->default_format | |
337 Function: return the default format | |
338 Returns : a string | |
339 Args : | |
340 | |
341 =cut | |
342 | |
343 sub default_format { | |
344 return 'default'; | |
345 } | |
346 | |
347 =head2 default_db | |
348 | |
349 Title : default_db | |
350 Usage : $db = $self->default_db | |
351 Function: return the default database | |
352 Returns : a string | |
353 Args : | |
354 | |
355 =cut | |
356 | |
357 sub default_db { 'embl' } | |
358 | |
359 =head2 db | |
360 | |
361 Title : db | |
362 Usage : $db = $self->db([$db]) | |
363 Function: get/set the database | |
364 Returns : a string | |
365 Args : new database | |
366 | |
367 =cut | |
368 | |
369 sub db { | |
370 my $self = shift; | |
371 | |
372 if (@_) { | |
373 | |
374 my $db = lc shift; | |
375 $FORMATMAP{$db} or $self->throw("invalid db [$db], must be one of [". | |
376 join(' ',keys %FORMATMAP). "]"); | |
377 $self->{_db} = $db; | |
378 } | |
379 return $self->{_db} || $self->default_db ; | |
380 } | |
381 | |
382 sub _namespace { | |
383 my $self = shift; | |
384 my $db = $self->db; | |
385 return $FORMATMAP{$db}{namespace} or $db; | |
386 } | |
387 | |
388 =head2 postprocess_data | |
389 | |
390 Title : postprocess_data | |
391 Usage : $self->postprocess_data ( 'type' => 'string', | |
392 'location' => \$datastr); | |
393 Function: process downloaded data before loading into a Bio::SeqIO | |
394 Returns : void | |
395 Args : hash with two keys - 'type' can be 'string' or 'file' | |
396 - 'location' either file location or string | |
397 reference containing data | |
398 | |
399 =cut | |
400 | |
401 sub postprocess_data { | |
402 my ($self,%args) = @_; | |
403 | |
404 # check for errors in the stream | |
405 if ($args{'type'} eq 'string') { | |
406 my $stringref = $args{'location'}; | |
407 if ($$stringref =~ /^ERROR (\d+) (.+)/m) { | |
408 $self->throw("BioFetch Error $1: $2"); | |
409 } | |
410 } | |
411 | |
412 elsif ($args{'type'} eq 'file') { | |
413 open (F,$args{'location'}) or $self->throw("Couldn't open $args{location}: $!"); | |
414 # this is dumb, but the error may be anywhere on the first three lines because the | |
415 # CGI headers are sometimes printed out by the server... | |
416 my @data = (scalar <F>,scalar <F>,scalar <F>); | |
417 if (join('',@data) =~ /^ERROR (\d+) (.+)/m) { | |
418 $self->throw("BioFetch Error $1: $2"); | |
419 } | |
420 close F; | |
421 } | |
422 | |
423 else { | |
424 $self->throw("Don't know how to postprocess data of type $args{'type'}"); | |
425 } | |
426 } | |
427 | |
428 | |
429 =head2 request_format | |
430 | |
431 Title : request_format | |
432 Usage : my ($req_format, $ioformat) = $self->request_format; | |
433 $self->request_format("genbank"); | |
434 $self->request_format("fasta"); | |
435 Function: Get/Set sequence format retrieval. The get-form will normally not | |
436 be used outside of this and derived modules. | |
437 Returns : Array of two strings, the first representing the format for | |
438 retrieval, and the second specifying the corresponding SeqIO format. | |
439 Args : $format = sequence format | |
440 | |
441 =cut | |
442 | |
443 sub request_format { | |
444 my ($self, $value) = @_; | |
445 if ( defined $value ) { | |
446 my $db = $self->db; | |
447 my $namespace = $self->_namespace; | |
448 my $format = lc $value; | |
449 print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n" | |
450 if $self->verbose > 0; | |
451 $self->throw("Invalid format [$format], must be one of [". | |
452 join(' ',keys %{$FORMATMAP{$db}}). "]") | |
453 unless $format eq 'default' || $FORMATMAP{$db}->{$format}; | |
454 | |
455 $self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}]; | |
456 } | |
457 return @{$self->{'_format'}}; | |
458 } | |
459 | |
460 | |
461 =head2 Bio::DB::WebDBSeqI methods | |
462 | |
463 Overriding WebDBSeqI method to help newbies to retrieve sequences. | |
464 EMBL database is all too often passed RefSeq accessions. This | |
465 redirects those calls. See L<Bio::DB::RefSeq>. | |
466 | |
467 | |
468 =head2 get_Stream_by_acc | |
469 | |
470 Title : get_Stream_by_acc | |
471 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); | |
472 Function: Gets a series of Seq objects by accession numbers | |
473 Returns : a Bio::SeqIO stream object | |
474 Args : $ref : a reference to an array of accession numbers for | |
475 the desired sequence entries | |
476 | |
477 =cut | |
478 | |
479 sub get_Stream_by_acc { | |
480 my ($self, $ids ) = @_; | |
481 $self->_check_id($ids); | |
482 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); | |
483 } | |
484 | |
485 | |
486 =head2 _check_id | |
487 | |
488 Title : _check_id | |
489 Usage : | |
490 Function: Throw on whole chromosome NCBI sequences not in sequence databases | |
491 and redirect RefSeq accession requests sent to EMBL. | |
492 Returns : | |
493 Args : $id(s), $string | |
494 Throws : if accessionn number indicates whole chromosome NCBI sequence | |
495 | |
496 =cut | |
497 | |
498 sub _check_id { | |
499 my ($self, $id) = @_; | |
500 | |
501 # NT contigs can not be retrieved | |
502 $self->throw("NT_ contigs are whole chromosome files which are not part of regular". | |
503 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") | |
504 if $id =~ /NT_/; | |
505 | |
506 # Asking for a RefSeq from EMBL/GenBank | |
507 | |
508 if ($id =~ /N._/ && $self->db ne 'refseq') { | |
509 $self->warn("[$id] is not a normal sequence database but a RefSeq entry.". | |
510 " Redirecting the request.\n") | |
511 if $self->verbose >= 0; | |
512 $self->db('RefSeq'); | |
513 } | |
514 } | |
515 | |
516 1; |