Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/BioFetch.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: BioFetch.pm,v 1.13.2.1 2003/06/25 13:44:18 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::DB::BioFetch | |
| 4 # | |
| 5 # Cared for by Lincoln Stein <lstein@cshl.org> | |
| 6 # | |
| 7 # Copyright Lincoln Stein | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # | |
| 11 # POD documentation - main docs before the code | |
| 12 # | |
| 13 | |
| 14 package Bio::DB::BioFetch; | |
| 15 use strict; | |
| 16 use Bio::DB::WebDBSeqI; | |
| 17 use HTTP::Request::Common 'POST'; | |
| 18 | |
| 19 =head1 NAME | |
| 20 | |
| 21 Bio::DB::BioFetch - Database object interface to BioFetch retrieval | |
| 22 | |
| 23 =head1 SYNOPSIS | |
| 24 | |
| 25 use Bio::DB::BioFetch; | |
| 26 | |
| 27 $bf = new Bio::DB::BioFetch; | |
| 28 | |
| 29 $seq = $sp->get_Seq_by_id('BUM'); # EMBL or SWALL ID | |
| 30 | |
| 31 # change formats, storage procedures | |
| 32 $bf = new Bio::DB::BioFetch(-format => 'fasta', | |
| 33 -retrievaltype => 'tempfile', | |
| 34 -db => 'EMBL'); | |
| 35 | |
| 36 $stream = $bf->get_Stream_by_id(['BUM','J00231']); | |
| 37 while (my $s = $stream->next_seq) { | |
| 38 print $s->seq,"\n"; | |
| 39 } | |
| 40 # get a RefSeq entry | |
| 41 $bf->db('refseq'); | |
| 42 eval { | |
| 43 $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION | |
| 44 }; | |
| 45 print "accession is ", $seq->accession_number, "\n" unless $@; | |
| 46 | |
| 47 | |
| 48 =head1 DESCRIPTION | |
| 49 | |
| 50 Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching | |
| 51 method. It goes to the Web-based dbfetch server located at the EBI | |
| 52 (http://www.ebi.ac.uk/cgi-bin/dbfetch) to retrieve sequences in the | |
| 53 EMBL or GenBank sequence repositories. | |
| 54 | |
| 55 This module implements all the Bio::DB::RandomAccessI interface, plus | |
| 56 the get_Stream_by_id() and get_Stream_by_acc() methods that are found | |
| 57 in the Bio::DB::SwissProt interface. | |
| 58 | |
| 59 =head1 FEEDBACK | |
| 60 | |
| 61 =head2 Mailing Lists | |
| 62 | |
| 63 User feedback is an integral part of the evolution of this and other | |
| 64 Bioperl modules. Send your comments and suggestions preferably to one | |
| 65 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 66 | |
| 67 | |
| 68 bioperl-l@bioperl.org - General discussion | |
| 69 http://bio.perl.org/MailList.html - About the mailing lists | |
| 70 | |
| 71 =head2 Reporting Bugs | |
| 72 | |
| 73 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 74 the bugs and their resolution. Bug reports can be submitted via email | |
| 75 or the web: | |
| 76 | |
| 77 bioperl-bugs@bio.perl.org | |
| 78 http://bugzilla.bioperl.org/ | |
| 79 | |
| 80 =head1 AUTHOR - Lincoln Stein | |
| 81 | |
| 82 Email Lincoln Stein E<lt>lstein@cshl.orgE<lt> | |
| 83 | |
| 84 Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for the | |
| 85 BioFetch server and interface specification. | |
| 86 | |
| 87 =head1 APPENDIX | |
| 88 | |
| 89 The rest of the documentation details each of the object | |
| 90 methods. Internal methods are usually preceded with a _ | |
| 91 | |
| 92 =cut | |
| 93 | |
| 94 # Let the code begin... | |
| 95 use vars qw(@ISA $VERSION %FORMATMAP ); | |
| 96 use Bio::Root::Root; | |
| 97 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root); | |
| 98 $VERSION = '1.0'; | |
| 99 | |
| 100 # warning: names used here must map into Bio::SeqIO::* space | |
| 101 use constant DEFAULT_LOCATION => 'http://www.ebi.ac.uk/cgi-bin/dbfetch'; | |
| 102 | |
| 103 BEGIN { | |
| 104 | |
| 105 %FORMATMAP = ( | |
| 106 'embl' => { | |
| 107 default => 'embl', # default BioFetch format/SeqIOmodule pair | |
| 108 embl => 'embl', # alternative BioFetch format/module pair | |
| 109 fasta => 'fasta', # alternative BioFetch format/module pair | |
| 110 namespace => 'embl', | |
| 111 }, | |
| 112 'swissprot' => { | |
| 113 default => 'swiss', | |
| 114 swissprot => 'swiss', | |
| 115 fasta => 'fasta', | |
| 116 namespace => 'swall', | |
| 117 }, | |
| 118 'refseq' => { | |
| 119 default => 'genbank', | |
| 120 genbank => 'genbank', | |
| 121 fasta => 'fasta', | |
| 122 namespace => 'RefSeq', | |
| 123 }, | |
| 124 'swall' => { | |
| 125 default => 'swiss', | |
| 126 swissprot => 'swiss', | |
| 127 fasta => 'fasta', | |
| 128 namespace => 'swall', | |
| 129 }, | |
| 130 'genbank' => { | |
| 131 default => 'genbank', | |
| 132 genbank => 'genbank', | |
| 133 namespace => 'genbank', | |
| 134 }, | |
| 135 'genpep' => { | |
| 136 default => 'genbank', | |
| 137 genbank => 'genbank', | |
| 138 namespace => 'genpep', | |
| 139 }, | |
| 140 ); | |
| 141 } | |
| 142 | |
| 143 =head2 new | |
| 144 | |
| 145 Title : new | |
| 146 Usage : $bf = Bio::DB::BioFetch->new(@args) | |
| 147 Function: Construct a new Bio::DB::BioFetch object | |
| 148 Returns : a Bio::DB::BioFetch object | |
| 149 Args : see below | |
| 150 Throws : | |
| 151 | |
| 152 @args are standard -name=E<gt>value options as listed in the following | |
| 153 table. If you do not provide any options, the module assumes reasonable | |
| 154 defaults. | |
| 155 | |
| 156 Option Value Default | |
| 157 ------ ----- ------- | |
| 158 | |
| 159 -baseaddress location of dbfetch server http://www.ebi.ac.uk/cgi-bin/dbfetch | |
| 160 -retrievaltype "tempfile" or "io_string" io_string | |
| 161 -format "embl", "fasta", "swissprot", embl | |
| 162 or "genbank" | |
| 163 -db "embl", "genbank" or "swissprot" embl | |
| 164 | |
| 165 =cut | |
| 166 | |
| 167 #' | |
| 168 sub new { | |
| 169 my ($class,@args) = @_; | |
| 170 my $self = $class->SUPER::new(@args); | |
| 171 my ($db) = $self->_rearrange([qw(DB)],@args); | |
| 172 $db ||= $self->default_db; | |
| 173 $self->db($db); | |
| 174 $self->url_base_address(DEFAULT_LOCATION) unless $self->url_base_address; | |
| 175 $self; | |
| 176 } | |
| 177 | |
| 178 =head2 new_from_registry | |
| 179 | |
| 180 Title : new_from_registry | |
| 181 Usage : $biofetch = $db->new_from_registry(%config) | |
| 182 Function: Creates a BioFetch object from the registry config hash | |
| 183 Returns : itself | |
| 184 Args : A configuration hash (see Registry.pm) | |
| 185 Throws : | |
| 186 | |
| 187 | |
| 188 =cut | |
| 189 | |
| 190 sub new_from_registry { | |
| 191 my ($class,%config)=@_; | |
| 192 | |
| 193 my $self = $class->SUPER::new( | |
| 194 -BASEADDRESS=>$config{'location'} | |
| 195 ); | |
| 196 $self->db($config{'dbname'}) if $config{dbname}; | |
| 197 return $self; | |
| 198 } | |
| 199 | |
| 200 # from Bio::DB::RandomAccessI | |
| 201 | |
| 202 =head2 get_Seq_by_id | |
| 203 | |
| 204 Title : get_Seq_by_id | |
| 205 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
| 206 Function: Gets a Bio::Seq object by its name | |
| 207 Returns : a Bio::Seq object | |
| 208 Args : the id (as a string) of a sequence | |
| 209 Throws : "id does not exist" exception | |
| 210 | |
| 211 | |
| 212 =cut | |
| 213 | |
| 214 =head2 get_Seq_by_acc | |
| 215 | |
| 216 Title : get_Seq_by_acc | |
| 217 Usage : $seq = $db->get_Seq_by_acc('X77802'); | |
| 218 Function: Gets a Bio::Seq object by accession number | |
| 219 Returns : A Bio::Seq object | |
| 220 Args : accession number (as a string) | |
| 221 Throws : "acc does not exist" exception | |
| 222 | |
| 223 =cut | |
| 224 | |
| 225 =head2 get_Seq_by_gi | |
| 226 | |
| 227 Title : get_Seq_by_gi | |
| 228 Usage : $seq = $db->get_Seq_by_gi('405830'); | |
| 229 Function: Gets a Bio::Seq object by gi number | |
| 230 Returns : A Bio::Seq object | |
| 231 Args : gi number (as a string) | |
| 232 Throws : "gi does not exist" exception | |
| 233 | |
| 234 =cut | |
| 235 | |
| 236 =head2 get_Seq_by_version | |
| 237 | |
| 238 Title : get_Seq_by_version | |
| 239 Usage : $seq = $db->get_Seq_by_version('X77802.1'); | |
| 240 Function: Gets a Bio::Seq object by sequence version | |
| 241 Returns : A Bio::Seq object | |
| 242 Args : accession.version (as a string) | |
| 243 Throws : "acc.version does not exist" exception | |
| 244 | |
| 245 =cut | |
| 246 | |
| 247 sub get_Seq_by_version { | |
| 248 my ($self,$seqid) = @_; | |
| 249 return $self->get_Seq_by_acc($seqid); | |
| 250 } | |
| 251 | |
| 252 | |
| 253 =head2 get_Stream_by_id | |
| 254 | |
| 255 Title : get_Stream_by_id | |
| 256 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); | |
| 257 Function: Gets a series of Seq objects by unique identifiers | |
| 258 Returns : a Bio::SeqIO stream object | |
| 259 Args : $ref : a reference to an array of unique identifiers for | |
| 260 the desired sequence entries | |
| 261 | |
| 262 =cut | |
| 263 | |
| 264 =head2 get_Stream_by_gi | |
| 265 | |
| 266 Title : get_Stream_by_gi | |
| 267 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]); | |
| 268 Function: Gets a series of Seq objects by gi numbers | |
| 269 Returns : a Bio::SeqIO stream object | |
| 270 Args : $ref : a reference to an array of gi numbers for | |
| 271 the desired sequence entries | |
| 272 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
| 273 | |
| 274 =cut | |
| 275 | |
| 276 =head2 get_Stream_by_batch | |
| 277 | |
| 278 Title : get_Stream_by_batch | |
| 279 Usage : $seq = $db->get_Stream_by_batch($ref); | |
| 280 Function: Get a series of Seq objects by their IDs | |
| 281 Example : | |
| 282 Returns : a Bio::SeqIO stream object | |
| 283 Args : $ref : an array reference containing a list of unique | |
| 284 ids/accession numbers. | |
| 285 | |
| 286 In some of the Bio::DB::* moduels, get_Stream_by_id() is called | |
| 287 get_Stream_by_batch(). Since there seems to be no consensus, this | |
| 288 is provided as an alias. | |
| 289 | |
| 290 =cut | |
| 291 | |
| 292 *get_Stream_by_batch = \&Bio::DB::WebDBSeqI::get_Stream_by_id; | |
| 293 | |
| 294 =head1 The remainder of these methods are for internal use | |
| 295 | |
| 296 =head2 get_request | |
| 297 | |
| 298 Title : get_request | |
| 299 Usage : my $url = $self->get_request | |
| 300 Function: returns a HTTP::Request object | |
| 301 Returns : | |
| 302 Args : %qualifiers = a hash of qualifiers (ids, format, etc) | |
| 303 | |
| 304 =cut | |
| 305 | |
| 306 | |
| 307 sub get_request { | |
| 308 my ($self, @qualifiers) = @_; | |
| 309 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)], | |
| 310 @qualifiers); | |
| 311 my $db = $self->db; | |
| 312 my $namespace = $self->_namespace; | |
| 313 | |
| 314 $self->throw("Must specify a value for UIDs to fetch") | |
| 315 unless defined $uids; | |
| 316 my $tmp; | |
| 317 my $format_string = ''; | |
| 318 | |
| 319 $format ||= $self->default_format; | |
| 320 ($format, $tmp) = $self->request_format($format); | |
| 321 | |
| 322 my $base = $self->url_base_address; | |
| 323 my $uid = join('+', ref $uids ? @$uids : $uids); | |
| 324 $self->debug("\n$base$format_string&id=$uid\n"); | |
| 325 return POST($base, | |
| 326 [ db => $namespace, | |
| 327 id => join('+',ref $uids ? @$uids : $uids), | |
| 328 format => $format, | |
| 329 style => 'raw' | |
| 330 ]); | |
| 331 } | |
| 332 | |
| 333 =head2 default_format | |
| 334 | |
| 335 Title : default_format | |
| 336 Usage : $format = $self->default_format | |
| 337 Function: return the default format | |
| 338 Returns : a string | |
| 339 Args : | |
| 340 | |
| 341 =cut | |
| 342 | |
| 343 sub default_format { | |
| 344 return 'default'; | |
| 345 } | |
| 346 | |
| 347 =head2 default_db | |
| 348 | |
| 349 Title : default_db | |
| 350 Usage : $db = $self->default_db | |
| 351 Function: return the default database | |
| 352 Returns : a string | |
| 353 Args : | |
| 354 | |
| 355 =cut | |
| 356 | |
| 357 sub default_db { 'embl' } | |
| 358 | |
| 359 =head2 db | |
| 360 | |
| 361 Title : db | |
| 362 Usage : $db = $self->db([$db]) | |
| 363 Function: get/set the database | |
| 364 Returns : a string | |
| 365 Args : new database | |
| 366 | |
| 367 =cut | |
| 368 | |
| 369 sub db { | |
| 370 my $self = shift; | |
| 371 | |
| 372 if (@_) { | |
| 373 | |
| 374 my $db = lc shift; | |
| 375 $FORMATMAP{$db} or $self->throw("invalid db [$db], must be one of [". | |
| 376 join(' ',keys %FORMATMAP). "]"); | |
| 377 $self->{_db} = $db; | |
| 378 } | |
| 379 return $self->{_db} || $self->default_db ; | |
| 380 } | |
| 381 | |
| 382 sub _namespace { | |
| 383 my $self = shift; | |
| 384 my $db = $self->db; | |
| 385 return $FORMATMAP{$db}{namespace} or $db; | |
| 386 } | |
| 387 | |
| 388 =head2 postprocess_data | |
| 389 | |
| 390 Title : postprocess_data | |
| 391 Usage : $self->postprocess_data ( 'type' => 'string', | |
| 392 'location' => \$datastr); | |
| 393 Function: process downloaded data before loading into a Bio::SeqIO | |
| 394 Returns : void | |
| 395 Args : hash with two keys - 'type' can be 'string' or 'file' | |
| 396 - 'location' either file location or string | |
| 397 reference containing data | |
| 398 | |
| 399 =cut | |
| 400 | |
| 401 sub postprocess_data { | |
| 402 my ($self,%args) = @_; | |
| 403 | |
| 404 # check for errors in the stream | |
| 405 if ($args{'type'} eq 'string') { | |
| 406 my $stringref = $args{'location'}; | |
| 407 if ($$stringref =~ /^ERROR (\d+) (.+)/m) { | |
| 408 $self->throw("BioFetch Error $1: $2"); | |
| 409 } | |
| 410 } | |
| 411 | |
| 412 elsif ($args{'type'} eq 'file') { | |
| 413 open (F,$args{'location'}) or $self->throw("Couldn't open $args{location}: $!"); | |
| 414 # this is dumb, but the error may be anywhere on the first three lines because the | |
| 415 # CGI headers are sometimes printed out by the server... | |
| 416 my @data = (scalar <F>,scalar <F>,scalar <F>); | |
| 417 if (join('',@data) =~ /^ERROR (\d+) (.+)/m) { | |
| 418 $self->throw("BioFetch Error $1: $2"); | |
| 419 } | |
| 420 close F; | |
| 421 } | |
| 422 | |
| 423 else { | |
| 424 $self->throw("Don't know how to postprocess data of type $args{'type'}"); | |
| 425 } | |
| 426 } | |
| 427 | |
| 428 | |
| 429 =head2 request_format | |
| 430 | |
| 431 Title : request_format | |
| 432 Usage : my ($req_format, $ioformat) = $self->request_format; | |
| 433 $self->request_format("genbank"); | |
| 434 $self->request_format("fasta"); | |
| 435 Function: Get/Set sequence format retrieval. The get-form will normally not | |
| 436 be used outside of this and derived modules. | |
| 437 Returns : Array of two strings, the first representing the format for | |
| 438 retrieval, and the second specifying the corresponding SeqIO format. | |
| 439 Args : $format = sequence format | |
| 440 | |
| 441 =cut | |
| 442 | |
| 443 sub request_format { | |
| 444 my ($self, $value) = @_; | |
| 445 if ( defined $value ) { | |
| 446 my $db = $self->db; | |
| 447 my $namespace = $self->_namespace; | |
| 448 my $format = lc $value; | |
| 449 print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n" | |
| 450 if $self->verbose > 0; | |
| 451 $self->throw("Invalid format [$format], must be one of [". | |
| 452 join(' ',keys %{$FORMATMAP{$db}}). "]") | |
| 453 unless $format eq 'default' || $FORMATMAP{$db}->{$format}; | |
| 454 | |
| 455 $self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}]; | |
| 456 } | |
| 457 return @{$self->{'_format'}}; | |
| 458 } | |
| 459 | |
| 460 | |
| 461 =head2 Bio::DB::WebDBSeqI methods | |
| 462 | |
| 463 Overriding WebDBSeqI method to help newbies to retrieve sequences. | |
| 464 EMBL database is all too often passed RefSeq accessions. This | |
| 465 redirects those calls. See L<Bio::DB::RefSeq>. | |
| 466 | |
| 467 | |
| 468 =head2 get_Stream_by_acc | |
| 469 | |
| 470 Title : get_Stream_by_acc | |
| 471 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); | |
| 472 Function: Gets a series of Seq objects by accession numbers | |
| 473 Returns : a Bio::SeqIO stream object | |
| 474 Args : $ref : a reference to an array of accession numbers for | |
| 475 the desired sequence entries | |
| 476 | |
| 477 =cut | |
| 478 | |
| 479 sub get_Stream_by_acc { | |
| 480 my ($self, $ids ) = @_; | |
| 481 $self->_check_id($ids); | |
| 482 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); | |
| 483 } | |
| 484 | |
| 485 | |
| 486 =head2 _check_id | |
| 487 | |
| 488 Title : _check_id | |
| 489 Usage : | |
| 490 Function: Throw on whole chromosome NCBI sequences not in sequence databases | |
| 491 and redirect RefSeq accession requests sent to EMBL. | |
| 492 Returns : | |
| 493 Args : $id(s), $string | |
| 494 Throws : if accessionn number indicates whole chromosome NCBI sequence | |
| 495 | |
| 496 =cut | |
| 497 | |
| 498 sub _check_id { | |
| 499 my ($self, $id) = @_; | |
| 500 | |
| 501 # NT contigs can not be retrieved | |
| 502 $self->throw("NT_ contigs are whole chromosome files which are not part of regular". | |
| 503 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") | |
| 504 if $id =~ /NT_/; | |
| 505 | |
| 506 # Asking for a RefSeq from EMBL/GenBank | |
| 507 | |
| 508 if ($id =~ /N._/ && $self->db ne 'refseq') { | |
| 509 $self->warn("[$id] is not a normal sequence database but a RefSeq entry.". | |
| 510 " Redirecting the request.\n") | |
| 511 if $self->verbose >= 0; | |
| 512 $self->db('RefSeq'); | |
| 513 } | |
| 514 } | |
| 515 | |
| 516 1; |
