Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/NCBIHelper.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: NCBIHelper.pm,v 1.24.2.2 2003/06/12 09:29:38 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::DB::NCBIHelper | |
| 4 # | |
| 5 # Cared for by Jason Stajich | |
| 6 # | |
| 7 # Copyright Jason Stajich | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # | |
| 11 # POD documentation - main docs before the code | |
| 12 # | |
| 13 # Interfaces with new WebDBSeqI interface | |
| 14 | |
| 15 =head1 NAME | |
| 16 | |
| 17 Bio::DB::NCBIHelper - A collection of routines useful for queries to | |
| 18 NCBI databases. | |
| 19 | |
| 20 =head1 SYNOPSIS | |
| 21 | |
| 22 #Do not use this module directly. | |
| 23 | |
| 24 # get a Bio::DB::NCBIHelper object somehow | |
| 25 my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']); | |
| 26 foreach my $seq ( $seqio->next_seq ) { | |
| 27 # process seq | |
| 28 } | |
| 29 | |
| 30 =head1 DESCRIPTION | |
| 31 | |
| 32 Provides a single place to setup some common methods for querying NCBI | |
| 33 web databases. This module just centralizes the methods for | |
| 34 constructing a URL for querying NCBI GenBank and NCBI GenPept and the | |
| 35 common HTML stripping done in L<postprocess_data>(). | |
| 36 | |
| 37 The base NCBI query URL used is | |
| 38 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi. | |
| 39 | |
| 40 =head1 FEEDBACK | |
| 41 | |
| 42 =head2 Mailing Lists | |
| 43 | |
| 44 User feedback is an integral part of the | |
| 45 evolution of this and other Bioperl modules. Send | |
| 46 your comments and suggestions preferably to one | |
| 47 of the Bioperl mailing lists. Your participation | |
| 48 is much appreciated. | |
| 49 | |
| 50 bioperl-l@bioperl.org - General discussion | |
| 51 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 52 | |
| 53 =head2 Reporting Bugs | |
| 54 | |
| 55 Report bugs to the Bioperl bug tracking system to | |
| 56 help us keep track the bugs and their resolution. | |
| 57 Bug reports can be submitted via email or the | |
| 58 web: | |
| 59 | |
| 60 bioperl-bugs@bio.perl.org | |
| 61 http://bugzilla.bioperl.org/ | |
| 62 | |
| 63 =head1 AUTHOR - Jason Stajich | |
| 64 | |
| 65 Email jason@bioperl.org | |
| 66 | |
| 67 =head1 APPENDIX | |
| 68 | |
| 69 The rest of the documentation details each of the | |
| 70 object methods. Internal methods are usually | |
| 71 preceded with a _ | |
| 72 | |
| 73 =cut | |
| 74 | |
| 75 # Let the code begin... | |
| 76 | |
| 77 package Bio::DB::NCBIHelper; | |
| 78 use strict; | |
| 79 use vars qw(@ISA $HOSTBASE %CGILOCATION %FORMATMAP | |
| 80 $DEFAULTFORMAT $MAX_ENTRIES $VERSION); | |
| 81 | |
| 82 use Bio::DB::WebDBSeqI; | |
| 83 use Bio::DB::Query::GenBank; | |
| 84 use HTTP::Request::Common; | |
| 85 use URI; | |
| 86 use Bio::Root::IO; | |
| 87 use Bio::DB::RefSeq; | |
| 88 use Bio::Root::Root; | |
| 89 | |
| 90 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root); | |
| 91 $VERSION = '0.8'; | |
| 92 | |
| 93 BEGIN { | |
| 94 $MAX_ENTRIES = 19000; | |
| 95 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov'; | |
| 96 %CGILOCATION = ( | |
| 97 'batch' => ['post' => '/entrez/eutils/efetch.fcgi'], | |
| 98 'query' => ['get' => '/entrez/eutils/efetch.fcgi'], | |
| 99 'single' => ['get' => '/entrez/eutils/efetch.fcgi'], | |
| 100 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'], | |
| 101 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'], | |
| 102 ); | |
| 103 | |
| 104 %FORMATMAP = ( 'gb' => 'genbank', | |
| 105 'gp' => 'genbank', | |
| 106 'fasta' => 'fasta', | |
| 107 ); | |
| 108 | |
| 109 $DEFAULTFORMAT = 'gb'; | |
| 110 } | |
| 111 | |
| 112 # the new way to make modules a little more lightweight | |
| 113 | |
| 114 sub new { | |
| 115 my ($class, @args ) = @_; | |
| 116 my $self = $class->SUPER::new(@args); | |
| 117 return $self; | |
| 118 } | |
| 119 | |
| 120 | |
| 121 =head2 get_params | |
| 122 | |
| 123 Title : get_params | |
| 124 Usage : my %params = $self->get_params($mode) | |
| 125 Function: Returns key,value pairs to be passed to NCBI database | |
| 126 for either 'batch' or 'single' sequence retrieval method | |
| 127 Returns : a key,value pair hash | |
| 128 Args : 'single' or 'batch' mode for retrieval | |
| 129 | |
| 130 =cut | |
| 131 | |
| 132 sub get_params { | |
| 133 my ($self, $mode) = @_; | |
| 134 $self->throw("subclass did not implement get_params"); | |
| 135 } | |
| 136 | |
| 137 =head2 default_format | |
| 138 | |
| 139 Title : default_format | |
| 140 Usage : my $format = $self->default_format | |
| 141 Function: Returns default sequence format for this module | |
| 142 Returns : string | |
| 143 Args : none | |
| 144 | |
| 145 =cut | |
| 146 | |
| 147 sub default_format { | |
| 148 return $DEFAULTFORMAT; | |
| 149 } | |
| 150 | |
| 151 =head2 get_request | |
| 152 | |
| 153 Title : get_request | |
| 154 Usage : my $url = $self->get_request | |
| 155 Function: HTTP::Request | |
| 156 Returns : | |
| 157 Args : %qualifiers = a hash of qualifiers (ids, format, etc) | |
| 158 | |
| 159 =cut | |
| 160 | |
| 161 sub get_request { | |
| 162 my ($self, @qualifiers) = @_; | |
| 163 my ($mode, $uids, $format, $query) = $self->_rearrange([qw(MODE UIDS | |
| 164 FORMAT QUERY)], | |
| 165 @qualifiers); | |
| 166 | |
| 167 $mode = lc $mode; | |
| 168 ($format) = $self->request_format() unless ( defined $format); | |
| 169 if( !defined $mode || $mode eq '' ) { $mode = 'single'; } | |
| 170 my %params = $self->get_params($mode); | |
| 171 if( ! %params ) { | |
| 172 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'") | |
| 173 } | |
| 174 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]); | |
| 175 | |
| 176 unless( defined $uids or defined $query) { | |
| 177 $self->throw("Must specify a query or list of uids to fetch"); | |
| 178 } | |
| 179 | |
| 180 if ($uids) { | |
| 181 if( ref($uids) =~ /array/i ) { | |
| 182 $uids = join(",", @$uids); | |
| 183 } | |
| 184 $params{'id'} = $uids; | |
| 185 } | |
| 186 | |
| 187 elsif ($query && $query->can('cookie')) { | |
| 188 @params{'WebEnv','query_key'} = $query->cookie; | |
| 189 $params{'db'} = $query->db; | |
| 190 } | |
| 191 | |
| 192 elsif ($query) { | |
| 193 $params{'id'} = join ',',$query->ids; | |
| 194 } | |
| 195 | |
| 196 $params{'rettype'} = $format; | |
| 197 if ($CGILOCATION{$mode}[0] eq 'post') { | |
| 198 return POST $url,[%params]; | |
| 199 } else { | |
| 200 $url->query_form(%params); | |
| 201 $self->debug("url is $url \n"); | |
| 202 return GET $url; | |
| 203 } | |
| 204 } | |
| 205 | |
| 206 =head2 get_Stream_by_batch | |
| 207 | |
| 208 Title : get_Stream_by_batch | |
| 209 Usage : $seq = $db->get_Stream_by_batch($ref); | |
| 210 Function: Retrieves Seq objects from Entrez 'en masse', rather than one | |
| 211 at a time. For large numbers of sequences, this is far superior | |
| 212 than get_Stream_by_[id/acc](). | |
| 213 Example : | |
| 214 Returns : a Bio::SeqIO stream object | |
| 215 Args : $ref : either an array reference, a filename, or a filehandle | |
| 216 from which to get the list of unique ids/accession numbers. | |
| 217 | |
| 218 NOTE: deprecated API. Use get_Stream_by_id() instead. | |
| 219 | |
| 220 =cut | |
| 221 | |
| 222 *get_Stream_by_batch = sub { | |
| 223 my $self = shift; | |
| 224 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead'); | |
| 225 $self->get_Stream_by_id(@_) | |
| 226 }; | |
| 227 | |
| 228 =head2 get_Stream_by_query | |
| 229 | |
| 230 Title : get_Stream_by_query | |
| 231 Usage : $seq = $db->get_Stream_by_query($query); | |
| 232 Function: Retrieves Seq objects from Entrez 'en masse', rather than one | |
| 233 at a time. For large numbers of sequences, this is far superior | |
| 234 than get_Stream_by_[id/acc](). | |
| 235 Example : | |
| 236 Returns : a Bio::SeqIO stream object | |
| 237 Args : $query : An Entrez query string or a | |
| 238 Bio::DB::Query::GenBank object. It is suggested that you | |
| 239 create a Bio::DB::Query::GenBank object and get the entry | |
| 240 count before you fetch a potentially large stream. | |
| 241 | |
| 242 =cut | |
| 243 | |
| 244 sub get_Stream_by_query { | |
| 245 my ($self, $query) = @_; | |
| 246 unless (ref $query && $query->can('query')) { | |
| 247 $query = Bio::DB::Query::GenBank->new($query); | |
| 248 } | |
| 249 return $self->get_seq_stream('-query' => $query, '-mode'=>'query'); | |
| 250 } | |
| 251 | |
| 252 =head2 postprocess_data | |
| 253 | |
| 254 Title : postprocess_data | |
| 255 Usage : $self->postprocess_data ( 'type' => 'string', | |
| 256 'location' => \$datastr); | |
| 257 Function: process downloaded data before loading into a Bio::SeqIO | |
| 258 Returns : void | |
| 259 Args : hash with two keys - 'type' can be 'string' or 'file' | |
| 260 - 'location' either file location or string | |
| 261 reference containing data | |
| 262 | |
| 263 =cut | |
| 264 | |
| 265 # the default method, works for genbank/genpept, other classes should | |
| 266 # override it with their own method. | |
| 267 | |
| 268 sub postprocess_data { | |
| 269 my ($self, %args) = @_; | |
| 270 my $data; | |
| 271 my $type = uc $args{'type'}; | |
| 272 my $location = $args{'location'}; | |
| 273 if( !defined $type || $type eq '' || !defined $location) { | |
| 274 return; | |
| 275 } elsif( $type eq 'STRING' ) { | |
| 276 $data = $$location; | |
| 277 } elsif ( $type eq 'FILE' ) { | |
| 278 open(TMP, $location) or $self->throw("could not open file $location"); | |
| 279 my @in = <TMP>; | |
| 280 close TMP; | |
| 281 $data = join("", @in); | |
| 282 } | |
| 283 | |
| 284 # transform links to appropriate descriptions | |
| 285 if ($data =~ /\nCONTIG\s+/) { | |
| 286 $self->warn("CONTIG found. GenBank get_Stream_by_acc about to run."); | |
| 287 my(@batch,@accession,%accessions,@location,$id, | |
| 288 $contig,$stream,$aCount,$cCount,$gCount,$tCount); | |
| 289 | |
| 290 # process GenBank CONTIG join(...) into two arrays | |
| 291 $data =~ /(?:CONTIG\s+join\()((?:.+\n)+)(?:\/\/)/; | |
| 292 $contig = $1; | |
| 293 $contig =~ s/\n|\)//g; | |
| 294 foreach (split /\s*,\s*/,$contig){ | |
| 295 if (/>(.+)<.+>:(.+)/) { | |
| 296 ($id) = split /\./, $1; | |
| 297 push @accession, $id; | |
| 298 push @location, $2; | |
| 299 $accessions{$id}->{'count'}++; | |
| 300 } elsif( /([\w\.]+):(.+)/ ) { | |
| 301 ($id) = split /\./, $1; | |
| 302 $accessions{$id}->{'count'}++; | |
| 303 push @accession, $id; | |
| 304 push @location, $2; | |
| 305 } | |
| 306 } | |
| 307 | |
| 308 # grab multiple sequences by batch and join based location variable | |
| 309 my @unique_accessions = keys %accessions; | |
| 310 $stream = $self->get_Stream_by_acc(\@unique_accessions); | |
| 311 $contig = ""; | |
| 312 my $ct = 0; | |
| 313 while( my $seq = $stream->next_seq() ) { | |
| 314 if( $seq->accession_number !~ /$unique_accessions[$ct]/ ) { | |
| 315 printf STDERR "warning, %s does not match %s\n", | |
| 316 $seq->accession_number, $unique_accessions[$ct]; | |
| 317 } | |
| 318 $accessions{$unique_accessions[$ct]}->{'seq'} = $seq; | |
| 319 $ct++; | |
| 320 } | |
| 321 for (my $i = 0; $i < @accession; $i++) { | |
| 322 my $seq = $accessions{$accession[$i]}->{'seq'}; | |
| 323 unless( defined $seq ) { | |
| 324 # seq not cached, get next sequence | |
| 325 $self->warn("unable to find sequence $accession[$i]\n"); | |
| 326 return undef; | |
| 327 } | |
| 328 my($start,$end) = split(/\.\./, $location[$i]); | |
| 329 $contig .= $seq->subseq($start,$end-$start); | |
| 330 } | |
| 331 | |
| 332 # count number of each letter in sequence | |
| 333 $aCount = () = $contig =~ /a/ig; | |
| 334 $cCount = () = $contig =~ /c/ig; | |
| 335 $gCount = () = $contig =~ /g/ig; | |
| 336 $tCount = () = $contig =~ /t/ig; | |
| 337 | |
| 338 # remove everything after and including CONTIG | |
| 339 $data =~ s/(CONTIG[\s\S]+)$//i; | |
| 340 | |
| 341 # build ORIGIN part of data file using sequence and counts | |
| 342 $data .= "BASE COUNT $aCount a $cCount c $gCount g $tCount t\n"; | |
| 343 $data .= "ORIGIN \n"; | |
| 344 $data .= "$contig\n//"; | |
| 345 } | |
| 346 else { | |
| 347 $data =~ s/<a\s+href\s*=.+>\s*(\S+)\s*<\s*\/a\s*\>/$1/ig; | |
| 348 } | |
| 349 | |
| 350 # fix gt and lt | |
| 351 $data =~ s/>/>/ig; | |
| 352 $data =~ s/</</ig; | |
| 353 if( $type eq 'FILE' ) { | |
| 354 open(TMP, ">$location") or $self->throw("couldn't overwrite file $location"); | |
| 355 print TMP $data; | |
| 356 close TMP; | |
| 357 } elsif ( $type eq 'STRING' ) { | |
| 358 ${$args{'location'}} = $data; | |
| 359 } | |
| 360 $self->debug("format is ". join(',',$self->request_format()). | |
| 361 " data is\n$data\n"); | |
| 362 } | |
| 363 | |
| 364 | |
| 365 =head2 request_format | |
| 366 | |
| 367 Title : request_format | |
| 368 Usage : my ($req_format, $ioformat) = $self->request_format; | |
| 369 $self->request_format("genbank"); | |
| 370 $self->request_format("fasta"); | |
| 371 Function: Get/Set sequence format retrieval. The get-form will normally not | |
| 372 be used outside of this and derived modules. | |
| 373 Returns : Array of two strings, the first representing the format for | |
| 374 retrieval, and the second specifying the corresponding SeqIO format. | |
| 375 Args : $format = sequence format | |
| 376 | |
| 377 =cut | |
| 378 | |
| 379 sub request_format { | |
| 380 my ($self, $value) = @_; | |
| 381 if( defined $value ) { | |
| 382 $value = lc $value; | |
| 383 if( defined $FORMATMAP{$value} ) { | |
| 384 $self->{'_format'} = [ $value, $FORMATMAP{$value}]; | |
| 385 } else { | |
| 386 # Try to fall back to a default. Alternatively, we could throw | |
| 387 # an exception | |
| 388 $self->{'_format'} = [ $value, $value ]; | |
| 389 } | |
| 390 } | |
| 391 return @{$self->{'_format'}}; | |
| 392 } | |
| 393 | |
| 394 =head2 Bio::DB::WebDBSeqI methods | |
| 395 | |
| 396 Overriding WebDBSeqI method to help newbies to retrieve sequences | |
| 397 | |
| 398 =head2 get_Stream_by_acc | |
| 399 | |
| 400 Title : get_Stream_by_acc | |
| 401 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); | |
| 402 Function: Gets a series of Seq objects by accession numbers | |
| 403 Returns : a Bio::SeqIO stream object | |
| 404 Args : $ref : a reference to an array of accession numbers for | |
| 405 the desired sequence entries | |
| 406 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
| 407 | |
| 408 =cut | |
| 409 | |
| 410 sub get_Stream_by_acc { | |
| 411 my ($self, $ids ) = @_; | |
| 412 my $newdb = $self->_check_id($ids); | |
| 413 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) { | |
| 414 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single'); | |
| 415 } else { | |
| 416 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); | |
| 417 } | |
| 418 } | |
| 419 | |
| 420 | |
| 421 =head2 _check_id | |
| 422 | |
| 423 Title : _check_id | |
| 424 Usage : | |
| 425 Function: | |
| 426 Returns : A Bio::DB::RefSeq reference or throws | |
| 427 Args : $id(s), $string | |
| 428 | |
| 429 =cut | |
| 430 | |
| 431 sub _check_id { | |
| 432 my ($self, $ids) = @_; | |
| 433 | |
| 434 # NT contigs can not be retrieved | |
| 435 $self->throw("NT_ contigs are whole chromosome files which are not part of regular". | |
| 436 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") | |
| 437 if $ids =~ /NT_/; | |
| 438 | |
| 439 # Asking for a RefSeq from EMBL/GenBank | |
| 440 | |
| 441 if ($ids =~ /N._/) { | |
| 442 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.". | |
| 443 " Redirecting the request.\n") | |
| 444 if $self->verbose >= 0; | |
| 445 return new Bio::DB::RefSeq; | |
| 446 } | |
| 447 } | |
| 448 | |
| 449 =head2 delay_policy | |
| 450 | |
| 451 Title : delay_policy | |
| 452 Usage : $secs = $self->delay_policy | |
| 453 Function: return number of seconds to delay between calls to remote db | |
| 454 Returns : number of seconds to delay | |
| 455 Args : none | |
| 456 | |
| 457 NOTE: NCBI requests a delay of 3s between requests. This method | |
| 458 implements that policy. | |
| 459 | |
| 460 =cut | |
| 461 | |
| 462 sub delay_policy { | |
| 463 my $self = shift; | |
| 464 return 3; | |
| 465 } | |
| 466 | |
| 467 1; | |
| 468 __END__ |
