Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/DB/NCBIHelper.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: NCBIHelper.pm,v 1.24.2.2 2003/06/12 09:29:38 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::DB::NCBIHelper | |
4 # | |
5 # Cared for by Jason Stajich | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # | |
11 # POD documentation - main docs before the code | |
12 # | |
13 # Interfaces with new WebDBSeqI interface | |
14 | |
15 =head1 NAME | |
16 | |
17 Bio::DB::NCBIHelper - A collection of routines useful for queries to | |
18 NCBI databases. | |
19 | |
20 =head1 SYNOPSIS | |
21 | |
22 #Do not use this module directly. | |
23 | |
24 # get a Bio::DB::NCBIHelper object somehow | |
25 my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']); | |
26 foreach my $seq ( $seqio->next_seq ) { | |
27 # process seq | |
28 } | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 Provides a single place to setup some common methods for querying NCBI | |
33 web databases. This module just centralizes the methods for | |
34 constructing a URL for querying NCBI GenBank and NCBI GenPept and the | |
35 common HTML stripping done in L<postprocess_data>(). | |
36 | |
37 The base NCBI query URL used is | |
38 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi. | |
39 | |
40 =head1 FEEDBACK | |
41 | |
42 =head2 Mailing Lists | |
43 | |
44 User feedback is an integral part of the | |
45 evolution of this and other Bioperl modules. Send | |
46 your comments and suggestions preferably to one | |
47 of the Bioperl mailing lists. Your participation | |
48 is much appreciated. | |
49 | |
50 bioperl-l@bioperl.org - General discussion | |
51 http://bioperl.org/MailList.shtml - About the mailing lists | |
52 | |
53 =head2 Reporting Bugs | |
54 | |
55 Report bugs to the Bioperl bug tracking system to | |
56 help us keep track the bugs and their resolution. | |
57 Bug reports can be submitted via email or the | |
58 web: | |
59 | |
60 bioperl-bugs@bio.perl.org | |
61 http://bugzilla.bioperl.org/ | |
62 | |
63 =head1 AUTHOR - Jason Stajich | |
64 | |
65 Email jason@bioperl.org | |
66 | |
67 =head1 APPENDIX | |
68 | |
69 The rest of the documentation details each of the | |
70 object methods. Internal methods are usually | |
71 preceded with a _ | |
72 | |
73 =cut | |
74 | |
75 # Let the code begin... | |
76 | |
77 package Bio::DB::NCBIHelper; | |
78 use strict; | |
79 use vars qw(@ISA $HOSTBASE %CGILOCATION %FORMATMAP | |
80 $DEFAULTFORMAT $MAX_ENTRIES $VERSION); | |
81 | |
82 use Bio::DB::WebDBSeqI; | |
83 use Bio::DB::Query::GenBank; | |
84 use HTTP::Request::Common; | |
85 use URI; | |
86 use Bio::Root::IO; | |
87 use Bio::DB::RefSeq; | |
88 use Bio::Root::Root; | |
89 | |
90 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root); | |
91 $VERSION = '0.8'; | |
92 | |
93 BEGIN { | |
94 $MAX_ENTRIES = 19000; | |
95 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov'; | |
96 %CGILOCATION = ( | |
97 'batch' => ['post' => '/entrez/eutils/efetch.fcgi'], | |
98 'query' => ['get' => '/entrez/eutils/efetch.fcgi'], | |
99 'single' => ['get' => '/entrez/eutils/efetch.fcgi'], | |
100 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'], | |
101 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'], | |
102 ); | |
103 | |
104 %FORMATMAP = ( 'gb' => 'genbank', | |
105 'gp' => 'genbank', | |
106 'fasta' => 'fasta', | |
107 ); | |
108 | |
109 $DEFAULTFORMAT = 'gb'; | |
110 } | |
111 | |
112 # the new way to make modules a little more lightweight | |
113 | |
114 sub new { | |
115 my ($class, @args ) = @_; | |
116 my $self = $class->SUPER::new(@args); | |
117 return $self; | |
118 } | |
119 | |
120 | |
121 =head2 get_params | |
122 | |
123 Title : get_params | |
124 Usage : my %params = $self->get_params($mode) | |
125 Function: Returns key,value pairs to be passed to NCBI database | |
126 for either 'batch' or 'single' sequence retrieval method | |
127 Returns : a key,value pair hash | |
128 Args : 'single' or 'batch' mode for retrieval | |
129 | |
130 =cut | |
131 | |
132 sub get_params { | |
133 my ($self, $mode) = @_; | |
134 $self->throw("subclass did not implement get_params"); | |
135 } | |
136 | |
137 =head2 default_format | |
138 | |
139 Title : default_format | |
140 Usage : my $format = $self->default_format | |
141 Function: Returns default sequence format for this module | |
142 Returns : string | |
143 Args : none | |
144 | |
145 =cut | |
146 | |
147 sub default_format { | |
148 return $DEFAULTFORMAT; | |
149 } | |
150 | |
151 =head2 get_request | |
152 | |
153 Title : get_request | |
154 Usage : my $url = $self->get_request | |
155 Function: HTTP::Request | |
156 Returns : | |
157 Args : %qualifiers = a hash of qualifiers (ids, format, etc) | |
158 | |
159 =cut | |
160 | |
161 sub get_request { | |
162 my ($self, @qualifiers) = @_; | |
163 my ($mode, $uids, $format, $query) = $self->_rearrange([qw(MODE UIDS | |
164 FORMAT QUERY)], | |
165 @qualifiers); | |
166 | |
167 $mode = lc $mode; | |
168 ($format) = $self->request_format() unless ( defined $format); | |
169 if( !defined $mode || $mode eq '' ) { $mode = 'single'; } | |
170 my %params = $self->get_params($mode); | |
171 if( ! %params ) { | |
172 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'") | |
173 } | |
174 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]); | |
175 | |
176 unless( defined $uids or defined $query) { | |
177 $self->throw("Must specify a query or list of uids to fetch"); | |
178 } | |
179 | |
180 if ($uids) { | |
181 if( ref($uids) =~ /array/i ) { | |
182 $uids = join(",", @$uids); | |
183 } | |
184 $params{'id'} = $uids; | |
185 } | |
186 | |
187 elsif ($query && $query->can('cookie')) { | |
188 @params{'WebEnv','query_key'} = $query->cookie; | |
189 $params{'db'} = $query->db; | |
190 } | |
191 | |
192 elsif ($query) { | |
193 $params{'id'} = join ',',$query->ids; | |
194 } | |
195 | |
196 $params{'rettype'} = $format; | |
197 if ($CGILOCATION{$mode}[0] eq 'post') { | |
198 return POST $url,[%params]; | |
199 } else { | |
200 $url->query_form(%params); | |
201 $self->debug("url is $url \n"); | |
202 return GET $url; | |
203 } | |
204 } | |
205 | |
206 =head2 get_Stream_by_batch | |
207 | |
208 Title : get_Stream_by_batch | |
209 Usage : $seq = $db->get_Stream_by_batch($ref); | |
210 Function: Retrieves Seq objects from Entrez 'en masse', rather than one | |
211 at a time. For large numbers of sequences, this is far superior | |
212 than get_Stream_by_[id/acc](). | |
213 Example : | |
214 Returns : a Bio::SeqIO stream object | |
215 Args : $ref : either an array reference, a filename, or a filehandle | |
216 from which to get the list of unique ids/accession numbers. | |
217 | |
218 NOTE: deprecated API. Use get_Stream_by_id() instead. | |
219 | |
220 =cut | |
221 | |
222 *get_Stream_by_batch = sub { | |
223 my $self = shift; | |
224 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead'); | |
225 $self->get_Stream_by_id(@_) | |
226 }; | |
227 | |
228 =head2 get_Stream_by_query | |
229 | |
230 Title : get_Stream_by_query | |
231 Usage : $seq = $db->get_Stream_by_query($query); | |
232 Function: Retrieves Seq objects from Entrez 'en masse', rather than one | |
233 at a time. For large numbers of sequences, this is far superior | |
234 than get_Stream_by_[id/acc](). | |
235 Example : | |
236 Returns : a Bio::SeqIO stream object | |
237 Args : $query : An Entrez query string or a | |
238 Bio::DB::Query::GenBank object. It is suggested that you | |
239 create a Bio::DB::Query::GenBank object and get the entry | |
240 count before you fetch a potentially large stream. | |
241 | |
242 =cut | |
243 | |
244 sub get_Stream_by_query { | |
245 my ($self, $query) = @_; | |
246 unless (ref $query && $query->can('query')) { | |
247 $query = Bio::DB::Query::GenBank->new($query); | |
248 } | |
249 return $self->get_seq_stream('-query' => $query, '-mode'=>'query'); | |
250 } | |
251 | |
252 =head2 postprocess_data | |
253 | |
254 Title : postprocess_data | |
255 Usage : $self->postprocess_data ( 'type' => 'string', | |
256 'location' => \$datastr); | |
257 Function: process downloaded data before loading into a Bio::SeqIO | |
258 Returns : void | |
259 Args : hash with two keys - 'type' can be 'string' or 'file' | |
260 - 'location' either file location or string | |
261 reference containing data | |
262 | |
263 =cut | |
264 | |
265 # the default method, works for genbank/genpept, other classes should | |
266 # override it with their own method. | |
267 | |
268 sub postprocess_data { | |
269 my ($self, %args) = @_; | |
270 my $data; | |
271 my $type = uc $args{'type'}; | |
272 my $location = $args{'location'}; | |
273 if( !defined $type || $type eq '' || !defined $location) { | |
274 return; | |
275 } elsif( $type eq 'STRING' ) { | |
276 $data = $$location; | |
277 } elsif ( $type eq 'FILE' ) { | |
278 open(TMP, $location) or $self->throw("could not open file $location"); | |
279 my @in = <TMP>; | |
280 close TMP; | |
281 $data = join("", @in); | |
282 } | |
283 | |
284 # transform links to appropriate descriptions | |
285 if ($data =~ /\nCONTIG\s+/) { | |
286 $self->warn("CONTIG found. GenBank get_Stream_by_acc about to run."); | |
287 my(@batch,@accession,%accessions,@location,$id, | |
288 $contig,$stream,$aCount,$cCount,$gCount,$tCount); | |
289 | |
290 # process GenBank CONTIG join(...) into two arrays | |
291 $data =~ /(?:CONTIG\s+join\()((?:.+\n)+)(?:\/\/)/; | |
292 $contig = $1; | |
293 $contig =~ s/\n|\)//g; | |
294 foreach (split /\s*,\s*/,$contig){ | |
295 if (/>(.+)<.+>:(.+)/) { | |
296 ($id) = split /\./, $1; | |
297 push @accession, $id; | |
298 push @location, $2; | |
299 $accessions{$id}->{'count'}++; | |
300 } elsif( /([\w\.]+):(.+)/ ) { | |
301 ($id) = split /\./, $1; | |
302 $accessions{$id}->{'count'}++; | |
303 push @accession, $id; | |
304 push @location, $2; | |
305 } | |
306 } | |
307 | |
308 # grab multiple sequences by batch and join based location variable | |
309 my @unique_accessions = keys %accessions; | |
310 $stream = $self->get_Stream_by_acc(\@unique_accessions); | |
311 $contig = ""; | |
312 my $ct = 0; | |
313 while( my $seq = $stream->next_seq() ) { | |
314 if( $seq->accession_number !~ /$unique_accessions[$ct]/ ) { | |
315 printf STDERR "warning, %s does not match %s\n", | |
316 $seq->accession_number, $unique_accessions[$ct]; | |
317 } | |
318 $accessions{$unique_accessions[$ct]}->{'seq'} = $seq; | |
319 $ct++; | |
320 } | |
321 for (my $i = 0; $i < @accession; $i++) { | |
322 my $seq = $accessions{$accession[$i]}->{'seq'}; | |
323 unless( defined $seq ) { | |
324 # seq not cached, get next sequence | |
325 $self->warn("unable to find sequence $accession[$i]\n"); | |
326 return undef; | |
327 } | |
328 my($start,$end) = split(/\.\./, $location[$i]); | |
329 $contig .= $seq->subseq($start,$end-$start); | |
330 } | |
331 | |
332 # count number of each letter in sequence | |
333 $aCount = () = $contig =~ /a/ig; | |
334 $cCount = () = $contig =~ /c/ig; | |
335 $gCount = () = $contig =~ /g/ig; | |
336 $tCount = () = $contig =~ /t/ig; | |
337 | |
338 # remove everything after and including CONTIG | |
339 $data =~ s/(CONTIG[\s\S]+)$//i; | |
340 | |
341 # build ORIGIN part of data file using sequence and counts | |
342 $data .= "BASE COUNT $aCount a $cCount c $gCount g $tCount t\n"; | |
343 $data .= "ORIGIN \n"; | |
344 $data .= "$contig\n//"; | |
345 } | |
346 else { | |
347 $data =~ s/<a\s+href\s*=.+>\s*(\S+)\s*<\s*\/a\s*\>/$1/ig; | |
348 } | |
349 | |
350 # fix gt and lt | |
351 $data =~ s/>/>/ig; | |
352 $data =~ s/</</ig; | |
353 if( $type eq 'FILE' ) { | |
354 open(TMP, ">$location") or $self->throw("couldn't overwrite file $location"); | |
355 print TMP $data; | |
356 close TMP; | |
357 } elsif ( $type eq 'STRING' ) { | |
358 ${$args{'location'}} = $data; | |
359 } | |
360 $self->debug("format is ". join(',',$self->request_format()). | |
361 " data is\n$data\n"); | |
362 } | |
363 | |
364 | |
365 =head2 request_format | |
366 | |
367 Title : request_format | |
368 Usage : my ($req_format, $ioformat) = $self->request_format; | |
369 $self->request_format("genbank"); | |
370 $self->request_format("fasta"); | |
371 Function: Get/Set sequence format retrieval. The get-form will normally not | |
372 be used outside of this and derived modules. | |
373 Returns : Array of two strings, the first representing the format for | |
374 retrieval, and the second specifying the corresponding SeqIO format. | |
375 Args : $format = sequence format | |
376 | |
377 =cut | |
378 | |
379 sub request_format { | |
380 my ($self, $value) = @_; | |
381 if( defined $value ) { | |
382 $value = lc $value; | |
383 if( defined $FORMATMAP{$value} ) { | |
384 $self->{'_format'} = [ $value, $FORMATMAP{$value}]; | |
385 } else { | |
386 # Try to fall back to a default. Alternatively, we could throw | |
387 # an exception | |
388 $self->{'_format'} = [ $value, $value ]; | |
389 } | |
390 } | |
391 return @{$self->{'_format'}}; | |
392 } | |
393 | |
394 =head2 Bio::DB::WebDBSeqI methods | |
395 | |
396 Overriding WebDBSeqI method to help newbies to retrieve sequences | |
397 | |
398 =head2 get_Stream_by_acc | |
399 | |
400 Title : get_Stream_by_acc | |
401 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); | |
402 Function: Gets a series of Seq objects by accession numbers | |
403 Returns : a Bio::SeqIO stream object | |
404 Args : $ref : a reference to an array of accession numbers for | |
405 the desired sequence entries | |
406 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
407 | |
408 =cut | |
409 | |
410 sub get_Stream_by_acc { | |
411 my ($self, $ids ) = @_; | |
412 my $newdb = $self->_check_id($ids); | |
413 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) { | |
414 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single'); | |
415 } else { | |
416 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); | |
417 } | |
418 } | |
419 | |
420 | |
421 =head2 _check_id | |
422 | |
423 Title : _check_id | |
424 Usage : | |
425 Function: | |
426 Returns : A Bio::DB::RefSeq reference or throws | |
427 Args : $id(s), $string | |
428 | |
429 =cut | |
430 | |
431 sub _check_id { | |
432 my ($self, $ids) = @_; | |
433 | |
434 # NT contigs can not be retrieved | |
435 $self->throw("NT_ contigs are whole chromosome files which are not part of regular". | |
436 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") | |
437 if $ids =~ /NT_/; | |
438 | |
439 # Asking for a RefSeq from EMBL/GenBank | |
440 | |
441 if ($ids =~ /N._/) { | |
442 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.". | |
443 " Redirecting the request.\n") | |
444 if $self->verbose >= 0; | |
445 return new Bio::DB::RefSeq; | |
446 } | |
447 } | |
448 | |
449 =head2 delay_policy | |
450 | |
451 Title : delay_policy | |
452 Usage : $secs = $self->delay_policy | |
453 Function: return number of seconds to delay between calls to remote db | |
454 Returns : number of seconds to delay | |
455 Args : none | |
456 | |
457 NOTE: NCBI requests a delay of 3s between requests. This method | |
458 implements that policy. | |
459 | |
460 =cut | |
461 | |
462 sub delay_policy { | |
463 my $self = shift; | |
464 return 3; | |
465 } | |
466 | |
467 1; | |
468 __END__ |