Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/GenBank.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: GenBank.pm,v 1.47.2.2 2003/07/03 12:31:31 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::DB::GenBank | |
4 # | |
5 # Cared for by Aaron Mackey <amackey@virginia.edu> | |
6 # | |
7 # Copyright Aaron Mackey | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # | |
11 # POD documentation - main docs before the code | |
12 # | |
13 # Added LWP support - Jason Stajich 2000-11-6 | |
14 # completely reworked by Jason Stajich 2000-12-8 | |
15 # to use WebDBSeqI | |
16 | |
17 # Added batch entrez back when determined that new entrez cgi will | |
18 # essentially work (there is a limit to the number of characters in a | |
19 # GET request so I am not sure how we can get around this). The NCBI | |
20 # Batch Entrez form has changed some and it does not support retrieval | |
21 # of text only data. Still should investigate POST-ing (tried and | |
22 # failed) a message to the entrez cgi to get around the GET | |
23 # limitations. | |
24 | |
25 =head1 NAME | |
26 | |
27 Bio::DB::GenBank - Database object interface to GenBank | |
28 | |
29 =head1 SYNOPSIS | |
30 | |
31 use Bio::DB::GenBank; | |
32 $gb = new Bio::DB::GenBank; | |
33 | |
34 $seq = $gb->get_Seq_by_id('MUSIGHBA1'); # Unique ID | |
35 | |
36 # or ... | |
37 | |
38 $seq = $gb->get_Seq_by_acc('J00522'); # Accession Number | |
39 $seq = $gb->get_Seq_by_version('J00522.1'); # Accession.version | |
40 $seq = $gb->get_Seq_by_gi('405830'); # GI Number | |
41 | |
42 # get a stream via a query string | |
43 my $query = Bio::DB::Query::GenBank->new | |
44 (-query =>'Oryza sativa[Organism] AND EST', | |
45 -reldate => '30', | |
46 -db => 'nucleotide'); | |
47 my $seqio = $gb->get_Stream_by_query($query); | |
48 | |
49 while( my $seq = $seqio->next_seq ) { | |
50 print "seq length is ", $seq->length,"\n"; | |
51 } | |
52 | |
53 # or ... best when downloading very large files, prevents | |
54 # keeping all of the file in memory | |
55 | |
56 # also don't want features, just sequence so let's save bandwith | |
57 # and request Fasta sequence | |
58 $gb = new Bio::DB::GenBank(-retrievaltype => 'tempfile' , | |
59 -format => 'Fasta'); | |
60 my $seqio = $gb->get_Stream_by_acc(['AC013798', 'AC021953'] ); | |
61 while( my $clone = $seqio->next_seq ) { | |
62 print "cloneid is ", $clone->display_id, " ", | |
63 $clone->accession_number, "\n"; | |
64 } | |
65 # note that get_Stream_by_version is not implemented | |
66 | |
67 =head1 DESCRIPTION | |
68 | |
69 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the | |
70 GenBank database at NCBI, via an Entrez query. | |
71 | |
72 WARNING: Please do NOT spam the Entrez web server with multiple | |
73 requests. NCBI offers Batch Entrez for this purpose. | |
74 | |
75 Note that when querying for GenBank accessions starting with 'NT_' you | |
76 will need to call $gb-E<gt>request_format('fasta') beforehand, because | |
77 in GenBank format (the default) the sequence part will be left out | |
78 (the reason is that NT contigs are rather annotation with references | |
79 to clones). | |
80 | |
81 Some work has been done to automatically detect and retrieve whole NT_ | |
82 clones when the data is in that format (NCBI RefSeq clones). More | |
83 testing and feedback from users is needed to achieve a good fit of | |
84 functionality and ease of use. | |
85 | |
86 =head1 FEEDBACK | |
87 | |
88 =head2 Mailing Lists | |
89 | |
90 User feedback is an integral part of the evolution of this and other | |
91 Bioperl modules. Send your comments and suggestions preferably to one | |
92 of the Bioperl mailing lists. Your participation is much appreciated. | |
93 | |
94 bioperl-l@bioperl.org - General discussion | |
95 http://bioperl.org/MailList.shtml - About the mailing lists | |
96 | |
97 =head2 Reporting Bugs | |
98 | |
99 Report bugs to the Bioperl bug tracking system to help us keep track | |
100 the bugs and their resolution. Bug reports can be submitted via email | |
101 or the web: | |
102 | |
103 bioperl-bugs@bio.perl.org | |
104 http://bugzilla.bioperl.org/ | |
105 | |
106 =head1 AUTHOR - Aaron Mackey, Jason Stajich | |
107 | |
108 Email amackey@virginia.edu | |
109 Email jason@bioperl.org | |
110 | |
111 =head1 APPENDIX | |
112 | |
113 The rest of the documentation details each of the | |
114 object methods. Internal methods are usually | |
115 preceded with a _ | |
116 | |
117 =cut | |
118 | |
119 # Let the code begin... | |
120 | |
121 package Bio::DB::GenBank; | |
122 use strict; | |
123 use vars qw(@ISA %PARAMSTRING $DEFAULTFORMAT $DEFAULTMODE); | |
124 use Bio::DB::NCBIHelper; | |
125 | |
126 @ISA = qw(Bio::DB::NCBIHelper); | |
127 BEGIN { | |
128 $DEFAULTMODE = 'single'; | |
129 $DEFAULTFORMAT = 'gp'; | |
130 %PARAMSTRING = ( | |
131 'batch' => { 'db' => 'nucleotide', | |
132 'usehistory' => 'n', | |
133 'tool' => 'bioperl', | |
134 'retmode' => 'text'}, | |
135 'query' => { 'usehistory' => 'y', | |
136 'tool' => 'bioperl', | |
137 'retmode' => 'text'}, | |
138 'gi' => { 'db' => 'nucleotide', | |
139 'usehistory' => 'n', | |
140 'tool' => 'bioperl', | |
141 'retmode' => 'text'}, | |
142 'version' => { 'db' => 'nucleotide', | |
143 'usehistory' => 'n', | |
144 'tool' => 'bioperl', | |
145 'retmode' => 'text'}, | |
146 'single' => { 'db' => 'nucleotide', | |
147 'usehistory' => 'n', | |
148 'tool' => 'bioperl', | |
149 'retmode' => 'text'}, | |
150 ); | |
151 } | |
152 | |
153 # new is in NCBIHelper | |
154 | |
155 # helper method to get db specific options | |
156 | |
157 =head2 new | |
158 | |
159 Title : new | |
160 Usage : $gb = Bio::DB::GenBank->new(@options) | |
161 Function: Creates a new genbank handle | |
162 Returns : New genbank handle | |
163 Args : -delay number of seconds to delay between fetches (3s) | |
164 | |
165 NOTE: There are other options that are used internally. By NCBI policy, this | |
166 module introduces a 3s delay between fetches. If you are fetching multiple genbank | |
167 ids, it is a good idea to use get | |
168 | |
169 =cut | |
170 | |
171 =head2 get_params | |
172 | |
173 Title : get_params | |
174 Usage : my %params = $self->get_params($mode) | |
175 Function: Returns key,value pairs to be passed to NCBI database | |
176 for either 'batch' or 'single' sequence retrieval method | |
177 Returns : a key,value pair hash | |
178 Args : 'single' or 'batch' mode for retrieval | |
179 | |
180 =cut | |
181 | |
182 sub get_params { | |
183 my ($self, $mode) = @_; | |
184 return defined $PARAMSTRING{$mode} ? | |
185 %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}}; | |
186 } | |
187 | |
188 # from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI | |
189 | |
190 =head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI | |
191 | |
192 =head2 get_Seq_by_id | |
193 | |
194 Title : get_Seq_by_id | |
195 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
196 Function: Gets a Bio::Seq object by its name | |
197 Returns : a Bio::Seq object | |
198 Args : the id (as a string) of a sequence | |
199 Throws : "id does not exist" exception | |
200 | |
201 =head2 get_Seq_by_acc | |
202 | |
203 Title : get_Seq_by_acc | |
204 Usage : $seq = $db->get_Seq_by_acc($acc); | |
205 Function: Gets a Seq object by accession numbers | |
206 Returns : a Bio::Seq object | |
207 Args : the accession number as a string | |
208 Note : For GenBank, this just calls the same code for get_Seq_by_id() | |
209 Throws : "id does not exist" exception | |
210 | |
211 =cut | |
212 | |
213 | |
214 sub get_Seq_by_acc { | |
215 my ($self,$seqid) = @_; | |
216 $self->SUPER::get_Seq_by_acc("gb|$seqid"); | |
217 } | |
218 | |
219 =head2 get_Seq_by_gi | |
220 | |
221 Title : get_Seq_by_gi | |
222 Usage : $seq = $db->get_Seq_by_gi('405830'); | |
223 Function: Gets a Bio::Seq object by gi number | |
224 Returns : A Bio::Seq object | |
225 Args : gi number (as a string) | |
226 Throws : "gi does not exist" exception | |
227 | |
228 =head2 get_Seq_by_version | |
229 | |
230 Title : get_Seq_by_version | |
231 Usage : $seq = $db->get_Seq_by_version('X77802.1'); | |
232 Function: Gets a Bio::Seq object by sequence version | |
233 Returns : A Bio::Seq object | |
234 Args : accession.version (as a string) | |
235 Throws : "acc.version does not exist" exception | |
236 | |
237 =head1 Routines implemented by Bio::DB::NCBIHelper | |
238 | |
239 =head2 get_Stream_by_query | |
240 | |
241 Title : get_Stream_by_query | |
242 Usage : $seq = $db->get_Stream_by_query($query); | |
243 Function: Retrieves Seq objects from Entrez 'en masse', rather than one | |
244 at a time. For large numbers of sequences, this is far superior | |
245 than get_Stream_by_[id/acc](). | |
246 Example : | |
247 Returns : a Bio::SeqIO stream object | |
248 Args : $query : An Entrez query string or a | |
249 Bio::DB::Query::GenBank object. It is suggested that you | |
250 create a Bio::DB::Query::GenBank object and get the entry | |
251 count before you fetch a potentially large stream. | |
252 | |
253 =cut | |
254 | |
255 =head2 get_Stream_by_id | |
256 | |
257 Title : get_Stream_by_id | |
258 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); | |
259 Function: Gets a series of Seq objects by unique identifiers | |
260 Returns : a Bio::SeqIO stream object | |
261 Args : $ref : a reference to an array of unique identifiers for | |
262 the desired sequence entries | |
263 | |
264 =head2 get_Stream_by_acc | |
265 | |
266 Title : get_Stream_by_acc | |
267 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); | |
268 Function: Gets a series of Seq objects by accession numbers | |
269 Returns : a Bio::SeqIO stream object | |
270 Args : $ref : a reference to an array of accession numbers for | |
271 the desired sequence entries | |
272 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
273 | |
274 =cut | |
275 | |
276 =head2 get_Stream_by_gi | |
277 | |
278 Title : get_Stream_by_gi | |
279 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]); | |
280 Function: Gets a series of Seq objects by gi numbers | |
281 Returns : a Bio::SeqIO stream object | |
282 Args : $ref : a reference to an array of gi numbers for | |
283 the desired sequence entries | |
284 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
285 | |
286 =head2 get_Stream_by_batch | |
287 | |
288 Title : get_Stream_by_batch | |
289 Usage : $seq = $db->get_Stream_by_batch($ref); | |
290 Function: Retrieves Seq objects from Entrez 'en masse', rather than one | |
291 at a time. | |
292 Example : | |
293 Returns : a Bio::SeqIO stream object | |
294 Args : $ref : either an array reference, a filename, or a filehandle | |
295 from which to get the list of unique ids/accession numbers. | |
296 | |
297 NOTE: This method is redundant and deprecated. Use get_Stream_by_id() | |
298 instead. | |
299 | |
300 =head2 get_request | |
301 | |
302 Title : get_request | |
303 Usage : my $url = $self->get_request | |
304 Function: HTTP::Request | |
305 Returns : | |
306 Args : %qualifiers = a hash of qualifiers (ids, format, etc) | |
307 | |
308 1; | |
309 __END__ |