0
|
1 #
|
|
2 # $Id: SwissProt.pm,v 1.19 2002/12/01 00:05:19 jason Exp $
|
|
3 #
|
|
4 # BioPerl module for Bio::DB::SwissProt
|
|
5 #
|
|
6 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
7 #
|
|
8 # Copyright Jason Stajich
|
|
9 #
|
|
10 # You may distribute this module under the same terms as perl itself
|
|
11
|
|
12 # POD documentation - main docs before the code
|
|
13 # Reworked to use Bio::DB::WebDBSeqI 2000-12-11
|
|
14
|
|
15 =head1 NAME
|
|
16
|
|
17 Bio::DB::SwissProt - Database object interface to SwissProt retrieval
|
|
18
|
|
19 =head1 SYNOPSIS
|
|
20
|
|
21 use Bio::DB::SwissProt;
|
|
22
|
|
23 $sp = new Bio::DB::SwissProt;
|
|
24
|
|
25 $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID
|
|
26 # <4-letter-identifier>_<species 5-letter code>
|
|
27 # or ...
|
|
28 $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC
|
|
29 # [OPQ]xxxxx
|
|
30
|
|
31
|
|
32 # In fact in this implementation
|
|
33 # these methods call the same webscript so you can use
|
|
34 # then interchangeably
|
|
35
|
|
36 # choose a different server to query
|
|
37 $sp = new Bio::DB::SwissProt('-servertype' => 'expasy',
|
|
38 '-hostlocation' => 'us');
|
|
39
|
|
40 $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID
|
|
41
|
|
42 =head1 DESCRIPTION
|
|
43
|
|
44 SwissProt is a curated database of proteins managed by the Swiss
|
|
45 Bioinformatics Institute. This is in contrast to EMBL/GenBank/DDBJ
|
|
46 which are archives of protein information. Additional tools for
|
|
47 parsing and manipulating swissprot files can be found at
|
|
48 ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/.
|
|
49
|
|
50 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
|
|
51 SwissProt database via an expasy retrieval. Perhaps through SRS
|
|
52 later.
|
|
53
|
|
54 In order to make changes transparent we have host type (currently only
|
|
55 expasy) and location (default to switzerland) separated out. This
|
|
56 allows the user to pick the closest expasy mirror for running their
|
|
57 queries.
|
|
58
|
|
59
|
|
60 =head1 FEEDBACK
|
|
61
|
|
62 =head2 Mailing Lists
|
|
63
|
|
64 User feedback is an integral part of the evolution of this and other
|
|
65 Bioperl modules. Send your comments and suggestions preferably to one
|
|
66 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
67
|
|
68
|
|
69 bioperl-l@bioperl.org - General discussion
|
|
70 http://bio.perl.org/MailList.html - About the mailing lists
|
|
71
|
|
72 =head2 Reporting Bugs
|
|
73
|
|
74 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
75 the bugs and their resolution. Bug reports can be submitted via email
|
|
76 or the web:
|
|
77
|
|
78 bioperl-bugs@bio.perl.org
|
|
79 http://bugzilla.bioperl.org/
|
|
80
|
|
81 =head1 AUTHOR - Jason Stajich
|
|
82
|
|
83 Email Jason Stajich E<lt>jason@bioperl.org E<lt>
|
|
84
|
|
85 Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss
|
|
86 Institute of Bioinformatics for helping point us in the direction of
|
|
87 the correct expasy scripts and for swissknife references.
|
|
88
|
|
89 Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for help with
|
|
90 adding EBI swall server.
|
|
91
|
|
92 =head1 APPENDIX
|
|
93
|
|
94 The rest of the documentation details each of the object
|
|
95 methods. Internal methods are usually preceded with a _
|
|
96
|
|
97 =cut
|
|
98
|
|
99 # Let the code begin...
|
|
100
|
|
101 package Bio::DB::SwissProt;
|
|
102 use strict;
|
|
103 use vars qw(@ISA $MODVERSION %HOSTS $DEFAULTFORMAT $DEFAULTSERVERTYPE);
|
|
104
|
|
105 $MODVERSION = '0.8.1';
|
|
106 use HTTP::Request::Common;
|
|
107 use Bio::DB::WebDBSeqI;
|
|
108
|
|
109 @ISA = qw(Bio::DB::WebDBSeqI);
|
|
110
|
|
111 # global vars
|
|
112 $DEFAULTSERVERTYPE = 'ebi';
|
|
113 $DEFAULTFORMAT = 'swissprot';
|
|
114
|
|
115 # you can add your own here theoretically.
|
|
116 %HOSTS = (
|
|
117 'expasy' => {
|
|
118 'default' => 'us',
|
|
119 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl',
|
|
120 'hosts' =>
|
|
121 {
|
|
122 'switzerland' => 'ch.expasy.org',
|
|
123 'canada' => 'ca.expasy.org',
|
|
124 'china' => 'cn.expasy.org',
|
|
125 'taiwan' => 'tw.expasy.org',
|
|
126 'australia' => 'au.expasy.org',
|
|
127 'korea' => 'kr.expasy.org',
|
|
128 'us' => 'us.expasy.org',
|
|
129 },
|
|
130 # ick, CGI variables
|
|
131 'jointype' => ' ',
|
|
132 'idvar' => 'list',
|
|
133 'basevars' => [ ],
|
|
134 },
|
|
135 'ebi' => {
|
|
136 'default' => 'uk',
|
|
137 'baseurl' => 'http://%s/cgi-bin/dbfetch',
|
|
138 'hosts' => {
|
|
139 'uk' => 'www.ebi.ac.uk',
|
|
140 },
|
|
141 'jointype' => ',',
|
|
142 'idvar' => 'id',
|
|
143 'basevars' => [ 'db' => 'swall',
|
|
144 'style' => 'raw' ],
|
|
145 }
|
|
146 );
|
|
147
|
|
148 # new modules should be a little more lightweight and
|
|
149 # should use Bio::Root::Root
|
|
150 sub new {
|
|
151 my ($class, @args) = @_;
|
|
152 my $self = $class->SUPER::new(@args);
|
|
153
|
|
154 my ($format, $hostlocation,$servertype) =
|
|
155 $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)],
|
|
156 @args);
|
|
157
|
|
158 if( $format && $format !~ /(swiss)|(fasta)/i ) {
|
|
159 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported");
|
|
160 $format = $self->default_format;
|
|
161 }
|
|
162 $servertype = $DEFAULTSERVERTYPE unless $servertype;
|
|
163 $servertype = lc $servertype;
|
|
164 $self->servertype($servertype);
|
|
165 if ( $hostlocation ) {
|
|
166 $self->hostlocation(lc $hostlocation);
|
|
167 }
|
|
168
|
|
169 $self->request_format($format); # let's always override the format, as it must be swiss or fasta
|
|
170 return $self;
|
|
171 }
|
|
172
|
|
173 =head2 Routines from Bio::DB::RandomAccessI
|
|
174
|
|
175 =cut
|
|
176
|
|
177 =head2 get_Seq_by_id
|
|
178
|
|
179 Title : get_Seq_by_id
|
|
180 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
|
|
181 Function: Gets a Bio::Seq object by its name
|
|
182 Returns : a Bio::Seq object
|
|
183 Args : the id (as a string) of a sequence
|
|
184 Throws : "id does not exist" exception
|
|
185
|
|
186 =cut
|
|
187
|
|
188 =head2 get_Seq_by_acc
|
|
189
|
|
190 Title : get_Seq_by_acc
|
|
191 Usage : $seq = $db->get_Seq_by_acc('X77802');
|
|
192 Function: Gets a Bio::Seq object by accession number
|
|
193 Returns : A Bio::Seq object
|
|
194 Args : accession number (as a string)
|
|
195 Throws : "acc does not exist" exception
|
|
196
|
|
197 =cut
|
|
198
|
|
199 =head2 get_Stream_by_id
|
|
200
|
|
201 Title : get_Stream_by_id
|
|
202 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
|
|
203 Function: Gets a series of Seq objects by unique identifiers
|
|
204 Returns : a Bio::SeqIO stream object
|
|
205 Args : $ref : a reference to an array of unique identifiers for
|
|
206 the desired sequence entries
|
|
207
|
|
208 =cut
|
|
209
|
|
210 =head2 get_Stream_by_acc
|
|
211
|
|
212 Title : get_Stream_by_acc
|
|
213 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
|
|
214 Function: Gets a series of Seq objects by accession numbers
|
|
215 Returns : a Bio::SeqIO stream object
|
|
216 Args : $ref : a reference to an array of accession numbers for
|
|
217 the desired sequence entries
|
|
218 Note : For GenBank, this just calls the same code for get_Stream_by_id()
|
|
219
|
|
220 =cut
|
|
221
|
|
222 =head2 get_Stream_by_batch
|
|
223
|
|
224 Title : get_Stream_by_batch
|
|
225 Usage : $seq = $db->get_Stream_by_batch($ref);
|
|
226 Function: Retrieves Seq objects from SwissProt 'en masse', rather than one
|
|
227 at a time. This is implemented the same way as get_Stream_by_id,
|
|
228 but is provided here in keeping with access methods of NCBI
|
|
229 modules.
|
|
230 Example :
|
|
231 Returns : a Bio::SeqIO stream object
|
|
232 Args : $ref : either an array reference, a filename, or a filehandle
|
|
233 from which to get the list of unique ids/accession numbers.
|
|
234
|
|
235 =cut
|
|
236
|
|
237 sub get_Stream_by_batch {
|
|
238 my ($self, $ids) = @_;
|
|
239 return $self->get_Stream_by_id( $ids);
|
|
240 }
|
|
241
|
|
242 =head2 Implemented Routines from Bio::DB::WebDBSeqI interface
|
|
243
|
|
244 =cut
|
|
245
|
|
246 =head2 get_request
|
|
247
|
|
248 Title : get_request
|
|
249 Usage : my $url = $self->get_request
|
|
250 Function: returns a HTTP::Request object
|
|
251 Returns :
|
|
252 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
|
|
253
|
|
254 =cut
|
|
255
|
|
256 sub get_request {
|
|
257 my ($self, @qualifiers) = @_;
|
|
258 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
|
|
259 @qualifiers);
|
|
260
|
|
261 if( !defined $uids ) {
|
|
262 $self->throw("Must specify a value for uids to query");
|
|
263 }
|
|
264 my ($f,undef) = $self->request_format($format);
|
|
265
|
|
266 my %vars = (
|
|
267 @{$HOSTS{$self->servertype}->{'basevars'}},
|
|
268 ( 'format' => $f )
|
|
269 );
|
|
270
|
|
271 my $url = $self->location_url;
|
|
272
|
|
273 my $uid;
|
|
274 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' ';
|
|
275 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id';
|
|
276
|
|
277 if( ref($uids) =~ /ARRAY/i ) {
|
|
278 # HTTP::Request automagically converts the ' ' to %20
|
|
279 $uid = join($jointype, @$uids);
|
|
280 } else {
|
|
281 $uid = $uids;
|
|
282 }
|
|
283 $vars{$idvar} = $uid;
|
|
284
|
|
285 return POST $url, \%vars;
|
|
286 }
|
|
287
|
|
288 =head2 postprocess_data
|
|
289
|
|
290 Title : postprocess_data
|
|
291 Usage : $self->postprocess_data ( 'type' => 'string',
|
|
292 'location' => \$datastr);
|
|
293 Function: process downloaded data before loading into a Bio::SeqIO
|
|
294 Returns : void
|
|
295 Args : hash with two keys - 'type' can be 'string' or 'file'
|
|
296 - 'location' either file location or string
|
|
297 reference containing data
|
|
298
|
|
299 =cut
|
|
300
|
|
301 # don't need to do anything
|
|
302
|
|
303 sub postprocess_data {
|
|
304 my ($self, %args) = @_;
|
|
305 return;
|
|
306 }
|
|
307
|
|
308 =head2 default_format
|
|
309
|
|
310 Title : default_format
|
|
311 Usage : my $format = $self->default_format
|
|
312 Function: Returns default sequence format for this module
|
|
313 Returns : string
|
|
314 Args : none
|
|
315
|
|
316 =cut
|
|
317
|
|
318 sub default_format {
|
|
319 return $DEFAULTFORMAT;
|
|
320 }
|
|
321
|
|
322 =head2 Bio::DB::SwissProt specific routines
|
|
323
|
|
324 =cut
|
|
325
|
|
326 =head2 servertype
|
|
327
|
|
328 Title : servertype
|
|
329 Usage : my $servertype = $self->servertype
|
|
330 $self->servertype($servertype);
|
|
331 Function: Get/Set server type
|
|
332 Returns : string
|
|
333 Args : server type string [optional]
|
|
334
|
|
335 =cut
|
|
336
|
|
337 sub servertype {
|
|
338 my ($self, $servertype) = @_;
|
|
339 if( defined $servertype && $servertype ne '') {
|
|
340 $self->throw("You gave an invalid server type ($servertype)".
|
|
341 " - available types are ".
|
|
342 keys %HOSTS) unless( $HOSTS{$servertype} );
|
|
343 $self->{'_servertype'} = $servertype;
|
|
344 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'};
|
|
345
|
|
346 # make sure format is reset properly in that different
|
|
347 # servers have different syntaxes
|
|
348 my ($existingformat,$seqioformat) = $self->request_format;
|
|
349 $self->request_format($existingformat);
|
|
350 }
|
|
351 return $self->{'_servertype'} || $DEFAULTSERVERTYPE;
|
|
352 }
|
|
353
|
|
354
|
|
355 =head2 hostlocation
|
|
356
|
|
357 Title : hostlocation
|
|
358 Usage : my $location = $self->hostlocation()
|
|
359 $self->hostlocation($location)
|
|
360 Function: Set/Get Hostlocation
|
|
361 Returns : string representing hostlocation
|
|
362 Args : string specifying hostlocation [optional]
|
|
363
|
|
364 =cut
|
|
365
|
|
366 sub hostlocation {
|
|
367 my ($self, $location ) = @_;
|
|
368 $location = lc $location;
|
|
369 my $servertype = $self->servertype;
|
|
370 $self->throw("Must have a valid servertype defined not $servertype")
|
|
371 unless defined $servertype;
|
|
372 my %hosts = %{$HOSTS{$servertype}->{'hosts'}};
|
|
373 if( defined $location && $location ne '' ) {
|
|
374 if( ! $hosts{$location} ) {
|
|
375 $self->throw("Must specify a known host, not $location,".
|
|
376 " possible values (".
|
|
377 join(",", sort keys %hosts ). ")");
|
|
378 }
|
|
379 $self->{'_hostlocation'} = $location;
|
|
380 }
|
|
381 return $self->{'_hostlocation'};
|
|
382 }
|
|
383
|
|
384 =head2 location_url
|
|
385
|
|
386 Title : location
|
|
387 Usage : my $url = $self->location_url()
|
|
388 Function: Get host url
|
|
389 Returns : string representing url
|
|
390 Args : none
|
|
391
|
|
392 =cut
|
|
393
|
|
394 sub location_url {
|
|
395 my ($self) = @_;
|
|
396 my $servertype = $self->servertype();
|
|
397 my $location = $self->hostlocation();
|
|
398
|
|
399 if( ! defined $location || !defined $servertype ) {
|
|
400 $self->throw("must have a valid hostlocation and servertype set before calling location_url");
|
|
401 }
|
|
402 return sprintf($HOSTS{$servertype}->{'baseurl'},
|
|
403 $HOSTS{$servertype}->{'hosts'}->{$location});
|
|
404 }
|
|
405
|
|
406 =head2 request_format
|
|
407
|
|
408 Title : request_format
|
|
409 Usage : my ($req_format, $ioformat) = $self->request_format;
|
|
410 $self->request_format("genbank");
|
|
411 $self->request_format("fasta");
|
|
412 Function: Get/Set sequence format retrieval. The get-form will normally not
|
|
413 be used outside of this and derived modules.
|
|
414 Returns : Array of two strings, the first representing the format for
|
|
415 retrieval, and the second specifying the corresponding SeqIO format.
|
|
416 Args : $format = sequence format
|
|
417
|
|
418 =cut
|
|
419
|
|
420 sub request_format {
|
|
421 my ($self, $value) = @_;
|
|
422 if( defined $value ) {
|
|
423 if( $self->servertype =~ /expasy/ ) {
|
|
424 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
|
|
425 $self->{'_format'} = [ 'sprot', 'swiss'];
|
|
426 } elsif( $value =~ /^fa/ ) {
|
|
427 $self->{'_format'} = [ 'fasta', 'fasta'];
|
|
428 } else {
|
|
429 $self->warn("Unrecognized format $value requested");
|
|
430 $self->{'_format'} = [ 'fasta', 'fasta'];
|
|
431 }
|
|
432 } elsif( $self->servertype =~ /ebi/ ) {
|
|
433 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
|
|
434 $self->{'_format'} = [ 'swissprot', 'swiss' ];
|
|
435 } elsif( $value =~ /^fa/ ) {
|
|
436 $self->{'_format'} = [ 'fasta', 'fasta'];
|
|
437 } else {
|
|
438 $self->warn("Unrecognized format $value requested");
|
|
439 $self->{'_format'} = [ 'swissprot', 'swiss'];
|
|
440 }
|
|
441 }
|
|
442 }
|
|
443 return @{$self->{'_format'}};
|
|
444 }
|
|
445
|
|
446 1;
|
|
447 __END__
|