Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/SwissProt.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # | |
| 2 # $Id: SwissProt.pm,v 1.19 2002/12/01 00:05:19 jason Exp $ | |
| 3 # | |
| 4 # BioPerl module for Bio::DB::SwissProt | |
| 5 # | |
| 6 # Cared for by Jason Stajich <jason@bioperl.org> | |
| 7 # | |
| 8 # Copyright Jason Stajich | |
| 9 # | |
| 10 # You may distribute this module under the same terms as perl itself | |
| 11 | |
| 12 # POD documentation - main docs before the code | |
| 13 # Reworked to use Bio::DB::WebDBSeqI 2000-12-11 | |
| 14 | |
| 15 =head1 NAME | |
| 16 | |
| 17 Bio::DB::SwissProt - Database object interface to SwissProt retrieval | |
| 18 | |
| 19 =head1 SYNOPSIS | |
| 20 | |
| 21 use Bio::DB::SwissProt; | |
| 22 | |
| 23 $sp = new Bio::DB::SwissProt; | |
| 24 | |
| 25 $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID | |
| 26 # <4-letter-identifier>_<species 5-letter code> | |
| 27 # or ... | |
| 28 $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC | |
| 29 # [OPQ]xxxxx | |
| 30 | |
| 31 | |
| 32 # In fact in this implementation | |
| 33 # these methods call the same webscript so you can use | |
| 34 # then interchangeably | |
| 35 | |
| 36 # choose a different server to query | |
| 37 $sp = new Bio::DB::SwissProt('-servertype' => 'expasy', | |
| 38 '-hostlocation' => 'us'); | |
| 39 | |
| 40 $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID | |
| 41 | |
| 42 =head1 DESCRIPTION | |
| 43 | |
| 44 SwissProt is a curated database of proteins managed by the Swiss | |
| 45 Bioinformatics Institute. This is in contrast to EMBL/GenBank/DDBJ | |
| 46 which are archives of protein information. Additional tools for | |
| 47 parsing and manipulating swissprot files can be found at | |
| 48 ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/. | |
| 49 | |
| 50 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the | |
| 51 SwissProt database via an expasy retrieval. Perhaps through SRS | |
| 52 later. | |
| 53 | |
| 54 In order to make changes transparent we have host type (currently only | |
| 55 expasy) and location (default to switzerland) separated out. This | |
| 56 allows the user to pick the closest expasy mirror for running their | |
| 57 queries. | |
| 58 | |
| 59 | |
| 60 =head1 FEEDBACK | |
| 61 | |
| 62 =head2 Mailing Lists | |
| 63 | |
| 64 User feedback is an integral part of the evolution of this and other | |
| 65 Bioperl modules. Send your comments and suggestions preferably to one | |
| 66 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 67 | |
| 68 | |
| 69 bioperl-l@bioperl.org - General discussion | |
| 70 http://bio.perl.org/MailList.html - About the mailing lists | |
| 71 | |
| 72 =head2 Reporting Bugs | |
| 73 | |
| 74 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 75 the bugs and their resolution. Bug reports can be submitted via email | |
| 76 or the web: | |
| 77 | |
| 78 bioperl-bugs@bio.perl.org | |
| 79 http://bugzilla.bioperl.org/ | |
| 80 | |
| 81 =head1 AUTHOR - Jason Stajich | |
| 82 | |
| 83 Email Jason Stajich E<lt>jason@bioperl.org E<lt> | |
| 84 | |
| 85 Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss | |
| 86 Institute of Bioinformatics for helping point us in the direction of | |
| 87 the correct expasy scripts and for swissknife references. | |
| 88 | |
| 89 Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for help with | |
| 90 adding EBI swall server. | |
| 91 | |
| 92 =head1 APPENDIX | |
| 93 | |
| 94 The rest of the documentation details each of the object | |
| 95 methods. Internal methods are usually preceded with a _ | |
| 96 | |
| 97 =cut | |
| 98 | |
| 99 # Let the code begin... | |
| 100 | |
| 101 package Bio::DB::SwissProt; | |
| 102 use strict; | |
| 103 use vars qw(@ISA $MODVERSION %HOSTS $DEFAULTFORMAT $DEFAULTSERVERTYPE); | |
| 104 | |
| 105 $MODVERSION = '0.8.1'; | |
| 106 use HTTP::Request::Common; | |
| 107 use Bio::DB::WebDBSeqI; | |
| 108 | |
| 109 @ISA = qw(Bio::DB::WebDBSeqI); | |
| 110 | |
| 111 # global vars | |
| 112 $DEFAULTSERVERTYPE = 'ebi'; | |
| 113 $DEFAULTFORMAT = 'swissprot'; | |
| 114 | |
| 115 # you can add your own here theoretically. | |
| 116 %HOSTS = ( | |
| 117 'expasy' => { | |
| 118 'default' => 'us', | |
| 119 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl', | |
| 120 'hosts' => | |
| 121 { | |
| 122 'switzerland' => 'ch.expasy.org', | |
| 123 'canada' => 'ca.expasy.org', | |
| 124 'china' => 'cn.expasy.org', | |
| 125 'taiwan' => 'tw.expasy.org', | |
| 126 'australia' => 'au.expasy.org', | |
| 127 'korea' => 'kr.expasy.org', | |
| 128 'us' => 'us.expasy.org', | |
| 129 }, | |
| 130 # ick, CGI variables | |
| 131 'jointype' => ' ', | |
| 132 'idvar' => 'list', | |
| 133 'basevars' => [ ], | |
| 134 }, | |
| 135 'ebi' => { | |
| 136 'default' => 'uk', | |
| 137 'baseurl' => 'http://%s/cgi-bin/dbfetch', | |
| 138 'hosts' => { | |
| 139 'uk' => 'www.ebi.ac.uk', | |
| 140 }, | |
| 141 'jointype' => ',', | |
| 142 'idvar' => 'id', | |
| 143 'basevars' => [ 'db' => 'swall', | |
| 144 'style' => 'raw' ], | |
| 145 } | |
| 146 ); | |
| 147 | |
| 148 # new modules should be a little more lightweight and | |
| 149 # should use Bio::Root::Root | |
| 150 sub new { | |
| 151 my ($class, @args) = @_; | |
| 152 my $self = $class->SUPER::new(@args); | |
| 153 | |
| 154 my ($format, $hostlocation,$servertype) = | |
| 155 $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)], | |
| 156 @args); | |
| 157 | |
| 158 if( $format && $format !~ /(swiss)|(fasta)/i ) { | |
| 159 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported"); | |
| 160 $format = $self->default_format; | |
| 161 } | |
| 162 $servertype = $DEFAULTSERVERTYPE unless $servertype; | |
| 163 $servertype = lc $servertype; | |
| 164 $self->servertype($servertype); | |
| 165 if ( $hostlocation ) { | |
| 166 $self->hostlocation(lc $hostlocation); | |
| 167 } | |
| 168 | |
| 169 $self->request_format($format); # let's always override the format, as it must be swiss or fasta | |
| 170 return $self; | |
| 171 } | |
| 172 | |
| 173 =head2 Routines from Bio::DB::RandomAccessI | |
| 174 | |
| 175 =cut | |
| 176 | |
| 177 =head2 get_Seq_by_id | |
| 178 | |
| 179 Title : get_Seq_by_id | |
| 180 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
| 181 Function: Gets a Bio::Seq object by its name | |
| 182 Returns : a Bio::Seq object | |
| 183 Args : the id (as a string) of a sequence | |
| 184 Throws : "id does not exist" exception | |
| 185 | |
| 186 =cut | |
| 187 | |
| 188 =head2 get_Seq_by_acc | |
| 189 | |
| 190 Title : get_Seq_by_acc | |
| 191 Usage : $seq = $db->get_Seq_by_acc('X77802'); | |
| 192 Function: Gets a Bio::Seq object by accession number | |
| 193 Returns : A Bio::Seq object | |
| 194 Args : accession number (as a string) | |
| 195 Throws : "acc does not exist" exception | |
| 196 | |
| 197 =cut | |
| 198 | |
| 199 =head2 get_Stream_by_id | |
| 200 | |
| 201 Title : get_Stream_by_id | |
| 202 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); | |
| 203 Function: Gets a series of Seq objects by unique identifiers | |
| 204 Returns : a Bio::SeqIO stream object | |
| 205 Args : $ref : a reference to an array of unique identifiers for | |
| 206 the desired sequence entries | |
| 207 | |
| 208 =cut | |
| 209 | |
| 210 =head2 get_Stream_by_acc | |
| 211 | |
| 212 Title : get_Stream_by_acc | |
| 213 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); | |
| 214 Function: Gets a series of Seq objects by accession numbers | |
| 215 Returns : a Bio::SeqIO stream object | |
| 216 Args : $ref : a reference to an array of accession numbers for | |
| 217 the desired sequence entries | |
| 218 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
| 219 | |
| 220 =cut | |
| 221 | |
| 222 =head2 get_Stream_by_batch | |
| 223 | |
| 224 Title : get_Stream_by_batch | |
| 225 Usage : $seq = $db->get_Stream_by_batch($ref); | |
| 226 Function: Retrieves Seq objects from SwissProt 'en masse', rather than one | |
| 227 at a time. This is implemented the same way as get_Stream_by_id, | |
| 228 but is provided here in keeping with access methods of NCBI | |
| 229 modules. | |
| 230 Example : | |
| 231 Returns : a Bio::SeqIO stream object | |
| 232 Args : $ref : either an array reference, a filename, or a filehandle | |
| 233 from which to get the list of unique ids/accession numbers. | |
| 234 | |
| 235 =cut | |
| 236 | |
| 237 sub get_Stream_by_batch { | |
| 238 my ($self, $ids) = @_; | |
| 239 return $self->get_Stream_by_id( $ids); | |
| 240 } | |
| 241 | |
| 242 =head2 Implemented Routines from Bio::DB::WebDBSeqI interface | |
| 243 | |
| 244 =cut | |
| 245 | |
| 246 =head2 get_request | |
| 247 | |
| 248 Title : get_request | |
| 249 Usage : my $url = $self->get_request | |
| 250 Function: returns a HTTP::Request object | |
| 251 Returns : | |
| 252 Args : %qualifiers = a hash of qualifiers (ids, format, etc) | |
| 253 | |
| 254 =cut | |
| 255 | |
| 256 sub get_request { | |
| 257 my ($self, @qualifiers) = @_; | |
| 258 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)], | |
| 259 @qualifiers); | |
| 260 | |
| 261 if( !defined $uids ) { | |
| 262 $self->throw("Must specify a value for uids to query"); | |
| 263 } | |
| 264 my ($f,undef) = $self->request_format($format); | |
| 265 | |
| 266 my %vars = ( | |
| 267 @{$HOSTS{$self->servertype}->{'basevars'}}, | |
| 268 ( 'format' => $f ) | |
| 269 ); | |
| 270 | |
| 271 my $url = $self->location_url; | |
| 272 | |
| 273 my $uid; | |
| 274 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' '; | |
| 275 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id'; | |
| 276 | |
| 277 if( ref($uids) =~ /ARRAY/i ) { | |
| 278 # HTTP::Request automagically converts the ' ' to %20 | |
| 279 $uid = join($jointype, @$uids); | |
| 280 } else { | |
| 281 $uid = $uids; | |
| 282 } | |
| 283 $vars{$idvar} = $uid; | |
| 284 | |
| 285 return POST $url, \%vars; | |
| 286 } | |
| 287 | |
| 288 =head2 postprocess_data | |
| 289 | |
| 290 Title : postprocess_data | |
| 291 Usage : $self->postprocess_data ( 'type' => 'string', | |
| 292 'location' => \$datastr); | |
| 293 Function: process downloaded data before loading into a Bio::SeqIO | |
| 294 Returns : void | |
| 295 Args : hash with two keys - 'type' can be 'string' or 'file' | |
| 296 - 'location' either file location or string | |
| 297 reference containing data | |
| 298 | |
| 299 =cut | |
| 300 | |
| 301 # don't need to do anything | |
| 302 | |
| 303 sub postprocess_data { | |
| 304 my ($self, %args) = @_; | |
| 305 return; | |
| 306 } | |
| 307 | |
| 308 =head2 default_format | |
| 309 | |
| 310 Title : default_format | |
| 311 Usage : my $format = $self->default_format | |
| 312 Function: Returns default sequence format for this module | |
| 313 Returns : string | |
| 314 Args : none | |
| 315 | |
| 316 =cut | |
| 317 | |
| 318 sub default_format { | |
| 319 return $DEFAULTFORMAT; | |
| 320 } | |
| 321 | |
| 322 =head2 Bio::DB::SwissProt specific routines | |
| 323 | |
| 324 =cut | |
| 325 | |
| 326 =head2 servertype | |
| 327 | |
| 328 Title : servertype | |
| 329 Usage : my $servertype = $self->servertype | |
| 330 $self->servertype($servertype); | |
| 331 Function: Get/Set server type | |
| 332 Returns : string | |
| 333 Args : server type string [optional] | |
| 334 | |
| 335 =cut | |
| 336 | |
| 337 sub servertype { | |
| 338 my ($self, $servertype) = @_; | |
| 339 if( defined $servertype && $servertype ne '') { | |
| 340 $self->throw("You gave an invalid server type ($servertype)". | |
| 341 " - available types are ". | |
| 342 keys %HOSTS) unless( $HOSTS{$servertype} ); | |
| 343 $self->{'_servertype'} = $servertype; | |
| 344 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'}; | |
| 345 | |
| 346 # make sure format is reset properly in that different | |
| 347 # servers have different syntaxes | |
| 348 my ($existingformat,$seqioformat) = $self->request_format; | |
| 349 $self->request_format($existingformat); | |
| 350 } | |
| 351 return $self->{'_servertype'} || $DEFAULTSERVERTYPE; | |
| 352 } | |
| 353 | |
| 354 | |
| 355 =head2 hostlocation | |
| 356 | |
| 357 Title : hostlocation | |
| 358 Usage : my $location = $self->hostlocation() | |
| 359 $self->hostlocation($location) | |
| 360 Function: Set/Get Hostlocation | |
| 361 Returns : string representing hostlocation | |
| 362 Args : string specifying hostlocation [optional] | |
| 363 | |
| 364 =cut | |
| 365 | |
| 366 sub hostlocation { | |
| 367 my ($self, $location ) = @_; | |
| 368 $location = lc $location; | |
| 369 my $servertype = $self->servertype; | |
| 370 $self->throw("Must have a valid servertype defined not $servertype") | |
| 371 unless defined $servertype; | |
| 372 my %hosts = %{$HOSTS{$servertype}->{'hosts'}}; | |
| 373 if( defined $location && $location ne '' ) { | |
| 374 if( ! $hosts{$location} ) { | |
| 375 $self->throw("Must specify a known host, not $location,". | |
| 376 " possible values (". | |
| 377 join(",", sort keys %hosts ). ")"); | |
| 378 } | |
| 379 $self->{'_hostlocation'} = $location; | |
| 380 } | |
| 381 return $self->{'_hostlocation'}; | |
| 382 } | |
| 383 | |
| 384 =head2 location_url | |
| 385 | |
| 386 Title : location | |
| 387 Usage : my $url = $self->location_url() | |
| 388 Function: Get host url | |
| 389 Returns : string representing url | |
| 390 Args : none | |
| 391 | |
| 392 =cut | |
| 393 | |
| 394 sub location_url { | |
| 395 my ($self) = @_; | |
| 396 my $servertype = $self->servertype(); | |
| 397 my $location = $self->hostlocation(); | |
| 398 | |
| 399 if( ! defined $location || !defined $servertype ) { | |
| 400 $self->throw("must have a valid hostlocation and servertype set before calling location_url"); | |
| 401 } | |
| 402 return sprintf($HOSTS{$servertype}->{'baseurl'}, | |
| 403 $HOSTS{$servertype}->{'hosts'}->{$location}); | |
| 404 } | |
| 405 | |
| 406 =head2 request_format | |
| 407 | |
| 408 Title : request_format | |
| 409 Usage : my ($req_format, $ioformat) = $self->request_format; | |
| 410 $self->request_format("genbank"); | |
| 411 $self->request_format("fasta"); | |
| 412 Function: Get/Set sequence format retrieval. The get-form will normally not | |
| 413 be used outside of this and derived modules. | |
| 414 Returns : Array of two strings, the first representing the format for | |
| 415 retrieval, and the second specifying the corresponding SeqIO format. | |
| 416 Args : $format = sequence format | |
| 417 | |
| 418 =cut | |
| 419 | |
| 420 sub request_format { | |
| 421 my ($self, $value) = @_; | |
| 422 if( defined $value ) { | |
| 423 if( $self->servertype =~ /expasy/ ) { | |
| 424 if( $value =~ /sprot/ || $value =~ /swiss/ ) { | |
| 425 $self->{'_format'} = [ 'sprot', 'swiss']; | |
| 426 } elsif( $value =~ /^fa/ ) { | |
| 427 $self->{'_format'} = [ 'fasta', 'fasta']; | |
| 428 } else { | |
| 429 $self->warn("Unrecognized format $value requested"); | |
| 430 $self->{'_format'} = [ 'fasta', 'fasta']; | |
| 431 } | |
| 432 } elsif( $self->servertype =~ /ebi/ ) { | |
| 433 if( $value =~ /sprot/ || $value =~ /swiss/ ) { | |
| 434 $self->{'_format'} = [ 'swissprot', 'swiss' ]; | |
| 435 } elsif( $value =~ /^fa/ ) { | |
| 436 $self->{'_format'} = [ 'fasta', 'fasta']; | |
| 437 } else { | |
| 438 $self->warn("Unrecognized format $value requested"); | |
| 439 $self->{'_format'} = [ 'swissprot', 'swiss']; | |
| 440 } | |
| 441 } | |
| 442 } | |
| 443 return @{$self->{'_format'}}; | |
| 444 } | |
| 445 | |
| 446 1; | |
| 447 __END__ |
