Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/DB/SwissProt.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/SwissProt.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,447 @@ +# +# $Id: SwissProt.pm,v 1.19 2002/12/01 00:05:19 jason Exp $ +# +# BioPerl module for Bio::DB::SwissProt +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code +# Reworked to use Bio::DB::WebDBSeqI 2000-12-11 + +=head1 NAME + +Bio::DB::SwissProt - Database object interface to SwissProt retrieval + +=head1 SYNOPSIS + + use Bio::DB::SwissProt; + + $sp = new Bio::DB::SwissProt; + + $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID + # <4-letter-identifier>_<species 5-letter code> + # or ... + $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC + # [OPQ]xxxxx + + + # In fact in this implementation + # these methods call the same webscript so you can use + # then interchangeably + + # choose a different server to query + $sp = new Bio::DB::SwissProt('-servertype' => 'expasy', + '-hostlocation' => 'us'); + + $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID + +=head1 DESCRIPTION + +SwissProt is a curated database of proteins managed by the Swiss +Bioinformatics Institute. This is in contrast to EMBL/GenBank/DDBJ +which are archives of protein information. Additional tools for +parsing and manipulating swissprot files can be found at +ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/. + +Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the +SwissProt database via an expasy retrieval. Perhaps through SRS +later. + +In order to make changes transparent we have host type (currently only +expasy) and location (default to switzerland) separated out. This +allows the user to pick the closest expasy mirror for running their +queries. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email Jason Stajich E<lt>jason@bioperl.org E<lt> + +Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss +Institute of Bioinformatics for helping point us in the direction of +the correct expasy scripts and for swissknife references. + +Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for help with +adding EBI swall server. + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::SwissProt; +use strict; +use vars qw(@ISA $MODVERSION %HOSTS $DEFAULTFORMAT $DEFAULTSERVERTYPE); + +$MODVERSION = '0.8.1'; +use HTTP::Request::Common; +use Bio::DB::WebDBSeqI; + +@ISA = qw(Bio::DB::WebDBSeqI); + +# global vars +$DEFAULTSERVERTYPE = 'ebi'; +$DEFAULTFORMAT = 'swissprot'; + +# you can add your own here theoretically. +%HOSTS = ( + 'expasy' => { + 'default' => 'us', + 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl', + 'hosts' => + { + 'switzerland' => 'ch.expasy.org', + 'canada' => 'ca.expasy.org', + 'china' => 'cn.expasy.org', + 'taiwan' => 'tw.expasy.org', + 'australia' => 'au.expasy.org', + 'korea' => 'kr.expasy.org', + 'us' => 'us.expasy.org', + }, + # ick, CGI variables + 'jointype' => ' ', + 'idvar' => 'list', + 'basevars' => [ ], + }, + 'ebi' => { + 'default' => 'uk', + 'baseurl' => 'http://%s/cgi-bin/dbfetch', + 'hosts' => { + 'uk' => 'www.ebi.ac.uk', + }, + 'jointype' => ',', + 'idvar' => 'id', + 'basevars' => [ 'db' => 'swall', + 'style' => 'raw' ], + } + ); + +# new modules should be a little more lightweight and +# should use Bio::Root::Root +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($format, $hostlocation,$servertype) = + $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)], + @args); + + if( $format && $format !~ /(swiss)|(fasta)/i ) { + $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported"); + $format = $self->default_format; + } + $servertype = $DEFAULTSERVERTYPE unless $servertype; + $servertype = lc $servertype; + $self->servertype($servertype); + if ( $hostlocation ) { + $self->hostlocation(lc $hostlocation); + } + + $self->request_format($format); # let's always override the format, as it must be swiss or fasta + return $self; +} + +=head2 Routines from Bio::DB::RandomAccessI + +=cut + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +=cut + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + +=cut + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); + Function: Gets a series of Seq objects by unique identifiers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of unique identifiers for + the desired sequence entries + +=cut + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Retrieves Seq objects from SwissProt 'en masse', rather than one + at a time. This is implemented the same way as get_Stream_by_id, + but is provided here in keeping with access methods of NCBI + modules. + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +=cut + +sub get_Stream_by_batch { + my ($self, $ids) = @_; + return $self->get_Stream_by_id( $ids); +} + +=head2 Implemented Routines from Bio::DB::WebDBSeqI interface + +=cut + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: returns a HTTP::Request object + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + +sub get_request { + my ($self, @qualifiers) = @_; + my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)], + @qualifiers); + + if( !defined $uids ) { + $self->throw("Must specify a value for uids to query"); + } + my ($f,undef) = $self->request_format($format); + + my %vars = ( + @{$HOSTS{$self->servertype}->{'basevars'}}, + ( 'format' => $f ) + ); + + my $url = $self->location_url; + + my $uid; + my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' '; + my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id'; + + if( ref($uids) =~ /ARRAY/i ) { + # HTTP::Request automagically converts the ' ' to %20 + $uid = join($jointype, @$uids); + } else { + $uid = $uids; + } + $vars{$idvar} = $uid; + + return POST $url, \%vars; +} + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +# don't need to do anything + +sub postprocess_data { + my ($self, %args) = @_; + return; +} + +=head2 default_format + + Title : default_format + Usage : my $format = $self->default_format + Function: Returns default sequence format for this module + Returns : string + Args : none + +=cut + +sub default_format { + return $DEFAULTFORMAT; +} + +=head2 Bio::DB::SwissProt specific routines + +=cut + +=head2 servertype + + Title : servertype + Usage : my $servertype = $self->servertype + $self->servertype($servertype); + Function: Get/Set server type + Returns : string + Args : server type string [optional] + +=cut + +sub servertype { + my ($self, $servertype) = @_; + if( defined $servertype && $servertype ne '') { + $self->throw("You gave an invalid server type ($servertype)". + " - available types are ". + keys %HOSTS) unless( $HOSTS{$servertype} ); + $self->{'_servertype'} = $servertype; + $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'}; + + # make sure format is reset properly in that different + # servers have different syntaxes + my ($existingformat,$seqioformat) = $self->request_format; + $self->request_format($existingformat); + } + return $self->{'_servertype'} || $DEFAULTSERVERTYPE; +} + + +=head2 hostlocation + + Title : hostlocation + Usage : my $location = $self->hostlocation() + $self->hostlocation($location) + Function: Set/Get Hostlocation + Returns : string representing hostlocation + Args : string specifying hostlocation [optional] + +=cut + +sub hostlocation { + my ($self, $location ) = @_; + $location = lc $location; + my $servertype = $self->servertype; + $self->throw("Must have a valid servertype defined not $servertype") + unless defined $servertype; + my %hosts = %{$HOSTS{$servertype}->{'hosts'}}; + if( defined $location && $location ne '' ) { + if( ! $hosts{$location} ) { + $self->throw("Must specify a known host, not $location,". + " possible values (". + join(",", sort keys %hosts ). ")"); + } + $self->{'_hostlocation'} = $location; + } + return $self->{'_hostlocation'}; +} + +=head2 location_url + + Title : location + Usage : my $url = $self->location_url() + Function: Get host url + Returns : string representing url + Args : none + +=cut + +sub location_url { + my ($self) = @_; + my $servertype = $self->servertype(); + my $location = $self->hostlocation(); + + if( ! defined $location || !defined $servertype ) { + $self->throw("must have a valid hostlocation and servertype set before calling location_url"); + } + return sprintf($HOSTS{$servertype}->{'baseurl'}, + $HOSTS{$servertype}->{'hosts'}->{$location}); +} + +=head2 request_format + + Title : request_format + Usage : my ($req_format, $ioformat) = $self->request_format; + $self->request_format("genbank"); + $self->request_format("fasta"); + Function: Get/Set sequence format retrieval. The get-form will normally not + be used outside of this and derived modules. + Returns : Array of two strings, the first representing the format for + retrieval, and the second specifying the corresponding SeqIO format. + Args : $format = sequence format + +=cut + +sub request_format { + my ($self, $value) = @_; + if( defined $value ) { + if( $self->servertype =~ /expasy/ ) { + if( $value =~ /sprot/ || $value =~ /swiss/ ) { + $self->{'_format'} = [ 'sprot', 'swiss']; + } elsif( $value =~ /^fa/ ) { + $self->{'_format'} = [ 'fasta', 'fasta']; + } else { + $self->warn("Unrecognized format $value requested"); + $self->{'_format'} = [ 'fasta', 'fasta']; + } + } elsif( $self->servertype =~ /ebi/ ) { + if( $value =~ /sprot/ || $value =~ /swiss/ ) { + $self->{'_format'} = [ 'swissprot', 'swiss' ]; + } elsif( $value =~ /^fa/ ) { + $self->{'_format'} = [ 'fasta', 'fasta']; + } else { + $self->warn("Unrecognized format $value requested"); + $self->{'_format'} = [ 'swissprot', 'swiss']; + } + } + } + return @{$self->{'_format'}}; +} + +1; +__END__