Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/DB/NCBIHelper.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/NCBIHelper.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,468 @@ +# $Id: NCBIHelper.pm,v 1.24.2.2 2003/06/12 09:29:38 heikki Exp $ +# +# BioPerl module for Bio::DB::NCBIHelper +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# +# Interfaces with new WebDBSeqI interface + +=head1 NAME + +Bio::DB::NCBIHelper - A collection of routines useful for queries to +NCBI databases. + +=head1 SYNOPSIS + + #Do not use this module directly. + + # get a Bio::DB::NCBIHelper object somehow + my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']); + foreach my $seq ( $seqio->next_seq ) { + # process seq + } + +=head1 DESCRIPTION + +Provides a single place to setup some common methods for querying NCBI +web databases. This module just centralizes the methods for +constructing a URL for querying NCBI GenBank and NCBI GenPept and the +common HTML stripping done in L<postprocess_data>(). + +The base NCBI query URL used is +http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the +evolution of this and other Bioperl modules. Send +your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation +is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to +help us keep track the bugs and their resolution. +Bug reports can be submitted via email or the +web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::NCBIHelper; +use strict; +use vars qw(@ISA $HOSTBASE %CGILOCATION %FORMATMAP + $DEFAULTFORMAT $MAX_ENTRIES $VERSION); + +use Bio::DB::WebDBSeqI; +use Bio::DB::Query::GenBank; +use HTTP::Request::Common; +use URI; +use Bio::Root::IO; +use Bio::DB::RefSeq; +use Bio::Root::Root; + +@ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root); +$VERSION = '0.8'; + +BEGIN { + $MAX_ENTRIES = 19000; + $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov'; + %CGILOCATION = ( + 'batch' => ['post' => '/entrez/eutils/efetch.fcgi'], + 'query' => ['get' => '/entrez/eutils/efetch.fcgi'], + 'single' => ['get' => '/entrez/eutils/efetch.fcgi'], + 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'], + 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'], + ); + + %FORMATMAP = ( 'gb' => 'genbank', + 'gp' => 'genbank', + 'fasta' => 'fasta', + ); + + $DEFAULTFORMAT = 'gb'; +} + +# the new way to make modules a little more lightweight + +sub new { + my ($class, @args ) = @_; + my $self = $class->SUPER::new(@args); + return $self; +} + + +=head2 get_params + + Title : get_params + Usage : my %params = $self->get_params($mode) + Function: Returns key,value pairs to be passed to NCBI database + for either 'batch' or 'single' sequence retrieval method + Returns : a key,value pair hash + Args : 'single' or 'batch' mode for retrieval + +=cut + +sub get_params { + my ($self, $mode) = @_; + $self->throw("subclass did not implement get_params"); +} + +=head2 default_format + + Title : default_format + Usage : my $format = $self->default_format + Function: Returns default sequence format for this module + Returns : string + Args : none + +=cut + +sub default_format { + return $DEFAULTFORMAT; +} + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: HTTP::Request + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +=cut + +sub get_request { + my ($self, @qualifiers) = @_; + my ($mode, $uids, $format, $query) = $self->_rearrange([qw(MODE UIDS + FORMAT QUERY)], + @qualifiers); + + $mode = lc $mode; + ($format) = $self->request_format() unless ( defined $format); + if( !defined $mode || $mode eq '' ) { $mode = 'single'; } + my %params = $self->get_params($mode); + if( ! %params ) { + $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'") + } + my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]); + + unless( defined $uids or defined $query) { + $self->throw("Must specify a query or list of uids to fetch"); + } + + if ($uids) { + if( ref($uids) =~ /array/i ) { + $uids = join(",", @$uids); + } + $params{'id'} = $uids; + } + + elsif ($query && $query->can('cookie')) { + @params{'WebEnv','query_key'} = $query->cookie; + $params{'db'} = $query->db; + } + + elsif ($query) { + $params{'id'} = join ',',$query->ids; + } + + $params{'rettype'} = $format; + if ($CGILOCATION{$mode}[0] eq 'post') { + return POST $url,[%params]; + } else { + $url->query_form(%params); + $self->debug("url is $url \n"); + return GET $url; + } +} + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. For large numbers of sequences, this is far superior + than get_Stream_by_[id/acc](). + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +NOTE: deprecated API. Use get_Stream_by_id() instead. + +=cut + +*get_Stream_by_batch = sub { + my $self = shift; + $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead'); + $self->get_Stream_by_id(@_) +}; + +=head2 get_Stream_by_query + + Title : get_Stream_by_query + Usage : $seq = $db->get_Stream_by_query($query); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. For large numbers of sequences, this is far superior + than get_Stream_by_[id/acc](). + Example : + Returns : a Bio::SeqIO stream object + Args : $query : An Entrez query string or a + Bio::DB::Query::GenBank object. It is suggested that you + create a Bio::DB::Query::GenBank object and get the entry + count before you fetch a potentially large stream. + +=cut + +sub get_Stream_by_query { + my ($self, $query) = @_; + unless (ref $query && $query->can('query')) { + $query = Bio::DB::Query::GenBank->new($query); + } + return $self->get_seq_stream('-query' => $query, '-mode'=>'query'); +} + +=head2 postprocess_data + + Title : postprocess_data + Usage : $self->postprocess_data ( 'type' => 'string', + 'location' => \$datastr); + Function: process downloaded data before loading into a Bio::SeqIO + Returns : void + Args : hash with two keys - 'type' can be 'string' or 'file' + - 'location' either file location or string + reference containing data + +=cut + +# the default method, works for genbank/genpept, other classes should +# override it with their own method. + +sub postprocess_data { + my ($self, %args) = @_; + my $data; + my $type = uc $args{'type'}; + my $location = $args{'location'}; + if( !defined $type || $type eq '' || !defined $location) { + return; + } elsif( $type eq 'STRING' ) { + $data = $$location; + } elsif ( $type eq 'FILE' ) { + open(TMP, $location) or $self->throw("could not open file $location"); + my @in = <TMP>; + close TMP; + $data = join("", @in); + } + + # transform links to appropriate descriptions + if ($data =~ /\nCONTIG\s+/) { + $self->warn("CONTIG found. GenBank get_Stream_by_acc about to run."); + my(@batch,@accession,%accessions,@location,$id, + $contig,$stream,$aCount,$cCount,$gCount,$tCount); + + # process GenBank CONTIG join(...) into two arrays + $data =~ /(?:CONTIG\s+join\()((?:.+\n)+)(?:\/\/)/; + $contig = $1; + $contig =~ s/\n|\)//g; + foreach (split /\s*,\s*/,$contig){ + if (/>(.+)<.+>:(.+)/) { + ($id) = split /\./, $1; + push @accession, $id; + push @location, $2; + $accessions{$id}->{'count'}++; + } elsif( /([\w\.]+):(.+)/ ) { + ($id) = split /\./, $1; + $accessions{$id}->{'count'}++; + push @accession, $id; + push @location, $2; + } + } + + # grab multiple sequences by batch and join based location variable + my @unique_accessions = keys %accessions; + $stream = $self->get_Stream_by_acc(\@unique_accessions); + $contig = ""; + my $ct = 0; + while( my $seq = $stream->next_seq() ) { + if( $seq->accession_number !~ /$unique_accessions[$ct]/ ) { + printf STDERR "warning, %s does not match %s\n", + $seq->accession_number, $unique_accessions[$ct]; + } + $accessions{$unique_accessions[$ct]}->{'seq'} = $seq; + $ct++; + } + for (my $i = 0; $i < @accession; $i++) { + my $seq = $accessions{$accession[$i]}->{'seq'}; + unless( defined $seq ) { + # seq not cached, get next sequence + $self->warn("unable to find sequence $accession[$i]\n"); + return undef; + } + my($start,$end) = split(/\.\./, $location[$i]); + $contig .= $seq->subseq($start,$end-$start); + } + + # count number of each letter in sequence + $aCount = () = $contig =~ /a/ig; + $cCount = () = $contig =~ /c/ig; + $gCount = () = $contig =~ /g/ig; + $tCount = () = $contig =~ /t/ig; + + # remove everything after and including CONTIG + $data =~ s/(CONTIG[\s\S]+)$//i; + + # build ORIGIN part of data file using sequence and counts + $data .= "BASE COUNT $aCount a $cCount c $gCount g $tCount t\n"; + $data .= "ORIGIN \n"; + $data .= "$contig\n//"; + } + else { + $data =~ s/<a\s+href\s*=.+>\s*(\S+)\s*<\s*\/a\s*\>/$1/ig; + } + + # fix gt and lt + $data =~ s/>/>/ig; + $data =~ s/</</ig; + if( $type eq 'FILE' ) { + open(TMP, ">$location") or $self->throw("couldn't overwrite file $location"); + print TMP $data; + close TMP; + } elsif ( $type eq 'STRING' ) { + ${$args{'location'}} = $data; + } + $self->debug("format is ". join(',',$self->request_format()). + " data is\n$data\n"); +} + + +=head2 request_format + + Title : request_format + Usage : my ($req_format, $ioformat) = $self->request_format; + $self->request_format("genbank"); + $self->request_format("fasta"); + Function: Get/Set sequence format retrieval. The get-form will normally not + be used outside of this and derived modules. + Returns : Array of two strings, the first representing the format for + retrieval, and the second specifying the corresponding SeqIO format. + Args : $format = sequence format + +=cut + +sub request_format { + my ($self, $value) = @_; + if( defined $value ) { + $value = lc $value; + if( defined $FORMATMAP{$value} ) { + $self->{'_format'} = [ $value, $FORMATMAP{$value}]; + } else { + # Try to fall back to a default. Alternatively, we could throw + # an exception + $self->{'_format'} = [ $value, $value ]; + } + } + return @{$self->{'_format'}}; +} + +=head2 Bio::DB::WebDBSeqI methods + +Overriding WebDBSeqI method to help newbies to retrieve sequences + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +sub get_Stream_by_acc { + my ($self, $ids ) = @_; + my $newdb = $self->_check_id($ids); + if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) { + return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } else { + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } +} + + +=head2 _check_id + + Title : _check_id + Usage : + Function: + Returns : A Bio::DB::RefSeq reference or throws + Args : $id(s), $string + +=cut + +sub _check_id { + my ($self, $ids) = @_; + + # NT contigs can not be retrieved + $self->throw("NT_ contigs are whole chromosome files which are not part of regular". + "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") + if $ids =~ /NT_/; + + # Asking for a RefSeq from EMBL/GenBank + + if ($ids =~ /N._/) { + $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.". + " Redirecting the request.\n") + if $self->verbose >= 0; + return new Bio::DB::RefSeq; + } +} + +=head2 delay_policy + + Title : delay_policy + Usage : $secs = $self->delay_policy + Function: return number of seconds to delay between calls to remote db + Returns : number of seconds to delay + Args : none + +NOTE: NCBI requests a delay of 3s between requests. This method +implements that policy. + +=cut + +sub delay_policy { + my $self = shift; + return 3; +} + +1; +__END__