Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Perl.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Perl.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,697 @@ +# $Id: Perl.pm,v 1.16.2.1 2003/03/25 12:32:15 heikki Exp $ +# +# BioPerl module for Bio::Perl +# +# Cared for by Ewan Birney <bioperl-l@bio.perl.org> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Perl - Functional access to BioPerl for people who don't know objects + +=head1 SYNOPSIS + + use Bio::Perl; + + # will guess file format from extension + $seq_object = read_sequence($filename); + + # forces genbank format + $seq_object = read_sequence($filename,'genbank'); + + # reads an array of sequences + @seq_object_array = read_all_sequences($filename,'fasta'); + + # sequences are Bio::Seq objects, so the following methods work + # for more info see L<Bio::Seq>, or do 'perldoc Bio/Seq.pm' + + print "Sequence name is ",$seq_object->display_id,"\n"; + print "Sequence acc is ",$seq_object->accession_number,"\n"; + print "First 5 bases is ",$seq_object->subseq(1,5),"\n"; + + # get the whole sequence as a single string + + $sequence_as_a_string = $seq_object->seq(); + + # writing sequences + + write_sequence(">$filename",'genbank',$seq_object); + + write_sequence(">$filename",'genbank',@seq_object_array); + + # making a new sequence from just strings you have + # from something else + + $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA", + "myname","AL12232"); + + + # getting a sequence from a database (assumes internet connection) + + $seq_object = get_sequence('swissprot',"ROA1_HUMAN"); + + $seq_object = get_sequence('embl',"AI129902"); + + $seq_object = get_sequence('genbank',"AI129902"); + + # BLAST a sequence (assummes an internet connection) + + $blast_report = blast_sequence($seq_object); + + write_blast(">blast.out",$blast_report); + + +=head1 DESCRIPTION + +Easy first time access to BioPerl via functions + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bio.perl.org + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email bioperl-l@bio.perl.org + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + + +package Bio::Perl; +use vars qw(@ISA @EXPORT @EXPORT_OK $DBOKAY); +use strict; +use Carp; +use Exporter; + +use Bio::SeqIO; +use Bio::Seq; +BEGIN { + eval { + require Bio::DB::EMBL; + require Bio::DB::GenBank; + require Bio::DB::SwissProt; + require Bio::DB::RefSeq; + require Bio::DB::GenPept; + }; + if( $@ ) { + $DBOKAY = 0; + } else { + $DBOKAY = 1; + } +} + +@ISA = qw(Exporter); + +@EXPORT = qw(read_sequence read_all_sequences write_sequence + new_sequence get_sequence translate translate_as_string + reverse_complement revcom revcom_as_string + reverse_complement_as_string blast_sequence write_blast); + +@EXPORT_OK = @EXPORT; + + +=head2 read_sequence + + Title : read_sequence + Usage : $seq = read_sequence('sequences.fa') + $seq = read_sequence($filename,'genbank'); + + # pipes are fine + $seq = read_sequence("my_fetching_program $id |",'fasta'); + + Function: Reads the top sequence from the file. If no format is given, it will + try to guess the format from the filename. If a format is given, it + forces that format. The filename can be any valid perl open() string + - in particular, you can put in pipes + + Returns : A Bio::Seq object. A quick synopsis: + $seq_object->display_id - name of the sequence + $seq_object->seq - sequence as a string + + Args : Two strings, first the filename - any Perl open() string is ok + Second string is the format, which is optional + +For more information on Seq objects see L<Bio::Seq>. + +=cut + +sub read_sequence{ + my ($filename,$format) = @_; + + if( !defined $filename ) { + confess "read_sequence($filename) - usage incorrect"; + } + + my $seqio; + + if( defined $format ) { + $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format); + } else { + $seqio = Bio::SeqIO->new( '-file' => $filename); + } + + my $seq = $seqio->next_seq(); + + return $seq; +} + + +=head2 read_all_sequences + + Title : read_all_sequences + Usage : @seq_object_array = read_all_sequences($filename); + @seq_object_array = read_all_sequences($filename,'genbank'); + + Function: Just as the function above, but reads all the sequences in the + file and loads them into an array. + + For very large files, you will run out of memory. When this + happens, you've got to use the SeqIO system directly (this is + not so hard! Don't worry about it!). + + Returns : array of Bio::Seq objects + + Args : two strings, first the filename (any open() string is ok) + second the format (which is optional) + +See L<Bio::SeqIO> and L<Bio::Seq> for more information + +=cut + +sub read_all_sequences{ + my ($filename,$format) = @_; + + if( !defined $filename ) { + confess "read_all_sequences($filename) - usage incorrect"; + } + + my $seqio; + + if( defined $format ) { + $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format); + } else { + $seqio = Bio::SeqIO->new( '-file' => $filename); + } + + my @seq_array; + + while( my $seq = $seqio->next_seq() ) { + push(@seq_array,$seq); + } + + return @seq_array; +} + + +=head2 write_sequence + + Title : write_sequence + Usage : write_sequence(">new_file.gb",'genbank',$seq) + write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects) + + Function: writes sequences in the specified format + + Returns : true + + Args : filename as a string, must provide an open() output file + format as a string + one or more sequence objects + + +=cut + +sub write_sequence{ + my ($filename,$format,@sequence_objects) = @_; + + if( scalar(@sequence_objects) == 0 ) { + confess("write_sequence(filename,format,sequence_object)"); + } + + my $error = 0; + my $seqname = "sequence1"; + + # catch users who haven't passed us a filename we can open + if( $filename !~ /^\>/ && $filename !~ /^|/ ) { + $filename = ">".$filename; + } + + my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format); + + foreach my $seq ( @sequence_objects ) { + my $seq_obj; + + if( !ref $seq ) { + if( length $seq > 50 ) { + # odds are this is a sequence as a string, and someone has not figured out + # how to make objects. Warn him/her and then make a sequence object + # from this + if( $error == 0 ) { + carp("WARNING: You have put in a long string into write_sequence.\nI suspect this means that this is the actual sequence\nIn the future try the\n new_sequence method of this module to make a new sequence object.\nDoing this for you here\n"); + $error = 1; + } + + $seq_obj = new_sequence($seq,$seqname); + $seqname++; + } else { + confess("You have a non object [$seq] passed to write_sequence. It maybe that you want to use new_sequence to make this string into a sequence object?"); + } + } else { + if( !$seq->isa("Bio::SeqI") ) { + confess("object [$seq] is not a Bio::Seq object; can't write it out"); + } + $seq_obj = $seq; + } + + # finally... we get to write out the sequence! + $seqio->write_seq($seq_obj); + } + 1; +} + +=head2 new_sequence + + Title : new_sequence + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub new_sequence{ + my ($seq,$name,$accession) = @_; + + if( !defined $seq ) { + confess("new_sequence(sequence_as_string) usage"); + } + + $name ||= "no-name-for-sequence"; + + my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name); + + $accession && $seq_object->accession_number($accession); + + return $seq_object; +} + +=head2 blast_sequence + + Title : blast_sequence + Usage : $blast_result = blast_sequence($seq) + $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE'); + + Function: If the computer has Internet accessibility, blasts + the sequence using the NCBI BLAST server against nrdb. + + It choose the flavour of BLAST on the basis of the sequence. + + This function uses Bio::Tools::Run::RemoteBlast, which itself + use Bio::SearchIO - as soon as you want to more, check out + these modules + Returns : Bio::Search::Result::GenericResult.pm + + Args : Either a string of protein letters or nucleotides, or a + Bio::Seq object + +=cut + +sub blast_sequence { + my ($seq,$verbose) = shift; + + if( !defined $verbose ) { + $verbose = 1; + } + + if( !ref $seq ) { + $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id'); + } elsif ( !$seq->isa('Bio::PrimarySeqI') ) { + croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted"); + } + + require Bio::Tools::Run::RemoteBlast; + + my $prog = 'blastp'; + my $e_val= '1e-10'; + + my @params = ( '-prog' => $prog, + '-expect' => $e_val, + '-readmethod' => 'SearchIO' ); + + my $factory = Bio::Tools::Run::RemoteBlast->new(@params); + + my $r = $factory->submit_blast($seq); + if( $verbose ) { + print STDERR "Submitted Blast for [".$seq->id."] "; + } + sleep 5; + + my $result; + + LOOP : + while( my @rids = $factory->each_rid) { + foreach my $rid ( @rids ) { + my $rc = $factory->retrieve_blast($rid); + if( !ref($rc) ) { + if( $rc < 0 ) { + $factory->remove_rid($rid); + } + if( $verbose ) { + print STDERR "."; + } + sleep 10; + } else { + $result = $rc->next_result(); + $factory->remove_rid($rid); + last LOOP; + } + } + } + + if( $verbose ) { + print STDERR "\n"; + } + return $result; +} + +=head2 write_blast + + Title : write_blast + Usage : write_blast($filename,$blast_report); + + Function: Writes a BLAST result object (or more formally + a SearchIO result object) out to a filename + in BLAST-like format + + Returns : none + + Args : filename as a string + Bio::SearchIO::Results object + +=cut + +sub write_blast { + my ($filename,$blast) = @_; + + if( $filename !~ /^\>/ && $filename !~ /^|/ ) { + $filename = ">".$filename; + } + + my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename); + + $output->write_result($blast); + +} + +=head2 get_sequence + + Title : get_sequence + Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN"); + + Function: If the computer has Internet accessibility, gets + the sequence from Internet accessible databases. Currently + this supports Swissprot, EMBL, GenBank and RefSeq. + + Swissprot and EMBL are more robust than GenBank fetching. + + If the user is trying to retrieve a RefSeq entry from + GenBank/EMBL, the query is silently redirected. + + Returns : A Bio::Seq object + + Args : database type - one of swiss, embl, genbank or refseq + identifier or accession number + +=cut + +my $genbank_db = undef; +my $genpept_db = undef; +my $embl_db = undef; +my $swiss_db = undef; +my $refseq_db = undef; + +sub get_sequence{ + my ($db_type,$identifier) = @_; + if( ! $DBOKAY ) { + confess("Your system does not have IO::String installed so the DB retrieval method is not available"); + return; + } + $db_type = lc($db_type); + + my $db; + + if( $db_type =~ /genbank/ ) { + if( !defined $genbank_db ) { + $genbank_db = Bio::DB::GenBank->new(); + } + $db = $genbank_db; + } + if( $db_type =~ /genpept/ ) { + if( !defined $genpept_db ) { + $genpept_db = Bio::DB::GenPept->new(); + } + $db = $genpept_db; + } + + if( $db_type =~ /swiss/ ) { + if( !defined $swiss_db ) { + $swiss_db = Bio::DB::SwissProt->new(); + } + $db = $swiss_db; + } + + if( $db_type =~ /embl/ ) { + if( !defined $embl_db ) { + $embl_db = Bio::DB::EMBL->new(); + } + $db = $embl_db; + } + + if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and + $identifier =~ /^\s*N\S+_/)) { + if( !defined $refseq_db ) { + $refseq_db = Bio::DB::RefSeq->new(); + } + $db = $refseq_db; + } + + my $seq; + + if( $identifier =~ /^\w+\d+$/ ) { + $seq = $db->get_Seq_by_acc($identifier); + } else { + $seq = $db->get_Seq_by_id($identifier); + } + + return $seq; +} + + +=head2 translate + + Title : translate + Usage : $seqobj = translate($seq_or_string_scalar) + + Function: translates a DNA sequence object OR just a plain + string of DNA to amino acids + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub translate { + my ($scalar) = shift; + + my $obj; + + if( ref $scalar ) { + if( !$scalar->isa("Bio::PrimarySeqI") ) { + confess("Expecting a sequence object not a $scalar"); + } else { + $obj= $scalar; + + } + + } else { + + # check this looks vaguely like DNA + my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); + + if( $n < length($scalar) * 0.85 ) { + confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); + } + + $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); + } + + return $obj->translate(); +} + + +=head2 translate_as_string + + Title : translate_as_string + Usage : $seqstring = translate_as_string($seq_or_string_scalar) + + Function: translates a DNA sequence object OR just a plain + string of DNA to amino acids + Returns : A stirng of just amino acids + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub translate_as_string { + my ($scalar) = shift; + + my $obj = Bio::Perl::translate($scalar); + + return $obj->seq; +} + + +=head2 reverse_complement + + Title : reverse_complement + Usage : $seqobj = reverse_complement($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a Bio::Seq - if you want a string, you + can use reverse_complement_as_string + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub reverse_complement { + my ($scalar) = shift; + + my $obj; + + if( ref $scalar ) { + if( !$scalar->isa("Bio::PrimarySeqI") ) { + confess("Expecting a sequence object not a $scalar"); + } else { + $obj= $scalar; + + } + + } else { + + # check this looks vaguely like DNA + my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); + + if( $n < length($scalar) * 0.85 ) { + confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); + } + + $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); + } + + return $obj->revcom(); +} + +=head2 revcom + + Title : revcom + Usage : $seqobj = revcom($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a Bio::Seq - if you want a string, you + can use reverse_complement_as_string + + This is an alias for reverse_complement + Returns : A Bio::Seq object + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub revcom { + return &Bio::Perl::reverse_complement(@_); +} + + +=head2 reverse_complement_as_string + + Title : reverse_complement_as_string + Usage : $string = reverse_complement_as_string($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a string + Returns : A string of DNA letters + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub reverse_complement_as_string { + my ($scalar) = shift; + + my $obj = &Bio::Perl::reverse_complement($scalar); + + return $obj->seq; +} + + +=head2 revcom_as_string + + Title : revcom_as_string + Usage : $string = revcom_as_string($seq_or_string_scalar) + + Function: reverse complements a string or sequnce argument + producing a string + Returns : A string of DNA letters + + Args : Either a sequence object or a string of + just DNA sequence characters + +=cut + +sub revcom_as_string { + my ($scalar) = shift; + + my $obj = &Bio::Perl::reverse_complement($scalar); + + return $obj->seq; +} + + +1;