diff variant_effect_predictor/Bio/Tools/SeqWords.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Tools/SeqWords.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,225 @@
+# $Id: SeqWords.pm,v 1.7.2.1 2003/03/05 19:06:15 jason Exp $
+
+#---------------------------------------------------------------------------
+# PACKAGE    : SeqWords.pm
+# PURPOSE    : To count n-mers in any sequence of characters
+# AUTHOR     : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
+# SOURCE     : 
+# CREATED    : 21st March 2000
+# MODIFIED   : 
+# DISCLAIMER : I am employed in the pharmaceutical industry but my 
+#	     : employers do not endorse or sponsor this module
+#	     : in any way whatsoever.  The above email address is
+#	     : given purely for the purpose of easy communication
+#            : with the author, and does not imply any connection
+#	     : between my employers and anything written below.
+# LICENCE    : You may distribute this module under the same terms 
+#	     : as the rest of BioPerl.
+#---------------------------------------------------------------------------
+
+=head1 NAME
+
+Bio::Tools::SeqWords - Object holding n-mer statistics for one sequence
+
+=head1 SYNOPSIS
+
+Take a sequence object from eg, an inputstream, and creates an object
+for the purposes of holding n-mer word statistics about that sequence.
+The sequence can be nucleic acid or protein, but the module is
+probably most relevant for DNA.  The words are counted in a
+non-overlapping manner, ie. in the style of a codon table, but with
+any word length.  For overlapping word counts, a sequence can be
+'shifted' to remove the first character and then the count repeated.
+For counts on opposite strand (DNA/RNA), a reverse complement method
+should be performed, and then the count repeated.
+
+Creating the SeqWords object, eg:
+
+  my $inputstream = Bio::SeqIO->new( -file => "seqfile", 
+	                             -format => 'Fasta');
+  my $seqobj = $inputstream->next_seq();
+  my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
+
+or:
+
+  my $seqobj = Bio::PrimarySeq->new(-seq=>'[cut and paste a sequence here]', 
+                                    -alphabet => 'dna', 
+                                    -id => 'test');
+  my $seq_word  =  Bio::Tools::SeqWords->new(-seq => $seqobj);
+
+obtain a hash of word counts, eg:
+
+  my $hash_ref = $seq_stats->count_words($word_length);
+
+display hash table, eg:
+
+  my %hash = %$hash_ref;
+  foreach my $key(sort keys %hash)
+  {
+    print "\n$key\t$hash{$key}";
+  }
+
+or	
+
+  my $hash_ref = Bio::SeqWords->count_words($seqobj,$word_length);
+
+
+=head1 DESCRIPTION
+
+Bio:SeqWords is a featherweight object for the calculation of n-mer
+word occurrences in a single sequence.  It is envisaged that the
+object will be useful for construction of scripts which use n-mer word
+tables as the raw material for statistical calculations; for instance,
+hexamer frequency for the calculation of coding protential, or the
+calculation of periodicity in repetitive DNA.  Triplet frequency is
+already handled by Bio::SeqStats.pm (author: Peter Schattner).  There
+are a few possible applications for protein, eg: hypothesised amino
+acid 7-mers in heat shock proteins, or proteins with multiple simple
+motifs.  Sometimes these protein periodicities are best seen when the
+amino acid alphabet is truncated, eg Shulman alphabet.  Since there
+are quite a few of these shortened alphabets, this module does not
+specify any particular alphabet.
+
+See Synopsis above for object creation code.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this
+and other Bioperl modules. Send your comments and suggestions preferably
+to one of the Bioperl mailing lists.
+Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                 - General discussion
+  http://bio.perl.org/MailList.html     - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.
+Bug reports can be submitted via the web:
+
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR
+
+Derek Gatherer, in the loosest sense of the word 'author'.  The
+general shape of the module is lifted directly from Peter Schattner's
+SeqStats.pm module.  The central subroutine to count the words is
+adapted from original code provided by Dave Shivak, in response to a
+query on the bioperl mailing list.  At least 2 other people provided
+alternative means (equally good but not used in the end) of performing
+the same calculation.  Thanks to all for your assistance.
+
+=head1 CONTRUBITORS
+
+Jason Stajich, jason-at-bioperl.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. 
+Internal methods are usually preceded with a _
+
+=cut
+
+package Bio::Tools::SeqWords;
+use vars qw(@ISA);
+use strict;
+
+use Bio::Root::Root;
+
+@ISA = qw(Bio::Root::Root);
+
+sub new {
+    my($class,@args) = @_;
+    # our new standard way of instantiation
+    my $self = $class->SUPER::new(@args);
+
+    my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
+    if((! defined($seqobj)) && @args && ref($args[0])) {
+	# parameter not passed as named parameter?
+	$seqobj = $args[0];
+    }
+    
+    if(! $seqobj->isa("Bio::PrimarySeqI")) { 
+	$self->throw(ref($self) . " works only on PrimarySeqI objects\n");
+    }
+	
+    $self->{'_seqref'} = $seqobj;
+    return $self; 
+}
+
+
+=head2 count_words
+
+ Title   : count_words
+ Usage   : $word_count = $seq_stats->count_words($word_length); 
+ or 	 : $word_count = $seq_stats->Bio::SeqWords->($seqobj,$word_length);
+ Function: Counts non-overlapping words within a string
+	 : any alphabet is used
+ Example : a sequence ACCGTCCGT, counted at word length 4,
+	 : will give the hash
+	 : ACCG 1, TCCG 1
+ Returns : Reference to a hash in which keys are words (any length) of the
+         : alphabet used and values are number of occurrences of the word 
+         : in the sequence.
+ Args    : Word length as scalar and, reference to sequence object if
+         : required
+
+  Throws an exception word length is not a positive integer
+  or if word length is longer than the sequence.
+
+=cut
+
+sub count_words
+{
+    my ($self,$seqobj,$word_length) = @_;
+
+    # check how we were called, and if necessary rearrange arguments
+    if(ref($seqobj)) {
+	# call as SeqWords->count_words($seq, $wordlen)
+	if(! $seqobj->isa("Bio::PrimarySeqI")) { 
+	    $self->throw("SeqWords works only on PrimarySeqI objects\n");
+	}
+    } else {
+	# call as $obj->count_words($wordlen)
+	$word_length = $seqobj;
+	$seqobj = undef;
+    }
+
+    if($word_length eq "" || $word_length =~ /[a-z]/i)
+    {
+	$self->throw("SeqWords cannot accept non-numeric characters".
+		     " or a null value in the \$word_length variable\n");
+    }
+    elsif ($word_length <1 || ($word_length - int($word_length)) >0)
+    {
+	$self->throw("SeqWords requires the word length to be a ".
+		     "positive integer\n");
+    }
+
+    if(! defined($seqobj)) {
+	$seqobj =  $self->{'_seqref'};
+    }
+    my $seqstring = uc $seqobj->seq();
+
+    if($word_length > length($seqstring))
+    {
+	$self->throw("die in count words, \$word_length is bigger ".
+		     "than sequence length\n");
+    }
+
+    my %codon = ();
+
+    # now the real business
+    # JS - remove DNA assumption
+    while($seqstring =~ /((\w){$word_length})/gim) {
+	$codon{uc($1)}++;
+    }
+    return \%codon;
+
+# and that's it
+}
+
+1;