Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/Alignment/Trim.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Alignment/Trim.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,656 @@ +# Bio::Tools::Alignment::Trim.pm +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Alignment::Trim - A kludge to do specialized trimming of + sequence based on quality. + +=head1 SYNOPSIS + + use Bio::Tools::Alignment::Trim; + $o_trim = new Bio::Tools::Alignment::Trim; + $o_trim->set_reverse_designator("R"); + $o_trim->set_forward_designator("F"); + + +=head1 DESCRIPTION + +This is a specialized module designed by Chad for Chad to trim sequences +based on a highly specialized list of requirements. In other words, write +something that will trim sequences 'just like the people in the lab would +do manually'. + +I settled on a sliding-window-average style of search which is ugly and +slow but does _exactly_ what I want it to do. + +Mental note: rewrite this. + +It is very important to keep in mind the context in which this module was +written: strictly to support the projects for which Consed.pm was +designed. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing +lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Chad Matsalla + +Email bioinformatics@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +package Bio::Tools::Alignment::Trim; + +use Bio::Root::Root; +use strict; +use Dumpvalue; + + + +use vars qw($VERSION @ISA %DEFAULTS); + +$VERSION = '0.01'; + +@ISA = qw(Bio::Root::Root); + +BEGIN { + %DEFAULTS = ( 'f_designator' => 'f', + 'r_designator' => 'r', + 'windowsize' => '10', + 'phreds' => '20'); +} + +=head2 new() + + Title : new() + Usage : $o_trim = Bio::Tools::Alignment::Trim->new(); + Function: Construct the Bio::Tools::Alignment::Trim object. No parameters + are required to create this object. It is strictly a bundle of + functions, as far as I am concerned. + Returns : A reference to a Bio::Tools::Alignment::Trim object. + Args : (optional) + -windowsize (default 10) + -phreds (default 20) + + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my($windowsize,$phreds) = + $self->_rearrange([qw( + WINDOWSIZE + PHREDS + )], + @args); + $self->{windowsize} = $windowsize || $DEFAULTS{'windowsize'}; + $self->{phreds} = $phreds || $DEFAULTS{'phreds'}; + # print("Constructor set phreds to ".$self->{phreds}."\n") if $self->verbose > 0; + $self->set_designators($DEFAULTS{'f_designator'}, + $DEFAULTS{'r_designator'}); + return $self; +} + +=head2 set_designators($forward_designator,$reverse_designator) + + Title : set_designators(<forward>,<reverse>) + Usage : $o_trim->set_designators("F","R") + Function: Set the string by which the system determines whether a given + sequence represents a forward or a reverse read. + Returns : Nothing. + Args : two scalars: one representing the forward designator and one + representing the reverse designator + +=cut + +sub set_designators { + my $self = shift; + ($self->{'f_designator'},$self->{'r_designator'}) = @_; +} + +=head2 set_forward_designator($designator) + + Title : set_forward_designator($designator) + Usage : $o_trim->set_forward_designator("F") + Function: Set the string by which the system determines if a given + sequence is a forward read. + Returns : Nothing. + Args : A string representing the forward designator of this project. + +=cut + +sub set_forward_designator { + my ($self,$desig) = @_; + $self->{'f_designator'} = $desig; +} + +=head2 set_reverse_designator($reverse_designator) + + Title : set_reverse_designator($reverse_designator) + Function: Set the string by which the system determines if a given + sequence is a reverse read. + Usage : $o_trim->set_reverse_designator("R") + Returns : Nothing. + Args : A string representing the forward designator of this project. + +=cut + +sub set_reverse_designator { + my ($self,$desig) = @_; + $self->{'r_designator'} = $desig; +} + +=head2 get_designators() + + Title : get_designators() + Usage : $o_trim->get_designators() + Returns : A string describing the current designators. + Args : None + Notes : Really for informational purposes only. Duh. + +=cut + +sub get_designators { + my $self = shift; + return("forward: ".$self->{'f_designator'}." reverse: ".$self->{'r_designator'}); +} + +=head2 trim_leading_polys() + + Title : trim_leading_polys() + Usage : $o_trim->trim_leading_polys() + Function: Not implemented. Does nothing. + Returns : Nothing. + Args : None. + Notes : This function is not implemented. Part of something I wanted to + do but never got around to doing. + +=cut + +sub trim_leading_polys { + my ($self, $sequence) = @_; +} + +=head2 dump_hash() + + Title : dump_hash() + Usage : $o_trim->dump_hash() + Function: Unimplemented. + Returns : Nothing. + Args : None. + Notes : Does nothing. + +=cut + +sub dump_hash { + my $self = shift; + my %hash = %{$self->{'qualities'}}; +} # end dump_hash + +=head2 trim_singlet($sequence,$quality,$name,$class) + + Title : trim_singlet($sequence,$quality,$name,$class) + Usage : ($r_trim_points,$trimmed_sequence) = + @{$o_trim->trim_singlet($sequence,$quality,$name,$class)}; + Function: Trim a singlet based on its quality. + Returns : a reference to an array containing the forward and reverse + trim points and the trimmed sequence. + Args : $sequence : A sequence (SCALAR, please) + $quality : A _scalar_ of space-delimited quality values. + $name : the name of the sequence + $class : The class of the sequence. One of qw(singlet + singleton doublet pair multiplet) + Notes : At the time this was written the bioperl objects SeqWithQuality + and PrimaryQual did not exist. This is what is with the clumsy + passing of references and so on. I will rewrite this next time I + have to work with it. I also wasn't sure whether this function + should return just the trim points or the points and the sequence. + I decided that I always wanted both so that's how I implemented + it. + - Note that the size of the sliding windows is set during construction of + the Bio::Tools::Alignment::Trim object. + +=cut + +sub trim_singlet { + my ($self,$sequence,$quality,$name,$class) = @_; + # this split is done because I normally store quality values in a + # space-delimited scalar rather then in an array. + # I do this because serialization of the arrays is tough. + my @qual = split(' ',$quality); + my @points; + my $sequence_length = length($sequence); + my ($returnstring,$processed_sequence); + # smooth out the qualities + my $r_windows = &_sliding_window(\@qual,$self->{windowsize}); + # find out the leading and trailing trimpoints + my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds}); + my (@new_points,$trimmed_sequence); + # do you think that any sequence shorter then 100 should be + # discarded? I don't think that this should be the decision of this + # module. + # removed, 020926 + $points[0] = $start_base; + # whew! now for the end base + # required parameters: reference_to_windows,windowsize,$phredvalue,start_base + my $end_base = &_get_end($r_windows,$self->{windowsize}, + $self->{phreds},$start_base); + $points[1] = $end_base; + # now do the actual trimming + # CHAD : I don't think that it is a good idea to call chop_sequence here + # because chop_sequence also removes X's and N's and things + # and that is not always what is wanted + return \@points; +} + +=head2 trim_doublet($sequence,$quality,$name,$class) + + Title : trim_doublet($sequence,$quality,$name,$class) + Usage : ($r_trim_points,$trimmed_sequence) = + @{$o_trim->trim_singlet($sequence,$quality,$name,$class)}; + Function: Trim a singlet based on its quality. + Returns : a reference to an array containing the forward and reverse + Args : $sequence : A sequence + $quality : A _scalar_ of space-delimited quality values. + $name : the name of the sequence + $class : The class of the sequence. One of qw(singlet + singleton doublet pair multiplet) + Notes : At the time this was written the bioperl objects SeqWithQuality + and PrimaryQual did not exist. This is what is with the clumsy + passing of references and so on. I will rewrite this next time I + have to work with it. I also wasn't sure whether this function + should return just the trim points or the points and the sequence. + I decided that I always wanted both so that's how I implemented + it. + +=cut + +#' +sub trim_doublet { + my ($self,$sequence,$quality,$name,$class) = @_; + my @qual = split(' ',$quality); + my @points; + my $sequence_length = length($sequence); + my ($returnstring,$processed_sequence); + # smooth out the qualities + my $r_windows = &_sliding_window(\@qual,$self->{windowsize}); + # determine where the consensus sequence starts + my $offset = 0; + for (my $current = 0; $current<$sequence_length;$current++) { + if ($qual[$current] != 0) { + $offset = $current; + last; + } + } + # start_base required: r_quality,$windowsize,$phredvalue + my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds},$offset); + if ($start_base > ($sequence_length - 100)) { + $points[0] = ("FAILED"); + $points[1] = ("FAILED"); + return @points; + } + $points[0] = $start_base; + # + # whew! now for the end base + # + # required parameters: reference_to_windows,windowsize,$phredvalue,start_base + # | + # 010420 NOTE: We will no longer get the end base to avoid the Q/--\___/-- syndrome + my $end_base = $sequence_length; + my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual); + $points[1] = $end_base; + # CHAD : I don't think that it is a good idea to call chop_sequence here + # because chop_sequence also removes X's and N's and things + # and that is not always what is wanted + return @points; +} # end trim_doublet + +=head2 chop_sequence($name,$class,$sequence,@points) + + Title : chop_sequence($name,$class,$sequence,@points) + Usage : ($start_point,$end_point,$chopped_sequence) = + $o_trim->chop_sequence($name,$class,$sequence,@points); + Function: Chop a sequence based on its name, class, and sequence. + Returns : an array containing three scalars: + 1- the start trim point + 2- the end trim point + 3- the chopped sequence + Args : + $name : the name of the sequence + $class : The class of the sequence. One of qw(singlet + singleton doublet pair multiplet) + $sequence : A sequence + @points : An array containing two elements- the first contains + the start trim point and the second conatines the end trim + point. + +=cut + +sub chop_sequence { + my ($self,$name,$class,$sequence,@points) = @_; + print("Coming into chop_sequence, \@points are @points\n"); + my $fdesig = $self->{'f_designator'}; + my $rdesig = $self->{'r_designator'}; + if (!$points[0] && !$points[1]) { + $sequence = "junk"; + return $sequence; + } + if ($class eq "singlet" && $name =~ /$fdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "singlet" && $name =~ /$rdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "singleton" && $name =~ /$fdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "singleton" && $name =~ /$rdesig$/) { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + elsif ($class eq "doublet") { + $sequence = substr($sequence,$points[0],$points[1]-$points[0]); + } + # this is a _terrible_ to do this! i couldn't seem to find a better way + # i thought something like s/(^.*[Xx]{5,})//g; might work, but no go + # no time to find a fix! + my $length_before_trimming = length($sequence); + my $subs_Xs = $sequence =~ s/^.*[Xx]{5,}//g; + if ($subs_Xs) { + my $length_after_trimming = length($sequence); + my $number_Xs_trimmed = $length_before_trimming - $length_after_trimming; + $points[0] += $number_Xs_trimmed; + } + $length_before_trimming = length($sequence); + my $subs_Ns = $sequence =~ s/[Nn]{1,}$//g; + if ($subs_Ns) { + my $length_after_trimming = length($sequence); + my $number_Ns_trimmed = $length_before_trimming - $length_after_trimming; + $points[1] -= $number_Ns_trimmed; + $points[1] -= 1; + } + push @points,$sequence; + print("chop_sequence \@points are @points\n"); + return @points; +} + +=head2 _get_start($r_quals,$windowsize,$phreds,$offset) + + Title : _get_start($r_quals,$windowsize,$phreds,$offset) + Usage : $start_base = $self->_get_start($r_windows,5,20); + Function: Provide the start trim point for this sequence. + Returns : a scalar representing the start of the sequence + Args : + $r_quals : A reference to an array containing quality values. In + context, this array of values has been smoothed by then + sliding window-look ahead algorithm. + $windowsize : The size of the window used when the sliding window + look-ahead average was calculated. + $phreds : <fill in what this does here> + $offset : <fill in what this does here> + +=cut + +sub _get_start { + my ($self,$r_quals,$windowsize,$phreds,$offset) = @_; + print("Using $phreds phreds\n") if $self->verbose > 0; + # this is to help determine whether the sequence is good at all + my @quals = @$r_quals; + my ($count,$count2,$qualsum); + if ($offset) { $count = $offset; } else { $count = 0; } + # search along the length of the sequence + for (; ($count+$windowsize) <= scalar(@quals); $count++) { + # sum all of the quality values in this window. + my $cumulative=0; + for($count2 = $count; $count2 < $count+$windowsize; $count2++) { + if (!$quals[$count2]) { + # print("Quals don't exist here!\n"); + } + else { + $qualsum += $quals[$count2]; + # print("Incremented qualsum to ($qualsum)\n"); + } + $cumulative++; + } + # print("The sum of this window (starting at $count) is $qualsum. I counted $cumulative bases.\n"); + # if the total of windowsize * phreds is + if ($qualsum && $qualsum >= $windowsize*$phreds) { return $count; } + $qualsum = 0; + } + # if ($count > scalar(@quals)-$windowsize) { return; } + return $count; +} + +=head2 _get_end($r_qual,$windowsize,$phreds,$count) + + Title : _get_end($r_qual,$windowsize,$phreds,$count) + Usage : my $end_base = &_get_end($r_windows,20,20,$start_base); + Function: Get the end trim point for this sequence. + Returns : A scalar representing the end trim point for this sequence. + Args : + $r_qual : A reference to an array containing quality values. In + context, this array of values has been smoothed by then + sliding window-look ahead algorithm. + $windowsize : The size of the window used when the sliding window + look-ahead average was calculated. + $phreds : <fill in what this does here> + $count : Start looking for the end of the sequence here. + +=cut + +sub _get_end { + my ($r_qual,$windowsize,$phreds,$count) = @_; + my @quals = @$r_qual; + my $total_bases = scalar(@quals); + my ($count2,$qualsum,$end_of_quals,$bases_counted); + if (!$count) { $count=0; } + BASE: for (; $count < $total_bases; $count++) { + $bases_counted = 0; + $qualsum = 0; + POSITION: for($count2 = $count; $count2 < $total_bases; $count2++) { + $bases_counted++; + + if ($count2 == $total_bases-1) { + $qualsum += $quals[$count2]; + $bases_counted++; + last BASE; + } + elsif ($bases_counted == $windowsize) { + $qualsum += $quals[$count2]; + if ($qualsum < $bases_counted*$phreds) { + return $count+$bases_counted+$windowsize; + } + next BASE; + } + else { + $qualsum += $quals[$count2]; + } + } + if ($qualsum < $bases_counted*$phreds) { + return $count+$bases_counted+$windowsize; + } + else { } + $qualsum = 0; + } # end for + if ($end_of_quals) { + my $bases_for_average = $total_bases-$count2; + return $count2; + } + else { } + if ($qualsum) { } # print ("$qualsum\n"); + return $total_bases; +} # end get_end + +=head2 count_doublet_trailing_zeros($r_qual) + + Title : count_doublet_trailing_zeros($r_qual) + Usage : my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual); + Function: Find out when the trailing zero qualities start. + Returns : A scalar representing where the zeros start. + Args : A reference to an array of quality values. + Notes : Again, this should be rewritten to use PrimaryQual objects. + A more detailed explanation of why phrap puts these zeros here should + be written and placed here. Please email and hassle the author. + + +=cut + +sub count_doublet_trailing_zeros { + my ($r_qual) = shift; + my $number_of_trailing_zeros = 0; + my @qualities = @$r_qual; + for (my $current=scalar(@qualities);$current>0;$current--) { + if ($qualities[$current] && $qualities[$current] != 0) { + $number_of_trailing_zeros = scalar(@qualities)-$current; + return $current+1; + } + } + return scalar(@qualities); +} # end count_doublet_trailing_zeros + +=head2 _sliding_window($r_quals,$windowsize) + + Title : _sliding_window($r_quals,$windowsize) + Usage : my $r_windows = &_sliding_window(\@qual,$windowsize); + Function: Create a sliding window, look-forward-average on an array + of quality values. Used to smooth out differences in qualities. + Returns : A reference to an array containing the smoothed values. + Args : $r_quals: A reference to an array containing quality values. + $windowsize : The size of the sliding window. + Notes : This was written before PrimaryQual objects existed. They + should use that object but I haven't rewritten this yet. + +=cut + +#' +sub _sliding_window { + my ($r_quals,$windowsize) = @_; + my (@window,@quals,$qualsum,$count,$count2,$average,@averages,$bases_counted); + @quals = @$r_quals; + my $size_of_quality = scalar(@quals); + # do this loop for all of the qualities + for ($count=0; $count <= $size_of_quality; $count++) { + $bases_counted = 0; + BASE: for($count2 = $count; $count2 < $size_of_quality; $count2++) { + $bases_counted++; + # if the search hits the end of the averages, stop + # this is for the case near the end where bases remaining < windowsize + if ($count2 == $size_of_quality) { + $qualsum += $quals[$count2]; + last BASE; + } + # if the search hits the size of the window + elsif ($bases_counted == $windowsize) { + $qualsum += $quals[$count2]; + last BASE; + } + # otherwise add the quality value + unless (!$quals[$count2]) { + $qualsum += $quals[$count2]; + } + } + unless (!$qualsum || !$windowsize) { + $average = $qualsum / $bases_counted; + if (!$average) { $average = "0"; } + push @averages,$average; + } + $qualsum = 0; + } + # 02101 Yes, I repaired the mismatching numbers between averages and windows. + # print("There are ".scalar(@$r_quals)." quality values. They are @$r_quals\n"); + # print("There are ".scalar(@averages)." average values. They are @averages\n"); + return \@averages; + +} + +=head2 _print_formatted_qualities + + Title : _print_formatted_qualities(\@quals) + Usage : &_print_formatted_qualities(\@quals); + Returns : Nothing. Prints. + Args : A reference to an array containing quality values. + Notes : An internal procedure used in debugging. Prints out an array nicely. + +=cut + +sub _print_formatted_qualities { + my $rquals = shift; + my @qual = @$rquals; + for (my $count=0; $count<scalar(@qual) ; $count++) { + if (($count%10)==0) { print("\n$count\t"); } + if ($qual[$count]) { print ("$qual[$count]\t");} + else { print("0\t"); } + } + print("\n"); +} + +=head2 _get_end_old($r_qual,$windowsize,$phreds,$count) + + Title : _get_end_old($r_qual,$windowsize,$phreds,$count) + Usage : Deprecated. Don't use this! + Returns : Deprecated. Don't use this! + Args : Deprecated. Don't use this! + +=cut + +#' +sub _get_end_old { + my ($r_qual,$windowsize,$phreds,$count) = @_; + warn("Do Not Use this function (_get_end_old)"); + my $target = $windowsize*$phreds; + my @quals = @$r_qual; + my $total_bases = scalar(@quals); + my ($count2,$qualsum,$end_of_quals); + if (!$count) { $count=0; } + BASE: for (; $count < $total_bases; $count++) { + for($count2 = $count; $count2 < $count+$windowsize; $count2++) { + if ($count2 == scalar(@quals)-1) { + $qualsum += $quals[$count2]; + $end_of_quals = 1; + last BASE; + + } + $qualsum += $quals[$count2]; + } + if ($qualsum < $windowsize*$phreds) { + return $count+$windowsize; + } + $qualsum = 0; + } # end for +} # end get_end_old + + +# Autoload methods go after =cut, and are processed by the autosplit program. + +1; +__END__