0
|
1 # Bio::Tools::Alignment::Trim.pm
|
|
2 #
|
|
3 # Cared for by Chad Matsalla
|
|
4 #
|
|
5 # Copyright Chad Matsalla
|
|
6 #
|
|
7 # You may distribute this module under the same terms as perl itself
|
|
8
|
|
9 # POD documentation - main docs before the code
|
|
10
|
|
11 =head1 NAME
|
|
12
|
|
13 Bio::Tools::Alignment::Trim - A kludge to do specialized trimming of
|
|
14 sequence based on quality.
|
|
15
|
|
16 =head1 SYNOPSIS
|
|
17
|
|
18 use Bio::Tools::Alignment::Trim;
|
|
19 $o_trim = new Bio::Tools::Alignment::Trim;
|
|
20 $o_trim->set_reverse_designator("R");
|
|
21 $o_trim->set_forward_designator("F");
|
|
22
|
|
23
|
|
24 =head1 DESCRIPTION
|
|
25
|
|
26 This is a specialized module designed by Chad for Chad to trim sequences
|
|
27 based on a highly specialized list of requirements. In other words, write
|
|
28 something that will trim sequences 'just like the people in the lab would
|
|
29 do manually'.
|
|
30
|
|
31 I settled on a sliding-window-average style of search which is ugly and
|
|
32 slow but does _exactly_ what I want it to do.
|
|
33
|
|
34 Mental note: rewrite this.
|
|
35
|
|
36 It is very important to keep in mind the context in which this module was
|
|
37 written: strictly to support the projects for which Consed.pm was
|
|
38 designed.
|
|
39
|
|
40 =head1 FEEDBACK
|
|
41
|
|
42 =head2 Mailing Lists
|
|
43
|
|
44 User feedback is an integral part of the evolution of this and other
|
|
45 Bioperl modules. Send your comments and suggestions preferably to one
|
|
46 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
47
|
|
48 bioperl-l@bioperl.org - General discussion
|
|
49 http://bio.perl.org/MailList.html - About the mailing
|
|
50 lists
|
|
51
|
|
52 =head2 Reporting Bugs
|
|
53
|
|
54 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
55 the bugs and their resolution. Bug reports can be submitted via
|
|
56 email or the web:
|
|
57
|
|
58 bioperl-bugs@bio.perl.org
|
|
59 http://bugzilla.bioperl.org/
|
|
60
|
|
61 =head1 AUTHOR - Chad Matsalla
|
|
62
|
|
63 Email bioinformatics@dieselwurks.com
|
|
64
|
|
65 =head1 APPENDIX
|
|
66
|
|
67 The rest of the documentation details each of the object methods.
|
|
68 Internal methods are usually preceded with a _
|
|
69
|
|
70 =cut
|
|
71
|
|
72 package Bio::Tools::Alignment::Trim;
|
|
73
|
|
74 use Bio::Root::Root;
|
|
75 use strict;
|
|
76 use Dumpvalue;
|
|
77
|
|
78
|
|
79
|
|
80 use vars qw($VERSION @ISA %DEFAULTS);
|
|
81
|
|
82 $VERSION = '0.01';
|
|
83
|
|
84 @ISA = qw(Bio::Root::Root);
|
|
85
|
|
86 BEGIN {
|
|
87 %DEFAULTS = ( 'f_designator' => 'f',
|
|
88 'r_designator' => 'r',
|
|
89 'windowsize' => '10',
|
|
90 'phreds' => '20');
|
|
91 }
|
|
92
|
|
93 =head2 new()
|
|
94
|
|
95 Title : new()
|
|
96 Usage : $o_trim = Bio::Tools::Alignment::Trim->new();
|
|
97 Function: Construct the Bio::Tools::Alignment::Trim object. No parameters
|
|
98 are required to create this object. It is strictly a bundle of
|
|
99 functions, as far as I am concerned.
|
|
100 Returns : A reference to a Bio::Tools::Alignment::Trim object.
|
|
101 Args : (optional)
|
|
102 -windowsize (default 10)
|
|
103 -phreds (default 20)
|
|
104
|
|
105
|
|
106 =cut
|
|
107
|
|
108 sub new {
|
|
109 my ($class,@args) = @_;
|
|
110 my $self = $class->SUPER::new(@args);
|
|
111 my($windowsize,$phreds) =
|
|
112 $self->_rearrange([qw(
|
|
113 WINDOWSIZE
|
|
114 PHREDS
|
|
115 )],
|
|
116 @args);
|
|
117 $self->{windowsize} = $windowsize || $DEFAULTS{'windowsize'};
|
|
118 $self->{phreds} = $phreds || $DEFAULTS{'phreds'};
|
|
119 # print("Constructor set phreds to ".$self->{phreds}."\n") if $self->verbose > 0;
|
|
120 $self->set_designators($DEFAULTS{'f_designator'},
|
|
121 $DEFAULTS{'r_designator'});
|
|
122 return $self;
|
|
123 }
|
|
124
|
|
125 =head2 set_designators($forward_designator,$reverse_designator)
|
|
126
|
|
127 Title : set_designators(<forward>,<reverse>)
|
|
128 Usage : $o_trim->set_designators("F","R")
|
|
129 Function: Set the string by which the system determines whether a given
|
|
130 sequence represents a forward or a reverse read.
|
|
131 Returns : Nothing.
|
|
132 Args : two scalars: one representing the forward designator and one
|
|
133 representing the reverse designator
|
|
134
|
|
135 =cut
|
|
136
|
|
137 sub set_designators {
|
|
138 my $self = shift;
|
|
139 ($self->{'f_designator'},$self->{'r_designator'}) = @_;
|
|
140 }
|
|
141
|
|
142 =head2 set_forward_designator($designator)
|
|
143
|
|
144 Title : set_forward_designator($designator)
|
|
145 Usage : $o_trim->set_forward_designator("F")
|
|
146 Function: Set the string by which the system determines if a given
|
|
147 sequence is a forward read.
|
|
148 Returns : Nothing.
|
|
149 Args : A string representing the forward designator of this project.
|
|
150
|
|
151 =cut
|
|
152
|
|
153 sub set_forward_designator {
|
|
154 my ($self,$desig) = @_;
|
|
155 $self->{'f_designator'} = $desig;
|
|
156 }
|
|
157
|
|
158 =head2 set_reverse_designator($reverse_designator)
|
|
159
|
|
160 Title : set_reverse_designator($reverse_designator)
|
|
161 Function: Set the string by which the system determines if a given
|
|
162 sequence is a reverse read.
|
|
163 Usage : $o_trim->set_reverse_designator("R")
|
|
164 Returns : Nothing.
|
|
165 Args : A string representing the forward designator of this project.
|
|
166
|
|
167 =cut
|
|
168
|
|
169 sub set_reverse_designator {
|
|
170 my ($self,$desig) = @_;
|
|
171 $self->{'r_designator'} = $desig;
|
|
172 }
|
|
173
|
|
174 =head2 get_designators()
|
|
175
|
|
176 Title : get_designators()
|
|
177 Usage : $o_trim->get_designators()
|
|
178 Returns : A string describing the current designators.
|
|
179 Args : None
|
|
180 Notes : Really for informational purposes only. Duh.
|
|
181
|
|
182 =cut
|
|
183
|
|
184 sub get_designators {
|
|
185 my $self = shift;
|
|
186 return("forward: ".$self->{'f_designator'}." reverse: ".$self->{'r_designator'});
|
|
187 }
|
|
188
|
|
189 =head2 trim_leading_polys()
|
|
190
|
|
191 Title : trim_leading_polys()
|
|
192 Usage : $o_trim->trim_leading_polys()
|
|
193 Function: Not implemented. Does nothing.
|
|
194 Returns : Nothing.
|
|
195 Args : None.
|
|
196 Notes : This function is not implemented. Part of something I wanted to
|
|
197 do but never got around to doing.
|
|
198
|
|
199 =cut
|
|
200
|
|
201 sub trim_leading_polys {
|
|
202 my ($self, $sequence) = @_;
|
|
203 }
|
|
204
|
|
205 =head2 dump_hash()
|
|
206
|
|
207 Title : dump_hash()
|
|
208 Usage : $o_trim->dump_hash()
|
|
209 Function: Unimplemented.
|
|
210 Returns : Nothing.
|
|
211 Args : None.
|
|
212 Notes : Does nothing.
|
|
213
|
|
214 =cut
|
|
215
|
|
216 sub dump_hash {
|
|
217 my $self = shift;
|
|
218 my %hash = %{$self->{'qualities'}};
|
|
219 } # end dump_hash
|
|
220
|
|
221 =head2 trim_singlet($sequence,$quality,$name,$class)
|
|
222
|
|
223 Title : trim_singlet($sequence,$quality,$name,$class)
|
|
224 Usage : ($r_trim_points,$trimmed_sequence) =
|
|
225 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
|
|
226 Function: Trim a singlet based on its quality.
|
|
227 Returns : a reference to an array containing the forward and reverse
|
|
228 trim points and the trimmed sequence.
|
|
229 Args : $sequence : A sequence (SCALAR, please)
|
|
230 $quality : A _scalar_ of space-delimited quality values.
|
|
231 $name : the name of the sequence
|
|
232 $class : The class of the sequence. One of qw(singlet
|
|
233 singleton doublet pair multiplet)
|
|
234 Notes : At the time this was written the bioperl objects SeqWithQuality
|
|
235 and PrimaryQual did not exist. This is what is with the clumsy
|
|
236 passing of references and so on. I will rewrite this next time I
|
|
237 have to work with it. I also wasn't sure whether this function
|
|
238 should return just the trim points or the points and the sequence.
|
|
239 I decided that I always wanted both so that's how I implemented
|
|
240 it.
|
|
241 - Note that the size of the sliding windows is set during construction of
|
|
242 the Bio::Tools::Alignment::Trim object.
|
|
243
|
|
244 =cut
|
|
245
|
|
246 sub trim_singlet {
|
|
247 my ($self,$sequence,$quality,$name,$class) = @_;
|
|
248 # this split is done because I normally store quality values in a
|
|
249 # space-delimited scalar rather then in an array.
|
|
250 # I do this because serialization of the arrays is tough.
|
|
251 my @qual = split(' ',$quality);
|
|
252 my @points;
|
|
253 my $sequence_length = length($sequence);
|
|
254 my ($returnstring,$processed_sequence);
|
|
255 # smooth out the qualities
|
|
256 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
|
|
257 # find out the leading and trailing trimpoints
|
|
258 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds});
|
|
259 my (@new_points,$trimmed_sequence);
|
|
260 # do you think that any sequence shorter then 100 should be
|
|
261 # discarded? I don't think that this should be the decision of this
|
|
262 # module.
|
|
263 # removed, 020926
|
|
264 $points[0] = $start_base;
|
|
265 # whew! now for the end base
|
|
266 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
|
|
267 my $end_base = &_get_end($r_windows,$self->{windowsize},
|
|
268 $self->{phreds},$start_base);
|
|
269 $points[1] = $end_base;
|
|
270 # now do the actual trimming
|
|
271 # CHAD : I don't think that it is a good idea to call chop_sequence here
|
|
272 # because chop_sequence also removes X's and N's and things
|
|
273 # and that is not always what is wanted
|
|
274 return \@points;
|
|
275 }
|
|
276
|
|
277 =head2 trim_doublet($sequence,$quality,$name,$class)
|
|
278
|
|
279 Title : trim_doublet($sequence,$quality,$name,$class)
|
|
280 Usage : ($r_trim_points,$trimmed_sequence) =
|
|
281 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
|
|
282 Function: Trim a singlet based on its quality.
|
|
283 Returns : a reference to an array containing the forward and reverse
|
|
284 Args : $sequence : A sequence
|
|
285 $quality : A _scalar_ of space-delimited quality values.
|
|
286 $name : the name of the sequence
|
|
287 $class : The class of the sequence. One of qw(singlet
|
|
288 singleton doublet pair multiplet)
|
|
289 Notes : At the time this was written the bioperl objects SeqWithQuality
|
|
290 and PrimaryQual did not exist. This is what is with the clumsy
|
|
291 passing of references and so on. I will rewrite this next time I
|
|
292 have to work with it. I also wasn't sure whether this function
|
|
293 should return just the trim points or the points and the sequence.
|
|
294 I decided that I always wanted both so that's how I implemented
|
|
295 it.
|
|
296
|
|
297 =cut
|
|
298
|
|
299 #'
|
|
300 sub trim_doublet {
|
|
301 my ($self,$sequence,$quality,$name,$class) = @_;
|
|
302 my @qual = split(' ',$quality);
|
|
303 my @points;
|
|
304 my $sequence_length = length($sequence);
|
|
305 my ($returnstring,$processed_sequence);
|
|
306 # smooth out the qualities
|
|
307 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
|
|
308 # determine where the consensus sequence starts
|
|
309 my $offset = 0;
|
|
310 for (my $current = 0; $current<$sequence_length;$current++) {
|
|
311 if ($qual[$current] != 0) {
|
|
312 $offset = $current;
|
|
313 last;
|
|
314 }
|
|
315 }
|
|
316 # start_base required: r_quality,$windowsize,$phredvalue
|
|
317 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds},$offset);
|
|
318 if ($start_base > ($sequence_length - 100)) {
|
|
319 $points[0] = ("FAILED");
|
|
320 $points[1] = ("FAILED");
|
|
321 return @points;
|
|
322 }
|
|
323 $points[0] = $start_base;
|
|
324 #
|
|
325 # whew! now for the end base
|
|
326 #
|
|
327 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
|
|
328 # |
|
|
329 # 010420 NOTE: We will no longer get the end base to avoid the Q/--\___/-- syndrome
|
|
330 my $end_base = $sequence_length;
|
|
331 my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
|
|
332 $points[1] = $end_base;
|
|
333 # CHAD : I don't think that it is a good idea to call chop_sequence here
|
|
334 # because chop_sequence also removes X's and N's and things
|
|
335 # and that is not always what is wanted
|
|
336 return @points;
|
|
337 } # end trim_doublet
|
|
338
|
|
339 =head2 chop_sequence($name,$class,$sequence,@points)
|
|
340
|
|
341 Title : chop_sequence($name,$class,$sequence,@points)
|
|
342 Usage : ($start_point,$end_point,$chopped_sequence) =
|
|
343 $o_trim->chop_sequence($name,$class,$sequence,@points);
|
|
344 Function: Chop a sequence based on its name, class, and sequence.
|
|
345 Returns : an array containing three scalars:
|
|
346 1- the start trim point
|
|
347 2- the end trim point
|
|
348 3- the chopped sequence
|
|
349 Args :
|
|
350 $name : the name of the sequence
|
|
351 $class : The class of the sequence. One of qw(singlet
|
|
352 singleton doublet pair multiplet)
|
|
353 $sequence : A sequence
|
|
354 @points : An array containing two elements- the first contains
|
|
355 the start trim point and the second conatines the end trim
|
|
356 point.
|
|
357
|
|
358 =cut
|
|
359
|
|
360 sub chop_sequence {
|
|
361 my ($self,$name,$class,$sequence,@points) = @_;
|
|
362 print("Coming into chop_sequence, \@points are @points\n");
|
|
363 my $fdesig = $self->{'f_designator'};
|
|
364 my $rdesig = $self->{'r_designator'};
|
|
365 if (!$points[0] && !$points[1]) {
|
|
366 $sequence = "junk";
|
|
367 return $sequence;
|
|
368 }
|
|
369 if ($class eq "singlet" && $name =~ /$fdesig$/) {
|
|
370 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
|
|
371 }
|
|
372 elsif ($class eq "singlet" && $name =~ /$rdesig$/) {
|
|
373 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
|
|
374 }
|
|
375 elsif ($class eq "singleton" && $name =~ /$fdesig$/) {
|
|
376 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
|
|
377 }
|
|
378 elsif ($class eq "singleton" && $name =~ /$rdesig$/) {
|
|
379 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
|
|
380 }
|
|
381 elsif ($class eq "doublet") {
|
|
382 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
|
|
383 }
|
|
384 # this is a _terrible_ to do this! i couldn't seem to find a better way
|
|
385 # i thought something like s/(^.*[Xx]{5,})//g; might work, but no go
|
|
386 # no time to find a fix!
|
|
387 my $length_before_trimming = length($sequence);
|
|
388 my $subs_Xs = $sequence =~ s/^.*[Xx]{5,}//g;
|
|
389 if ($subs_Xs) {
|
|
390 my $length_after_trimming = length($sequence);
|
|
391 my $number_Xs_trimmed = $length_before_trimming - $length_after_trimming;
|
|
392 $points[0] += $number_Xs_trimmed;
|
|
393 }
|
|
394 $length_before_trimming = length($sequence);
|
|
395 my $subs_Ns = $sequence =~ s/[Nn]{1,}$//g;
|
|
396 if ($subs_Ns) {
|
|
397 my $length_after_trimming = length($sequence);
|
|
398 my $number_Ns_trimmed = $length_before_trimming - $length_after_trimming;
|
|
399 $points[1] -= $number_Ns_trimmed;
|
|
400 $points[1] -= 1;
|
|
401 }
|
|
402 push @points,$sequence;
|
|
403 print("chop_sequence \@points are @points\n");
|
|
404 return @points;
|
|
405 }
|
|
406
|
|
407 =head2 _get_start($r_quals,$windowsize,$phreds,$offset)
|
|
408
|
|
409 Title : _get_start($r_quals,$windowsize,$phreds,$offset)
|
|
410 Usage : $start_base = $self->_get_start($r_windows,5,20);
|
|
411 Function: Provide the start trim point for this sequence.
|
|
412 Returns : a scalar representing the start of the sequence
|
|
413 Args :
|
|
414 $r_quals : A reference to an array containing quality values. In
|
|
415 context, this array of values has been smoothed by then
|
|
416 sliding window-look ahead algorithm.
|
|
417 $windowsize : The size of the window used when the sliding window
|
|
418 look-ahead average was calculated.
|
|
419 $phreds : <fill in what this does here>
|
|
420 $offset : <fill in what this does here>
|
|
421
|
|
422 =cut
|
|
423
|
|
424 sub _get_start {
|
|
425 my ($self,$r_quals,$windowsize,$phreds,$offset) = @_;
|
|
426 print("Using $phreds phreds\n") if $self->verbose > 0;
|
|
427 # this is to help determine whether the sequence is good at all
|
|
428 my @quals = @$r_quals;
|
|
429 my ($count,$count2,$qualsum);
|
|
430 if ($offset) { $count = $offset; } else { $count = 0; }
|
|
431 # search along the length of the sequence
|
|
432 for (; ($count+$windowsize) <= scalar(@quals); $count++) {
|
|
433 # sum all of the quality values in this window.
|
|
434 my $cumulative=0;
|
|
435 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
|
|
436 if (!$quals[$count2]) {
|
|
437 # print("Quals don't exist here!\n");
|
|
438 }
|
|
439 else {
|
|
440 $qualsum += $quals[$count2];
|
|
441 # print("Incremented qualsum to ($qualsum)\n");
|
|
442 }
|
|
443 $cumulative++;
|
|
444 }
|
|
445 # print("The sum of this window (starting at $count) is $qualsum. I counted $cumulative bases.\n");
|
|
446 # if the total of windowsize * phreds is
|
|
447 if ($qualsum && $qualsum >= $windowsize*$phreds) { return $count; }
|
|
448 $qualsum = 0;
|
|
449 }
|
|
450 # if ($count > scalar(@quals)-$windowsize) { return; }
|
|
451 return $count;
|
|
452 }
|
|
453
|
|
454 =head2 _get_end($r_qual,$windowsize,$phreds,$count)
|
|
455
|
|
456 Title : _get_end($r_qual,$windowsize,$phreds,$count)
|
|
457 Usage : my $end_base = &_get_end($r_windows,20,20,$start_base);
|
|
458 Function: Get the end trim point for this sequence.
|
|
459 Returns : A scalar representing the end trim point for this sequence.
|
|
460 Args :
|
|
461 $r_qual : A reference to an array containing quality values. In
|
|
462 context, this array of values has been smoothed by then
|
|
463 sliding window-look ahead algorithm.
|
|
464 $windowsize : The size of the window used when the sliding window
|
|
465 look-ahead average was calculated.
|
|
466 $phreds : <fill in what this does here>
|
|
467 $count : Start looking for the end of the sequence here.
|
|
468
|
|
469 =cut
|
|
470
|
|
471 sub _get_end {
|
|
472 my ($r_qual,$windowsize,$phreds,$count) = @_;
|
|
473 my @quals = @$r_qual;
|
|
474 my $total_bases = scalar(@quals);
|
|
475 my ($count2,$qualsum,$end_of_quals,$bases_counted);
|
|
476 if (!$count) { $count=0; }
|
|
477 BASE: for (; $count < $total_bases; $count++) {
|
|
478 $bases_counted = 0;
|
|
479 $qualsum = 0;
|
|
480 POSITION: for($count2 = $count; $count2 < $total_bases; $count2++) {
|
|
481 $bases_counted++;
|
|
482
|
|
483 if ($count2 == $total_bases-1) {
|
|
484 $qualsum += $quals[$count2];
|
|
485 $bases_counted++;
|
|
486 last BASE;
|
|
487 }
|
|
488 elsif ($bases_counted == $windowsize) {
|
|
489 $qualsum += $quals[$count2];
|
|
490 if ($qualsum < $bases_counted*$phreds) {
|
|
491 return $count+$bases_counted+$windowsize;
|
|
492 }
|
|
493 next BASE;
|
|
494 }
|
|
495 else {
|
|
496 $qualsum += $quals[$count2];
|
|
497 }
|
|
498 }
|
|
499 if ($qualsum < $bases_counted*$phreds) {
|
|
500 return $count+$bases_counted+$windowsize;
|
|
501 }
|
|
502 else { }
|
|
503 $qualsum = 0;
|
|
504 } # end for
|
|
505 if ($end_of_quals) {
|
|
506 my $bases_for_average = $total_bases-$count2;
|
|
507 return $count2;
|
|
508 }
|
|
509 else { }
|
|
510 if ($qualsum) { } # print ("$qualsum\n");
|
|
511 return $total_bases;
|
|
512 } # end get_end
|
|
513
|
|
514 =head2 count_doublet_trailing_zeros($r_qual)
|
|
515
|
|
516 Title : count_doublet_trailing_zeros($r_qual)
|
|
517 Usage : my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
|
|
518 Function: Find out when the trailing zero qualities start.
|
|
519 Returns : A scalar representing where the zeros start.
|
|
520 Args : A reference to an array of quality values.
|
|
521 Notes : Again, this should be rewritten to use PrimaryQual objects.
|
|
522 A more detailed explanation of why phrap puts these zeros here should
|
|
523 be written and placed here. Please email and hassle the author.
|
|
524
|
|
525
|
|
526 =cut
|
|
527
|
|
528 sub count_doublet_trailing_zeros {
|
|
529 my ($r_qual) = shift;
|
|
530 my $number_of_trailing_zeros = 0;
|
|
531 my @qualities = @$r_qual;
|
|
532 for (my $current=scalar(@qualities);$current>0;$current--) {
|
|
533 if ($qualities[$current] && $qualities[$current] != 0) {
|
|
534 $number_of_trailing_zeros = scalar(@qualities)-$current;
|
|
535 return $current+1;
|
|
536 }
|
|
537 }
|
|
538 return scalar(@qualities);
|
|
539 } # end count_doublet_trailing_zeros
|
|
540
|
|
541 =head2 _sliding_window($r_quals,$windowsize)
|
|
542
|
|
543 Title : _sliding_window($r_quals,$windowsize)
|
|
544 Usage : my $r_windows = &_sliding_window(\@qual,$windowsize);
|
|
545 Function: Create a sliding window, look-forward-average on an array
|
|
546 of quality values. Used to smooth out differences in qualities.
|
|
547 Returns : A reference to an array containing the smoothed values.
|
|
548 Args : $r_quals: A reference to an array containing quality values.
|
|
549 $windowsize : The size of the sliding window.
|
|
550 Notes : This was written before PrimaryQual objects existed. They
|
|
551 should use that object but I haven't rewritten this yet.
|
|
552
|
|
553 =cut
|
|
554
|
|
555 #'
|
|
556 sub _sliding_window {
|
|
557 my ($r_quals,$windowsize) = @_;
|
|
558 my (@window,@quals,$qualsum,$count,$count2,$average,@averages,$bases_counted);
|
|
559 @quals = @$r_quals;
|
|
560 my $size_of_quality = scalar(@quals);
|
|
561 # do this loop for all of the qualities
|
|
562 for ($count=0; $count <= $size_of_quality; $count++) {
|
|
563 $bases_counted = 0;
|
|
564 BASE: for($count2 = $count; $count2 < $size_of_quality; $count2++) {
|
|
565 $bases_counted++;
|
|
566 # if the search hits the end of the averages, stop
|
|
567 # this is for the case near the end where bases remaining < windowsize
|
|
568 if ($count2 == $size_of_quality) {
|
|
569 $qualsum += $quals[$count2];
|
|
570 last BASE;
|
|
571 }
|
|
572 # if the search hits the size of the window
|
|
573 elsif ($bases_counted == $windowsize) {
|
|
574 $qualsum += $quals[$count2];
|
|
575 last BASE;
|
|
576 }
|
|
577 # otherwise add the quality value
|
|
578 unless (!$quals[$count2]) {
|
|
579 $qualsum += $quals[$count2];
|
|
580 }
|
|
581 }
|
|
582 unless (!$qualsum || !$windowsize) {
|
|
583 $average = $qualsum / $bases_counted;
|
|
584 if (!$average) { $average = "0"; }
|
|
585 push @averages,$average;
|
|
586 }
|
|
587 $qualsum = 0;
|
|
588 }
|
|
589 # 02101 Yes, I repaired the mismatching numbers between averages and windows.
|
|
590 # print("There are ".scalar(@$r_quals)." quality values. They are @$r_quals\n");
|
|
591 # print("There are ".scalar(@averages)." average values. They are @averages\n");
|
|
592 return \@averages;
|
|
593
|
|
594 }
|
|
595
|
|
596 =head2 _print_formatted_qualities
|
|
597
|
|
598 Title : _print_formatted_qualities(\@quals)
|
|
599 Usage : &_print_formatted_qualities(\@quals);
|
|
600 Returns : Nothing. Prints.
|
|
601 Args : A reference to an array containing quality values.
|
|
602 Notes : An internal procedure used in debugging. Prints out an array nicely.
|
|
603
|
|
604 =cut
|
|
605
|
|
606 sub _print_formatted_qualities {
|
|
607 my $rquals = shift;
|
|
608 my @qual = @$rquals;
|
|
609 for (my $count=0; $count<scalar(@qual) ; $count++) {
|
|
610 if (($count%10)==0) { print("\n$count\t"); }
|
|
611 if ($qual[$count]) { print ("$qual[$count]\t");}
|
|
612 else { print("0\t"); }
|
|
613 }
|
|
614 print("\n");
|
|
615 }
|
|
616
|
|
617 =head2 _get_end_old($r_qual,$windowsize,$phreds,$count)
|
|
618
|
|
619 Title : _get_end_old($r_qual,$windowsize,$phreds,$count)
|
|
620 Usage : Deprecated. Don't use this!
|
|
621 Returns : Deprecated. Don't use this!
|
|
622 Args : Deprecated. Don't use this!
|
|
623
|
|
624 =cut
|
|
625
|
|
626 #'
|
|
627 sub _get_end_old {
|
|
628 my ($r_qual,$windowsize,$phreds,$count) = @_;
|
|
629 warn("Do Not Use this function (_get_end_old)");
|
|
630 my $target = $windowsize*$phreds;
|
|
631 my @quals = @$r_qual;
|
|
632 my $total_bases = scalar(@quals);
|
|
633 my ($count2,$qualsum,$end_of_quals);
|
|
634 if (!$count) { $count=0; }
|
|
635 BASE: for (; $count < $total_bases; $count++) {
|
|
636 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
|
|
637 if ($count2 == scalar(@quals)-1) {
|
|
638 $qualsum += $quals[$count2];
|
|
639 $end_of_quals = 1;
|
|
640 last BASE;
|
|
641
|
|
642 }
|
|
643 $qualsum += $quals[$count2];
|
|
644 }
|
|
645 if ($qualsum < $windowsize*$phreds) {
|
|
646 return $count+$windowsize;
|
|
647 }
|
|
648 $qualsum = 0;
|
|
649 } # end for
|
|
650 } # end get_end_old
|
|
651
|
|
652
|
|
653 # Autoload methods go after =cut, and are processed by the autosplit program.
|
|
654
|
|
655 1;
|
|
656 __END__
|