comparison variant_effect_predictor/Bio/Tools/Alignment/Trim.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # Bio::Tools::Alignment::Trim.pm
2 #
3 # Cared for by Chad Matsalla
4 #
5 # Copyright Chad Matsalla
6 #
7 # You may distribute this module under the same terms as perl itself
8
9 # POD documentation - main docs before the code
10
11 =head1 NAME
12
13 Bio::Tools::Alignment::Trim - A kludge to do specialized trimming of
14 sequence based on quality.
15
16 =head1 SYNOPSIS
17
18 use Bio::Tools::Alignment::Trim;
19 $o_trim = new Bio::Tools::Alignment::Trim;
20 $o_trim->set_reverse_designator("R");
21 $o_trim->set_forward_designator("F");
22
23
24 =head1 DESCRIPTION
25
26 This is a specialized module designed by Chad for Chad to trim sequences
27 based on a highly specialized list of requirements. In other words, write
28 something that will trim sequences 'just like the people in the lab would
29 do manually'.
30
31 I settled on a sliding-window-average style of search which is ugly and
32 slow but does _exactly_ what I want it to do.
33
34 Mental note: rewrite this.
35
36 It is very important to keep in mind the context in which this module was
37 written: strictly to support the projects for which Consed.pm was
38 designed.
39
40 =head1 FEEDBACK
41
42 =head2 Mailing Lists
43
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to one
46 of the Bioperl mailing lists. Your participation is much appreciated.
47
48 bioperl-l@bioperl.org - General discussion
49 http://bio.perl.org/MailList.html - About the mailing
50 lists
51
52 =head2 Reporting Bugs
53
54 Report bugs to the Bioperl bug tracking system to help us keep track
55 the bugs and their resolution. Bug reports can be submitted via
56 email or the web:
57
58 bioperl-bugs@bio.perl.org
59 http://bugzilla.bioperl.org/
60
61 =head1 AUTHOR - Chad Matsalla
62
63 Email bioinformatics@dieselwurks.com
64
65 =head1 APPENDIX
66
67 The rest of the documentation details each of the object methods.
68 Internal methods are usually preceded with a _
69
70 =cut
71
72 package Bio::Tools::Alignment::Trim;
73
74 use Bio::Root::Root;
75 use strict;
76 use Dumpvalue;
77
78
79
80 use vars qw($VERSION @ISA %DEFAULTS);
81
82 $VERSION = '0.01';
83
84 @ISA = qw(Bio::Root::Root);
85
86 BEGIN {
87 %DEFAULTS = ( 'f_designator' => 'f',
88 'r_designator' => 'r',
89 'windowsize' => '10',
90 'phreds' => '20');
91 }
92
93 =head2 new()
94
95 Title : new()
96 Usage : $o_trim = Bio::Tools::Alignment::Trim->new();
97 Function: Construct the Bio::Tools::Alignment::Trim object. No parameters
98 are required to create this object. It is strictly a bundle of
99 functions, as far as I am concerned.
100 Returns : A reference to a Bio::Tools::Alignment::Trim object.
101 Args : (optional)
102 -windowsize (default 10)
103 -phreds (default 20)
104
105
106 =cut
107
108 sub new {
109 my ($class,@args) = @_;
110 my $self = $class->SUPER::new(@args);
111 my($windowsize,$phreds) =
112 $self->_rearrange([qw(
113 WINDOWSIZE
114 PHREDS
115 )],
116 @args);
117 $self->{windowsize} = $windowsize || $DEFAULTS{'windowsize'};
118 $self->{phreds} = $phreds || $DEFAULTS{'phreds'};
119 # print("Constructor set phreds to ".$self->{phreds}."\n") if $self->verbose > 0;
120 $self->set_designators($DEFAULTS{'f_designator'},
121 $DEFAULTS{'r_designator'});
122 return $self;
123 }
124
125 =head2 set_designators($forward_designator,$reverse_designator)
126
127 Title : set_designators(<forward>,<reverse>)
128 Usage : $o_trim->set_designators("F","R")
129 Function: Set the string by which the system determines whether a given
130 sequence represents a forward or a reverse read.
131 Returns : Nothing.
132 Args : two scalars: one representing the forward designator and one
133 representing the reverse designator
134
135 =cut
136
137 sub set_designators {
138 my $self = shift;
139 ($self->{'f_designator'},$self->{'r_designator'}) = @_;
140 }
141
142 =head2 set_forward_designator($designator)
143
144 Title : set_forward_designator($designator)
145 Usage : $o_trim->set_forward_designator("F")
146 Function: Set the string by which the system determines if a given
147 sequence is a forward read.
148 Returns : Nothing.
149 Args : A string representing the forward designator of this project.
150
151 =cut
152
153 sub set_forward_designator {
154 my ($self,$desig) = @_;
155 $self->{'f_designator'} = $desig;
156 }
157
158 =head2 set_reverse_designator($reverse_designator)
159
160 Title : set_reverse_designator($reverse_designator)
161 Function: Set the string by which the system determines if a given
162 sequence is a reverse read.
163 Usage : $o_trim->set_reverse_designator("R")
164 Returns : Nothing.
165 Args : A string representing the forward designator of this project.
166
167 =cut
168
169 sub set_reverse_designator {
170 my ($self,$desig) = @_;
171 $self->{'r_designator'} = $desig;
172 }
173
174 =head2 get_designators()
175
176 Title : get_designators()
177 Usage : $o_trim->get_designators()
178 Returns : A string describing the current designators.
179 Args : None
180 Notes : Really for informational purposes only. Duh.
181
182 =cut
183
184 sub get_designators {
185 my $self = shift;
186 return("forward: ".$self->{'f_designator'}." reverse: ".$self->{'r_designator'});
187 }
188
189 =head2 trim_leading_polys()
190
191 Title : trim_leading_polys()
192 Usage : $o_trim->trim_leading_polys()
193 Function: Not implemented. Does nothing.
194 Returns : Nothing.
195 Args : None.
196 Notes : This function is not implemented. Part of something I wanted to
197 do but never got around to doing.
198
199 =cut
200
201 sub trim_leading_polys {
202 my ($self, $sequence) = @_;
203 }
204
205 =head2 dump_hash()
206
207 Title : dump_hash()
208 Usage : $o_trim->dump_hash()
209 Function: Unimplemented.
210 Returns : Nothing.
211 Args : None.
212 Notes : Does nothing.
213
214 =cut
215
216 sub dump_hash {
217 my $self = shift;
218 my %hash = %{$self->{'qualities'}};
219 } # end dump_hash
220
221 =head2 trim_singlet($sequence,$quality,$name,$class)
222
223 Title : trim_singlet($sequence,$quality,$name,$class)
224 Usage : ($r_trim_points,$trimmed_sequence) =
225 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
226 Function: Trim a singlet based on its quality.
227 Returns : a reference to an array containing the forward and reverse
228 trim points and the trimmed sequence.
229 Args : $sequence : A sequence (SCALAR, please)
230 $quality : A _scalar_ of space-delimited quality values.
231 $name : the name of the sequence
232 $class : The class of the sequence. One of qw(singlet
233 singleton doublet pair multiplet)
234 Notes : At the time this was written the bioperl objects SeqWithQuality
235 and PrimaryQual did not exist. This is what is with the clumsy
236 passing of references and so on. I will rewrite this next time I
237 have to work with it. I also wasn't sure whether this function
238 should return just the trim points or the points and the sequence.
239 I decided that I always wanted both so that's how I implemented
240 it.
241 - Note that the size of the sliding windows is set during construction of
242 the Bio::Tools::Alignment::Trim object.
243
244 =cut
245
246 sub trim_singlet {
247 my ($self,$sequence,$quality,$name,$class) = @_;
248 # this split is done because I normally store quality values in a
249 # space-delimited scalar rather then in an array.
250 # I do this because serialization of the arrays is tough.
251 my @qual = split(' ',$quality);
252 my @points;
253 my $sequence_length = length($sequence);
254 my ($returnstring,$processed_sequence);
255 # smooth out the qualities
256 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
257 # find out the leading and trailing trimpoints
258 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds});
259 my (@new_points,$trimmed_sequence);
260 # do you think that any sequence shorter then 100 should be
261 # discarded? I don't think that this should be the decision of this
262 # module.
263 # removed, 020926
264 $points[0] = $start_base;
265 # whew! now for the end base
266 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
267 my $end_base = &_get_end($r_windows,$self->{windowsize},
268 $self->{phreds},$start_base);
269 $points[1] = $end_base;
270 # now do the actual trimming
271 # CHAD : I don't think that it is a good idea to call chop_sequence here
272 # because chop_sequence also removes X's and N's and things
273 # and that is not always what is wanted
274 return \@points;
275 }
276
277 =head2 trim_doublet($sequence,$quality,$name,$class)
278
279 Title : trim_doublet($sequence,$quality,$name,$class)
280 Usage : ($r_trim_points,$trimmed_sequence) =
281 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
282 Function: Trim a singlet based on its quality.
283 Returns : a reference to an array containing the forward and reverse
284 Args : $sequence : A sequence
285 $quality : A _scalar_ of space-delimited quality values.
286 $name : the name of the sequence
287 $class : The class of the sequence. One of qw(singlet
288 singleton doublet pair multiplet)
289 Notes : At the time this was written the bioperl objects SeqWithQuality
290 and PrimaryQual did not exist. This is what is with the clumsy
291 passing of references and so on. I will rewrite this next time I
292 have to work with it. I also wasn't sure whether this function
293 should return just the trim points or the points and the sequence.
294 I decided that I always wanted both so that's how I implemented
295 it.
296
297 =cut
298
299 #'
300 sub trim_doublet {
301 my ($self,$sequence,$quality,$name,$class) = @_;
302 my @qual = split(' ',$quality);
303 my @points;
304 my $sequence_length = length($sequence);
305 my ($returnstring,$processed_sequence);
306 # smooth out the qualities
307 my $r_windows = &_sliding_window(\@qual,$self->{windowsize});
308 # determine where the consensus sequence starts
309 my $offset = 0;
310 for (my $current = 0; $current<$sequence_length;$current++) {
311 if ($qual[$current] != 0) {
312 $offset = $current;
313 last;
314 }
315 }
316 # start_base required: r_quality,$windowsize,$phredvalue
317 my $start_base = $self->_get_start($r_windows,$self->{windowsize},$self->{phreds},$offset);
318 if ($start_base > ($sequence_length - 100)) {
319 $points[0] = ("FAILED");
320 $points[1] = ("FAILED");
321 return @points;
322 }
323 $points[0] = $start_base;
324 #
325 # whew! now for the end base
326 #
327 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
328 # |
329 # 010420 NOTE: We will no longer get the end base to avoid the Q/--\___/-- syndrome
330 my $end_base = $sequence_length;
331 my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
332 $points[1] = $end_base;
333 # CHAD : I don't think that it is a good idea to call chop_sequence here
334 # because chop_sequence also removes X's and N's and things
335 # and that is not always what is wanted
336 return @points;
337 } # end trim_doublet
338
339 =head2 chop_sequence($name,$class,$sequence,@points)
340
341 Title : chop_sequence($name,$class,$sequence,@points)
342 Usage : ($start_point,$end_point,$chopped_sequence) =
343 $o_trim->chop_sequence($name,$class,$sequence,@points);
344 Function: Chop a sequence based on its name, class, and sequence.
345 Returns : an array containing three scalars:
346 1- the start trim point
347 2- the end trim point
348 3- the chopped sequence
349 Args :
350 $name : the name of the sequence
351 $class : The class of the sequence. One of qw(singlet
352 singleton doublet pair multiplet)
353 $sequence : A sequence
354 @points : An array containing two elements- the first contains
355 the start trim point and the second conatines the end trim
356 point.
357
358 =cut
359
360 sub chop_sequence {
361 my ($self,$name,$class,$sequence,@points) = @_;
362 print("Coming into chop_sequence, \@points are @points\n");
363 my $fdesig = $self->{'f_designator'};
364 my $rdesig = $self->{'r_designator'};
365 if (!$points[0] && !$points[1]) {
366 $sequence = "junk";
367 return $sequence;
368 }
369 if ($class eq "singlet" && $name =~ /$fdesig$/) {
370 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
371 }
372 elsif ($class eq "singlet" && $name =~ /$rdesig$/) {
373 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
374 }
375 elsif ($class eq "singleton" && $name =~ /$fdesig$/) {
376 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
377 }
378 elsif ($class eq "singleton" && $name =~ /$rdesig$/) {
379 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
380 }
381 elsif ($class eq "doublet") {
382 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
383 }
384 # this is a _terrible_ to do this! i couldn't seem to find a better way
385 # i thought something like s/(^.*[Xx]{5,})//g; might work, but no go
386 # no time to find a fix!
387 my $length_before_trimming = length($sequence);
388 my $subs_Xs = $sequence =~ s/^.*[Xx]{5,}//g;
389 if ($subs_Xs) {
390 my $length_after_trimming = length($sequence);
391 my $number_Xs_trimmed = $length_before_trimming - $length_after_trimming;
392 $points[0] += $number_Xs_trimmed;
393 }
394 $length_before_trimming = length($sequence);
395 my $subs_Ns = $sequence =~ s/[Nn]{1,}$//g;
396 if ($subs_Ns) {
397 my $length_after_trimming = length($sequence);
398 my $number_Ns_trimmed = $length_before_trimming - $length_after_trimming;
399 $points[1] -= $number_Ns_trimmed;
400 $points[1] -= 1;
401 }
402 push @points,$sequence;
403 print("chop_sequence \@points are @points\n");
404 return @points;
405 }
406
407 =head2 _get_start($r_quals,$windowsize,$phreds,$offset)
408
409 Title : _get_start($r_quals,$windowsize,$phreds,$offset)
410 Usage : $start_base = $self->_get_start($r_windows,5,20);
411 Function: Provide the start trim point for this sequence.
412 Returns : a scalar representing the start of the sequence
413 Args :
414 $r_quals : A reference to an array containing quality values. In
415 context, this array of values has been smoothed by then
416 sliding window-look ahead algorithm.
417 $windowsize : The size of the window used when the sliding window
418 look-ahead average was calculated.
419 $phreds : <fill in what this does here>
420 $offset : <fill in what this does here>
421
422 =cut
423
424 sub _get_start {
425 my ($self,$r_quals,$windowsize,$phreds,$offset) = @_;
426 print("Using $phreds phreds\n") if $self->verbose > 0;
427 # this is to help determine whether the sequence is good at all
428 my @quals = @$r_quals;
429 my ($count,$count2,$qualsum);
430 if ($offset) { $count = $offset; } else { $count = 0; }
431 # search along the length of the sequence
432 for (; ($count+$windowsize) <= scalar(@quals); $count++) {
433 # sum all of the quality values in this window.
434 my $cumulative=0;
435 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
436 if (!$quals[$count2]) {
437 # print("Quals don't exist here!\n");
438 }
439 else {
440 $qualsum += $quals[$count2];
441 # print("Incremented qualsum to ($qualsum)\n");
442 }
443 $cumulative++;
444 }
445 # print("The sum of this window (starting at $count) is $qualsum. I counted $cumulative bases.\n");
446 # if the total of windowsize * phreds is
447 if ($qualsum && $qualsum >= $windowsize*$phreds) { return $count; }
448 $qualsum = 0;
449 }
450 # if ($count > scalar(@quals)-$windowsize) { return; }
451 return $count;
452 }
453
454 =head2 _get_end($r_qual,$windowsize,$phreds,$count)
455
456 Title : _get_end($r_qual,$windowsize,$phreds,$count)
457 Usage : my $end_base = &_get_end($r_windows,20,20,$start_base);
458 Function: Get the end trim point for this sequence.
459 Returns : A scalar representing the end trim point for this sequence.
460 Args :
461 $r_qual : A reference to an array containing quality values. In
462 context, this array of values has been smoothed by then
463 sliding window-look ahead algorithm.
464 $windowsize : The size of the window used when the sliding window
465 look-ahead average was calculated.
466 $phreds : <fill in what this does here>
467 $count : Start looking for the end of the sequence here.
468
469 =cut
470
471 sub _get_end {
472 my ($r_qual,$windowsize,$phreds,$count) = @_;
473 my @quals = @$r_qual;
474 my $total_bases = scalar(@quals);
475 my ($count2,$qualsum,$end_of_quals,$bases_counted);
476 if (!$count) { $count=0; }
477 BASE: for (; $count < $total_bases; $count++) {
478 $bases_counted = 0;
479 $qualsum = 0;
480 POSITION: for($count2 = $count; $count2 < $total_bases; $count2++) {
481 $bases_counted++;
482
483 if ($count2 == $total_bases-1) {
484 $qualsum += $quals[$count2];
485 $bases_counted++;
486 last BASE;
487 }
488 elsif ($bases_counted == $windowsize) {
489 $qualsum += $quals[$count2];
490 if ($qualsum < $bases_counted*$phreds) {
491 return $count+$bases_counted+$windowsize;
492 }
493 next BASE;
494 }
495 else {
496 $qualsum += $quals[$count2];
497 }
498 }
499 if ($qualsum < $bases_counted*$phreds) {
500 return $count+$bases_counted+$windowsize;
501 }
502 else { }
503 $qualsum = 0;
504 } # end for
505 if ($end_of_quals) {
506 my $bases_for_average = $total_bases-$count2;
507 return $count2;
508 }
509 else { }
510 if ($qualsum) { } # print ("$qualsum\n");
511 return $total_bases;
512 } # end get_end
513
514 =head2 count_doublet_trailing_zeros($r_qual)
515
516 Title : count_doublet_trailing_zeros($r_qual)
517 Usage : my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
518 Function: Find out when the trailing zero qualities start.
519 Returns : A scalar representing where the zeros start.
520 Args : A reference to an array of quality values.
521 Notes : Again, this should be rewritten to use PrimaryQual objects.
522 A more detailed explanation of why phrap puts these zeros here should
523 be written and placed here. Please email and hassle the author.
524
525
526 =cut
527
528 sub count_doublet_trailing_zeros {
529 my ($r_qual) = shift;
530 my $number_of_trailing_zeros = 0;
531 my @qualities = @$r_qual;
532 for (my $current=scalar(@qualities);$current>0;$current--) {
533 if ($qualities[$current] && $qualities[$current] != 0) {
534 $number_of_trailing_zeros = scalar(@qualities)-$current;
535 return $current+1;
536 }
537 }
538 return scalar(@qualities);
539 } # end count_doublet_trailing_zeros
540
541 =head2 _sliding_window($r_quals,$windowsize)
542
543 Title : _sliding_window($r_quals,$windowsize)
544 Usage : my $r_windows = &_sliding_window(\@qual,$windowsize);
545 Function: Create a sliding window, look-forward-average on an array
546 of quality values. Used to smooth out differences in qualities.
547 Returns : A reference to an array containing the smoothed values.
548 Args : $r_quals: A reference to an array containing quality values.
549 $windowsize : The size of the sliding window.
550 Notes : This was written before PrimaryQual objects existed. They
551 should use that object but I haven't rewritten this yet.
552
553 =cut
554
555 #'
556 sub _sliding_window {
557 my ($r_quals,$windowsize) = @_;
558 my (@window,@quals,$qualsum,$count,$count2,$average,@averages,$bases_counted);
559 @quals = @$r_quals;
560 my $size_of_quality = scalar(@quals);
561 # do this loop for all of the qualities
562 for ($count=0; $count <= $size_of_quality; $count++) {
563 $bases_counted = 0;
564 BASE: for($count2 = $count; $count2 < $size_of_quality; $count2++) {
565 $bases_counted++;
566 # if the search hits the end of the averages, stop
567 # this is for the case near the end where bases remaining < windowsize
568 if ($count2 == $size_of_quality) {
569 $qualsum += $quals[$count2];
570 last BASE;
571 }
572 # if the search hits the size of the window
573 elsif ($bases_counted == $windowsize) {
574 $qualsum += $quals[$count2];
575 last BASE;
576 }
577 # otherwise add the quality value
578 unless (!$quals[$count2]) {
579 $qualsum += $quals[$count2];
580 }
581 }
582 unless (!$qualsum || !$windowsize) {
583 $average = $qualsum / $bases_counted;
584 if (!$average) { $average = "0"; }
585 push @averages,$average;
586 }
587 $qualsum = 0;
588 }
589 # 02101 Yes, I repaired the mismatching numbers between averages and windows.
590 # print("There are ".scalar(@$r_quals)." quality values. They are @$r_quals\n");
591 # print("There are ".scalar(@averages)." average values. They are @averages\n");
592 return \@averages;
593
594 }
595
596 =head2 _print_formatted_qualities
597
598 Title : _print_formatted_qualities(\@quals)
599 Usage : &_print_formatted_qualities(\@quals);
600 Returns : Nothing. Prints.
601 Args : A reference to an array containing quality values.
602 Notes : An internal procedure used in debugging. Prints out an array nicely.
603
604 =cut
605
606 sub _print_formatted_qualities {
607 my $rquals = shift;
608 my @qual = @$rquals;
609 for (my $count=0; $count<scalar(@qual) ; $count++) {
610 if (($count%10)==0) { print("\n$count\t"); }
611 if ($qual[$count]) { print ("$qual[$count]\t");}
612 else { print("0\t"); }
613 }
614 print("\n");
615 }
616
617 =head2 _get_end_old($r_qual,$windowsize,$phreds,$count)
618
619 Title : _get_end_old($r_qual,$windowsize,$phreds,$count)
620 Usage : Deprecated. Don't use this!
621 Returns : Deprecated. Don't use this!
622 Args : Deprecated. Don't use this!
623
624 =cut
625
626 #'
627 sub _get_end_old {
628 my ($r_qual,$windowsize,$phreds,$count) = @_;
629 warn("Do Not Use this function (_get_end_old)");
630 my $target = $windowsize*$phreds;
631 my @quals = @$r_qual;
632 my $total_bases = scalar(@quals);
633 my ($count2,$qualsum,$end_of_quals);
634 if (!$count) { $count=0; }
635 BASE: for (; $count < $total_bases; $count++) {
636 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
637 if ($count2 == scalar(@quals)-1) {
638 $qualsum += $quals[$count2];
639 $end_of_quals = 1;
640 last BASE;
641
642 }
643 $qualsum += $quals[$count2];
644 }
645 if ($qualsum < $windowsize*$phreds) {
646 return $count+$windowsize;
647 }
648 $qualsum = 0;
649 } # end for
650 } # end get_end_old
651
652
653 # Autoload methods go after =cut, and are processed by the autosplit program.
654
655 1;
656 __END__