Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Alignment/Consed.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # Bio::Tools::Alignment::Consed.pm | |
2 # | |
3 # Cared for by Chad Matsalla | |
4 # | |
5 # Copyright Chad Matsalla | |
6 # | |
7 # You may distribute this module under the same terms as perl itself | |
8 | |
9 # POD documentation - main docs before the code | |
10 | |
11 =head1 NAME | |
12 | |
13 Bio::Tools::Alignment::Consed - A module to work with objects from consed .ace files | |
14 | |
15 =head1 SYNOPSIS | |
16 | |
17 # a report for sequencing stuff | |
18 my $o_consed = new Bio::Tools::Alignment::Consed( | |
19 -acefile => "/path/to/an/acefile.ace.1", | |
20 -verbose => 1); | |
21 my $foo = $o_consed->set_reverse_designator("r"); | |
22 my $bar = $o_consed->set_forward_designator("f"); | |
23 | |
24 # get the contig numbers | |
25 my @keys = $o_consed->get_contigs(); | |
26 | |
27 # construct the doublets | |
28 my $setter_doublets = $o_consed->choose_doublets(); | |
29 | |
30 # get the doublets | |
31 my @doublets = $o_consed->get_doublets(); | |
32 | |
33 =head1 DESCRIPTION | |
34 | |
35 Bio::Tools::Alignment::Consed provides methods and objects to deal | |
36 with the output from the Consed package of programs. Specifically, | |
37 Bio::Tools::Alignment::Consed takes in the name of in .ace file and | |
38 provides objects for the results. | |
39 | |
40 A word about doublets: This module was written to accomodate a large | |
41 EST sequencing operation. In this case, EST's were sequenced from the | |
42 3' and from the 5' end of the EST. The objective was to find a | |
43 consensus sequence for these two reads. Thus, a contig of two is what | |
44 we wanted, and this contig should consist of the forward and reverse | |
45 reads of a getn clone. For example, for a forward designator of "F" | |
46 and a reverse designator of "R", if the two reads chad1F and chad1R | |
47 were in a single contig (for example Contig 5) it will be determined | |
48 that the consensus sequence for Contig 5 will be the sequence for | |
49 clone chad1. | |
50 | |
51 Doublets are good! | |
52 | |
53 This module parses .ace and related files. A detailed list of methods | |
54 can be found at the end of this document. | |
55 | |
56 I wrote a detailed rationale for design that may explain the reasons | |
57 why some things were done the way they were done. That document is | |
58 beyond the scope of this pod and can probably be found in the | |
59 directory from which this module was 'made' or at | |
60 http://www.dieselwurks.com/bioinformatics/consedpm_documentation.pdf | |
61 | |
62 Note that the pod in that document might be old but the original | |
63 rationale still stands. | |
64 | |
65 =head1 FEEDBACK | |
66 | |
67 =head2 Mailing Lists | |
68 | |
69 User feedback is an integral part of the evolution of this and other | |
70 Bioperl modules. Send your comments and suggestions preferably to one | |
71 of the Bioperl mailing lists. Your participation is much appreciated. | |
72 | |
73 bioperl-l@bioperl.org - General discussion | |
74 http://bio.perl.org/MailList.html - About the mailing lists | |
75 | |
76 =head2 Reporting Bugs | |
77 | |
78 Report bugs to the Bioperl bug tracking system to help us keep track | |
79 the bugs and their resolution. Bug reports can be submitted via email | |
80 or the web: | |
81 | |
82 bioperl-bugs@bio.perl.org | |
83 http://bugzilla.bioperl.org/ | |
84 | |
85 | |
86 =head1 AUTHOR - Chad Matsalla | |
87 | |
88 chad@dieselwurks.com | |
89 | |
90 =head1 APPENDIX | |
91 | |
92 The rest of the documentation details each of the object | |
93 methods. Internal methods are usually preceded with a _ | |
94 | |
95 =cut | |
96 | |
97 #' | |
98 | |
99 package Bio::Tools::Alignment::Consed; | |
100 | |
101 use strict; | |
102 use vars qw($VERSION @ISA $Contigs %DEFAULTS); | |
103 use FileHandle; | |
104 use Dumpvalue qw(dumpValue); | |
105 use Bio::Tools::Alignment::Trim; | |
106 use Bio::Root::Root; | |
107 use Bio::Root::IO; | |
108 | |
109 @ISA = qw(Bio::Root::Root Bio::Root::IO); | |
110 | |
111 $VERSION = '0.60'; | |
112 | |
113 BEGIN { | |
114 %DEFAULTS = ( 'f_designator' => 'f', | |
115 'r_designator' => 'r'); | |
116 } | |
117 | |
118 =head2 new() | |
119 | |
120 Title : new(-acefile => $path_to_some_acefile, -verbose => "1") | |
121 Usage : $o_consed = Bio::Tools::Alignment::Consed-> | |
122 new(-acefile => $path_to_some_acefile, -verbose => "1"); | |
123 Function: Construct the Bio::Tools::Alignment::Consed object. Sets | |
124 verbosity for the following procedures, if necessary: | |
125 1. Construct a new Bio::Tools::Alignment::Trim object, to | |
126 handle quality trimming 2. Read in the acefile and parse it | |
127 | |
128 Returns : A reference to a Bio::Tools::Alignment::Consed object. | |
129 Args : A hash. (-acefile) is the filename of an acefile. If a full path | |
130 is not specified "./" is prepended to the filename and used from | |
131 instantiation until destruction. If you want | |
132 Bio::Tools::Alignment::Consed to be noisy during parsing of | |
133 the acefile, specify some value for (-verbose). | |
134 | |
135 =cut | |
136 | |
137 sub new { | |
138 my ($class,%args) = @_; | |
139 my $self = $class->SUPER::new(%args); | |
140 | |
141 $self->{'filename'} = $args{'-acefile'}; | |
142 # this is special to UNIX and should probably use catfile | |
143 if (!($self->{'filename'} =~ /\//)) { | |
144 $self->{'filename'} = "./".$self->{'filename'}; | |
145 } | |
146 $self->{'filename'} =~ m/(.*\/)(.*)ace.*$/; | |
147 $self->{'path'} = $1; | |
148 $self->_initialize_io('-file'=>$self->{'filename'}); | |
149 $self->{'o_trim'} = new Bio::Tools::Alignment::Trim(-verbose => $self->verbose()); | |
150 $self->set_forward_designator($DEFAULTS{'f_designator'}); | |
151 $self->set_reverse_designator($DEFAULTS{'r_designator'}); | |
152 | |
153 $self->_read_file(); | |
154 return $self; | |
155 } | |
156 | |
157 =head2 verbose() | |
158 | |
159 Title : verbose() | |
160 Usage : $o_consed->verbose(1); | |
161 Function: Set the verbosity level for debugging messages. On instantiation | |
162 of the Bio::Tools::Alignment::Consed object the verbosity level | |
163 is set to 0 (quiet). | |
164 Returns : 1 or 0. | |
165 Args : The verbosity levels are: | |
166 0 - quiet | |
167 1 - noisy | |
168 2 - noisier | |
169 3 - annoyingly noisy | |
170 Notes : This method for setting verbosity has largely been superseeded by | |
171 a sub-by-sub way, where for every sub you can provide a (-verbose) | |
172 switch. I am doing converting this bit-by-bit so do not be surprised | |
173 if some subs do not honour this. | |
174 | |
175 =cut | |
176 | |
177 # from RootI | |
178 | |
179 # backwards compat | |
180 sub set_verbose { (shift)->verbose(@_) } | |
181 | |
182 =head2 get_filename() | |
183 | |
184 Title : get_filename() | |
185 Usage : $o_consed->get_filename(); | |
186 Function: Returns the name of the acefile being used by the | |
187 Bio::Tools::Alignment::Consed object. | |
188 Returns : A scalar containing the name of a file. | |
189 Args : None. | |
190 | |
191 =cut | |
192 | |
193 | |
194 sub get_filename { | |
195 my $self = shift; | |
196 return $self->{'filename'}; | |
197 } | |
198 | |
199 =head2 count_sequences_with_grep() | |
200 | |
201 Title : count_sequences_with_grep() | |
202 Usage : $o_consed->count_sequences_with_grep(); | |
203 Function: Use /bin/grep to scan through the files in the ace project dir | |
204 and count sequences in those files. I used this method in the | |
205 development of this module to verify that I was getting all of the | |
206 sequences. It works, but it is (I think) unix-like platform | |
207 dependent. | |
208 Returns : A scalar containing the number of sequences in the ace project | |
209 directory. | |
210 Args : None. | |
211 Notes : If you are on a non-UNIX platform, you really do not have to use | |
212 this. It is more of a debugging routine designed to address very | |
213 specific problems. | |
214 | |
215 This method was reimplemented to be platform independent with a | |
216 pure perl implementation. The above note can be ignored. | |
217 | |
218 =cut | |
219 | |
220 sub count_sequences_with_grep { | |
221 my $self = shift; | |
222 my ($working_dir,$grep_cli,@total_grep_sequences); | |
223 # this should be migrated to a pure perl implementation ala | |
224 # Tom Christiansen's 'tcgrep' | |
225 # http://www.perl.com/language/ppt/src/grep/tcgrep | |
226 | |
227 open(FILE, $self->{'filename'}) or do { $self->warn("cannot open file ".$self->{'filename'}. " for grepping"); return}; | |
228 my $counter =0; | |
229 while(<FILE>) { $counter++ if(/^AF/); } | |
230 | |
231 close FILE; | |
232 opendir(SINGLETS,$self->{'path'}); | |
233 foreach my $f ( readdir(SINGLETS) ) { | |
234 next unless ($f =~ /\.singlets$/); | |
235 open(FILE, $self->catfile($self->{'path'},$f)) or do{ $self->warn("cannot open file ".$self->catfile($self->{'path'},$f)); next }; | |
236 while(<FILE>) { $counter++ if(/^>/) } | |
237 close FILE; | |
238 } | |
239 return $counter; | |
240 # Chad's platform implementation which required grep on the system | |
241 | |
242 # I tried to cause graceful exiting if not on *ix here | |
243 # then i took platforms from Bioperl*/PLATFORMS here. Is that good? | |
244 # print("\$^O is $^O\n"); | |
245 # if (!($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { | |
246 # $self->warn("Bio::Tools::Alignment::Consed::count_sequences_with_grep: This sub uses grep which is doesn't run on this operating system, AFAIK. Sorry .".$^O); | |
247 # return 1; | |
248 # } | |
249 # $grep_cli = `which grep`; | |
250 # if (!$grep_cli) { | |
251 # $self->warn("I couldn't see to find grep on this system, or the which command is broken. Bio::Tools::Alignment::Consed::count_sequences_with_grep requires grep and which to find grep."); | |
252 # return 1; | |
253 # } | |
254 # chomp $grep_cli; | |
255 # push(@total_grep_sequences, my @foo = `$grep_cli ^AF $self->{filename}`); | |
256 # my $cli = "$grep_cli \\> $self->{'path'}*.singlets"; | |
257 # push(@total_grep_sequences, @foo = `$cli`); | |
258 # return scalar(@total_grep_sequences); | |
259 } | |
260 | |
261 =head2 get_path() | |
262 | |
263 Title : get_path() | |
264 Usage : $o_consed->get_path(); | |
265 Function: Returns the path to the acefile this object is working with. | |
266 Returns : Scalar. The path to the working acefile. | |
267 Args : None. | |
268 | |
269 =cut | |
270 | |
271 sub get_path { | |
272 my $self = shift; | |
273 return $self->{'path'}; | |
274 } | |
275 | |
276 =head2 get_contigs() | |
277 | |
278 Title : get_contigs() | |
279 Usage : $o_consed->get_contigs(); | |
280 Function: Return the keys to the Bio::Tools::Alignment::Consed object. | |
281 Returns : An array containing the keynames in the | |
282 Bio::Tools::Alignment::Consed object. | |
283 Args : None. | |
284 Notes : This would normally be used to get the keynames for some sort of | |
285 iterator. These keys are worthless in general day-to-day use because | |
286 in the Consed acefile they are simply Contig1, Contig2, ... | |
287 | |
288 =cut | |
289 | |
290 sub get_contigs { | |
291 my ($self,$contig) = @_; | |
292 my @keys = (sort keys %{$self->{'contigs'}}); | |
293 return @keys; | |
294 } | |
295 | |
296 =head2 get_class($contig_keyname) | |
297 | |
298 Title : get_class($contig_keyname) | |
299 Usage : $o_consed->get_class($contig_keyname); | |
300 Function: Return the class name for this contig | |
301 Returns : A scalar representing the class of this contig. | |
302 Args : None. | |
303 Notes : | |
304 | |
305 =cut | |
306 | |
307 sub get_class { | |
308 my ($self,$contig) = @_; | |
309 return $self->{contigs}->{$contig}->{class}; | |
310 } | |
311 | |
312 =head2 get_quality_array($contig_keyname) | |
313 | |
314 Title : get_quality_array($contig_keyname) | |
315 Usage : $o_consed->get_quality_array($contig_keyname); | |
316 Function: Returns the quality for the consensus sequence for the given | |
317 contig as an array. See get_quality_scalar to get this as a scalar. | |
318 Returns : An array containing the quality for the consensus sequence with | |
319 the given keyname. | |
320 Args : The keyname of a contig. Note: This is a keyname. The key would | |
321 normally come from get_contigs. | |
322 Notes : Returns an array, not a reference. Is this a bug? <thinking> No. | |
323 Well, maybe. | |
324 Why was this developed like this? I was using FreezeThaw for object | |
325 persistence, and when it froze out these arrays it took a long time | |
326 to thaw it. Much better as a scalar. | |
327 | |
328 See L<get_quality_scalar()|get_quality_scalar> | |
329 | |
330 =cut | |
331 | |
332 sub get_quality_array { | |
333 my ($self,$contig) = @_; | |
334 my $quality = $self->{contigs}->{$contig}->{quality}; | |
335 # chad, what is with the s/// ? | |
336 # my @qualities = split | |
337 # (' ',($self->{contigs}->{$contig}->{quality} =~ s/\s+//)); | |
338 my @qualities = split | |
339 (' ',$self->{contigs}->{$contig}->{quality}); | |
340 return @qualities; | |
341 } | |
342 | |
343 =head2 get_quality_scalar($contig_keyname)) | |
344 | |
345 Title : get_quality_scalar($contig_keyname) | |
346 Usage : $o_consed->get_quality_scalar($contig_keyname); | |
347 Function: Returns the quality for the consensus sequence for the given | |
348 contig as a scalar. See get_quality_array to get this as an array. | |
349 Returns : An scalar containing the quality for the consensus sequence with | |
350 the given keyname. | |
351 Args : The keyname of a contig. Note this is a _keyname_. The key would | |
352 normally come from get_contigs. | |
353 Notes : Why was this developed like this? I was using FreezeThaw for object | |
354 persistence, and when it froze out these arrays it took a coon's age | |
355 to thaw it. Much better as a scalar. | |
356 | |
357 See L<get_quality_array()|get_quality_array> | |
358 | |
359 =cut | |
360 | |
361 #' | |
362 sub get_quality_scalar { | |
363 my ($self,$contig) = @_; | |
364 return $self->{'contigs'}->{$contig}->{'quality'}; | |
365 } | |
366 | |
367 =head2 freeze_hash() | |
368 | |
369 Title : freeze_hash() | |
370 Usage : $o_consed->freeze_hash(); | |
371 | |
372 Function: Use Ilya's FreezeThaw module to create a persistent data | |
373 object for this Bio::Tools::Alignment::Consed data | |
374 structure. In the case of AAFC, we use | |
375 Bio::Tools::Alignment::Consed to pre-process bunches of | |
376 sequences, freeze the structures, and send in a harvesting | |
377 robot later to do database stuff. | |
378 Returns : 0 or 1; | |
379 Args : None. | |
380 Notes : This procedure was removed so Consed.pm won't require | |
381 FreezeThaw. | |
382 | |
383 | |
384 =cut | |
385 | |
386 #' | |
387 sub freeze_hash { | |
388 my $self = shift; | |
389 $self->warn("This method (freeze_hash) was removed from the bioperl consed.pm. Sorry.\n"); | |
390 if (1==2) { | |
391 $self->debug("Bio::Tools::Alignment::Consed::freeze_hash: \$self->{path} is $self->{path}\n"); | |
392 my $filename = $self->{'path'}."frozen"; | |
393 my %contigs = %{$self->{'contigs'}}; | |
394 my $frozen = freeze(%contigs); | |
395 umask 0001; | |
396 open (FREEZE,">$filename") or do { | |
397 $self->warn( "Bio::Tools::Alignment::Consed could not freeze the contig hash because the file ($filename) could not be opened: $!\n"); | |
398 return 1; | |
399 }; | |
400 print FREEZE $frozen; | |
401 close FREEZE; | |
402 return 0; | |
403 } | |
404 } | |
405 | |
406 =head2 get_members($contig_keyname) | |
407 | |
408 Title : get_members($contig_keyname) | |
409 Usage : $o_consed->get_members($contig_keyname); | |
410 Function: Return the _names_ of the reads in this contig. | |
411 Returns : An array containing the names of the reads in this contig. | |
412 Args : The keyname of a contig. Note this is a keyname. The keyname | |
413 would normally come from get_contigs. | |
414 | |
415 See L<get_contigs()|get_contigs> | |
416 | |
417 =cut | |
418 | |
419 sub get_members { | |
420 my ($self,$contig) = @_; | |
421 if (!$contig) { | |
422 $self->warn("You need to provide the name of a contig to use Bio::Tools::Alignment::Consed::get_members!\n"); | |
423 return; | |
424 } | |
425 return @{$self->{'contigs'}->{$contig}->{'member_array'}}; | |
426 } | |
427 | |
428 =head2 get_members_by_name($some_arbitrary_name) | |
429 | |
430 Title : get_members_by_name($some_arbitrary_name) | |
431 Usage : $o_consed->get_members_by_name($some_arbitrary_name); | |
432 Function: Return the names of the reads in a contig. This is the name given | |
433 to $contig{key} based on what is in the contig. This is different | |
434 from the keys retrieved through get_contigs(). | |
435 Returns : An array containing the names of the reads in the contig with this | |
436 name. | |
437 Args : The name of a contig. Not a key, but a name. | |
438 Notes : Highly inefficient. use some other method if possible. | |
439 | |
440 See L<get_contigs()|get_contigs> | |
441 | |
442 =cut | |
443 | |
444 sub get_members_by_name { | |
445 my ($self,$name) = @_; | |
446 # build a list to try to screen for redundancy | |
447 my @contigs_with_that_name; | |
448 foreach my $currkey ( sort keys %{$self->{'contigs'}} ) { | |
449 next if (!$self->{'contigs'}->{$currkey}->{'name'}); | |
450 if ($self->{'contigs'}->{$currkey}->{'name'} eq "$name") { | |
451 push @contigs_with_that_name,$currkey; | |
452 } | |
453 } | |
454 my $count = @contigs_with_that_name; | |
455 if ($count == 1) { | |
456 my $contig_num = $contigs_with_that_name[0]; | |
457 return @{$self->{'contigs'}->{$contig_num}->{'member_array'}}; | |
458 } | |
459 } | |
460 | |
461 =head2 get_contig_number_by_name($some_arbitrary_name) | |
462 | |
463 Title : get_contig_number_by_name($some_arbitrary_name) | |
464 Usage : $o_consed->get_contig_number_by_name($some_arbitrary_name); | |
465 Function: Return the names of the reads in a contig. This is the name given | |
466 to $contig{key} based on what is in the contig. This is different | |
467 from the keys retrieved through get_contigs(). | |
468 Returns : An array containing the names of the reads in the contig with this | |
469 name. | |
470 Args : The name of a contig. Not a key, but a name. | |
471 | |
472 See L<get_contigs()|get_contigs> | |
473 | |
474 =cut | |
475 | |
476 sub get_contig_number_by_name { | |
477 my ($self,$name) = @_; | |
478 foreach my $currkey (sort keys %{$self->{'contigs'}}) { | |
479 if ($self->{'contigs'}->{$currkey}->{'name'} && | |
480 $self->{'contigs'}->{$currkey}->{'name'} eq "$name") { | |
481 return $currkey; | |
482 } | |
483 } | |
484 } | |
485 | |
486 =head2 get_sequence($contig_keyname) | |
487 | |
488 Title : get_sequence($contig_keyname) | |
489 Usage : $o_consed->get_sequence($contig_keyname); | |
490 Function: Returns the consensus sequence for a given contig. | |
491 Returns : A scalar containing a sequence. | |
492 Args : The keyname of a contig. Note this is a key. The key would | |
493 normally come from get_contigs. | |
494 | |
495 See L<get_contigs()|get_contigs> | |
496 | |
497 =cut | |
498 | |
499 sub get_sequence { | |
500 my ($self,$contig) = @_; | |
501 return $self->{'contigs'}->{$contig}->{'consensus'}; | |
502 } | |
503 | |
504 =head2 set_final_sequence($some_sequence) | |
505 | |
506 Title : set_final_sequence($name,$some_sequence) | |
507 Usage : $o_consed->set_final_sequence($name,$some_sequence); | |
508 Function: Provides a manual way to set the sequence for a given key in the | |
509 contig hash. Rarely used. | |
510 Returns : 0 or 1; | |
511 Args : The name (not the keyname) of a contig and an arbitrary string. | |
512 Notes : A method with a questionable and somewhat mysterious origin. May | |
513 raise the dead or something like that. | |
514 | |
515 =cut | |
516 | |
517 sub set_final_sequence { | |
518 my ($self,$name,$sequence) = @_; | |
519 if (!$self->{'contigs'}->{$name}) { | |
520 $self->warn("You cannot set the final sequence for $name because it doesn't exist!\n"); | |
521 return 1; | |
522 } | |
523 else { | |
524 $self->{'contigs'}->{$name}->{'final_sequence'} = $sequence; | |
525 } | |
526 return 0; | |
527 } | |
528 | |
529 =head2 _read_file() | |
530 | |
531 Title : _read_file(); | |
532 Usage : _read_file(); | |
533 Function: An internal subroutine used to read in an acefile and parse it | |
534 into a Bio::Tools::Alignment::Consed object. | |
535 Returns : 0 or 1. | |
536 Args : Nothing. | |
537 Notes : This routine creates and saves the filhandle for reading the | |
538 files in {fh} | |
539 | |
540 =cut | |
541 | |
542 sub _read_file { | |
543 my ($self) = @_; | |
544 my ($line,$in_contig,$in_quality,$contig_number,$top); | |
545 # make it easier to type $fhl | |
546 while (defined($line=$self->_readline()) ) { | |
547 chomp $line; | |
548 # check if there is anything on this line | |
549 # if not, you can stop gathering consensus sequence | |
550 if (!$line) { | |
551 # if the line is blank you are no longer to gather consensus | |
552 # sequence or quality values | |
553 $in_contig = 0; | |
554 $in_quality = 0; | |
555 } | |
556 # you are currently gathering consensus sequence | |
557 elsif ($in_contig) { | |
558 if ($in_contig == 1) { | |
559 $self->debug("Adding $line to consensus of contig number $contig_number.\n"); | |
560 $self->{'contigs'}->{$contig_number}->{'consensus'} .= $line; | |
561 } | |
562 } | |
563 elsif ($in_quality) { | |
564 if (!$line) { | |
565 $in_quality = undef; | |
566 } | |
567 else { | |
568 # I wrote this in here because acefiles produced by cap3 do not have a leading space | |
569 # like the acefiles produced by phrap and there is the potential to have concatenated | |
570 # quality values like this: 2020 rather then 20 20 whre lines collide. Thanks Andrew for noticing. | |
571 if ($self->{'contigs'}->{$contig_number}->{'quality'} && !($self->{'contigs'}->{$contig_number}->{'quality'} =~ m/\ $/)) { | |
572 $self->{'contigs'}->{$contig_number}->{'quality'} .= " "; | |
573 } | |
574 $self->{'contigs'}->{$contig_number}->{'quality'} .= $line; | |
575 } | |
576 } | |
577 elsif ($line =~ /^BQ/) { | |
578 $in_quality = 1; | |
579 } | |
580 # the line /^CO/ like this: | |
581 # CO Contig1 796 1 1 U | |
582 # can be broken down as follows: | |
583 # CO - Contig! | |
584 # Contig1 - the name of this contig | |
585 # 796 - Number of bases in this contig | |
586 # 1 - Number of reads in this contig | |
587 # 1 - number of base segments in this contig | |
588 # U - Uncomplemented | |
589 elsif ($line =~ /^CO/) { | |
590 $line =~ m/^CO\ Contig(\d+)\ \d+\ \d+\ \d+\ (\w)/; | |
591 $contig_number = $1; | |
592 if ($2 eq "C") { | |
593 $self->debug("Contig $contig_number is complemented!\n"); | |
594 } | |
595 $self->{'contigs'}->{$contig_number}->{'member_array'} = []; | |
596 $self->{'contigs'}->{$contig_number}->{'contig_direction'} = "$2"; | |
597 $in_contig = 1; | |
598 } | |
599 # 000713 | |
600 # this BS is deprecated, I think. | |
601 # haha, I am really witty. <ew> | |
602 elsif ($line =~ /^BSDEPRECATED/) { | |
603 $line =~ m/^BS\s+\d+\s+\d+\s+(.+)/; | |
604 my $member = $1; | |
605 $self->{'contigs'}->{$contig_number}->{$member}++; | |
606 } | |
607 # the members of the contigs are determined by the AF line in the ace file | |
608 elsif ($line =~ /^AF/) { | |
609 $self->debug("I see an AF line here.\n"); | |
610 $line =~ /^AF\ (\S+)\ (\w)\ (\S+)/; | |
611 # push the name of the current read onto the member array for this contig | |
612 push @{$self->{'contigs'}->{$contig_number}->{'member_array'}},$1; | |
613 # the first read in the contig will be named the "top" read | |
614 if (!$top) { | |
615 $self->debug("\$top is not set.\n"); | |
616 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") { | |
617 $self->debug("Reversing the order of the reads. The bottom will be $1\n"); | |
618 # if the contig sequence is marked as the complement, the top becomes the bottom and$ | |
619 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1; | |
620 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2; | |
621 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3; | |
622 } | |
623 else { | |
624 $self->debug("NOT reversing the order of the reads. The top_name will be $1\n"); | |
625 # if the contig sequence is marked as the complement, the top becomes the bottom and$ | |
626 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1; | |
627 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2; | |
628 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3; | |
629 } | |
630 $top = 1; | |
631 } | |
632 else { | |
633 # if the contig sequence is marked as the complement, the top becomes the bottom and the bottom becomes the top | |
634 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") { | |
635 $self->debug("Reversing the order of the reads. The top will be $1\n"); | |
636 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1; | |
637 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2; | |
638 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3; | |
639 } | |
640 else { | |
641 $self->debug("NOT reversing the order of the reads. The bottom will be $1\n"); | |
642 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1; | |
643 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2; | |
644 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3; | |
645 } | |
646 $top = undef; | |
647 } | |
648 } | |
649 } | |
650 return 0; | |
651 } | |
652 | |
653 =head2 set_reverse_designator($some_string) | |
654 | |
655 Title : set_reverse_designator($some_string) | |
656 Usage : $o_consed->set_reverse_designator($some_string); | |
657 Function: Set the designator for the reverse read of contigs in this | |
658 Bio::Tools::Alignment::Consed object. Used to determine if | |
659 contigs containing two reads can be named. | |
660 Returns : The value of $o_consed->{reverse_designator} so you can check | |
661 to see that it was set properly. | |
662 Args : An arbitrary string. | |
663 Notes : May be useful only to me. <shrug> | |
664 | |
665 =cut | |
666 | |
667 sub set_reverse_designator { | |
668 my ($self,$reverse_designator) = @_; | |
669 $self->{'reverse_designator'} = $reverse_designator; | |
670 $self->{'o_trim'}->set_reverse_designator($reverse_designator); | |
671 return $self->{'reverse_designator'}; | |
672 } # end set_reverse_designator | |
673 | |
674 =head2 set_forward_designator($some_string) | |
675 | |
676 Title : set_forward_designator($some_string) | |
677 Usage : $o_consed->set_forward_designator($some_string); | |
678 Function: Set the designator for the forward read of contigs in this | |
679 Bio::Tools::Alignment::Consed object. Used to determine if | |
680 contigs containing two reads can be named. | |
681 Returns : The value of $o_consed->{forward_designator} so you can check | |
682 to see that it was set properly. | |
683 Args : An arbitrary string. | |
684 Notes : May be useful only to me. <shrug> | |
685 | |
686 =cut | |
687 | |
688 sub set_forward_designator { | |
689 my ($self,$forward_designator) = @_; | |
690 $self->{'forward_designator'} = $forward_designator; | |
691 $self->{'o_trim'}->set_forward_designator($forward_designator); | |
692 return $self->{'forward_designator'}; | |
693 } # end set_forward_designator | |
694 | |
695 =head2 set_designator_ignore_case("yes") | |
696 | |
697 Title : set_designator_ignore_case("yes") | |
698 Usage : $o_consed->set_designator_ignore_case("yes"); | |
699 Function: Deprecated. | |
700 Returns : Deprecated. | |
701 Args : Deprecated. | |
702 Notes : Deprecated. Really. Trust me. | |
703 | |
704 =cut | |
705 | |
706 sub set_designator_ignore_case { | |
707 my ($self,$ignore_case) = @_; | |
708 if ($ignore_case eq "yes") { | |
709 $self->{'designator_ignore_case'} = 1; | |
710 } | |
711 return $self->{'designator_ignore_case'}; | |
712 } # end set_designator_ignore_case | |
713 | |
714 =head2 set_trim_points_singlets_and_singletons() | |
715 | |
716 Title : set_trim_points_singlets_and_singletons() | |
717 Usage : $o_consed->set_trim_points_singlets_and_singletons(); | |
718 Function: Set the trim points for singlets and singletons based on | |
719 quality. Uses the Bio::Tools::Alignment::Trim object. Use | |
720 at your own risk because the Bio::Tools::Alignment::Trim | |
721 object was designed specifically for me and is mysterious | |
722 in its ways. Every time somebody other then me uses it a | |
723 swarm of locusts decends on a small Central American | |
724 village so do not say you weren't warned. | |
725 Returns : Nothing. | |
726 Args : None. | |
727 Notes : Working on exceptions and warnings here. | |
728 | |
729 See L<Bio::Tools::Alignment::Trim> for more information | |
730 | |
731 =cut | |
732 | |
733 #' to make my emacs happy | |
734 | |
735 sub set_trim_points_singlets_and_singletons { | |
736 my ($self) = @_; | |
737 $self->debug("Consed.pm : \$self is $self\n"); | |
738 my (@points,$trimmed_sequence); | |
739 if (!$self->{'doublets_set'}) { | |
740 $self->debug("You need to set the doublets before you use set_trim_points_singlets_and_doublets. Doing that now."); | |
741 $self->set_doublets(); | |
742 } | |
743 foreach (sort keys %{$self->{'contigs'}}) { | |
744 if ($self->{'contigs'}->{$_}->{'class'} eq "singlet") { | |
745 $self->debug("Singlet $_\n"); | |
746 # this is what Warehouse wants | |
747 # my ($self,$sequence,$quality,$name) = @_; | |
748 # this is what Bio::Tools::Alignment::Trim::trim_singlet wants: | |
749 # my ($self,$sequence,$quality,$name,$class) = @_; | |
750 # the following several lines are to make the parameter passing legible. | |
751 my ($sequence,$quality,$name,$class); | |
752 $sequence = $self->{'contigs'}->{$_}->{'consensus'}; | |
753 if (!$self->{'contigs'}->{$_}->{'quality'}) { $quality = "unset"; } | |
754 else { $quality = $self->{'contigs'}->{$_}->{'quality'}; } | |
755 $name = $self->{'contigs'}->{$_}->{'name'}; | |
756 $class = $self->{'contigs'}->{$_}->{'class'}; | |
757 (@points) = @{$self->{'o_trim'}->trim_singlet($sequence,$quality,$name,$class)}; | |
758 $self->{'contigs'}->{$_}->{'start_point'} = $points[0]; | |
759 $self->{'contigs'}->{$_}->{'end_point'} = $points[1]; | |
760 $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]); | |
761 } | |
762 } | |
763 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_singlets_and_singletons: Done setting the quality trimpoints.\n"); | |
764 return; | |
765 } # end set_trim_points_singlet | |
766 | |
767 =head2 set_trim_points_doublets() | |
768 | |
769 Title : set_trim_points_doublets() | |
770 Usage : $o_consed->set_trim_points_doublets(); | |
771 Function: Set the trim points for doublets based on quality. Uses the | |
772 Bio::Tools::Alignment::Trim object. Use at your own risk because | |
773 the Bio::Tools::Alignment::Trim object was designed specifically | |
774 for me and is mysterious in its ways. Every time somebody other | |
775 then me uses it you risk a biblical plague being loosed on your | |
776 city. | |
777 Returns : Nothing. | |
778 Args : None. | |
779 Notes : Working on exceptions here. | |
780 | |
781 See L<Bio::Tools::Alignment::Trim> for more information | |
782 | |
783 =cut | |
784 | |
785 sub set_trim_points_doublets { | |
786 my $self = shift; | |
787 my @points; | |
788 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Restoring zeros for doublets.\n"); | |
789 # &show_missing_sequence($self); | |
790 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Setting doublet trim points.\n"); | |
791 foreach (sort keys %{$self->{'contigs'}}) { | |
792 if ($self->{'contigs'}->{$_}->{'class'} eq "doublet") { | |
793 # my ($self,$sequence,$quality,$name,$class) = @_; | |
794 my @quals = split(' ',$self->{'contigs'}->{$_}->{'quality'}); | |
795 | |
796 (@points) = $self->{o_trim}->trim_doublet($self->{'contigs'}->{$_}->{'consensus'},$self->{'contigs'}->{$_}->{'quality'},$self->{'contigs'}->{$_}->{name},$self->{'contigs'}->{$_}->{'class'}); | |
797 $self->{'contigs'}->{$_}->{'start_point'} = $points[0]; | |
798 $self->{'contigs'}->{$_}->{'end_point'} = $points[1]; | |
799 # now set this | |
800 $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]); | |
801 # 010102 the deprecated way to do things: | |
802 } | |
803 } | |
804 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Done setting doublet trim points.\n"); | |
805 return; | |
806 } # end set_trim_points_doublets | |
807 | |
808 =head2 get_trimmed_sequence_by_name($name) | |
809 | |
810 Title : get_trimmed_sequence_by_name($name) | |
811 Usage : $o_consed->get_trimmed_sequence_by_name($name); | |
812 Function: Returns the trimmed_sequence of a contig with {name} eq $name. | |
813 Returns : A scalar- the trimmed sequence. | |
814 Args : The {name} of a contig. | |
815 Notes : | |
816 | |
817 =cut | |
818 | |
819 sub get_trimmed_sequence_by_name { | |
820 my ($self,$name) = @_; | |
821 my $trimmed_sequence; | |
822 my $contigname = &get_contig_number_by_name($self,$name); | |
823 my $class = $self->{'contigs'}->{$contigname}->{'class'}; | |
824 # what is this business and who was smoking crack while writing this? | |
825 # if ($class eq "singlet") { | |
826 # send the sequence, the quality, and the name | |
827 # $trimmed_sequence = $self->{o_trim}->trim_singlet($self->{'contigs'}->{$contigname}->{consensus},$self->{'contigs'}->{$contigname}->{'quality'},$name); | |
828 # } | |
829 return $self->{'contigs'}->{$contigname}->{'sequence_trimmed'}; | |
830 } | |
831 | |
832 =head2 set_dash_present_in_sequence_name("yes") | |
833 | |
834 Title : set_dash_present_in_sequence_name("yes") | |
835 Usage : $o_consed->set_dash_present_in_sequence_name("yes"); | |
836 Function: Deprecated. Part of an uncompleted thought. ("Oooh! Shiny!") | |
837 Returns : Nothing. | |
838 Args : "yes" to set {dash_present_in_sequence_name} to 1 | |
839 Notes : | |
840 | |
841 =cut | |
842 | |
843 sub set_dash_present_in_sequence_name { | |
844 my ($self,$dash_present) = @_; | |
845 if ($dash_present eq "yes") { | |
846 $self->{'dash_present_in_sequence_name'} = 1; | |
847 } | |
848 else { | |
849 $self->{'dash_present_in_sequence_name'} = 0; | |
850 } | |
851 return $self->{'dash_present_in_sequence_name'}; | |
852 } # end set_dash_present_in_sequence_name | |
853 | |
854 =head2 set_doublets() | |
855 | |
856 Title : set_doublets() | |
857 Usage : $o_consed->set_doublets(); | |
858 Function: Find pairs that have similar names and mark them as doublets | |
859 and set the {name}. | |
860 Returns : 0 or 1. | |
861 Args : None. | |
862 Notes : A complicated subroutine that iterates over the | |
863 Bio::Tools::Alignment::Consed looking for contigs of 2. If the | |
864 forward and reverse designator are removed from each of the reads | |
865 in {'member_array'} and the remaining reads are the same, {name} | |
866 is set to that name and the contig's class is set as "doublet". | |
867 If any of those cases fail the contig is marked as a "pair". | |
868 | |
869 =cut | |
870 | |
871 #' make my emacs happy | |
872 | |
873 sub set_doublets { | |
874 my ($self) = @_; | |
875 # set the designators in the Bio::Tools::Alignment::Trim object | |
876 | |
877 $self->{'o_trim'}->set_designators($self->{'reverse_designator'}, | |
878 $self->{'forward_designator'}); | |
879 # | |
880 foreach my $key_contig (sort keys %{$self->{'contigs'}}) { | |
881 | |
882 # if there is a member array (why would there not be? This should be a die()able offence | |
883 # but for now I will leave it | |
884 if ($self->{'contigs'}->{$key_contig}->{'member_array'}) { | |
885 # if there are two reads in this contig | |
886 # i am pretty sure that this is wrong but i am keeping it for reference | |
887 # if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2 || !$self->{'contigs'}->{$key_contig}->{'class'}) { | |
888 # <seconds later> | |
889 # <nod> WRONG. Was I on crack? | |
890 if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2) { | |
891 $self->{'contigs'}->{$key_contig}->{'num_members'} = 2; | |
892 $self->debug("\tThere are 2 members! Looking for the contig name...\n"); | |
893 my $name = _get_contig_name($self,$self->{'contigs'}->{$key_contig}->{'member_array'}); | |
894 $self->debug("The name is $name\n") if defined $name; | |
895 if ($name) { | |
896 $self->{'contigs'}->{$key_contig}->{'name'} = $name; | |
897 $self->{'contigs'}->{$key_contig}->{'class'} = "doublet"; | |
898 } | |
899 else { | |
900 $self->debug("$key_contig is a pair.\n"); | |
901 $self->{'contigs'}->{$key_contig}->{'class'} = "pair"; | |
902 } | |
903 } | |
904 # this is all fair and good but what about singlets? | |
905 # they have one reads in the member_array but certainly are not singletons | |
906 elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 1) { | |
907 # set the name to be the name of the read | |
908 $self->{'contigs'}->{$key_contig}->{name} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}[0]; | |
909 # set the number of members to be one | |
910 $self->{'contigs'}->{$key_contig}->{num_members} = 1; | |
911 # if this was a singlet, it would already belong to the class "singlet" | |
912 # so leave it alone | |
913 # if it is not a singlet, it is a singleton! lablel it appropriately | |
914 unless ($self->{'contigs'}->{$key_contig}->{'class'}) { | |
915 $self->{'contigs'}->{$key_contig}->{'class'} = "singleton"; | |
916 } | |
917 } | |
918 # set the multiplet characteristics | |
919 elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} >= 3) { | |
920 $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}; | |
921 $self->{'contigs'}->{$key_contig}->{'class'} = "multiplet"; | |
922 } | |
923 $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}; | |
924 | |
925 } | |
926 } | |
927 $self->{'doublets_set'} = "done"; | |
928 return 0; | |
929 } # end set_doublets | |
930 | |
931 =head2 set_singlets | |
932 | |
933 Title : set_singlets | |
934 Usage : $o_consed->set_singlets(); | |
935 Function: Read in a singlets file and place them into the | |
936 Bio::Tools::Alignment::Consed object. | |
937 Returns : Nothing. | |
938 Args : A scalar to turn on verbose parsing of the singlets file. | |
939 Notes : | |
940 | |
941 =cut | |
942 | |
943 sub set_singlets { | |
944 # parse out the contents of the singlets file | |
945 my ($self) = @_; | |
946 $self->debug("Bio::Tools::Alignment::Consed Adding singlets to the contig hash...\n"); | |
947 my $full_filename = $self->{'filename'}; | |
948 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: \$full_filename is $full_filename\n"); | |
949 $full_filename =~ m/(.*\/)(.*ace.*)$/; | |
950 my ($base_path,$filename) = ($1,$2); | |
951 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: singlets filename is $filename and \$base_path is $base_path\n"); | |
952 $filename =~ m/(.*)ace.*$/; | |
953 my $singletsfile = $base_path.$1."singlets"; | |
954 $self->debug("\$singletsfile is $singletsfile\n"); | |
955 if (-f $singletsfile) { | |
956 $self->debug("$singletsfile is indeed a file. Trying to open it...\n"); | |
957 } | |
958 my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile); | |
959 my ($sequence,$name,$count); | |
960 while ($_ = $singlets_fh->_readline()) { | |
961 chomp $_; | |
962 if (/\>/) { | |
963 if ($name && $sequence) { | |
964 $self->debug("Adding $name with sequence $sequence to hash...\n"); | |
965 push @{$self->{'contigs'}->{$name}->{'member_array'}},$name; | |
966 $self->{'contigs'}->{$name}->{'consensus'} = $sequence; | |
967 $self->{'contigs'}->{$name}->{'name'} = $name; | |
968 $self->{'contigs'}->{$name}->{"singlet"} = 1; | |
969 $self->{'contigs'}->{$name}->{'class'} = "singlet"; | |
970 } | |
971 $sequence = $name = undef; | |
972 $count++; | |
973 m/^\>(.*)\s\sCHROMAT/; | |
974 $name = $1; | |
975 if (!$name) { | |
976 m/\>(\S+)\s/; | |
977 $name = $1; | |
978 } | |
979 } | |
980 else { $sequence .= $_; } | |
981 } | |
982 if ($name && $sequence) { | |
983 $self->debug("Pushing the last of the singlets ($name)\n"); | |
984 @{$self->{'contigs'}->{$name}->{'member_array'}} = $name; | |
985 $self->{'contigs'}->{$name}->{'consensus'} = $sequence; | |
986 $self->{'contigs'}->{$name}->{'name'} = $name; | |
987 $self->{'contigs'}->{$name}->{"singlet"} = 1; | |
988 $self->{'contigs'}->{$name}->{'class'} = "singlet"; | |
989 } | |
990 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: Done adding singlets to the singlets hash.\n"); | |
991 $self->{'singlets_set'} = "done"; | |
992 return 0; | |
993 } # end sub set_singlets | |
994 | |
995 =head2 get_singlets() | |
996 | |
997 Title : get_singlets() | |
998 Usage : $o_consed->get_singlets(); | |
999 Function: Return the keynames of the singlets. | |
1000 Returns : An array containing the keynames of all | |
1001 Bio::Tools::Alignment::Consed sequences in the class "singlet". | |
1002 Args : None. | |
1003 Notes : | |
1004 | |
1005 =cut | |
1006 | |
1007 sub get_singlets { | |
1008 # returns an array of singlet names | |
1009 # singlets have "singlet"=1 in the hash | |
1010 my $self = shift; | |
1011 if (!$self->{singlets_set}) { | |
1012 $self->debug("You need to set the singlets before you get them. Doing that now."); | |
1013 $self->set_singlets(); | |
1014 } | |
1015 | |
1016 my (@singlets,@array); | |
1017 foreach my $key (sort keys %{$self->{'contigs'}}) { | |
1018 # @array = @{$Consed::contigs{$key}->{'member_array'}}; | |
1019 # somethimes a user will try to get a list of singlets before the classes for the rest of the | |
1020 # contigs has been set (see t/test.t for how I figured this out. <bah> | |
1021 # so either way, just return class=singlets | |
1022 if (!$self->{'contigs'}->{$key}->{'class'}) { | |
1023 # print("$key has no class. why?\n"); | |
1024 } | |
1025 elsif ($self->{'contigs'}->{$key}->{'class'} eq "singlet") { | |
1026 push @singlets,$key; | |
1027 } | |
1028 } | |
1029 return @singlets; | |
1030 } | |
1031 | |
1032 =head2 set_quality_by_name($name,$quality) | |
1033 | |
1034 Title : set_quality_by_name($name,$quality) | |
1035 Usage : $o_consed->set_quality_by_name($name,$quality); | |
1036 Function: Deprecated. Make the contig with {name} have {'quality'} $quality. | |
1037 Probably used for testing. | |
1038 Returns : Nothing. | |
1039 Args : The name of a contig and a scalar for its quality. | |
1040 Notes : Deprecated. | |
1041 | |
1042 =cut | |
1043 | |
1044 sub set_quality_by_name { | |
1045 # this is likely deprecated | |
1046 my ($self,$name,$quality) = shift; | |
1047 my $return; | |
1048 foreach (sort keys %{$self->{'contigs'}}) { | |
1049 if ($self->{'contigs'} eq "$name" || $self->{'contigs'}->{'name'} eq "$name") { | |
1050 $self->{'contigs'}->{'quality'} = $quality; | |
1051 $return=1; | |
1052 } | |
1053 } | |
1054 if ($return) { return "0"; } else { return "1"; } | |
1055 } # end set quality by name | |
1056 | |
1057 =head2 set_singlet_quality() | |
1058 | |
1059 Title : set_singlet_quality() | |
1060 Usage : $o_consed->set_singlet_quality(); | |
1061 Function: For each singlet, go to the appropriate file in phd_dir and read | |
1062 in the phred quality for that read and place it into {'quality'} | |
1063 Returns : 0 or 1. | |
1064 Args : None. | |
1065 Notes : This is the next subroutine that will receive substantial revision | |
1066 in the next little while. It really should eval the creation of | |
1067 Bio::Tools::Alignment::Phred objects and go from there. | |
1068 | |
1069 =cut | |
1070 | |
1071 sub set_singlet_quality { | |
1072 my $self = shift; | |
1073 my $full_filename = $self->{'filename'}; | |
1074 $full_filename =~ m/(.*\/)(.*)ace.*$/; | |
1075 my ($base_path,$filename) = ($1,"$2"."qual"); | |
1076 my $singletsfile = $base_path.$filename; | |
1077 if (-f $singletsfile) { | |
1078 # print("$singletsfile is indeed a file. Trying to open it...\n"); | |
1079 } | |
1080 else { | |
1081 $self->warn("$singletsfile is not a file. Sorry.\n"); | |
1082 return; | |
1083 } | |
1084 my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile); | |
1085 my ($sequence,$name,$count); | |
1086 my ($identity,$line,$quality,@qline); | |
1087 while ($line = $singlets_fh->_readline()) { | |
1088 chomp $line; | |
1089 if ($line =~ /^\>/) { | |
1090 $quality = undef; | |
1091 $line =~ m/\>(\S*)\s/; | |
1092 $identity = $1; | |
1093 } | |
1094 else { | |
1095 if ($self->{'contigs'}->{$identity}) { | |
1096 $self->{'contigs'}->{$identity}->{'quality'} .= "$line "; | |
1097 } | |
1098 } | |
1099 | |
1100 } | |
1101 return 0; | |
1102 } | |
1103 | |
1104 =head2 set_contig_quality() | |
1105 | |
1106 Title : set_contig_quality() | |
1107 Usage : $o_consed->set_contig_quality(); | |
1108 Function: Deprecated. | |
1109 Returns : Deprecated. | |
1110 Args : Deprecated. | |
1111 Notes : Deprecated. Really. Trust me. | |
1112 | |
1113 =cut | |
1114 | |
1115 sub set_contig_quality { | |
1116 # note: contigs _include_ singletons but _not_ singlets | |
1117 my ($self) = shift; | |
1118 # the unexpected results I am referring to here are a doubling of quality values. | |
1119 # the profanity I uttered on discovering this reminded me of the simpsons: | |
1120 # Ned Flanders: "That is the loudest profanity I have ever heard!" | |
1121 $self->warn("set_contig_quality is deprecated and will likely produce unexpected results"); | |
1122 my $full_filename = $self->{'filename'}; | |
1123 # Run_SRC3700_2000-08-01_73+74.fasta.screen.contigs.qual | |
1124 # from Consed.pm | |
1125 $full_filename =~ m/(.*\/)(.*)ace.*$/; | |
1126 my ($base_path,$filename) = ($1,"$2"."contigs.qual"); | |
1127 my $singletsfile = $base_path.$filename; | |
1128 if (-f $singletsfile) { | |
1129 # print("$singletsfile is indeed a file. Trying to open it...\n"); | |
1130 } | |
1131 else { | |
1132 $self->warn("Bio::Tools::Alignment::Consed::set_contig_quality $singletsfile is not a file. Sorry.\n"); | |
1133 return; | |
1134 } | |
1135 my $contig_quality_fh = Bio::Root::IO->new(-file => $singletsfile); | |
1136 | |
1137 my ($sequence,$name,$count,$identity,$line,$quality); | |
1138 while ($line = $contig_quality_fh->_readline()) { | |
1139 chomp $line; | |
1140 if ($line =~ /^\>/) { | |
1141 $quality = undef; | |
1142 $line =~ m/\>.*Contig(\d+)\s/; | |
1143 $identity = $1; | |
1144 } | |
1145 else { | |
1146 if ($self->{'contigs'}->{$identity} ) { | |
1147 $self->{'contigs'}->{$identity}->{'quality'} .= " $line"; | |
1148 } | |
1149 } | |
1150 } | |
1151 } # end set_contig_quality | |
1152 | |
1153 =head2 get_multiplets() | |
1154 | |
1155 Title : get_multiplets() | |
1156 Usage : $o_consed->get_multiplets(); | |
1157 Function: Return the keynames of the multiplets. | |
1158 Returns : Returns an array containing the keynames of all | |
1159 Bio::Tools::Alignment::Consed sequences in the class "multiplet". | |
1160 Args : None. | |
1161 Notes : | |
1162 | |
1163 =cut | |
1164 | |
1165 sub get_multiplets { | |
1166 # returns an array of multiplet names | |
1167 # multiplets have # members > 2 | |
1168 my $self = shift; | |
1169 my (@multiplets,@array); | |
1170 foreach my $key (sort keys %{$self->{'contigs'}}) { | |
1171 if ($self->{'contigs'}->{$key}->{'class'}) { | |
1172 if ($self->{'contigs'}->{$key}->{'class'} eq "multiplet") { | |
1173 push @multiplets,$key; | |
1174 } | |
1175 } | |
1176 } | |
1177 return @multiplets; | |
1178 } | |
1179 | |
1180 =head2 get_all_members() | |
1181 | |
1182 Title : get_all_members() | |
1183 Usage : @all_members = $o_consed->get_all_members(); | |
1184 Function: Return a list of all of the read names in the | |
1185 Bio::Tools::Alignment::Consed object. | |
1186 Returns : An array containing all of the elements in all of the | |
1187 {'member_array'}s. | |
1188 Args : None. | |
1189 Notes : | |
1190 | |
1191 =cut | |
1192 | |
1193 sub get_all_members { | |
1194 my $self = shift; | |
1195 my @members; | |
1196 foreach my $key (sort keys %{$self->{'contigs'}}) { | |
1197 if ($key =~ /^singlet/) { | |
1198 push @members,$self->{'contigs'}->{$key}->{'member_array'}[0]; | |
1199 } | |
1200 elsif ($self->{'contigs'}->{$key}->{'member_array'}) { | |
1201 push @members,@{$self->{'contigs'}->{$key}->{'member_array'}}; | |
1202 } | |
1203 # else { | |
1204 # print("Bio::Tools::Alignment::Consed: $key is _not_ an array. Pushing $self->{'contigs'}->{$key}->{'member_array'} onto \@members\n"); | |
1205 # push @members,$self->{'contigs'}->{$key}->{'member_array'}; | |
1206 # } | |
1207 } | |
1208 return @members; | |
1209 } | |
1210 | |
1211 =head2 sum_lets($total_only) | |
1212 | |
1213 Title : sum_lets($total_only) | |
1214 Usage : $statistics = $o_consed->sum_lets($total_only); | |
1215 Function: Provide numbers for how many sequences were accounted for in the | |
1216 Bio::Tools::Alignment::Consed object. | |
1217 Returns : If a scalar is present, returns the total number of | |
1218 sequences accounted for in all classes. If no scalar passed | |
1219 then returns a string that looks like this: | |
1220 Singt/singn/doub/pair/mult/total : 2,0,1(2),0(0),0(0),4 | |
1221 This example means the following: There were 1 singlets. | |
1222 There were 0 singletons. There were 1 doublets for a total | |
1223 of 2 sequences in this class. There were 0 pairs for a | |
1224 total of 0 sequences in this class. There were 0 | |
1225 multiplets for a total of 0 sequences in this class. There | |
1226 were a total of 4 sequences accounted for in the | |
1227 Bio::Tools::Alignment::Consed object. | |
1228 Args : A scalar is optional to change the way the numbers are returned. | |
1229 Notes: | |
1230 | |
1231 =cut | |
1232 | |
1233 sub sum_lets { | |
1234 my ($self,$total_only) = @_; | |
1235 my ($count,$count_multiplets,$multiplet_count); | |
1236 my $singlets = &get_singlets($self); $count += $singlets; | |
1237 my $doublets = &get_doublets($self); $count += ($doublets * 2); | |
1238 my $pairs = &get_pairs($self); $count += ($pairs * 2); | |
1239 my $singletons = &get_singletons($self); $count += $singletons; | |
1240 my @multiplets = &get_multiplets($self); | |
1241 $count_multiplets = @multiplets; | |
1242 my $return_string; | |
1243 foreach (@multiplets) { | |
1244 my $number_members = $self->{'contigs'}->{$_}->{num_members}; | |
1245 $multiplet_count += $number_members; | |
1246 } | |
1247 if ($multiplet_count) { | |
1248 $count += $multiplet_count; | |
1249 } | |
1250 foreach (qw(multiplet_count singlets doublets pairs singletons multiplets count_multiplets)) { | |
1251 no strict 'refs'; # renege for the block | |
1252 if (!${$_}) { | |
1253 ${$_} = 0; | |
1254 } | |
1255 } | |
1256 if (!$multiplet_count) { $multiplet_count = 0; } | |
1257 if ($total_only) { | |
1258 return $count; | |
1259 } | |
1260 $return_string = "Singt/singn/doub/pair/mult/total : $singlets,$singletons,$doublets(". | |
1261 ($doublets*2)."),$pairs(".($pairs*2)."),$count_multiplets($multiplet_count),$count"; | |
1262 return $return_string; | |
1263 } | |
1264 | |
1265 =head2 write_stats() | |
1266 | |
1267 Title : write_stats() | |
1268 Usage : $o_consed->write_stats(); | |
1269 Function: Write a file called "statistics" containing numbers similar to | |
1270 those provided in sum_lets(). | |
1271 Returns : Nothing. Write a file in $o_consed->{path} containing something | |
1272 like this: | |
1273 | |
1274 0,0,50(100),0(0),0(0),100 | |
1275 | |
1276 Where the numbers provided are in the format described in the | |
1277 documentation for sum_lets(). | |
1278 Args : None. | |
1279 Notes : This might break platform independence, I do not know. | |
1280 | |
1281 See L<sum_lets()|sum_lets> | |
1282 | |
1283 =cut | |
1284 | |
1285 sub write_stats { | |
1286 # worry about platform dependence here? | |
1287 # oh shucksdarn. | |
1288 my $self = shift; | |
1289 my $stats_filename = $self->{'path'}."statistics"; | |
1290 my $statistics_raw = $self->sum_lets; | |
1291 (my $statsfilecontents = $statistics_raw) =~ s/.*\ \:\ //g; | |
1292 umask 0001; | |
1293 my $fh = new Bio::Root::IO(-file=>"$stats_filename"); | |
1294 # open(STATSFILE,">$stats_filename") or print("Could not open the statsfile: $!\n"); | |
1295 $fh->_print("$statsfilecontents"); | |
1296 # close STATSFILE; | |
1297 $fh->close(); | |
1298 } | |
1299 | |
1300 =head2 get_singletons() | |
1301 | |
1302 Title : get_singletons() | |
1303 Usage : @singletons = $o_consed->get_singletons(); | |
1304 Function: Return the keynames of the singletons. | |
1305 Returns : Returns an array containing the keynames of all | |
1306 Bio::Tools::Alignment::Consed sequences in the class "singleton". | |
1307 Args : None. | |
1308 Notes : | |
1309 | |
1310 =cut | |
1311 | |
1312 sub get_singletons { | |
1313 # returns an array of singleton names | |
1314 # singletons are contigs with one member (see consed documentation) | |
1315 my $self = shift; | |
1316 my (@singletons,@array); | |
1317 foreach my $key (sort keys %{$self->{'contigs'}}) { | |
1318 if ($self->{'contigs'}->{$key}->{'class'}) { | |
1319 # print ("$key class: $self->{'contigs'}->{$key}->{'class'}\n"); | |
1320 } | |
1321 else { | |
1322 # print("$key belongs to no class. why?\n"); | |
1323 } | |
1324 if ($self->{'contigs'}->{$key}->{'member_array'}) { | |
1325 @array = @{$self->{'contigs'}->{$key}->{'member_array'}}; | |
1326 } | |
1327 my $num_array_elem = @array; | |
1328 if ($num_array_elem == 1 && $self->{'contigs'}->{$key}->{'class'} && $self->{'contigs'}->{$key}->{'class'} eq "singleton") { push @singletons,$key; } | |
1329 } | |
1330 return @singletons; | |
1331 } | |
1332 | |
1333 =head2 get_pairs() | |
1334 | |
1335 Title : get_pairs() | |
1336 Usage : @pairs = $o_consed->get_pairs(); | |
1337 Function: Return the keynames of the pairs. | |
1338 Returns : Returns an array containing the keynames of all | |
1339 Bio::Tools::Alignment::Consed sequences in the class "pair". | |
1340 Args : None. | |
1341 Notes : | |
1342 | |
1343 =cut | |
1344 | |
1345 sub get_pairs { | |
1346 # returns an array of pair contig names | |
1347 # a pair is a contig of two where the names do not match | |
1348 my $self = shift; | |
1349 my (@pairs,@array); | |
1350 foreach my $key (sort keys %{$self->{'contigs'}}) { | |
1351 if ($self->{'contigs'}->{$key}->{'member_array'}) { | |
1352 if (@{$self->{'contigs'}->{$key}->{'member_array'}} == 2 && | |
1353 $self->{'contigs'}->{$key}->{'class'} eq "pair") { | |
1354 push @pairs,$key; | |
1355 } | |
1356 } | |
1357 } | |
1358 return @pairs; | |
1359 } | |
1360 | |
1361 =head2 get_name($contig_keyname) | |
1362 | |
1363 Title : get_name($contig_keyname) | |
1364 Usage : $name = $o_consed->get_name($contig_keyname); | |
1365 Function: Return the {name} for $contig_keyname. | |
1366 Returns : A string. ({name}) | |
1367 Args : A contig keyname. | |
1368 Notes : | |
1369 | |
1370 =cut | |
1371 | |
1372 sub get_name { | |
1373 my ($self,$contig) = @_; | |
1374 return $self->{'contigs'}->{$contig}->{'name'}; | |
1375 } | |
1376 | |
1377 =head2 _get_contig_name(\@array_containing_reads) | |
1378 | |
1379 Title : _get_contig_name(\@array_containing_reads) | |
1380 Usage : $o_consed->_get_contig_name(\@array_containing_reads); | |
1381 Function: The logic for the set_doublets subroutine. | |
1382 Returns : The name for this contig. | |
1383 Args : A reference to an array containing read names. | |
1384 Notes : Depends on reverse_designator. Be sure this is set the way you | |
1385 intend. | |
1386 | |
1387 =cut | |
1388 | |
1389 sub _get_contig_name { | |
1390 my ($self,$r_array) = @_; | |
1391 my @contig_members = @$r_array; | |
1392 my @name_nodir; | |
1393 foreach (@contig_members) { | |
1394 # how can I distinguish the clone name from the direction label? | |
1395 # look for $Consed::reverse_designator and $Consed::forward_designator | |
1396 # what if you do not find _any_ of those? | |
1397 my $forward_designator = $self->{'forward_designator'} || "f"; | |
1398 my $reverse_designator = $self->{'reverse_designator'} || "r"; | |
1399 my $any_hits = /(.+)($forward_designator.*)/ || /(.+)($reverse_designator.*)/||/(.+)(_.+)/; | |
1400 my $name = $1; | |
1401 my $suffix = $2; | |
1402 if ($name) { | |
1403 # print("\t\$name is $name "); | |
1404 } | |
1405 if ($suffix) { | |
1406 # print("and \$suffix is $suffix.\n"); | |
1407 } | |
1408 # Jee, I hope we get a naming convention soon | |
1409 if ($suffix) { | |
1410 if ($suffix =~ /^$forward_designator/ || $suffix =~ /^$reverse_designator/) { | |
1411 push @name_nodir,$name; | |
1412 } | |
1413 # bugwatch here! should this be unnested? | |
1414 else { | |
1415 push @name_nodir,"$name$suffix"; | |
1416 } | |
1417 } | |
1418 } | |
1419 # print("\@name_nodir: @name_nodir\n"); | |
1420 my $mismatch = 0; | |
1421 for (my $counter=0; $counter<@name_nodir;$counter++) { | |
1422 next if ($name_nodir[0] eq $name_nodir[$counter]); | |
1423 $mismatch = 1; | |
1424 } | |
1425 if ($mismatch == 0) { | |
1426 # print("\tYou have a cohesive contig named $name_nodir[0].\n\n"); | |
1427 return $name_nodir[0]; | |
1428 } | |
1429 else { | |
1430 # print("\tYou have mixed names in this contig.\n\n"); | |
1431 } | |
1432 } # end _get_contig_name | |
1433 | |
1434 =head2 get_doublets() | |
1435 | |
1436 Title : get_doublets() | |
1437 Usage : @doublets = $o_consed->get_doublets(); | |
1438 Function: Return the keynames of the doublets. | |
1439 Returns : Returns an array containing the keynames of all | |
1440 Bio::Tools::Alignment::Consed sequences in the class "doublet". | |
1441 Args : None. | |
1442 Notes : | |
1443 | |
1444 =cut | |
1445 | |
1446 sub get_doublets { | |
1447 my $self = shift; | |
1448 if (!$self->{doublets_set}) { | |
1449 $self->warn("You need to set the doublets before you can get them. Doing that now."); | |
1450 $self->set_doublets(); | |
1451 } | |
1452 my @doublets; | |
1453 foreach (sort keys %{$self->{'contigs'}}) { | |
1454 if ($self->{'contigs'}->{$_}->{name} && $self->{'contigs'}->{$_}->{'class'} eq "doublet") { | |
1455 push @doublets,$_; | |
1456 } | |
1457 } | |
1458 return @doublets; | |
1459 } # end get_doublets | |
1460 | |
1461 =head2 dump_hash() | |
1462 | |
1463 Title : dump_hash() | |
1464 Usage : $o_consed->dump_hash(); | |
1465 Function: Use dumpvar.pl to dump out the Bio::Tools::Alignment::Consed | |
1466 object to STDOUT. | |
1467 Returns : Nothing. | |
1468 Args : None. | |
1469 Notes : I used this a lot in debugging. | |
1470 | |
1471 =cut | |
1472 | |
1473 sub dump_hash { | |
1474 my $self = shift; | |
1475 my $dumper = new Dumpvalue; | |
1476 $self->debug( "Bio::Tools::Alignment::Consed::dump_hash - ". | |
1477 "The following is the contents of the contig hash...\n"); | |
1478 $dumper->dumpValue($self->{'contigs'}); | |
1479 } | |
1480 | |
1481 =head2 dump_hash_compact() | |
1482 | |
1483 Title : dump_hash_compact() | |
1484 Usage : $o_consed->dump_hash_compact(); | |
1485 Function: Dump out the Bio::Tools::Alignment::Consed object in a compact way. | |
1486 Returns : Nothing. | |
1487 Args : Nothing. | |
1488 Notes : Cleaner then dumpValue(), dumpHash(). I used this a lot in | |
1489 debugging. | |
1490 | |
1491 =cut | |
1492 | |
1493 sub dump_hash_compact { | |
1494 no strict 'refs'; # renege for the block | |
1495 my ($self,$sequence) = @_; | |
1496 # get the classes | |
1497 my @singlets = $self->get_singlets(); | |
1498 my @singletons = $self->get_singletons(); | |
1499 my @doublets = $self->get_doublets(); | |
1500 my @pairs = $self->get_pairs(); | |
1501 my @multiplets = $self->get_multiplets(); | |
1502 print("Name\tClass\tMembers\tQuality?\n"); | |
1503 foreach (@singlets) { | |
1504 my @members = $self->get_members($_); | |
1505 print($self->get_name($_)."\tsinglets\t".(join',',@members)."\t"); | |
1506 if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); } | |
1507 else { print("no qualities found here\n"); } | |
1508 | |
1509 } | |
1510 foreach (@singletons) { | |
1511 my @members = $self->get_members($_); | |
1512 print($self->get_name($_)."\tsingletons\t".(join',',@members)."\t"); | |
1513 if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); } | |
1514 else { print("no qualities found here\n"); } | |
1515 } | |
1516 foreach my $pair (@pairs) { | |
1517 my @members = $self->get_members($pair); | |
1518 my $name; | |
1519 if (!$self->get_name($pair)) { | |
1520 $name = "BLANK"; | |
1521 } | |
1522 else { $name = $self->get_name($pair); } | |
1523 print("$name\tpairs\t".(join',',@members)."\n"); | |
1524 } | |
1525 foreach (@doublets) { | |
1526 my @members = $self->get_members_by_name($_); | |
1527 print("$_\tdoublets\t".(join',',@members)."\t"); | |
1528 my $contig_number = &get_contig_number_by_name($self,$_); | |
1529 if ($self->{'contigs'}->{$contig_number}->{'quality'}) { print("qualities found here\n"); } | |
1530 else { print("no qualities found here\n"); } | |
1531 # print($_."\tdoublets\t".(join',',@members)."\n"); | |
1532 } | |
1533 foreach (@multiplets) { | |
1534 my @members = $self->get_members($_); | |
1535 print("Contig $_"."\tmultiplets\t".(join',',@members)."\n"); | |
1536 } | |
1537 } # end dump_hash_compact | |
1538 | |
1539 =head2 get_phreds() | |
1540 | |
1541 Title : get_phreds() | |
1542 Usage : @phreds = $o_consed->get_phreds(); | |
1543 Function: For each doublet in the Bio::Tools::Alignment::Consed hash, go | |
1544 and get the phreds for the top and bottom reads. Place them into | |
1545 {top_phreds} and {bottom_phreds}. | |
1546 Returns : Nothing. | |
1547 Args : Nothing. | |
1548 Notes : Requires parse_phd() and reverse_and_complement(). I realize that | |
1549 it would be much more elegant to pull qualities as required but there | |
1550 were certain "features" in the acefile that required a bit more | |
1551 detailed work be done to get the qualities for certain parts of the | |
1552 consensus sequence. In order to make _sure_ that this was done | |
1553 properly I wrote things to do all steps and then I used dump_hash() | |
1554 and checked each one to ensure expected bahavior. I have never changed | |
1555 this, so there you are. | |
1556 | |
1557 =cut | |
1558 | |
1559 sub get_phreds { | |
1560 # this subroutine is the target of a rewrite to use the Bio::Tools::Alignment::Phred object. | |
1561 my $self = shift; | |
1562 my $current_contig; | |
1563 foreach $current_contig (sort keys %{$self->{'contigs'}}) { | |
1564 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") { | |
1565 $self->debug("$current_contig is a doublet. Going to parse_phd for top($self->{'contigs'}->{$current_contig}->{'top_name'}) and bottom($self->{'contigs'}->{$current_contig}->{'bottom_name'})\n"); | |
1566 my $r_phreds_top = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'top_name'}); | |
1567 my $r_phreds_bottom = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'bottom_name'}); | |
1568 if ($self->{'contigs'}->{$current_contig}->{'top_complement'} eq "C") { | |
1569 # print("Reversing and complementing...\n"); | |
1570 $r_phreds_top = &reverse_and_complement($r_phreds_top); | |
1571 } | |
1572 if ($self->{'contigs'}->{$current_contig}->{'bottom_complement'} eq "C") { | |
1573 $r_phreds_bottom = &reverse_and_complement($r_phreds_bottom); | |
1574 } | |
1575 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = $r_phreds_top; | |
1576 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = $r_phreds_bottom; | |
1577 } | |
1578 } | |
1579 } | |
1580 | |
1581 =head2 parse_phd($read_name) | |
1582 | |
1583 Title : parse_phd($read_name) | |
1584 Usage : $o_consed->parse_phd($read_name); | |
1585 Function: Suck in the contents of a .phd file. | |
1586 Returns : A reference to an array containing the quality values for the read. | |
1587 Args : The name of a read. | |
1588 Notes : This is a significantly weak subroutine because it was always | |
1589 intended that these functions, along with the functions provided by | |
1590 get_phreds() be put into the Bio::SeqIO:phd module. This is done | |
1591 now but the Bio::Tools::Alignment::Consed module has not be | |
1592 rewritten to reflect this change. | |
1593 | |
1594 See L<Bio::SeqIO::phd> for more information. | |
1595 | |
1596 =cut | |
1597 | |
1598 sub parse_phd { | |
1599 my ($self,$sequence_name) = @_; | |
1600 $self->debug("Parsing phd for $sequence_name\n"); | |
1601 my $in_dna = 0; | |
1602 my $base_number = 0; | |
1603 my (@bases,@current_line); | |
1604 # print("parse_phd: $sequence_name\n"); | |
1605 my $fh = new Bio::Root::IO(-file=>"$self->{path}/../phd_dir/$sequence_name.phd.1"); | |
1606 # open(PHD,"<$self->{path}/../phd_dir/$sequence_name.phd.1") or | |
1607 # die "Couldn't open the phred for $sequence_name\n"; | |
1608 while ($fh->_readline()) { | |
1609 # print("Reading a line from a phredfile!\n"); | |
1610 chomp; | |
1611 if (/^BEGIN_DNA/) { $in_dna = 1; next} | |
1612 if (/^END_DNA/) { last; } | |
1613 if (!$in_dna) { next; } | |
1614 push(@bases,$_); | |
1615 } | |
1616 return \@bases; | |
1617 } | |
1618 | |
1619 =head2 reverse_and_complement(\@source) | |
1620 | |
1621 Title : reverse_and_complement(\@source) | |
1622 Usage : $reference_to_array = $o_consed->reverse_and_complement(\@source); | |
1623 Function: A stub for the recursive routine reverse_recurse(). | |
1624 Returns : A reference to a reversed and complemented array of phred data. | |
1625 Args : A reference to an array of phred data. | |
1626 Notes : | |
1627 | |
1628 =cut | |
1629 | |
1630 sub reverse_and_complement { | |
1631 my $r_source = shift; | |
1632 my $r_destination; | |
1633 $r_destination = &reverse_recurse($r_source,$r_destination); | |
1634 return $r_destination; | |
1635 } | |
1636 | |
1637 =head2 reverse_recurse($r_source,$r_destination) | |
1638 | |
1639 Title : reverse_recurse(\@source,\@destination) | |
1640 Usage : $o_consed->reverse_recurse(\@source,\@destination); | |
1641 Function: A recursive routine to reverse and complement an array of phred | |
1642 data. | |
1643 Returns : A reference to an array containing reversed phred data. | |
1644 Args : A reference to a source array and a reverence to a destination | |
1645 array. | |
1646 Notes : Recursion is kewl, but this sub should likely be _reverse_recurse. | |
1647 | |
1648 =cut | |
1649 | |
1650 my $current_base; | |
1651 sub reverse_recurse($$) { | |
1652 (my $r_source,my $r_destination) = @_; | |
1653 if (!@$r_source) { return $r_destination; } | |
1654 $_=pop(@$r_source); | |
1655 (s/c/g/||s/g/c/||s/a/t/||s/t/a/); | |
1656 push(@$r_destination,$_); | |
1657 &reverse_recurse($r_source,$r_destination); | |
1658 } | |
1659 | |
1660 =head2 show_missing_sequence() | |
1661 | |
1662 Title : show_missing_sequence(); | |
1663 Usage : $o_consed->show_missing_sequence(); | |
1664 Function: Used by set_trim_points_doublets() to fill in quality values where | |
1665 consed (phrap?) set them to 0 at the beginning and/or end of the | |
1666 consensus sequences. | |
1667 Returns : Nothing. | |
1668 Args : None. | |
1669 Notes : Acts on doublets only. Really very somewhat quite ugly. A | |
1670 disgusting kludge. <insert pride here> It was written stepwise with | |
1671 no real plan because it was not really evident why consed (phrap?) | |
1672 was doing this. | |
1673 | |
1674 =cut | |
1675 | |
1676 sub show_missing_sequence() { | |
1677 # decide which sequence should not have been clipped at consensus position = 0 | |
1678 my $self = shift; | |
1679 &get_phreds($self); | |
1680 my ($current_contig,@qualities); | |
1681 foreach $current_contig (sort keys %{$self->{'contigs'}}) { | |
1682 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") { | |
1683 my $number_leading_xs = 0; | |
1684 my $number_trailing_xs = 0; | |
1685 my $measurer = $self->{'contigs'}->{$current_contig}->{'quality'}; | |
1686 while ($measurer =~ s/^\ 0\ /\ /) { | |
1687 $number_leading_xs++; | |
1688 } | |
1689 while ($measurer =~ s/\ 0(\s*)$/$1/) { | |
1690 $number_trailing_xs++; | |
1691 } | |
1692 @qualities = split(' ',$self->{'contigs'}->{$current_contig}->{'quality'}); | |
1693 my $in_initial_zeros = 0; | |
1694 for (my $count=0;$count<scalar(@qualities); $count++) { | |
1695 if ($qualities[$count] == 0) { | |
1696 my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data); | |
1697 # print("The quality of the consensus at ".($count+1)." is zero. Retrieving the real quality value.\n"); | |
1698 # how do I know which strand to get these quality values from???? | |
1699 # boggle | |
1700 my $top_quality_here = $self->{'contigs'}->{$current_contig}->{'top_phreds'}->[0-$self->{'contigs'}->{$current_contig}->{'top_start'}+$count+1]; | |
1701 my $bottom_quality_here = $self->{'contigs'}->{$current_contig}->{'bottom_phreds'}->[1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count]; | |
1702 if (!$bottom_quality_here || (1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count)<0) { | |
1703 $bottom_quality_here = "not found"; | |
1704 } | |
1705 if (!$top_quality_here) { | |
1706 $top_quality_here = "not found"; | |
1707 } | |
1708 # print("Looking for quals at position $count of $current_contig: top position ".(0-$self->{'contigs'}->{$current_contig}->{top_start}+$count)." ($self->{'contigs'}->{$current_contig}->{top_name}) $top_quality_here , bottom position ".(1-$self->{'contigs'}->{$current_contig}->{bottom_start}+$count)." ($self->{'contigs'}->{$current_contig}->{bottom_name}) $bottom_quality_here\n"); | |
1709 if ($count<$number_leading_xs) { | |
1710 # print("$count is less then $number_leading_xs so I will get the quality from the top strand\n"); | |
1711 # print("retrieved quality is ".$self->{'contigs'}->{$current_contig}->{top_phreds}[0-$self->{'contigs'}->{$current_contig}->{top_start}+$count+1]."\n"); | |
1712 my $quality = $top_quality_here; | |
1713 $quality =~ /\S+\s(\d+)\s+/; | |
1714 $quality = $1; | |
1715 # print("retrieved quality for leading zero $count is $quality\n"); | |
1716 # t 9 9226 | |
1717 $qualities[$count] = $quality; | |
1718 } | |
1719 else { | |
1720 # this part is tricky | |
1721 # if the contig is like this | |
1722 # cccccccccccccccc | |
1723 # ffffffffffffffffff | |
1724 # rrrrrrrrrrrrrrrrr | |
1725 # then take the quality value for the trailing zeros in the cons. seq from the r | |
1726 # | |
1727 # but if the contig is like this | |
1728 # cccccccccccccccccc | |
1729 # ffffffffffffffffffffffffffffffff | |
1730 # rrrrrrrrrrrrrrrrrrrrrrrxxxxxxxxr | |
1731 # ^^^ | |
1732 # then any zeros that fall in the positions (^) must be decided whether the quality | |
1733 # is the qual from the f or r strand. I will use the greater number | |
1734 # does a similar situation exist for the leading zeros? i dunno | |
1735 # | |
1736 # print("$count is greater then $number_leading_xs so I will get the quality from the bottom strand\n"); | |
1737 # print("retrieved quality is ".$contigs->{$current_contig}->{top_phreds}[0-$contigs->{$current_contig}->{top_start}+$count+1]."\n"); | |
1738 # my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data); | |
1739 if ($bottom_quality_here eq "not found") { | |
1740 # $top_phred_position = 1-$contigs->{$current_contig}->{bottom_start}+$count; | |
1741 # print("Going to get quality from here: $top_phred_position of the top.\n"); | |
1742 # my $temp_quality - $contigs->{$current_contig}->{top_phreds} | |
1743 # $quality = $contigs->{$current_contig}->{top_phreds}[$top_phred_position]; | |
1744 $top_quality_here =~ /\w+\s(\d+)\s/; | |
1745 $quality = $1; | |
1746 } | |
1747 elsif ($top_quality_here eq "not found") { | |
1748 # $bottom_phred_position = 1+$contigs->{$current_contig}->{bottom_start}+$count; | |
1749 # print("Going to get quality from here: $bottom_phred_position of the bottom.\n"); | |
1750 # $quality = $contigs->{$current_contig}->{bottom_phreds}[$bottom_phred_position]; | |
1751 # print("Additional: no top quality but bottom is $quality\n"); | |
1752 $bottom_quality_here =~ /\w+\s(\d+)\s/; | |
1753 $quality = $1; | |
1754 } | |
1755 else { | |
1756 # print("Oh jeepers, there are 2 qualities to choose from at this position.\n"); | |
1757 # print("Going to compare these phred qualities: top: #$top_quality_here# bottom: #$bottom_quality_here#\n"); | |
1758 # now you have to compare them | |
1759 # my $top_quality_phred = $contigs->{$current_contig}->{top_phreds}[$top_phred_position]; | |
1760 # #t 40 875# | |
1761 # print("regexing #$top_quality_here#... "); | |
1762 $top_quality_here =~ /\w\ (\d+)\s/; | |
1763 my $top_quality = $1; | |
1764 # print("$top_quality\nregexing #$bottom_quality_here#... "); | |
1765 $bottom_quality_here =~ /\w\ (\d+)\s/; | |
1766 my $bottom_quality = $1; | |
1767 # print("$bottom_quality\n"); | |
1768 # print("top_quality: $top_quality bottom quality: $bottom_quality\n"); | |
1769 if ($bottom_quality > $top_quality) { | |
1770 # print("Chose to take the bottom quality: $bottom_quality\n"); | |
1771 $quality = $bottom_quality; | |
1772 } else { | |
1773 # print("Chose to take the top quality: $top_quality\n"); | |
1774 $quality = $top_quality; | |
1775 } | |
1776 } | |
1777 if (!$quality) { | |
1778 # print("Warning: no quality value for $current_contig, position $count!\n"); | |
1779 # print("Additional data: top quality phred: $top_quality_here\n"); | |
1780 # print("Additional data: bottom quality phred: $bottom_quality_here\n"); | |
1781 } else { | |
1782 $qualities[$count] = $quality; | |
1783 } | |
1784 } | |
1785 } | |
1786 | |
1787 } | |
1788 unless (!@qualities) { | |
1789 $self->{'contigs'}->{$current_contig}->{'quality'} = join(" ",@qualities); | |
1790 } | |
1791 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = undef; | |
1792 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = undef; | |
1793 my $count = 1; | |
1794 } # end foreach key | |
1795 } | |
1796 } | |
1797 | |
1798 | |
1799 # Autoload methods go after =cut, and are processed by the autosplit program. | |
1800 | |
1801 1; | |
1802 __END__ | |
1803 | |
1804 =head1 SEE ALSO | |
1805 | |
1806 perl(1). | |
1807 | |
1808 =cut |