0
|
1 # Bio::Tools::Alignment::Consed.pm
|
|
2 #
|
|
3 # Cared for by Chad Matsalla
|
|
4 #
|
|
5 # Copyright Chad Matsalla
|
|
6 #
|
|
7 # You may distribute this module under the same terms as perl itself
|
|
8
|
|
9 # POD documentation - main docs before the code
|
|
10
|
|
11 =head1 NAME
|
|
12
|
|
13 Bio::Tools::Alignment::Consed - A module to work with objects from consed .ace files
|
|
14
|
|
15 =head1 SYNOPSIS
|
|
16
|
|
17 # a report for sequencing stuff
|
|
18 my $o_consed = new Bio::Tools::Alignment::Consed(
|
|
19 -acefile => "/path/to/an/acefile.ace.1",
|
|
20 -verbose => 1);
|
|
21 my $foo = $o_consed->set_reverse_designator("r");
|
|
22 my $bar = $o_consed->set_forward_designator("f");
|
|
23
|
|
24 # get the contig numbers
|
|
25 my @keys = $o_consed->get_contigs();
|
|
26
|
|
27 # construct the doublets
|
|
28 my $setter_doublets = $o_consed->choose_doublets();
|
|
29
|
|
30 # get the doublets
|
|
31 my @doublets = $o_consed->get_doublets();
|
|
32
|
|
33 =head1 DESCRIPTION
|
|
34
|
|
35 Bio::Tools::Alignment::Consed provides methods and objects to deal
|
|
36 with the output from the Consed package of programs. Specifically,
|
|
37 Bio::Tools::Alignment::Consed takes in the name of in .ace file and
|
|
38 provides objects for the results.
|
|
39
|
|
40 A word about doublets: This module was written to accomodate a large
|
|
41 EST sequencing operation. In this case, EST's were sequenced from the
|
|
42 3' and from the 5' end of the EST. The objective was to find a
|
|
43 consensus sequence for these two reads. Thus, a contig of two is what
|
|
44 we wanted, and this contig should consist of the forward and reverse
|
|
45 reads of a getn clone. For example, for a forward designator of "F"
|
|
46 and a reverse designator of "R", if the two reads chad1F and chad1R
|
|
47 were in a single contig (for example Contig 5) it will be determined
|
|
48 that the consensus sequence for Contig 5 will be the sequence for
|
|
49 clone chad1.
|
|
50
|
|
51 Doublets are good!
|
|
52
|
|
53 This module parses .ace and related files. A detailed list of methods
|
|
54 can be found at the end of this document.
|
|
55
|
|
56 I wrote a detailed rationale for design that may explain the reasons
|
|
57 why some things were done the way they were done. That document is
|
|
58 beyond the scope of this pod and can probably be found in the
|
|
59 directory from which this module was 'made' or at
|
|
60 http://www.dieselwurks.com/bioinformatics/consedpm_documentation.pdf
|
|
61
|
|
62 Note that the pod in that document might be old but the original
|
|
63 rationale still stands.
|
|
64
|
|
65 =head1 FEEDBACK
|
|
66
|
|
67 =head2 Mailing Lists
|
|
68
|
|
69 User feedback is an integral part of the evolution of this and other
|
|
70 Bioperl modules. Send your comments and suggestions preferably to one
|
|
71 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
72
|
|
73 bioperl-l@bioperl.org - General discussion
|
|
74 http://bio.perl.org/MailList.html - About the mailing lists
|
|
75
|
|
76 =head2 Reporting Bugs
|
|
77
|
|
78 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
79 the bugs and their resolution. Bug reports can be submitted via email
|
|
80 or the web:
|
|
81
|
|
82 bioperl-bugs@bio.perl.org
|
|
83 http://bugzilla.bioperl.org/
|
|
84
|
|
85
|
|
86 =head1 AUTHOR - Chad Matsalla
|
|
87
|
|
88 chad@dieselwurks.com
|
|
89
|
|
90 =head1 APPENDIX
|
|
91
|
|
92 The rest of the documentation details each of the object
|
|
93 methods. Internal methods are usually preceded with a _
|
|
94
|
|
95 =cut
|
|
96
|
|
97 #'
|
|
98
|
|
99 package Bio::Tools::Alignment::Consed;
|
|
100
|
|
101 use strict;
|
|
102 use vars qw($VERSION @ISA $Contigs %DEFAULTS);
|
|
103 use FileHandle;
|
|
104 use Dumpvalue qw(dumpValue);
|
|
105 use Bio::Tools::Alignment::Trim;
|
|
106 use Bio::Root::Root;
|
|
107 use Bio::Root::IO;
|
|
108
|
|
109 @ISA = qw(Bio::Root::Root Bio::Root::IO);
|
|
110
|
|
111 $VERSION = '0.60';
|
|
112
|
|
113 BEGIN {
|
|
114 %DEFAULTS = ( 'f_designator' => 'f',
|
|
115 'r_designator' => 'r');
|
|
116 }
|
|
117
|
|
118 =head2 new()
|
|
119
|
|
120 Title : new(-acefile => $path_to_some_acefile, -verbose => "1")
|
|
121 Usage : $o_consed = Bio::Tools::Alignment::Consed->
|
|
122 new(-acefile => $path_to_some_acefile, -verbose => "1");
|
|
123 Function: Construct the Bio::Tools::Alignment::Consed object. Sets
|
|
124 verbosity for the following procedures, if necessary:
|
|
125 1. Construct a new Bio::Tools::Alignment::Trim object, to
|
|
126 handle quality trimming 2. Read in the acefile and parse it
|
|
127
|
|
128 Returns : A reference to a Bio::Tools::Alignment::Consed object.
|
|
129 Args : A hash. (-acefile) is the filename of an acefile. If a full path
|
|
130 is not specified "./" is prepended to the filename and used from
|
|
131 instantiation until destruction. If you want
|
|
132 Bio::Tools::Alignment::Consed to be noisy during parsing of
|
|
133 the acefile, specify some value for (-verbose).
|
|
134
|
|
135 =cut
|
|
136
|
|
137 sub new {
|
|
138 my ($class,%args) = @_;
|
|
139 my $self = $class->SUPER::new(%args);
|
|
140
|
|
141 $self->{'filename'} = $args{'-acefile'};
|
|
142 # this is special to UNIX and should probably use catfile
|
|
143 if (!($self->{'filename'} =~ /\//)) {
|
|
144 $self->{'filename'} = "./".$self->{'filename'};
|
|
145 }
|
|
146 $self->{'filename'} =~ m/(.*\/)(.*)ace.*$/;
|
|
147 $self->{'path'} = $1;
|
|
148 $self->_initialize_io('-file'=>$self->{'filename'});
|
|
149 $self->{'o_trim'} = new Bio::Tools::Alignment::Trim(-verbose => $self->verbose());
|
|
150 $self->set_forward_designator($DEFAULTS{'f_designator'});
|
|
151 $self->set_reverse_designator($DEFAULTS{'r_designator'});
|
|
152
|
|
153 $self->_read_file();
|
|
154 return $self;
|
|
155 }
|
|
156
|
|
157 =head2 verbose()
|
|
158
|
|
159 Title : verbose()
|
|
160 Usage : $o_consed->verbose(1);
|
|
161 Function: Set the verbosity level for debugging messages. On instantiation
|
|
162 of the Bio::Tools::Alignment::Consed object the verbosity level
|
|
163 is set to 0 (quiet).
|
|
164 Returns : 1 or 0.
|
|
165 Args : The verbosity levels are:
|
|
166 0 - quiet
|
|
167 1 - noisy
|
|
168 2 - noisier
|
|
169 3 - annoyingly noisy
|
|
170 Notes : This method for setting verbosity has largely been superseeded by
|
|
171 a sub-by-sub way, where for every sub you can provide a (-verbose)
|
|
172 switch. I am doing converting this bit-by-bit so do not be surprised
|
|
173 if some subs do not honour this.
|
|
174
|
|
175 =cut
|
|
176
|
|
177 # from RootI
|
|
178
|
|
179 # backwards compat
|
|
180 sub set_verbose { (shift)->verbose(@_) }
|
|
181
|
|
182 =head2 get_filename()
|
|
183
|
|
184 Title : get_filename()
|
|
185 Usage : $o_consed->get_filename();
|
|
186 Function: Returns the name of the acefile being used by the
|
|
187 Bio::Tools::Alignment::Consed object.
|
|
188 Returns : A scalar containing the name of a file.
|
|
189 Args : None.
|
|
190
|
|
191 =cut
|
|
192
|
|
193
|
|
194 sub get_filename {
|
|
195 my $self = shift;
|
|
196 return $self->{'filename'};
|
|
197 }
|
|
198
|
|
199 =head2 count_sequences_with_grep()
|
|
200
|
|
201 Title : count_sequences_with_grep()
|
|
202 Usage : $o_consed->count_sequences_with_grep();
|
|
203 Function: Use /bin/grep to scan through the files in the ace project dir
|
|
204 and count sequences in those files. I used this method in the
|
|
205 development of this module to verify that I was getting all of the
|
|
206 sequences. It works, but it is (I think) unix-like platform
|
|
207 dependent.
|
|
208 Returns : A scalar containing the number of sequences in the ace project
|
|
209 directory.
|
|
210 Args : None.
|
|
211 Notes : If you are on a non-UNIX platform, you really do not have to use
|
|
212 this. It is more of a debugging routine designed to address very
|
|
213 specific problems.
|
|
214
|
|
215 This method was reimplemented to be platform independent with a
|
|
216 pure perl implementation. The above note can be ignored.
|
|
217
|
|
218 =cut
|
|
219
|
|
220 sub count_sequences_with_grep {
|
|
221 my $self = shift;
|
|
222 my ($working_dir,$grep_cli,@total_grep_sequences);
|
|
223 # this should be migrated to a pure perl implementation ala
|
|
224 # Tom Christiansen's 'tcgrep'
|
|
225 # http://www.perl.com/language/ppt/src/grep/tcgrep
|
|
226
|
|
227 open(FILE, $self->{'filename'}) or do { $self->warn("cannot open file ".$self->{'filename'}. " for grepping"); return};
|
|
228 my $counter =0;
|
|
229 while(<FILE>) { $counter++ if(/^AF/); }
|
|
230
|
|
231 close FILE;
|
|
232 opendir(SINGLETS,$self->{'path'});
|
|
233 foreach my $f ( readdir(SINGLETS) ) {
|
|
234 next unless ($f =~ /\.singlets$/);
|
|
235 open(FILE, $self->catfile($self->{'path'},$f)) or do{ $self->warn("cannot open file ".$self->catfile($self->{'path'},$f)); next };
|
|
236 while(<FILE>) { $counter++ if(/^>/) }
|
|
237 close FILE;
|
|
238 }
|
|
239 return $counter;
|
|
240 # Chad's platform implementation which required grep on the system
|
|
241
|
|
242 # I tried to cause graceful exiting if not on *ix here
|
|
243 # then i took platforms from Bioperl*/PLATFORMS here. Is that good?
|
|
244 # print("\$^O is $^O\n");
|
|
245 # if (!($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) {
|
|
246 # $self->warn("Bio::Tools::Alignment::Consed::count_sequences_with_grep: This sub uses grep which is doesn't run on this operating system, AFAIK. Sorry .".$^O);
|
|
247 # return 1;
|
|
248 # }
|
|
249 # $grep_cli = `which grep`;
|
|
250 # if (!$grep_cli) {
|
|
251 # $self->warn("I couldn't see to find grep on this system, or the which command is broken. Bio::Tools::Alignment::Consed::count_sequences_with_grep requires grep and which to find grep.");
|
|
252 # return 1;
|
|
253 # }
|
|
254 # chomp $grep_cli;
|
|
255 # push(@total_grep_sequences, my @foo = `$grep_cli ^AF $self->{filename}`);
|
|
256 # my $cli = "$grep_cli \\> $self->{'path'}*.singlets";
|
|
257 # push(@total_grep_sequences, @foo = `$cli`);
|
|
258 # return scalar(@total_grep_sequences);
|
|
259 }
|
|
260
|
|
261 =head2 get_path()
|
|
262
|
|
263 Title : get_path()
|
|
264 Usage : $o_consed->get_path();
|
|
265 Function: Returns the path to the acefile this object is working with.
|
|
266 Returns : Scalar. The path to the working acefile.
|
|
267 Args : None.
|
|
268
|
|
269 =cut
|
|
270
|
|
271 sub get_path {
|
|
272 my $self = shift;
|
|
273 return $self->{'path'};
|
|
274 }
|
|
275
|
|
276 =head2 get_contigs()
|
|
277
|
|
278 Title : get_contigs()
|
|
279 Usage : $o_consed->get_contigs();
|
|
280 Function: Return the keys to the Bio::Tools::Alignment::Consed object.
|
|
281 Returns : An array containing the keynames in the
|
|
282 Bio::Tools::Alignment::Consed object.
|
|
283 Args : None.
|
|
284 Notes : This would normally be used to get the keynames for some sort of
|
|
285 iterator. These keys are worthless in general day-to-day use because
|
|
286 in the Consed acefile they are simply Contig1, Contig2, ...
|
|
287
|
|
288 =cut
|
|
289
|
|
290 sub get_contigs {
|
|
291 my ($self,$contig) = @_;
|
|
292 my @keys = (sort keys %{$self->{'contigs'}});
|
|
293 return @keys;
|
|
294 }
|
|
295
|
|
296 =head2 get_class($contig_keyname)
|
|
297
|
|
298 Title : get_class($contig_keyname)
|
|
299 Usage : $o_consed->get_class($contig_keyname);
|
|
300 Function: Return the class name for this contig
|
|
301 Returns : A scalar representing the class of this contig.
|
|
302 Args : None.
|
|
303 Notes :
|
|
304
|
|
305 =cut
|
|
306
|
|
307 sub get_class {
|
|
308 my ($self,$contig) = @_;
|
|
309 return $self->{contigs}->{$contig}->{class};
|
|
310 }
|
|
311
|
|
312 =head2 get_quality_array($contig_keyname)
|
|
313
|
|
314 Title : get_quality_array($contig_keyname)
|
|
315 Usage : $o_consed->get_quality_array($contig_keyname);
|
|
316 Function: Returns the quality for the consensus sequence for the given
|
|
317 contig as an array. See get_quality_scalar to get this as a scalar.
|
|
318 Returns : An array containing the quality for the consensus sequence with
|
|
319 the given keyname.
|
|
320 Args : The keyname of a contig. Note: This is a keyname. The key would
|
|
321 normally come from get_contigs.
|
|
322 Notes : Returns an array, not a reference. Is this a bug? <thinking> No.
|
|
323 Well, maybe.
|
|
324 Why was this developed like this? I was using FreezeThaw for object
|
|
325 persistence, and when it froze out these arrays it took a long time
|
|
326 to thaw it. Much better as a scalar.
|
|
327
|
|
328 See L<get_quality_scalar()|get_quality_scalar>
|
|
329
|
|
330 =cut
|
|
331
|
|
332 sub get_quality_array {
|
|
333 my ($self,$contig) = @_;
|
|
334 my $quality = $self->{contigs}->{$contig}->{quality};
|
|
335 # chad, what is with the s/// ?
|
|
336 # my @qualities = split
|
|
337 # (' ',($self->{contigs}->{$contig}->{quality} =~ s/\s+//));
|
|
338 my @qualities = split
|
|
339 (' ',$self->{contigs}->{$contig}->{quality});
|
|
340 return @qualities;
|
|
341 }
|
|
342
|
|
343 =head2 get_quality_scalar($contig_keyname))
|
|
344
|
|
345 Title : get_quality_scalar($contig_keyname)
|
|
346 Usage : $o_consed->get_quality_scalar($contig_keyname);
|
|
347 Function: Returns the quality for the consensus sequence for the given
|
|
348 contig as a scalar. See get_quality_array to get this as an array.
|
|
349 Returns : An scalar containing the quality for the consensus sequence with
|
|
350 the given keyname.
|
|
351 Args : The keyname of a contig. Note this is a _keyname_. The key would
|
|
352 normally come from get_contigs.
|
|
353 Notes : Why was this developed like this? I was using FreezeThaw for object
|
|
354 persistence, and when it froze out these arrays it took a coon's age
|
|
355 to thaw it. Much better as a scalar.
|
|
356
|
|
357 See L<get_quality_array()|get_quality_array>
|
|
358
|
|
359 =cut
|
|
360
|
|
361 #'
|
|
362 sub get_quality_scalar {
|
|
363 my ($self,$contig) = @_;
|
|
364 return $self->{'contigs'}->{$contig}->{'quality'};
|
|
365 }
|
|
366
|
|
367 =head2 freeze_hash()
|
|
368
|
|
369 Title : freeze_hash()
|
|
370 Usage : $o_consed->freeze_hash();
|
|
371
|
|
372 Function: Use Ilya's FreezeThaw module to create a persistent data
|
|
373 object for this Bio::Tools::Alignment::Consed data
|
|
374 structure. In the case of AAFC, we use
|
|
375 Bio::Tools::Alignment::Consed to pre-process bunches of
|
|
376 sequences, freeze the structures, and send in a harvesting
|
|
377 robot later to do database stuff.
|
|
378 Returns : 0 or 1;
|
|
379 Args : None.
|
|
380 Notes : This procedure was removed so Consed.pm won't require
|
|
381 FreezeThaw.
|
|
382
|
|
383
|
|
384 =cut
|
|
385
|
|
386 #'
|
|
387 sub freeze_hash {
|
|
388 my $self = shift;
|
|
389 $self->warn("This method (freeze_hash) was removed from the bioperl consed.pm. Sorry.\n");
|
|
390 if (1==2) {
|
|
391 $self->debug("Bio::Tools::Alignment::Consed::freeze_hash: \$self->{path} is $self->{path}\n");
|
|
392 my $filename = $self->{'path'}."frozen";
|
|
393 my %contigs = %{$self->{'contigs'}};
|
|
394 my $frozen = freeze(%contigs);
|
|
395 umask 0001;
|
|
396 open (FREEZE,">$filename") or do {
|
|
397 $self->warn( "Bio::Tools::Alignment::Consed could not freeze the contig hash because the file ($filename) could not be opened: $!\n");
|
|
398 return 1;
|
|
399 };
|
|
400 print FREEZE $frozen;
|
|
401 close FREEZE;
|
|
402 return 0;
|
|
403 }
|
|
404 }
|
|
405
|
|
406 =head2 get_members($contig_keyname)
|
|
407
|
|
408 Title : get_members($contig_keyname)
|
|
409 Usage : $o_consed->get_members($contig_keyname);
|
|
410 Function: Return the _names_ of the reads in this contig.
|
|
411 Returns : An array containing the names of the reads in this contig.
|
|
412 Args : The keyname of a contig. Note this is a keyname. The keyname
|
|
413 would normally come from get_contigs.
|
|
414
|
|
415 See L<get_contigs()|get_contigs>
|
|
416
|
|
417 =cut
|
|
418
|
|
419 sub get_members {
|
|
420 my ($self,$contig) = @_;
|
|
421 if (!$contig) {
|
|
422 $self->warn("You need to provide the name of a contig to use Bio::Tools::Alignment::Consed::get_members!\n");
|
|
423 return;
|
|
424 }
|
|
425 return @{$self->{'contigs'}->{$contig}->{'member_array'}};
|
|
426 }
|
|
427
|
|
428 =head2 get_members_by_name($some_arbitrary_name)
|
|
429
|
|
430 Title : get_members_by_name($some_arbitrary_name)
|
|
431 Usage : $o_consed->get_members_by_name($some_arbitrary_name);
|
|
432 Function: Return the names of the reads in a contig. This is the name given
|
|
433 to $contig{key} based on what is in the contig. This is different
|
|
434 from the keys retrieved through get_contigs().
|
|
435 Returns : An array containing the names of the reads in the contig with this
|
|
436 name.
|
|
437 Args : The name of a contig. Not a key, but a name.
|
|
438 Notes : Highly inefficient. use some other method if possible.
|
|
439
|
|
440 See L<get_contigs()|get_contigs>
|
|
441
|
|
442 =cut
|
|
443
|
|
444 sub get_members_by_name {
|
|
445 my ($self,$name) = @_;
|
|
446 # build a list to try to screen for redundancy
|
|
447 my @contigs_with_that_name;
|
|
448 foreach my $currkey ( sort keys %{$self->{'contigs'}} ) {
|
|
449 next if (!$self->{'contigs'}->{$currkey}->{'name'});
|
|
450 if ($self->{'contigs'}->{$currkey}->{'name'} eq "$name") {
|
|
451 push @contigs_with_that_name,$currkey;
|
|
452 }
|
|
453 }
|
|
454 my $count = @contigs_with_that_name;
|
|
455 if ($count == 1) {
|
|
456 my $contig_num = $contigs_with_that_name[0];
|
|
457 return @{$self->{'contigs'}->{$contig_num}->{'member_array'}};
|
|
458 }
|
|
459 }
|
|
460
|
|
461 =head2 get_contig_number_by_name($some_arbitrary_name)
|
|
462
|
|
463 Title : get_contig_number_by_name($some_arbitrary_name)
|
|
464 Usage : $o_consed->get_contig_number_by_name($some_arbitrary_name);
|
|
465 Function: Return the names of the reads in a contig. This is the name given
|
|
466 to $contig{key} based on what is in the contig. This is different
|
|
467 from the keys retrieved through get_contigs().
|
|
468 Returns : An array containing the names of the reads in the contig with this
|
|
469 name.
|
|
470 Args : The name of a contig. Not a key, but a name.
|
|
471
|
|
472 See L<get_contigs()|get_contigs>
|
|
473
|
|
474 =cut
|
|
475
|
|
476 sub get_contig_number_by_name {
|
|
477 my ($self,$name) = @_;
|
|
478 foreach my $currkey (sort keys %{$self->{'contigs'}}) {
|
|
479 if ($self->{'contigs'}->{$currkey}->{'name'} &&
|
|
480 $self->{'contigs'}->{$currkey}->{'name'} eq "$name") {
|
|
481 return $currkey;
|
|
482 }
|
|
483 }
|
|
484 }
|
|
485
|
|
486 =head2 get_sequence($contig_keyname)
|
|
487
|
|
488 Title : get_sequence($contig_keyname)
|
|
489 Usage : $o_consed->get_sequence($contig_keyname);
|
|
490 Function: Returns the consensus sequence for a given contig.
|
|
491 Returns : A scalar containing a sequence.
|
|
492 Args : The keyname of a contig. Note this is a key. The key would
|
|
493 normally come from get_contigs.
|
|
494
|
|
495 See L<get_contigs()|get_contigs>
|
|
496
|
|
497 =cut
|
|
498
|
|
499 sub get_sequence {
|
|
500 my ($self,$contig) = @_;
|
|
501 return $self->{'contigs'}->{$contig}->{'consensus'};
|
|
502 }
|
|
503
|
|
504 =head2 set_final_sequence($some_sequence)
|
|
505
|
|
506 Title : set_final_sequence($name,$some_sequence)
|
|
507 Usage : $o_consed->set_final_sequence($name,$some_sequence);
|
|
508 Function: Provides a manual way to set the sequence for a given key in the
|
|
509 contig hash. Rarely used.
|
|
510 Returns : 0 or 1;
|
|
511 Args : The name (not the keyname) of a contig and an arbitrary string.
|
|
512 Notes : A method with a questionable and somewhat mysterious origin. May
|
|
513 raise the dead or something like that.
|
|
514
|
|
515 =cut
|
|
516
|
|
517 sub set_final_sequence {
|
|
518 my ($self,$name,$sequence) = @_;
|
|
519 if (!$self->{'contigs'}->{$name}) {
|
|
520 $self->warn("You cannot set the final sequence for $name because it doesn't exist!\n");
|
|
521 return 1;
|
|
522 }
|
|
523 else {
|
|
524 $self->{'contigs'}->{$name}->{'final_sequence'} = $sequence;
|
|
525 }
|
|
526 return 0;
|
|
527 }
|
|
528
|
|
529 =head2 _read_file()
|
|
530
|
|
531 Title : _read_file();
|
|
532 Usage : _read_file();
|
|
533 Function: An internal subroutine used to read in an acefile and parse it
|
|
534 into a Bio::Tools::Alignment::Consed object.
|
|
535 Returns : 0 or 1.
|
|
536 Args : Nothing.
|
|
537 Notes : This routine creates and saves the filhandle for reading the
|
|
538 files in {fh}
|
|
539
|
|
540 =cut
|
|
541
|
|
542 sub _read_file {
|
|
543 my ($self) = @_;
|
|
544 my ($line,$in_contig,$in_quality,$contig_number,$top);
|
|
545 # make it easier to type $fhl
|
|
546 while (defined($line=$self->_readline()) ) {
|
|
547 chomp $line;
|
|
548 # check if there is anything on this line
|
|
549 # if not, you can stop gathering consensus sequence
|
|
550 if (!$line) {
|
|
551 # if the line is blank you are no longer to gather consensus
|
|
552 # sequence or quality values
|
|
553 $in_contig = 0;
|
|
554 $in_quality = 0;
|
|
555 }
|
|
556 # you are currently gathering consensus sequence
|
|
557 elsif ($in_contig) {
|
|
558 if ($in_contig == 1) {
|
|
559 $self->debug("Adding $line to consensus of contig number $contig_number.\n");
|
|
560 $self->{'contigs'}->{$contig_number}->{'consensus'} .= $line;
|
|
561 }
|
|
562 }
|
|
563 elsif ($in_quality) {
|
|
564 if (!$line) {
|
|
565 $in_quality = undef;
|
|
566 }
|
|
567 else {
|
|
568 # I wrote this in here because acefiles produced by cap3 do not have a leading space
|
|
569 # like the acefiles produced by phrap and there is the potential to have concatenated
|
|
570 # quality values like this: 2020 rather then 20 20 whre lines collide. Thanks Andrew for noticing.
|
|
571 if ($self->{'contigs'}->{$contig_number}->{'quality'} && !($self->{'contigs'}->{$contig_number}->{'quality'} =~ m/\ $/)) {
|
|
572 $self->{'contigs'}->{$contig_number}->{'quality'} .= " ";
|
|
573 }
|
|
574 $self->{'contigs'}->{$contig_number}->{'quality'} .= $line;
|
|
575 }
|
|
576 }
|
|
577 elsif ($line =~ /^BQ/) {
|
|
578 $in_quality = 1;
|
|
579 }
|
|
580 # the line /^CO/ like this:
|
|
581 # CO Contig1 796 1 1 U
|
|
582 # can be broken down as follows:
|
|
583 # CO - Contig!
|
|
584 # Contig1 - the name of this contig
|
|
585 # 796 - Number of bases in this contig
|
|
586 # 1 - Number of reads in this contig
|
|
587 # 1 - number of base segments in this contig
|
|
588 # U - Uncomplemented
|
|
589 elsif ($line =~ /^CO/) {
|
|
590 $line =~ m/^CO\ Contig(\d+)\ \d+\ \d+\ \d+\ (\w)/;
|
|
591 $contig_number = $1;
|
|
592 if ($2 eq "C") {
|
|
593 $self->debug("Contig $contig_number is complemented!\n");
|
|
594 }
|
|
595 $self->{'contigs'}->{$contig_number}->{'member_array'} = [];
|
|
596 $self->{'contigs'}->{$contig_number}->{'contig_direction'} = "$2";
|
|
597 $in_contig = 1;
|
|
598 }
|
|
599 # 000713
|
|
600 # this BS is deprecated, I think.
|
|
601 # haha, I am really witty. <ew>
|
|
602 elsif ($line =~ /^BSDEPRECATED/) {
|
|
603 $line =~ m/^BS\s+\d+\s+\d+\s+(.+)/;
|
|
604 my $member = $1;
|
|
605 $self->{'contigs'}->{$contig_number}->{$member}++;
|
|
606 }
|
|
607 # the members of the contigs are determined by the AF line in the ace file
|
|
608 elsif ($line =~ /^AF/) {
|
|
609 $self->debug("I see an AF line here.\n");
|
|
610 $line =~ /^AF\ (\S+)\ (\w)\ (\S+)/;
|
|
611 # push the name of the current read onto the member array for this contig
|
|
612 push @{$self->{'contigs'}->{$contig_number}->{'member_array'}},$1;
|
|
613 # the first read in the contig will be named the "top" read
|
|
614 if (!$top) {
|
|
615 $self->debug("\$top is not set.\n");
|
|
616 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") {
|
|
617 $self->debug("Reversing the order of the reads. The bottom will be $1\n");
|
|
618 # if the contig sequence is marked as the complement, the top becomes the bottom and$
|
|
619 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1;
|
|
620 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2;
|
|
621 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3;
|
|
622 }
|
|
623 else {
|
|
624 $self->debug("NOT reversing the order of the reads. The top_name will be $1\n");
|
|
625 # if the contig sequence is marked as the complement, the top becomes the bottom and$
|
|
626 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1;
|
|
627 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2;
|
|
628 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3;
|
|
629 }
|
|
630 $top = 1;
|
|
631 }
|
|
632 else {
|
|
633 # if the contig sequence is marked as the complement, the top becomes the bottom and the bottom becomes the top
|
|
634 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") {
|
|
635 $self->debug("Reversing the order of the reads. The top will be $1\n");
|
|
636 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1;
|
|
637 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2;
|
|
638 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3;
|
|
639 }
|
|
640 else {
|
|
641 $self->debug("NOT reversing the order of the reads. The bottom will be $1\n");
|
|
642 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1;
|
|
643 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2;
|
|
644 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3;
|
|
645 }
|
|
646 $top = undef;
|
|
647 }
|
|
648 }
|
|
649 }
|
|
650 return 0;
|
|
651 }
|
|
652
|
|
653 =head2 set_reverse_designator($some_string)
|
|
654
|
|
655 Title : set_reverse_designator($some_string)
|
|
656 Usage : $o_consed->set_reverse_designator($some_string);
|
|
657 Function: Set the designator for the reverse read of contigs in this
|
|
658 Bio::Tools::Alignment::Consed object. Used to determine if
|
|
659 contigs containing two reads can be named.
|
|
660 Returns : The value of $o_consed->{reverse_designator} so you can check
|
|
661 to see that it was set properly.
|
|
662 Args : An arbitrary string.
|
|
663 Notes : May be useful only to me. <shrug>
|
|
664
|
|
665 =cut
|
|
666
|
|
667 sub set_reverse_designator {
|
|
668 my ($self,$reverse_designator) = @_;
|
|
669 $self->{'reverse_designator'} = $reverse_designator;
|
|
670 $self->{'o_trim'}->set_reverse_designator($reverse_designator);
|
|
671 return $self->{'reverse_designator'};
|
|
672 } # end set_reverse_designator
|
|
673
|
|
674 =head2 set_forward_designator($some_string)
|
|
675
|
|
676 Title : set_forward_designator($some_string)
|
|
677 Usage : $o_consed->set_forward_designator($some_string);
|
|
678 Function: Set the designator for the forward read of contigs in this
|
|
679 Bio::Tools::Alignment::Consed object. Used to determine if
|
|
680 contigs containing two reads can be named.
|
|
681 Returns : The value of $o_consed->{forward_designator} so you can check
|
|
682 to see that it was set properly.
|
|
683 Args : An arbitrary string.
|
|
684 Notes : May be useful only to me. <shrug>
|
|
685
|
|
686 =cut
|
|
687
|
|
688 sub set_forward_designator {
|
|
689 my ($self,$forward_designator) = @_;
|
|
690 $self->{'forward_designator'} = $forward_designator;
|
|
691 $self->{'o_trim'}->set_forward_designator($forward_designator);
|
|
692 return $self->{'forward_designator'};
|
|
693 } # end set_forward_designator
|
|
694
|
|
695 =head2 set_designator_ignore_case("yes")
|
|
696
|
|
697 Title : set_designator_ignore_case("yes")
|
|
698 Usage : $o_consed->set_designator_ignore_case("yes");
|
|
699 Function: Deprecated.
|
|
700 Returns : Deprecated.
|
|
701 Args : Deprecated.
|
|
702 Notes : Deprecated. Really. Trust me.
|
|
703
|
|
704 =cut
|
|
705
|
|
706 sub set_designator_ignore_case {
|
|
707 my ($self,$ignore_case) = @_;
|
|
708 if ($ignore_case eq "yes") {
|
|
709 $self->{'designator_ignore_case'} = 1;
|
|
710 }
|
|
711 return $self->{'designator_ignore_case'};
|
|
712 } # end set_designator_ignore_case
|
|
713
|
|
714 =head2 set_trim_points_singlets_and_singletons()
|
|
715
|
|
716 Title : set_trim_points_singlets_and_singletons()
|
|
717 Usage : $o_consed->set_trim_points_singlets_and_singletons();
|
|
718 Function: Set the trim points for singlets and singletons based on
|
|
719 quality. Uses the Bio::Tools::Alignment::Trim object. Use
|
|
720 at your own risk because the Bio::Tools::Alignment::Trim
|
|
721 object was designed specifically for me and is mysterious
|
|
722 in its ways. Every time somebody other then me uses it a
|
|
723 swarm of locusts decends on a small Central American
|
|
724 village so do not say you weren't warned.
|
|
725 Returns : Nothing.
|
|
726 Args : None.
|
|
727 Notes : Working on exceptions and warnings here.
|
|
728
|
|
729 See L<Bio::Tools::Alignment::Trim> for more information
|
|
730
|
|
731 =cut
|
|
732
|
|
733 #' to make my emacs happy
|
|
734
|
|
735 sub set_trim_points_singlets_and_singletons {
|
|
736 my ($self) = @_;
|
|
737 $self->debug("Consed.pm : \$self is $self\n");
|
|
738 my (@points,$trimmed_sequence);
|
|
739 if (!$self->{'doublets_set'}) {
|
|
740 $self->debug("You need to set the doublets before you use set_trim_points_singlets_and_doublets. Doing that now.");
|
|
741 $self->set_doublets();
|
|
742 }
|
|
743 foreach (sort keys %{$self->{'contigs'}}) {
|
|
744 if ($self->{'contigs'}->{$_}->{'class'} eq "singlet") {
|
|
745 $self->debug("Singlet $_\n");
|
|
746 # this is what Warehouse wants
|
|
747 # my ($self,$sequence,$quality,$name) = @_;
|
|
748 # this is what Bio::Tools::Alignment::Trim::trim_singlet wants:
|
|
749 # my ($self,$sequence,$quality,$name,$class) = @_;
|
|
750 # the following several lines are to make the parameter passing legible.
|
|
751 my ($sequence,$quality,$name,$class);
|
|
752 $sequence = $self->{'contigs'}->{$_}->{'consensus'};
|
|
753 if (!$self->{'contigs'}->{$_}->{'quality'}) { $quality = "unset"; }
|
|
754 else { $quality = $self->{'contigs'}->{$_}->{'quality'}; }
|
|
755 $name = $self->{'contigs'}->{$_}->{'name'};
|
|
756 $class = $self->{'contigs'}->{$_}->{'class'};
|
|
757 (@points) = @{$self->{'o_trim'}->trim_singlet($sequence,$quality,$name,$class)};
|
|
758 $self->{'contigs'}->{$_}->{'start_point'} = $points[0];
|
|
759 $self->{'contigs'}->{$_}->{'end_point'} = $points[1];
|
|
760 $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]);
|
|
761 }
|
|
762 }
|
|
763 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_singlets_and_singletons: Done setting the quality trimpoints.\n");
|
|
764 return;
|
|
765 } # end set_trim_points_singlet
|
|
766
|
|
767 =head2 set_trim_points_doublets()
|
|
768
|
|
769 Title : set_trim_points_doublets()
|
|
770 Usage : $o_consed->set_trim_points_doublets();
|
|
771 Function: Set the trim points for doublets based on quality. Uses the
|
|
772 Bio::Tools::Alignment::Trim object. Use at your own risk because
|
|
773 the Bio::Tools::Alignment::Trim object was designed specifically
|
|
774 for me and is mysterious in its ways. Every time somebody other
|
|
775 then me uses it you risk a biblical plague being loosed on your
|
|
776 city.
|
|
777 Returns : Nothing.
|
|
778 Args : None.
|
|
779 Notes : Working on exceptions here.
|
|
780
|
|
781 See L<Bio::Tools::Alignment::Trim> for more information
|
|
782
|
|
783 =cut
|
|
784
|
|
785 sub set_trim_points_doublets {
|
|
786 my $self = shift;
|
|
787 my @points;
|
|
788 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Restoring zeros for doublets.\n");
|
|
789 # &show_missing_sequence($self);
|
|
790 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Setting doublet trim points.\n");
|
|
791 foreach (sort keys %{$self->{'contigs'}}) {
|
|
792 if ($self->{'contigs'}->{$_}->{'class'} eq "doublet") {
|
|
793 # my ($self,$sequence,$quality,$name,$class) = @_;
|
|
794 my @quals = split(' ',$self->{'contigs'}->{$_}->{'quality'});
|
|
795
|
|
796 (@points) = $self->{o_trim}->trim_doublet($self->{'contigs'}->{$_}->{'consensus'},$self->{'contigs'}->{$_}->{'quality'},$self->{'contigs'}->{$_}->{name},$self->{'contigs'}->{$_}->{'class'});
|
|
797 $self->{'contigs'}->{$_}->{'start_point'} = $points[0];
|
|
798 $self->{'contigs'}->{$_}->{'end_point'} = $points[1];
|
|
799 # now set this
|
|
800 $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]);
|
|
801 # 010102 the deprecated way to do things:
|
|
802 }
|
|
803 }
|
|
804 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Done setting doublet trim points.\n");
|
|
805 return;
|
|
806 } # end set_trim_points_doublets
|
|
807
|
|
808 =head2 get_trimmed_sequence_by_name($name)
|
|
809
|
|
810 Title : get_trimmed_sequence_by_name($name)
|
|
811 Usage : $o_consed->get_trimmed_sequence_by_name($name);
|
|
812 Function: Returns the trimmed_sequence of a contig with {name} eq $name.
|
|
813 Returns : A scalar- the trimmed sequence.
|
|
814 Args : The {name} of a contig.
|
|
815 Notes :
|
|
816
|
|
817 =cut
|
|
818
|
|
819 sub get_trimmed_sequence_by_name {
|
|
820 my ($self,$name) = @_;
|
|
821 my $trimmed_sequence;
|
|
822 my $contigname = &get_contig_number_by_name($self,$name);
|
|
823 my $class = $self->{'contigs'}->{$contigname}->{'class'};
|
|
824 # what is this business and who was smoking crack while writing this?
|
|
825 # if ($class eq "singlet") {
|
|
826 # send the sequence, the quality, and the name
|
|
827 # $trimmed_sequence = $self->{o_trim}->trim_singlet($self->{'contigs'}->{$contigname}->{consensus},$self->{'contigs'}->{$contigname}->{'quality'},$name);
|
|
828 # }
|
|
829 return $self->{'contigs'}->{$contigname}->{'sequence_trimmed'};
|
|
830 }
|
|
831
|
|
832 =head2 set_dash_present_in_sequence_name("yes")
|
|
833
|
|
834 Title : set_dash_present_in_sequence_name("yes")
|
|
835 Usage : $o_consed->set_dash_present_in_sequence_name("yes");
|
|
836 Function: Deprecated. Part of an uncompleted thought. ("Oooh! Shiny!")
|
|
837 Returns : Nothing.
|
|
838 Args : "yes" to set {dash_present_in_sequence_name} to 1
|
|
839 Notes :
|
|
840
|
|
841 =cut
|
|
842
|
|
843 sub set_dash_present_in_sequence_name {
|
|
844 my ($self,$dash_present) = @_;
|
|
845 if ($dash_present eq "yes") {
|
|
846 $self->{'dash_present_in_sequence_name'} = 1;
|
|
847 }
|
|
848 else {
|
|
849 $self->{'dash_present_in_sequence_name'} = 0;
|
|
850 }
|
|
851 return $self->{'dash_present_in_sequence_name'};
|
|
852 } # end set_dash_present_in_sequence_name
|
|
853
|
|
854 =head2 set_doublets()
|
|
855
|
|
856 Title : set_doublets()
|
|
857 Usage : $o_consed->set_doublets();
|
|
858 Function: Find pairs that have similar names and mark them as doublets
|
|
859 and set the {name}.
|
|
860 Returns : 0 or 1.
|
|
861 Args : None.
|
|
862 Notes : A complicated subroutine that iterates over the
|
|
863 Bio::Tools::Alignment::Consed looking for contigs of 2. If the
|
|
864 forward and reverse designator are removed from each of the reads
|
|
865 in {'member_array'} and the remaining reads are the same, {name}
|
|
866 is set to that name and the contig's class is set as "doublet".
|
|
867 If any of those cases fail the contig is marked as a "pair".
|
|
868
|
|
869 =cut
|
|
870
|
|
871 #' make my emacs happy
|
|
872
|
|
873 sub set_doublets {
|
|
874 my ($self) = @_;
|
|
875 # set the designators in the Bio::Tools::Alignment::Trim object
|
|
876
|
|
877 $self->{'o_trim'}->set_designators($self->{'reverse_designator'},
|
|
878 $self->{'forward_designator'});
|
|
879 #
|
|
880 foreach my $key_contig (sort keys %{$self->{'contigs'}}) {
|
|
881
|
|
882 # if there is a member array (why would there not be? This should be a die()able offence
|
|
883 # but for now I will leave it
|
|
884 if ($self->{'contigs'}->{$key_contig}->{'member_array'}) {
|
|
885 # if there are two reads in this contig
|
|
886 # i am pretty sure that this is wrong but i am keeping it for reference
|
|
887 # if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2 || !$self->{'contigs'}->{$key_contig}->{'class'}) {
|
|
888 # <seconds later>
|
|
889 # <nod> WRONG. Was I on crack?
|
|
890 if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2) {
|
|
891 $self->{'contigs'}->{$key_contig}->{'num_members'} = 2;
|
|
892 $self->debug("\tThere are 2 members! Looking for the contig name...\n");
|
|
893 my $name = _get_contig_name($self,$self->{'contigs'}->{$key_contig}->{'member_array'});
|
|
894 $self->debug("The name is $name\n") if defined $name;
|
|
895 if ($name) {
|
|
896 $self->{'contigs'}->{$key_contig}->{'name'} = $name;
|
|
897 $self->{'contigs'}->{$key_contig}->{'class'} = "doublet";
|
|
898 }
|
|
899 else {
|
|
900 $self->debug("$key_contig is a pair.\n");
|
|
901 $self->{'contigs'}->{$key_contig}->{'class'} = "pair";
|
|
902 }
|
|
903 }
|
|
904 # this is all fair and good but what about singlets?
|
|
905 # they have one reads in the member_array but certainly are not singletons
|
|
906 elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 1) {
|
|
907 # set the name to be the name of the read
|
|
908 $self->{'contigs'}->{$key_contig}->{name} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}[0];
|
|
909 # set the number of members to be one
|
|
910 $self->{'contigs'}->{$key_contig}->{num_members} = 1;
|
|
911 # if this was a singlet, it would already belong to the class "singlet"
|
|
912 # so leave it alone
|
|
913 # if it is not a singlet, it is a singleton! lablel it appropriately
|
|
914 unless ($self->{'contigs'}->{$key_contig}->{'class'}) {
|
|
915 $self->{'contigs'}->{$key_contig}->{'class'} = "singleton";
|
|
916 }
|
|
917 }
|
|
918 # set the multiplet characteristics
|
|
919 elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} >= 3) {
|
|
920 $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}};
|
|
921 $self->{'contigs'}->{$key_contig}->{'class'} = "multiplet";
|
|
922 }
|
|
923 $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}};
|
|
924
|
|
925 }
|
|
926 }
|
|
927 $self->{'doublets_set'} = "done";
|
|
928 return 0;
|
|
929 } # end set_doublets
|
|
930
|
|
931 =head2 set_singlets
|
|
932
|
|
933 Title : set_singlets
|
|
934 Usage : $o_consed->set_singlets();
|
|
935 Function: Read in a singlets file and place them into the
|
|
936 Bio::Tools::Alignment::Consed object.
|
|
937 Returns : Nothing.
|
|
938 Args : A scalar to turn on verbose parsing of the singlets file.
|
|
939 Notes :
|
|
940
|
|
941 =cut
|
|
942
|
|
943 sub set_singlets {
|
|
944 # parse out the contents of the singlets file
|
|
945 my ($self) = @_;
|
|
946 $self->debug("Bio::Tools::Alignment::Consed Adding singlets to the contig hash...\n");
|
|
947 my $full_filename = $self->{'filename'};
|
|
948 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: \$full_filename is $full_filename\n");
|
|
949 $full_filename =~ m/(.*\/)(.*ace.*)$/;
|
|
950 my ($base_path,$filename) = ($1,$2);
|
|
951 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: singlets filename is $filename and \$base_path is $base_path\n");
|
|
952 $filename =~ m/(.*)ace.*$/;
|
|
953 my $singletsfile = $base_path.$1."singlets";
|
|
954 $self->debug("\$singletsfile is $singletsfile\n");
|
|
955 if (-f $singletsfile) {
|
|
956 $self->debug("$singletsfile is indeed a file. Trying to open it...\n");
|
|
957 }
|
|
958 my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile);
|
|
959 my ($sequence,$name,$count);
|
|
960 while ($_ = $singlets_fh->_readline()) {
|
|
961 chomp $_;
|
|
962 if (/\>/) {
|
|
963 if ($name && $sequence) {
|
|
964 $self->debug("Adding $name with sequence $sequence to hash...\n");
|
|
965 push @{$self->{'contigs'}->{$name}->{'member_array'}},$name;
|
|
966 $self->{'contigs'}->{$name}->{'consensus'} = $sequence;
|
|
967 $self->{'contigs'}->{$name}->{'name'} = $name;
|
|
968 $self->{'contigs'}->{$name}->{"singlet"} = 1;
|
|
969 $self->{'contigs'}->{$name}->{'class'} = "singlet";
|
|
970 }
|
|
971 $sequence = $name = undef;
|
|
972 $count++;
|
|
973 m/^\>(.*)\s\sCHROMAT/;
|
|
974 $name = $1;
|
|
975 if (!$name) {
|
|
976 m/\>(\S+)\s/;
|
|
977 $name = $1;
|
|
978 }
|
|
979 }
|
|
980 else { $sequence .= $_; }
|
|
981 }
|
|
982 if ($name && $sequence) {
|
|
983 $self->debug("Pushing the last of the singlets ($name)\n");
|
|
984 @{$self->{'contigs'}->{$name}->{'member_array'}} = $name;
|
|
985 $self->{'contigs'}->{$name}->{'consensus'} = $sequence;
|
|
986 $self->{'contigs'}->{$name}->{'name'} = $name;
|
|
987 $self->{'contigs'}->{$name}->{"singlet"} = 1;
|
|
988 $self->{'contigs'}->{$name}->{'class'} = "singlet";
|
|
989 }
|
|
990 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: Done adding singlets to the singlets hash.\n");
|
|
991 $self->{'singlets_set'} = "done";
|
|
992 return 0;
|
|
993 } # end sub set_singlets
|
|
994
|
|
995 =head2 get_singlets()
|
|
996
|
|
997 Title : get_singlets()
|
|
998 Usage : $o_consed->get_singlets();
|
|
999 Function: Return the keynames of the singlets.
|
|
1000 Returns : An array containing the keynames of all
|
|
1001 Bio::Tools::Alignment::Consed sequences in the class "singlet".
|
|
1002 Args : None.
|
|
1003 Notes :
|
|
1004
|
|
1005 =cut
|
|
1006
|
|
1007 sub get_singlets {
|
|
1008 # returns an array of singlet names
|
|
1009 # singlets have "singlet"=1 in the hash
|
|
1010 my $self = shift;
|
|
1011 if (!$self->{singlets_set}) {
|
|
1012 $self->debug("You need to set the singlets before you get them. Doing that now.");
|
|
1013 $self->set_singlets();
|
|
1014 }
|
|
1015
|
|
1016 my (@singlets,@array);
|
|
1017 foreach my $key (sort keys %{$self->{'contigs'}}) {
|
|
1018 # @array = @{$Consed::contigs{$key}->{'member_array'}};
|
|
1019 # somethimes a user will try to get a list of singlets before the classes for the rest of the
|
|
1020 # contigs has been set (see t/test.t for how I figured this out. <bah>
|
|
1021 # so either way, just return class=singlets
|
|
1022 if (!$self->{'contigs'}->{$key}->{'class'}) {
|
|
1023 # print("$key has no class. why?\n");
|
|
1024 }
|
|
1025 elsif ($self->{'contigs'}->{$key}->{'class'} eq "singlet") {
|
|
1026 push @singlets,$key;
|
|
1027 }
|
|
1028 }
|
|
1029 return @singlets;
|
|
1030 }
|
|
1031
|
|
1032 =head2 set_quality_by_name($name,$quality)
|
|
1033
|
|
1034 Title : set_quality_by_name($name,$quality)
|
|
1035 Usage : $o_consed->set_quality_by_name($name,$quality);
|
|
1036 Function: Deprecated. Make the contig with {name} have {'quality'} $quality.
|
|
1037 Probably used for testing.
|
|
1038 Returns : Nothing.
|
|
1039 Args : The name of a contig and a scalar for its quality.
|
|
1040 Notes : Deprecated.
|
|
1041
|
|
1042 =cut
|
|
1043
|
|
1044 sub set_quality_by_name {
|
|
1045 # this is likely deprecated
|
|
1046 my ($self,$name,$quality) = shift;
|
|
1047 my $return;
|
|
1048 foreach (sort keys %{$self->{'contigs'}}) {
|
|
1049 if ($self->{'contigs'} eq "$name" || $self->{'contigs'}->{'name'} eq "$name") {
|
|
1050 $self->{'contigs'}->{'quality'} = $quality;
|
|
1051 $return=1;
|
|
1052 }
|
|
1053 }
|
|
1054 if ($return) { return "0"; } else { return "1"; }
|
|
1055 } # end set quality by name
|
|
1056
|
|
1057 =head2 set_singlet_quality()
|
|
1058
|
|
1059 Title : set_singlet_quality()
|
|
1060 Usage : $o_consed->set_singlet_quality();
|
|
1061 Function: For each singlet, go to the appropriate file in phd_dir and read
|
|
1062 in the phred quality for that read and place it into {'quality'}
|
|
1063 Returns : 0 or 1.
|
|
1064 Args : None.
|
|
1065 Notes : This is the next subroutine that will receive substantial revision
|
|
1066 in the next little while. It really should eval the creation of
|
|
1067 Bio::Tools::Alignment::Phred objects and go from there.
|
|
1068
|
|
1069 =cut
|
|
1070
|
|
1071 sub set_singlet_quality {
|
|
1072 my $self = shift;
|
|
1073 my $full_filename = $self->{'filename'};
|
|
1074 $full_filename =~ m/(.*\/)(.*)ace.*$/;
|
|
1075 my ($base_path,$filename) = ($1,"$2"."qual");
|
|
1076 my $singletsfile = $base_path.$filename;
|
|
1077 if (-f $singletsfile) {
|
|
1078 # print("$singletsfile is indeed a file. Trying to open it...\n");
|
|
1079 }
|
|
1080 else {
|
|
1081 $self->warn("$singletsfile is not a file. Sorry.\n");
|
|
1082 return;
|
|
1083 }
|
|
1084 my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile);
|
|
1085 my ($sequence,$name,$count);
|
|
1086 my ($identity,$line,$quality,@qline);
|
|
1087 while ($line = $singlets_fh->_readline()) {
|
|
1088 chomp $line;
|
|
1089 if ($line =~ /^\>/) {
|
|
1090 $quality = undef;
|
|
1091 $line =~ m/\>(\S*)\s/;
|
|
1092 $identity = $1;
|
|
1093 }
|
|
1094 else {
|
|
1095 if ($self->{'contigs'}->{$identity}) {
|
|
1096 $self->{'contigs'}->{$identity}->{'quality'} .= "$line ";
|
|
1097 }
|
|
1098 }
|
|
1099
|
|
1100 }
|
|
1101 return 0;
|
|
1102 }
|
|
1103
|
|
1104 =head2 set_contig_quality()
|
|
1105
|
|
1106 Title : set_contig_quality()
|
|
1107 Usage : $o_consed->set_contig_quality();
|
|
1108 Function: Deprecated.
|
|
1109 Returns : Deprecated.
|
|
1110 Args : Deprecated.
|
|
1111 Notes : Deprecated. Really. Trust me.
|
|
1112
|
|
1113 =cut
|
|
1114
|
|
1115 sub set_contig_quality {
|
|
1116 # note: contigs _include_ singletons but _not_ singlets
|
|
1117 my ($self) = shift;
|
|
1118 # the unexpected results I am referring to here are a doubling of quality values.
|
|
1119 # the profanity I uttered on discovering this reminded me of the simpsons:
|
|
1120 # Ned Flanders: "That is the loudest profanity I have ever heard!"
|
|
1121 $self->warn("set_contig_quality is deprecated and will likely produce unexpected results");
|
|
1122 my $full_filename = $self->{'filename'};
|
|
1123 # Run_SRC3700_2000-08-01_73+74.fasta.screen.contigs.qual
|
|
1124 # from Consed.pm
|
|
1125 $full_filename =~ m/(.*\/)(.*)ace.*$/;
|
|
1126 my ($base_path,$filename) = ($1,"$2"."contigs.qual");
|
|
1127 my $singletsfile = $base_path.$filename;
|
|
1128 if (-f $singletsfile) {
|
|
1129 # print("$singletsfile is indeed a file. Trying to open it...\n");
|
|
1130 }
|
|
1131 else {
|
|
1132 $self->warn("Bio::Tools::Alignment::Consed::set_contig_quality $singletsfile is not a file. Sorry.\n");
|
|
1133 return;
|
|
1134 }
|
|
1135 my $contig_quality_fh = Bio::Root::IO->new(-file => $singletsfile);
|
|
1136
|
|
1137 my ($sequence,$name,$count,$identity,$line,$quality);
|
|
1138 while ($line = $contig_quality_fh->_readline()) {
|
|
1139 chomp $line;
|
|
1140 if ($line =~ /^\>/) {
|
|
1141 $quality = undef;
|
|
1142 $line =~ m/\>.*Contig(\d+)\s/;
|
|
1143 $identity = $1;
|
|
1144 }
|
|
1145 else {
|
|
1146 if ($self->{'contigs'}->{$identity} ) {
|
|
1147 $self->{'contigs'}->{$identity}->{'quality'} .= " $line";
|
|
1148 }
|
|
1149 }
|
|
1150 }
|
|
1151 } # end set_contig_quality
|
|
1152
|
|
1153 =head2 get_multiplets()
|
|
1154
|
|
1155 Title : get_multiplets()
|
|
1156 Usage : $o_consed->get_multiplets();
|
|
1157 Function: Return the keynames of the multiplets.
|
|
1158 Returns : Returns an array containing the keynames of all
|
|
1159 Bio::Tools::Alignment::Consed sequences in the class "multiplet".
|
|
1160 Args : None.
|
|
1161 Notes :
|
|
1162
|
|
1163 =cut
|
|
1164
|
|
1165 sub get_multiplets {
|
|
1166 # returns an array of multiplet names
|
|
1167 # multiplets have # members > 2
|
|
1168 my $self = shift;
|
|
1169 my (@multiplets,@array);
|
|
1170 foreach my $key (sort keys %{$self->{'contigs'}}) {
|
|
1171 if ($self->{'contigs'}->{$key}->{'class'}) {
|
|
1172 if ($self->{'contigs'}->{$key}->{'class'} eq "multiplet") {
|
|
1173 push @multiplets,$key;
|
|
1174 }
|
|
1175 }
|
|
1176 }
|
|
1177 return @multiplets;
|
|
1178 }
|
|
1179
|
|
1180 =head2 get_all_members()
|
|
1181
|
|
1182 Title : get_all_members()
|
|
1183 Usage : @all_members = $o_consed->get_all_members();
|
|
1184 Function: Return a list of all of the read names in the
|
|
1185 Bio::Tools::Alignment::Consed object.
|
|
1186 Returns : An array containing all of the elements in all of the
|
|
1187 {'member_array'}s.
|
|
1188 Args : None.
|
|
1189 Notes :
|
|
1190
|
|
1191 =cut
|
|
1192
|
|
1193 sub get_all_members {
|
|
1194 my $self = shift;
|
|
1195 my @members;
|
|
1196 foreach my $key (sort keys %{$self->{'contigs'}}) {
|
|
1197 if ($key =~ /^singlet/) {
|
|
1198 push @members,$self->{'contigs'}->{$key}->{'member_array'}[0];
|
|
1199 }
|
|
1200 elsif ($self->{'contigs'}->{$key}->{'member_array'}) {
|
|
1201 push @members,@{$self->{'contigs'}->{$key}->{'member_array'}};
|
|
1202 }
|
|
1203 # else {
|
|
1204 # print("Bio::Tools::Alignment::Consed: $key is _not_ an array. Pushing $self->{'contigs'}->{$key}->{'member_array'} onto \@members\n");
|
|
1205 # push @members,$self->{'contigs'}->{$key}->{'member_array'};
|
|
1206 # }
|
|
1207 }
|
|
1208 return @members;
|
|
1209 }
|
|
1210
|
|
1211 =head2 sum_lets($total_only)
|
|
1212
|
|
1213 Title : sum_lets($total_only)
|
|
1214 Usage : $statistics = $o_consed->sum_lets($total_only);
|
|
1215 Function: Provide numbers for how many sequences were accounted for in the
|
|
1216 Bio::Tools::Alignment::Consed object.
|
|
1217 Returns : If a scalar is present, returns the total number of
|
|
1218 sequences accounted for in all classes. If no scalar passed
|
|
1219 then returns a string that looks like this:
|
|
1220 Singt/singn/doub/pair/mult/total : 2,0,1(2),0(0),0(0),4
|
|
1221 This example means the following: There were 1 singlets.
|
|
1222 There were 0 singletons. There were 1 doublets for a total
|
|
1223 of 2 sequences in this class. There were 0 pairs for a
|
|
1224 total of 0 sequences in this class. There were 0
|
|
1225 multiplets for a total of 0 sequences in this class. There
|
|
1226 were a total of 4 sequences accounted for in the
|
|
1227 Bio::Tools::Alignment::Consed object.
|
|
1228 Args : A scalar is optional to change the way the numbers are returned.
|
|
1229 Notes:
|
|
1230
|
|
1231 =cut
|
|
1232
|
|
1233 sub sum_lets {
|
|
1234 my ($self,$total_only) = @_;
|
|
1235 my ($count,$count_multiplets,$multiplet_count);
|
|
1236 my $singlets = &get_singlets($self); $count += $singlets;
|
|
1237 my $doublets = &get_doublets($self); $count += ($doublets * 2);
|
|
1238 my $pairs = &get_pairs($self); $count += ($pairs * 2);
|
|
1239 my $singletons = &get_singletons($self); $count += $singletons;
|
|
1240 my @multiplets = &get_multiplets($self);
|
|
1241 $count_multiplets = @multiplets;
|
|
1242 my $return_string;
|
|
1243 foreach (@multiplets) {
|
|
1244 my $number_members = $self->{'contigs'}->{$_}->{num_members};
|
|
1245 $multiplet_count += $number_members;
|
|
1246 }
|
|
1247 if ($multiplet_count) {
|
|
1248 $count += $multiplet_count;
|
|
1249 }
|
|
1250 foreach (qw(multiplet_count singlets doublets pairs singletons multiplets count_multiplets)) {
|
|
1251 no strict 'refs'; # renege for the block
|
|
1252 if (!${$_}) {
|
|
1253 ${$_} = 0;
|
|
1254 }
|
|
1255 }
|
|
1256 if (!$multiplet_count) { $multiplet_count = 0; }
|
|
1257 if ($total_only) {
|
|
1258 return $count;
|
|
1259 }
|
|
1260 $return_string = "Singt/singn/doub/pair/mult/total : $singlets,$singletons,$doublets(".
|
|
1261 ($doublets*2)."),$pairs(".($pairs*2)."),$count_multiplets($multiplet_count),$count";
|
|
1262 return $return_string;
|
|
1263 }
|
|
1264
|
|
1265 =head2 write_stats()
|
|
1266
|
|
1267 Title : write_stats()
|
|
1268 Usage : $o_consed->write_stats();
|
|
1269 Function: Write a file called "statistics" containing numbers similar to
|
|
1270 those provided in sum_lets().
|
|
1271 Returns : Nothing. Write a file in $o_consed->{path} containing something
|
|
1272 like this:
|
|
1273
|
|
1274 0,0,50(100),0(0),0(0),100
|
|
1275
|
|
1276 Where the numbers provided are in the format described in the
|
|
1277 documentation for sum_lets().
|
|
1278 Args : None.
|
|
1279 Notes : This might break platform independence, I do not know.
|
|
1280
|
|
1281 See L<sum_lets()|sum_lets>
|
|
1282
|
|
1283 =cut
|
|
1284
|
|
1285 sub write_stats {
|
|
1286 # worry about platform dependence here?
|
|
1287 # oh shucksdarn.
|
|
1288 my $self = shift;
|
|
1289 my $stats_filename = $self->{'path'}."statistics";
|
|
1290 my $statistics_raw = $self->sum_lets;
|
|
1291 (my $statsfilecontents = $statistics_raw) =~ s/.*\ \:\ //g;
|
|
1292 umask 0001;
|
|
1293 my $fh = new Bio::Root::IO(-file=>"$stats_filename");
|
|
1294 # open(STATSFILE,">$stats_filename") or print("Could not open the statsfile: $!\n");
|
|
1295 $fh->_print("$statsfilecontents");
|
|
1296 # close STATSFILE;
|
|
1297 $fh->close();
|
|
1298 }
|
|
1299
|
|
1300 =head2 get_singletons()
|
|
1301
|
|
1302 Title : get_singletons()
|
|
1303 Usage : @singletons = $o_consed->get_singletons();
|
|
1304 Function: Return the keynames of the singletons.
|
|
1305 Returns : Returns an array containing the keynames of all
|
|
1306 Bio::Tools::Alignment::Consed sequences in the class "singleton".
|
|
1307 Args : None.
|
|
1308 Notes :
|
|
1309
|
|
1310 =cut
|
|
1311
|
|
1312 sub get_singletons {
|
|
1313 # returns an array of singleton names
|
|
1314 # singletons are contigs with one member (see consed documentation)
|
|
1315 my $self = shift;
|
|
1316 my (@singletons,@array);
|
|
1317 foreach my $key (sort keys %{$self->{'contigs'}}) {
|
|
1318 if ($self->{'contigs'}->{$key}->{'class'}) {
|
|
1319 # print ("$key class: $self->{'contigs'}->{$key}->{'class'}\n");
|
|
1320 }
|
|
1321 else {
|
|
1322 # print("$key belongs to no class. why?\n");
|
|
1323 }
|
|
1324 if ($self->{'contigs'}->{$key}->{'member_array'}) {
|
|
1325 @array = @{$self->{'contigs'}->{$key}->{'member_array'}};
|
|
1326 }
|
|
1327 my $num_array_elem = @array;
|
|
1328 if ($num_array_elem == 1 && $self->{'contigs'}->{$key}->{'class'} && $self->{'contigs'}->{$key}->{'class'} eq "singleton") { push @singletons,$key; }
|
|
1329 }
|
|
1330 return @singletons;
|
|
1331 }
|
|
1332
|
|
1333 =head2 get_pairs()
|
|
1334
|
|
1335 Title : get_pairs()
|
|
1336 Usage : @pairs = $o_consed->get_pairs();
|
|
1337 Function: Return the keynames of the pairs.
|
|
1338 Returns : Returns an array containing the keynames of all
|
|
1339 Bio::Tools::Alignment::Consed sequences in the class "pair".
|
|
1340 Args : None.
|
|
1341 Notes :
|
|
1342
|
|
1343 =cut
|
|
1344
|
|
1345 sub get_pairs {
|
|
1346 # returns an array of pair contig names
|
|
1347 # a pair is a contig of two where the names do not match
|
|
1348 my $self = shift;
|
|
1349 my (@pairs,@array);
|
|
1350 foreach my $key (sort keys %{$self->{'contigs'}}) {
|
|
1351 if ($self->{'contigs'}->{$key}->{'member_array'}) {
|
|
1352 if (@{$self->{'contigs'}->{$key}->{'member_array'}} == 2 &&
|
|
1353 $self->{'contigs'}->{$key}->{'class'} eq "pair") {
|
|
1354 push @pairs,$key;
|
|
1355 }
|
|
1356 }
|
|
1357 }
|
|
1358 return @pairs;
|
|
1359 }
|
|
1360
|
|
1361 =head2 get_name($contig_keyname)
|
|
1362
|
|
1363 Title : get_name($contig_keyname)
|
|
1364 Usage : $name = $o_consed->get_name($contig_keyname);
|
|
1365 Function: Return the {name} for $contig_keyname.
|
|
1366 Returns : A string. ({name})
|
|
1367 Args : A contig keyname.
|
|
1368 Notes :
|
|
1369
|
|
1370 =cut
|
|
1371
|
|
1372 sub get_name {
|
|
1373 my ($self,$contig) = @_;
|
|
1374 return $self->{'contigs'}->{$contig}->{'name'};
|
|
1375 }
|
|
1376
|
|
1377 =head2 _get_contig_name(\@array_containing_reads)
|
|
1378
|
|
1379 Title : _get_contig_name(\@array_containing_reads)
|
|
1380 Usage : $o_consed->_get_contig_name(\@array_containing_reads);
|
|
1381 Function: The logic for the set_doublets subroutine.
|
|
1382 Returns : The name for this contig.
|
|
1383 Args : A reference to an array containing read names.
|
|
1384 Notes : Depends on reverse_designator. Be sure this is set the way you
|
|
1385 intend.
|
|
1386
|
|
1387 =cut
|
|
1388
|
|
1389 sub _get_contig_name {
|
|
1390 my ($self,$r_array) = @_;
|
|
1391 my @contig_members = @$r_array;
|
|
1392 my @name_nodir;
|
|
1393 foreach (@contig_members) {
|
|
1394 # how can I distinguish the clone name from the direction label?
|
|
1395 # look for $Consed::reverse_designator and $Consed::forward_designator
|
|
1396 # what if you do not find _any_ of those?
|
|
1397 my $forward_designator = $self->{'forward_designator'} || "f";
|
|
1398 my $reverse_designator = $self->{'reverse_designator'} || "r";
|
|
1399 my $any_hits = /(.+)($forward_designator.*)/ || /(.+)($reverse_designator.*)/||/(.+)(_.+)/;
|
|
1400 my $name = $1;
|
|
1401 my $suffix = $2;
|
|
1402 if ($name) {
|
|
1403 # print("\t\$name is $name ");
|
|
1404 }
|
|
1405 if ($suffix) {
|
|
1406 # print("and \$suffix is $suffix.\n");
|
|
1407 }
|
|
1408 # Jee, I hope we get a naming convention soon
|
|
1409 if ($suffix) {
|
|
1410 if ($suffix =~ /^$forward_designator/ || $suffix =~ /^$reverse_designator/) {
|
|
1411 push @name_nodir,$name;
|
|
1412 }
|
|
1413 # bugwatch here! should this be unnested?
|
|
1414 else {
|
|
1415 push @name_nodir,"$name$suffix";
|
|
1416 }
|
|
1417 }
|
|
1418 }
|
|
1419 # print("\@name_nodir: @name_nodir\n");
|
|
1420 my $mismatch = 0;
|
|
1421 for (my $counter=0; $counter<@name_nodir;$counter++) {
|
|
1422 next if ($name_nodir[0] eq $name_nodir[$counter]);
|
|
1423 $mismatch = 1;
|
|
1424 }
|
|
1425 if ($mismatch == 0) {
|
|
1426 # print("\tYou have a cohesive contig named $name_nodir[0].\n\n");
|
|
1427 return $name_nodir[0];
|
|
1428 }
|
|
1429 else {
|
|
1430 # print("\tYou have mixed names in this contig.\n\n");
|
|
1431 }
|
|
1432 } # end _get_contig_name
|
|
1433
|
|
1434 =head2 get_doublets()
|
|
1435
|
|
1436 Title : get_doublets()
|
|
1437 Usage : @doublets = $o_consed->get_doublets();
|
|
1438 Function: Return the keynames of the doublets.
|
|
1439 Returns : Returns an array containing the keynames of all
|
|
1440 Bio::Tools::Alignment::Consed sequences in the class "doublet".
|
|
1441 Args : None.
|
|
1442 Notes :
|
|
1443
|
|
1444 =cut
|
|
1445
|
|
1446 sub get_doublets {
|
|
1447 my $self = shift;
|
|
1448 if (!$self->{doublets_set}) {
|
|
1449 $self->warn("You need to set the doublets before you can get them. Doing that now.");
|
|
1450 $self->set_doublets();
|
|
1451 }
|
|
1452 my @doublets;
|
|
1453 foreach (sort keys %{$self->{'contigs'}}) {
|
|
1454 if ($self->{'contigs'}->{$_}->{name} && $self->{'contigs'}->{$_}->{'class'} eq "doublet") {
|
|
1455 push @doublets,$_;
|
|
1456 }
|
|
1457 }
|
|
1458 return @doublets;
|
|
1459 } # end get_doublets
|
|
1460
|
|
1461 =head2 dump_hash()
|
|
1462
|
|
1463 Title : dump_hash()
|
|
1464 Usage : $o_consed->dump_hash();
|
|
1465 Function: Use dumpvar.pl to dump out the Bio::Tools::Alignment::Consed
|
|
1466 object to STDOUT.
|
|
1467 Returns : Nothing.
|
|
1468 Args : None.
|
|
1469 Notes : I used this a lot in debugging.
|
|
1470
|
|
1471 =cut
|
|
1472
|
|
1473 sub dump_hash {
|
|
1474 my $self = shift;
|
|
1475 my $dumper = new Dumpvalue;
|
|
1476 $self->debug( "Bio::Tools::Alignment::Consed::dump_hash - ".
|
|
1477 "The following is the contents of the contig hash...\n");
|
|
1478 $dumper->dumpValue($self->{'contigs'});
|
|
1479 }
|
|
1480
|
|
1481 =head2 dump_hash_compact()
|
|
1482
|
|
1483 Title : dump_hash_compact()
|
|
1484 Usage : $o_consed->dump_hash_compact();
|
|
1485 Function: Dump out the Bio::Tools::Alignment::Consed object in a compact way.
|
|
1486 Returns : Nothing.
|
|
1487 Args : Nothing.
|
|
1488 Notes : Cleaner then dumpValue(), dumpHash(). I used this a lot in
|
|
1489 debugging.
|
|
1490
|
|
1491 =cut
|
|
1492
|
|
1493 sub dump_hash_compact {
|
|
1494 no strict 'refs'; # renege for the block
|
|
1495 my ($self,$sequence) = @_;
|
|
1496 # get the classes
|
|
1497 my @singlets = $self->get_singlets();
|
|
1498 my @singletons = $self->get_singletons();
|
|
1499 my @doublets = $self->get_doublets();
|
|
1500 my @pairs = $self->get_pairs();
|
|
1501 my @multiplets = $self->get_multiplets();
|
|
1502 print("Name\tClass\tMembers\tQuality?\n");
|
|
1503 foreach (@singlets) {
|
|
1504 my @members = $self->get_members($_);
|
|
1505 print($self->get_name($_)."\tsinglets\t".(join',',@members)."\t");
|
|
1506 if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); }
|
|
1507 else { print("no qualities found here\n"); }
|
|
1508
|
|
1509 }
|
|
1510 foreach (@singletons) {
|
|
1511 my @members = $self->get_members($_);
|
|
1512 print($self->get_name($_)."\tsingletons\t".(join',',@members)."\t");
|
|
1513 if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); }
|
|
1514 else { print("no qualities found here\n"); }
|
|
1515 }
|
|
1516 foreach my $pair (@pairs) {
|
|
1517 my @members = $self->get_members($pair);
|
|
1518 my $name;
|
|
1519 if (!$self->get_name($pair)) {
|
|
1520 $name = "BLANK";
|
|
1521 }
|
|
1522 else { $name = $self->get_name($pair); }
|
|
1523 print("$name\tpairs\t".(join',',@members)."\n");
|
|
1524 }
|
|
1525 foreach (@doublets) {
|
|
1526 my @members = $self->get_members_by_name($_);
|
|
1527 print("$_\tdoublets\t".(join',',@members)."\t");
|
|
1528 my $contig_number = &get_contig_number_by_name($self,$_);
|
|
1529 if ($self->{'contigs'}->{$contig_number}->{'quality'}) { print("qualities found here\n"); }
|
|
1530 else { print("no qualities found here\n"); }
|
|
1531 # print($_."\tdoublets\t".(join',',@members)."\n");
|
|
1532 }
|
|
1533 foreach (@multiplets) {
|
|
1534 my @members = $self->get_members($_);
|
|
1535 print("Contig $_"."\tmultiplets\t".(join',',@members)."\n");
|
|
1536 }
|
|
1537 } # end dump_hash_compact
|
|
1538
|
|
1539 =head2 get_phreds()
|
|
1540
|
|
1541 Title : get_phreds()
|
|
1542 Usage : @phreds = $o_consed->get_phreds();
|
|
1543 Function: For each doublet in the Bio::Tools::Alignment::Consed hash, go
|
|
1544 and get the phreds for the top and bottom reads. Place them into
|
|
1545 {top_phreds} and {bottom_phreds}.
|
|
1546 Returns : Nothing.
|
|
1547 Args : Nothing.
|
|
1548 Notes : Requires parse_phd() and reverse_and_complement(). I realize that
|
|
1549 it would be much more elegant to pull qualities as required but there
|
|
1550 were certain "features" in the acefile that required a bit more
|
|
1551 detailed work be done to get the qualities for certain parts of the
|
|
1552 consensus sequence. In order to make _sure_ that this was done
|
|
1553 properly I wrote things to do all steps and then I used dump_hash()
|
|
1554 and checked each one to ensure expected bahavior. I have never changed
|
|
1555 this, so there you are.
|
|
1556
|
|
1557 =cut
|
|
1558
|
|
1559 sub get_phreds {
|
|
1560 # this subroutine is the target of a rewrite to use the Bio::Tools::Alignment::Phred object.
|
|
1561 my $self = shift;
|
|
1562 my $current_contig;
|
|
1563 foreach $current_contig (sort keys %{$self->{'contigs'}}) {
|
|
1564 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") {
|
|
1565 $self->debug("$current_contig is a doublet. Going to parse_phd for top($self->{'contigs'}->{$current_contig}->{'top_name'}) and bottom($self->{'contigs'}->{$current_contig}->{'bottom_name'})\n");
|
|
1566 my $r_phreds_top = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'top_name'});
|
|
1567 my $r_phreds_bottom = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'bottom_name'});
|
|
1568 if ($self->{'contigs'}->{$current_contig}->{'top_complement'} eq "C") {
|
|
1569 # print("Reversing and complementing...\n");
|
|
1570 $r_phreds_top = &reverse_and_complement($r_phreds_top);
|
|
1571 }
|
|
1572 if ($self->{'contigs'}->{$current_contig}->{'bottom_complement'} eq "C") {
|
|
1573 $r_phreds_bottom = &reverse_and_complement($r_phreds_bottom);
|
|
1574 }
|
|
1575 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = $r_phreds_top;
|
|
1576 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = $r_phreds_bottom;
|
|
1577 }
|
|
1578 }
|
|
1579 }
|
|
1580
|
|
1581 =head2 parse_phd($read_name)
|
|
1582
|
|
1583 Title : parse_phd($read_name)
|
|
1584 Usage : $o_consed->parse_phd($read_name);
|
|
1585 Function: Suck in the contents of a .phd file.
|
|
1586 Returns : A reference to an array containing the quality values for the read.
|
|
1587 Args : The name of a read.
|
|
1588 Notes : This is a significantly weak subroutine because it was always
|
|
1589 intended that these functions, along with the functions provided by
|
|
1590 get_phreds() be put into the Bio::SeqIO:phd module. This is done
|
|
1591 now but the Bio::Tools::Alignment::Consed module has not be
|
|
1592 rewritten to reflect this change.
|
|
1593
|
|
1594 See L<Bio::SeqIO::phd> for more information.
|
|
1595
|
|
1596 =cut
|
|
1597
|
|
1598 sub parse_phd {
|
|
1599 my ($self,$sequence_name) = @_;
|
|
1600 $self->debug("Parsing phd for $sequence_name\n");
|
|
1601 my $in_dna = 0;
|
|
1602 my $base_number = 0;
|
|
1603 my (@bases,@current_line);
|
|
1604 # print("parse_phd: $sequence_name\n");
|
|
1605 my $fh = new Bio::Root::IO(-file=>"$self->{path}/../phd_dir/$sequence_name.phd.1");
|
|
1606 # open(PHD,"<$self->{path}/../phd_dir/$sequence_name.phd.1") or
|
|
1607 # die "Couldn't open the phred for $sequence_name\n";
|
|
1608 while ($fh->_readline()) {
|
|
1609 # print("Reading a line from a phredfile!\n");
|
|
1610 chomp;
|
|
1611 if (/^BEGIN_DNA/) { $in_dna = 1; next}
|
|
1612 if (/^END_DNA/) { last; }
|
|
1613 if (!$in_dna) { next; }
|
|
1614 push(@bases,$_);
|
|
1615 }
|
|
1616 return \@bases;
|
|
1617 }
|
|
1618
|
|
1619 =head2 reverse_and_complement(\@source)
|
|
1620
|
|
1621 Title : reverse_and_complement(\@source)
|
|
1622 Usage : $reference_to_array = $o_consed->reverse_and_complement(\@source);
|
|
1623 Function: A stub for the recursive routine reverse_recurse().
|
|
1624 Returns : A reference to a reversed and complemented array of phred data.
|
|
1625 Args : A reference to an array of phred data.
|
|
1626 Notes :
|
|
1627
|
|
1628 =cut
|
|
1629
|
|
1630 sub reverse_and_complement {
|
|
1631 my $r_source = shift;
|
|
1632 my $r_destination;
|
|
1633 $r_destination = &reverse_recurse($r_source,$r_destination);
|
|
1634 return $r_destination;
|
|
1635 }
|
|
1636
|
|
1637 =head2 reverse_recurse($r_source,$r_destination)
|
|
1638
|
|
1639 Title : reverse_recurse(\@source,\@destination)
|
|
1640 Usage : $o_consed->reverse_recurse(\@source,\@destination);
|
|
1641 Function: A recursive routine to reverse and complement an array of phred
|
|
1642 data.
|
|
1643 Returns : A reference to an array containing reversed phred data.
|
|
1644 Args : A reference to a source array and a reverence to a destination
|
|
1645 array.
|
|
1646 Notes : Recursion is kewl, but this sub should likely be _reverse_recurse.
|
|
1647
|
|
1648 =cut
|
|
1649
|
|
1650 my $current_base;
|
|
1651 sub reverse_recurse($$) {
|
|
1652 (my $r_source,my $r_destination) = @_;
|
|
1653 if (!@$r_source) { return $r_destination; }
|
|
1654 $_=pop(@$r_source);
|
|
1655 (s/c/g/||s/g/c/||s/a/t/||s/t/a/);
|
|
1656 push(@$r_destination,$_);
|
|
1657 &reverse_recurse($r_source,$r_destination);
|
|
1658 }
|
|
1659
|
|
1660 =head2 show_missing_sequence()
|
|
1661
|
|
1662 Title : show_missing_sequence();
|
|
1663 Usage : $o_consed->show_missing_sequence();
|
|
1664 Function: Used by set_trim_points_doublets() to fill in quality values where
|
|
1665 consed (phrap?) set them to 0 at the beginning and/or end of the
|
|
1666 consensus sequences.
|
|
1667 Returns : Nothing.
|
|
1668 Args : None.
|
|
1669 Notes : Acts on doublets only. Really very somewhat quite ugly. A
|
|
1670 disgusting kludge. <insert pride here> It was written stepwise with
|
|
1671 no real plan because it was not really evident why consed (phrap?)
|
|
1672 was doing this.
|
|
1673
|
|
1674 =cut
|
|
1675
|
|
1676 sub show_missing_sequence() {
|
|
1677 # decide which sequence should not have been clipped at consensus position = 0
|
|
1678 my $self = shift;
|
|
1679 &get_phreds($self);
|
|
1680 my ($current_contig,@qualities);
|
|
1681 foreach $current_contig (sort keys %{$self->{'contigs'}}) {
|
|
1682 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") {
|
|
1683 my $number_leading_xs = 0;
|
|
1684 my $number_trailing_xs = 0;
|
|
1685 my $measurer = $self->{'contigs'}->{$current_contig}->{'quality'};
|
|
1686 while ($measurer =~ s/^\ 0\ /\ /) {
|
|
1687 $number_leading_xs++;
|
|
1688 }
|
|
1689 while ($measurer =~ s/\ 0(\s*)$/$1/) {
|
|
1690 $number_trailing_xs++;
|
|
1691 }
|
|
1692 @qualities = split(' ',$self->{'contigs'}->{$current_contig}->{'quality'});
|
|
1693 my $in_initial_zeros = 0;
|
|
1694 for (my $count=0;$count<scalar(@qualities); $count++) {
|
|
1695 if ($qualities[$count] == 0) {
|
|
1696 my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data);
|
|
1697 # print("The quality of the consensus at ".($count+1)." is zero. Retrieving the real quality value.\n");
|
|
1698 # how do I know which strand to get these quality values from????
|
|
1699 # boggle
|
|
1700 my $top_quality_here = $self->{'contigs'}->{$current_contig}->{'top_phreds'}->[0-$self->{'contigs'}->{$current_contig}->{'top_start'}+$count+1];
|
|
1701 my $bottom_quality_here = $self->{'contigs'}->{$current_contig}->{'bottom_phreds'}->[1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count];
|
|
1702 if (!$bottom_quality_here || (1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count)<0) {
|
|
1703 $bottom_quality_here = "not found";
|
|
1704 }
|
|
1705 if (!$top_quality_here) {
|
|
1706 $top_quality_here = "not found";
|
|
1707 }
|
|
1708 # print("Looking for quals at position $count of $current_contig: top position ".(0-$self->{'contigs'}->{$current_contig}->{top_start}+$count)." ($self->{'contigs'}->{$current_contig}->{top_name}) $top_quality_here , bottom position ".(1-$self->{'contigs'}->{$current_contig}->{bottom_start}+$count)." ($self->{'contigs'}->{$current_contig}->{bottom_name}) $bottom_quality_here\n");
|
|
1709 if ($count<$number_leading_xs) {
|
|
1710 # print("$count is less then $number_leading_xs so I will get the quality from the top strand\n");
|
|
1711 # print("retrieved quality is ".$self->{'contigs'}->{$current_contig}->{top_phreds}[0-$self->{'contigs'}->{$current_contig}->{top_start}+$count+1]."\n");
|
|
1712 my $quality = $top_quality_here;
|
|
1713 $quality =~ /\S+\s(\d+)\s+/;
|
|
1714 $quality = $1;
|
|
1715 # print("retrieved quality for leading zero $count is $quality\n");
|
|
1716 # t 9 9226
|
|
1717 $qualities[$count] = $quality;
|
|
1718 }
|
|
1719 else {
|
|
1720 # this part is tricky
|
|
1721 # if the contig is like this
|
|
1722 # cccccccccccccccc
|
|
1723 # ffffffffffffffffff
|
|
1724 # rrrrrrrrrrrrrrrrr
|
|
1725 # then take the quality value for the trailing zeros in the cons. seq from the r
|
|
1726 #
|
|
1727 # but if the contig is like this
|
|
1728 # cccccccccccccccccc
|
|
1729 # ffffffffffffffffffffffffffffffff
|
|
1730 # rrrrrrrrrrrrrrrrrrrrrrrxxxxxxxxr
|
|
1731 # ^^^
|
|
1732 # then any zeros that fall in the positions (^) must be decided whether the quality
|
|
1733 # is the qual from the f or r strand. I will use the greater number
|
|
1734 # does a similar situation exist for the leading zeros? i dunno
|
|
1735 #
|
|
1736 # print("$count is greater then $number_leading_xs so I will get the quality from the bottom strand\n");
|
|
1737 # print("retrieved quality is ".$contigs->{$current_contig}->{top_phreds}[0-$contigs->{$current_contig}->{top_start}+$count+1]."\n");
|
|
1738 # my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data);
|
|
1739 if ($bottom_quality_here eq "not found") {
|
|
1740 # $top_phred_position = 1-$contigs->{$current_contig}->{bottom_start}+$count;
|
|
1741 # print("Going to get quality from here: $top_phred_position of the top.\n");
|
|
1742 # my $temp_quality - $contigs->{$current_contig}->{top_phreds}
|
|
1743 # $quality = $contigs->{$current_contig}->{top_phreds}[$top_phred_position];
|
|
1744 $top_quality_here =~ /\w+\s(\d+)\s/;
|
|
1745 $quality = $1;
|
|
1746 }
|
|
1747 elsif ($top_quality_here eq "not found") {
|
|
1748 # $bottom_phred_position = 1+$contigs->{$current_contig}->{bottom_start}+$count;
|
|
1749 # print("Going to get quality from here: $bottom_phred_position of the bottom.\n");
|
|
1750 # $quality = $contigs->{$current_contig}->{bottom_phreds}[$bottom_phred_position];
|
|
1751 # print("Additional: no top quality but bottom is $quality\n");
|
|
1752 $bottom_quality_here =~ /\w+\s(\d+)\s/;
|
|
1753 $quality = $1;
|
|
1754 }
|
|
1755 else {
|
|
1756 # print("Oh jeepers, there are 2 qualities to choose from at this position.\n");
|
|
1757 # print("Going to compare these phred qualities: top: #$top_quality_here# bottom: #$bottom_quality_here#\n");
|
|
1758 # now you have to compare them
|
|
1759 # my $top_quality_phred = $contigs->{$current_contig}->{top_phreds}[$top_phred_position];
|
|
1760 # #t 40 875#
|
|
1761 # print("regexing #$top_quality_here#... ");
|
|
1762 $top_quality_here =~ /\w\ (\d+)\s/;
|
|
1763 my $top_quality = $1;
|
|
1764 # print("$top_quality\nregexing #$bottom_quality_here#... ");
|
|
1765 $bottom_quality_here =~ /\w\ (\d+)\s/;
|
|
1766 my $bottom_quality = $1;
|
|
1767 # print("$bottom_quality\n");
|
|
1768 # print("top_quality: $top_quality bottom quality: $bottom_quality\n");
|
|
1769 if ($bottom_quality > $top_quality) {
|
|
1770 # print("Chose to take the bottom quality: $bottom_quality\n");
|
|
1771 $quality = $bottom_quality;
|
|
1772 } else {
|
|
1773 # print("Chose to take the top quality: $top_quality\n");
|
|
1774 $quality = $top_quality;
|
|
1775 }
|
|
1776 }
|
|
1777 if (!$quality) {
|
|
1778 # print("Warning: no quality value for $current_contig, position $count!\n");
|
|
1779 # print("Additional data: top quality phred: $top_quality_here\n");
|
|
1780 # print("Additional data: bottom quality phred: $bottom_quality_here\n");
|
|
1781 } else {
|
|
1782 $qualities[$count] = $quality;
|
|
1783 }
|
|
1784 }
|
|
1785 }
|
|
1786
|
|
1787 }
|
|
1788 unless (!@qualities) {
|
|
1789 $self->{'contigs'}->{$current_contig}->{'quality'} = join(" ",@qualities);
|
|
1790 }
|
|
1791 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = undef;
|
|
1792 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = undef;
|
|
1793 my $count = 1;
|
|
1794 } # end foreach key
|
|
1795 }
|
|
1796 }
|
|
1797
|
|
1798
|
|
1799 # Autoload methods go after =cut, and are processed by the autosplit program.
|
|
1800
|
|
1801 1;
|
|
1802 __END__
|
|
1803
|
|
1804 =head1 SEE ALSO
|
|
1805
|
|
1806 perl(1).
|
|
1807
|
|
1808 =cut
|