comparison VelvetOptimiser-2.1.7_modified/VelvetOpt/Assembly.pm @ 0:7363fee7f20c default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author konradpaszkiewicz
date Tue, 07 Jun 2011 17:42:26 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7363fee7f20c
1 # VelvetOpt::Assembly.pm
2 #
3 # Copyright 2008,2009 Simon Gladman <simon.gladman@csiro.au>
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 # MA 02110-1301, USA.
19
20 # Version 2.1.2
21 #
22 # Changes for 2.0.1
23 # *Bug fix in CalcAssemblyScore. Now returns 0 if there is no calculable score instead of crashing.
24 #
25 # Changes for 2.1.0
26 # *Added 2 stage optimisation functions for optimising kmer size and cov_cutoff differently if required.
27 #
28 # Changes for 2.1.1
29 # *Allowed for non-word characters in prefix names. (. - etc.) Still no spaces allowed in prefix name or any filenames.
30 #
31 # Changes for 2.1.2
32 # *Now warns nicely of optimisation function returning undef or 0. Suggests you choose and alternative.
33 #
34 # Changes for 2.1.3
35 # *Now prints the velvet calculated insert sizes and standard deviations in the Assembly summaries (both log files and screen).
36
37 package VelvetOpt::Assembly;
38
39 =head1 NAME
40
41 VelvetOpt::Assembly.pm - Velvet assembly container class.
42
43 =head1 AUTHOR
44
45 Simon Gladman, CSIRO, 2007, 2008.
46
47 =head1 LICENSE
48
49 Copyright 2008, 2009 Simon Gladman <simon.gladman@csiro.au>
50
51 This program is free software; you can redistribute it and/or modify
52 it under the terms of the GNU General Public License as published by
53 the Free Software Foundation; either version 2 of the License, or
54 (at your option) any later version.
55
56 This program is distributed in the hope that it will be useful,
57 but WITHOUT ANY WARRANTY; without even the implied warranty of
58 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
59 GNU General Public License for more details.
60
61 You should have received a copy of the GNU General Public License
62 along with this program; if not, write to the Free Software
63 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
64 MA 02110-1301, USA.
65
66 =head1 SYNOPSIS
67
68 use VelvetOpt::Assembly;
69 my $object = VelvetOpt::Assembly->new(
70 timestamph => "23 November 2008 15:00:00",
71 ass_id => "1",
72 versionh => "0.7.04",
73 ass_dir => "/home/gla048/Desktop/newVelvetOptimiser/data_1"
74 );
75 print $object->toString();
76
77 =head1 DESCRIPTION
78
79 A container class to hold the results of a Velvet assembly. Includes timestamps,
80 version information, parameter strings and assembly output metrics.
81
82 Version 1.1
83
84 =head2 Uses
85
86 =over 8
87
88 =item strict
89
90 =item warnings
91
92 =item Carp
93
94 =back
95
96 =head2 Fields
97
98 =over 8
99
100 =item assmscore
101
102 The assembly score metric for this object
103
104 =item timstamph
105
106 The timestamp of the start of the velveth run for this assembly
107
108 =item timestampg
109
110 The date and time of the end of the velvetg run.
111
112 =item ass_id
113
114 The assembly id number. Sequential for all the runs for this optimisation.
115
116 =item versionh
117
118 The version number of velveth used in this assembly
119
120 =item versiong
121
122 The version number of velvetg used in this assembly
123
124 =item readfilename
125
126 The name of the file containing all the reads (or a qw of them if more than one...)
127
128 =item pstringh
129
130 The velveth parameter string used in this assembly
131
132 =item pstringg
133
134 The velvetg parameter string used in this assembly
135
136 =item ass_dir
137
138 The assembly directory path (full)
139
140 =item hashval
141
142 The hash value used for this assembly
143
144 =item rmapfs
145
146 The roadmap file size
147
148 =item sequences
149
150 The total number of sequences in the input files
151
152 =item nconts
153
154 The number of contigs in the final assembly
155
156 =item totalbp
157
158 The total number of bases in the contigs
159
160 =item n50
161
162 The n50 of the assembly
163
164 =item maxlength
165
166 The length of the longest contig in the assembly
167
168 =item maxcont
169
170 The size of the largest contig in the assembly
171
172 =item nconts1k
173
174 The number of contigs greater than 1k in size
175
176 =item totalbp1k
177
178 the sum of the length of contigs > 1k in size
179
180 =item velvethout
181
182 The velveth output
183
184 =item velvetgout
185
186 The velvetg output
187
188 =back
189
190 =head2 Methods
191
192 =over 8
193
194 =item new
195
196 Returns a new VelvetAssembly object.
197
198 =item accessor methods
199
200 Accessor methods for all fields.
201
202 =item calcAssemblyScore
203
204 Calculates the assembly score of the object (after velvetg has been run.) and stores it in self.
205
206 =item getHashingDetails
207
208 Gets the details of the outputs from the velveth run and stores it in self.
209
210 =item getAssemblyDetails
211
212 Gets the details of the outputs from the velvetg run and stores it in self.
213
214 =item toString
215
216 Returns a string representation of the object's contents.
217
218 =item toStringNoV
219
220 Returns a string representation of the object's contents without the velvet outputs which are large.
221
222 =item opt_func_toString
223
224 Returns the usage of the optimisation function.
225
226 =back
227
228 =cut
229
230 use strict;
231 use lib "/usr/local/lib/perl5/site_perl/5.8.8";
232 use Carp;
233 use warnings;
234 #use base "Storable";
235 use Cwd;
236 use Bio::SeqIO;
237
238 my $interested = 0;
239
240
241
242 #constructor
243 sub new {
244 my $class = shift;
245 my $self = {@_};
246 bless ($self, $class);
247 return $self;
248 }
249
250 #optimisation function options...
251 my %f_opts;
252 $f_opts{'ncon'}->{'intname'} = 'nconts';
253 $f_opts{'ncon'}->{'desc'} = "The total number of contigs";
254 $f_opts{'n50'}->{'intname'} = 'n50';
255 $f_opts{'n50'}->{'desc'} = "The n50";
256 $f_opts{'max'}->{'intname'} = 'maxlength';
257 $f_opts{'max'}->{'desc'} = "The length of the longest contig";
258 $f_opts{'Lcon'}->{'intname'} = 'nconts1k';
259 $f_opts{'Lcon'}->{'desc'} = "The number of large contigs";
260 $f_opts{'tbp'}->{'intname'} = 'totalbp';
261 $f_opts{'tbp'}->{'desc'} = "The total number of basepairs in contigs";
262 $f_opts{'Lbp'}->{'intname'} = 'totalbp1k';
263 $f_opts{'Lbp'}->{'desc'} = "The total number of base pairs in large contigs";
264
265 #accessor methods
266 sub assmscore{ $_[0]->{assmscore}=$_[1] if defined $_[1]; $_[0]->{assmscore}}
267 sub timestamph{ $_[0]->{timestamph}=$_[1] if defined $_[1]; $_[0]->{timestamph}}
268 sub timestampg{ $_[0]->{timestampg}=$_[1] if defined $_[1]; $_[0]->{timestampg}}
269 sub ass_id{ $_[0]->{ass_id}=$_[1] if defined $_[1]; $_[0]->{ass_id}}
270 sub versionh{ $_[0]->{versionh}=$_[1] if defined $_[1]; $_[0]->{versionh}}
271 sub versiong{ $_[0]->{versiong}=$_[1] if defined $_[1]; $_[0]->{versiong}}
272 sub readfilename{ $_[0]->{readfilename}=$_[1] if defined $_[1]; $_[0]->{readfilename}}
273 sub pstringh{ $_[0]->{pstringh}=$_[1] if defined $_[1]; $_[0]->{pstringh}}
274 sub pstringg{ $_[0]->{pstringg}=$_[1] if defined $_[1]; $_[0]->{pstringg}}
275 sub ass_dir{ $_[0]->{ass_dir}=$_[1] if defined $_[1]; $_[0]->{ass_dir}}
276 sub hashval{ $_[0]->{hashval}=$_[1] if defined $_[1]; $_[0]->{hashval}}
277 sub rmapfs{ $_[0]->{rmapfs}=$_[1] if defined $_[1]; $_[0]->{rmapfs}}
278 sub nconts{ $_[0]->{nconts}=$_[1] if defined $_[1]; $_[0]->{nconts}}
279 sub n50{ $_[0]->{n50}=$_[1] if defined $_[1]; $_[0]->{n50}}
280 sub maxlength{ $_[0]->{maxlength}=$_[1] if defined $_[1]; $_[0]->{maxlength}}
281 sub nconts1k{ $_[0]->{nconts1k}=$_[1] if defined $_[1]; $_[0]->{nconts1k}}
282 sub totalbp{ $_[0]->{totalbp}=$_[1] if defined $_[1]; $_[0]->{totalbp}}
283 sub totalbp1k{ $_[0]->{totalbp1k}=$_[1] if defined $_[1]; $_[0]->{totalbp1k}}
284 sub velvethout{ $_[0]->{velvethout}=$_[1] if defined $_[1]; $_[0]->{velvethout}}
285 sub velvetgout{ $_[0]->{velvetgout}=$_[1] if defined $_[1]; $_[0]->{velvetgout}}
286 sub sequences{ $_[0]->{sequences}=$_[1] if defined $_[1]; $_[0]->{sequences}}
287 sub assmfunc{ $_[0]->{assmfunc}=$_[1] if defined $_[1]; $_[0]->{assmfunc}}
288 sub assmfunc2{ $_[0]->{assmfunc2}=$_[1] if defined $_[1]; $_[0]->{assmfunc2}}
289
290 #assemblyScoreCalculator
291 sub calcAssemblyScore {
292 use Safe;
293
294 my $self = shift;
295 my $func = shift;
296
297 my $cpt = new Safe;
298
299 #Basic variable IO and traversal
300 $cpt->permit(qw(null scalar const padany lineseq leaveeval rv2sv rv2hv helem hslice each values keys exists delete rv2cv));
301 #Comparators
302 $cpt->permit(qw(lt i_lt gt i_gt le i_le ge i_ge eq i_eq ne i_ne ncmp i_ncmp slt sgt sle sge seq sne scmp));
303 #Base math
304 $cpt->permit(qw(preinc i_preinc predec i_predec postinc i_postinc postdec i_postdec int hex oct abs pow multiply i_multiply divide i_divide modulo i_modulo add i_add subtract i_subtract));
305 #Binary math
306 $cpt->permit(qw(left_shift right_shift bit_and bit_xor bit_or negate i_negate not complement));
307 #Regex
308 $cpt->permit(qw(match split qr));
309 #Conditionals
310 $cpt->permit(qw(cond_expr flip flop andassign orassign and or xor));
311 #Advanced math
312 $cpt->permit(qw(atan2 sin cos exp log sqrt rand srand));
313
314 foreach my $key (keys %f_opts){
315 print "\nkey: $key\tintname: ", $f_opts{$key}->{'intname'}, "\n" if $interested;
316
317 $func =~ s/\b$key\b/$self->{$f_opts{$key}->{'intname'}}/g;
318 }
319
320 my $r = $cpt->reval($func);
321 warn $@ if $@;
322 $self->{assmscore} = $r;
323 unless($r =~ /^\d+/){
324 warn "Optimisation function did not return a single float.\nOptimisation function was not evaluatable.\nOptfunc: $func";
325 warn "Setting assembly score to 0\n";
326 $self->{assmscore} = 0;
327 }
328 if($r == 0){
329 print STDERR "**********\n";
330 print STDERR "Warning: Assembly score for assembly_id " . $self->{ass_id} . " is 0\n";
331 print STDERR "You may want to consider choosing a different optimisation variable or function.\n";
332 print STDERR "Current optimisation functions are ", $self->{assmfunc}, " for k value and ", $self->{assmfunc2}, " for cov_cutoff\n";
333 print STDERR "**********\n";
334 }
335 return 1;
336 }
337
338 #getHashingDetails
339 sub getHashingDetails {
340 my $self = shift;
341 unless(!$self->timestamph || !$self->pstringh){
342 my $programPath = cwd;
343 $self->pstringh =~ /^(\S+)\s+(\d+)\s+(.*)$/;
344 $self->{ass_dir} = $programPath . "/" . $1;
345 $self->{rmapfs} = -s $self->ass_dir . "/Roadmaps";
346 $self->{hashval} = $2;
347 $self->{readfilename} = $3;
348 my @t = split /\n/, $self->velvethout;
349 foreach(@t){
350 if(/^(\d+).*total\.$/){
351 $self->{sequences} = $1;
352 last;
353 }
354 }
355 return 1;
356 }
357 return 0;
358 }
359
360 #getAssemblyDetails
361 sub getAssemblyDetails {
362 my $self = shift;
363 my $file = $self->ass_dir . "/contigs.fa";
364 unless(!(-e $file)){
365
366 my $all = &contigStats($file,1);
367 my $large = &contigStats($file,1000);
368
369 $self->{nconts} = defined $all->{numSeqs} ? $all->{numSeqs} : 0;
370 $self->{n50} = defined $all->{n50} ? $all->{n50} : 0;
371 $self->{maxlength} = defined $all->{maxLen} ? $all->{maxLen} : 0;
372 $self->{nconts1k} = defined $large->{numSeqs} ? $large->{numSeqs} : 0;
373 $self->{totalbp} = defined $all->{numBases} ? $all->{numBases} : 0;
374 $self->{totalbp1k} = defined $large->{numBases} ? $large->{numBases} : 0;
375
376 if($self->pstringg =~ m/cov_cutoff/){
377 $self->calcAssemblyScore($self->{assmfunc2});
378 }
379 else {
380 $self->calcAssemblyScore($self->{assmfunc});
381 }
382
383 return 1;
384 }
385 return 0;
386 }
387
388 #contigStats
389 #Original script fa-show.pl by Torsten Seemann (Monash University, Melbourne, Australia)
390 #Modified by Simon Gladman to suit.
391 sub contigStats {
392
393 my $file = shift;
394 my $minsize = shift;
395
396 print "In contigStats with $file, $minsize\n" if $interested;
397
398 my $numseq=0;
399 my $avglen=0;
400 my $minlen=1E9;
401 my $maxlen=0;
402 my @len;
403 my $toosmall=0;
404 my $nn=0;
405
406 my $in = Bio::SeqIO->new(-file => $file, -format => 'Fasta');
407 while(my $seq = $in->next_seq()){
408 my $L = $seq->length;
409 #check > minsize
410 if($L < $minsize){
411 $toosmall ++;
412 next;
413 }
414 #count Ns
415 my $s = $seq->seq;
416 my $n = $s =~ s/N/N/gi;
417 $n ||= 0;
418 $nn += $n;
419 #count seqs and other stats
420 $numseq ++;
421 $avglen += $L;
422 $maxlen = $L if $L > $maxlen;
423 $minlen = $L if $L < $minlen;
424 push @len, $L;
425 }
426 @len = sort { $a <=> $b } @len;
427 my $cum = 0;
428 my $n50 = 0;
429 for my $i (0 .. $#len){
430 $cum += $len[$i];
431 if($cum >= $avglen/2) {
432 $n50 = $len[$i];
433 last;
434 }
435 }
436
437 my %out;
438 if($numseq > 0){
439 $out{numSeqs} = $numseq;
440 $out{numBases} = $avglen;
441 $out{numOK} = ($avglen - $nn);
442 $out{numNs} = $nn;
443 $out{minLen} = $minlen;
444 $out{avgLen} = $avglen/$numseq;
445 $out{maxLen} = $maxlen;
446 $out{n50} = $n50;
447 $out{minsize} = $minsize;
448 $out{numTooSmall} = $toosmall;
449 }
450 else {
451 $out{$numseq} = 0;
452 }
453
454 print "Leaving contigstats!\n" if $interested;
455 return (\%out);
456 }
457
458
459 #toString method
460 sub toString {
461 my $self = shift;
462 my $tmp = $self->toStringNoV();
463 if(defined $self->velvethout){
464 $tmp .= "Velveth Output:\n" . $self->velvethout() . "\n";
465 }
466 if(defined $self->velvetgout){
467 $tmp .= "Velvetg Output:\n" . $self->velvetgout() . "\n";
468 }
469 $tmp .= "**********************************************************\n";
470 return $tmp;
471 }
472
473
474 #toStringNoV method
475 sub toStringNoV {
476 my $self = shift;
477 my $tmp = "********************************************************\n";
478 if($self->ass_id()){
479 $tmp .= "Assembly id: " . $self->ass_id(). "\n";
480 }
481 if($self->assmscore()){
482 $tmp .= "Assembly score: " .$self->assmscore(). "\n";
483 }
484 if($self->timestamph()){
485 $tmp .= "Velveth timestamp: " . $self->timestamph(). "\n";
486 }
487 if($self->timestampg()){
488 $tmp .= "Velvetg timestamp: " . $self->timestampg(). "\n";
489 }
490 if(defined $self->versionh){
491 $tmp .= "Velveth version: " . $self->versionh(). "\n";
492 }
493 if(defined $self->versiong){
494 $tmp .= "Velvetg version: " . $self->versiong(). "\n";
495 }
496 if(defined $self->readfilename){
497 $tmp .= "Readfile(s): " . $self->readfilename(). "\n";
498 }
499 if(defined $self->pstringh){
500 $tmp .= "Velveth parameter string: " . $self->pstringh(). "\n";
501 }
502 if(defined $self->pstringg){
503 $tmp .= "Velvetg parameter string: " . $self->pstringg(). "\n";
504 }
505 if(defined $self->ass_dir){
506 $tmp .= "Assembly directory: " . $self->ass_dir(). "\n";
507 }
508 if(defined $self->hashval){
509 $tmp .= "Velvet hash value: " . $self->hashval(). "\n";
510 }
511 if(defined $self->rmapfs){
512 $tmp .= "Roadmap file size: " . $self->rmapfs(). "\n";
513 }
514 if(defined $self->sequences){
515 $tmp .= "Total number of sequences: " . $self->sequences(). "\n";
516 }
517 if(defined $self->nconts){
518 $tmp .= "Total number of contigs: " . $self->nconts(). "\n";
519 }
520 if(defined $self->n50){
521 $tmp .= "n50: " . $self->n50(). "\n";
522 }
523 if(defined $self->maxlength){
524 $tmp .= "length of longest contig: " . $self->maxlength(). "\n";
525 }
526 if(defined $self->totalbp){
527 $tmp .= "Total bases in contigs: " . $self->totalbp(). "\n";
528 }
529 if(defined $self->nconts1k){
530 $tmp .= "Number of contigs > 1k: " . $self->nconts1k(). "\n";
531 }
532 if(defined $self->totalbp1k){
533 $tmp .= "Total bases in contigs > 1k: " . $self->totalbp1k(). "\n";
534 }
535 if($self->pstringh =~ /Pair/ && defined $self->pstringg && $self->pstringg =~ /-exp_cov/){
536 $tmp .= "Paired Library insert stats:\n";
537 my @x = split /\n/, $self->velvetgout;
538 foreach(@x){
539 chomp;
540 if(/^Paired-end library \d+ has/){
541 $tmp .= "$_\n";
542 }
543 }
544 }
545 $tmp .= "**********************************************************\n";
546 return $tmp;
547 }
548
549 sub opt_func_toString {
550 my $out = "\nVelvet optimiser assembly optimisation function can be built from the following variables.\n";
551 foreach my $key (sort keys %f_opts){
552 $out .= "\t$key = " . $f_opts{$key}->{'desc'} . "\n";
553 }
554 $out .= "Examples are:\n\t'Lbp' = Just the total basepairs in contigs longer than 1kb\n";
555 $out .= "\t'n50*Lcon' = The n50 times the number of long contigs.\n";
556 $out .= "\t'n50*Lcon/tbp+log(Lbp)' = The n50 times the number of long contigs divided\n\t\tby the total bases in all contigs plus the log of the number of bases\n\t\tin long contigs.\n";
557 return $out
558 }
559
560 1;