Mercurial > repos > konradpaszkiewicz > velvetoptimiser
comparison VelvetOptimiser-2.1.7_modified/VelvetOpt/Assembly.pm @ 0:7363fee7f20c default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | konradpaszkiewicz |
---|---|
date | Tue, 07 Jun 2011 17:42:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7363fee7f20c |
---|---|
1 # VelvetOpt::Assembly.pm | |
2 # | |
3 # Copyright 2008,2009 Simon Gladman <simon.gladman@csiro.au> | |
4 # | |
5 # This program is free software; you can redistribute it and/or modify | |
6 # it under the terms of the GNU General Public License as published by | |
7 # the Free Software Foundation; either version 2 of the License, or | |
8 # (at your option) any later version. | |
9 # | |
10 # This program is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 # | |
15 # You should have received a copy of the GNU General Public License | |
16 # along with this program; if not, write to the Free Software | |
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
18 # MA 02110-1301, USA. | |
19 | |
20 # Version 2.1.2 | |
21 # | |
22 # Changes for 2.0.1 | |
23 # *Bug fix in CalcAssemblyScore. Now returns 0 if there is no calculable score instead of crashing. | |
24 # | |
25 # Changes for 2.1.0 | |
26 # *Added 2 stage optimisation functions for optimising kmer size and cov_cutoff differently if required. | |
27 # | |
28 # Changes for 2.1.1 | |
29 # *Allowed for non-word characters in prefix names. (. - etc.) Still no spaces allowed in prefix name or any filenames. | |
30 # | |
31 # Changes for 2.1.2 | |
32 # *Now warns nicely of optimisation function returning undef or 0. Suggests you choose and alternative. | |
33 # | |
34 # Changes for 2.1.3 | |
35 # *Now prints the velvet calculated insert sizes and standard deviations in the Assembly summaries (both log files and screen). | |
36 | |
37 package VelvetOpt::Assembly; | |
38 | |
39 =head1 NAME | |
40 | |
41 VelvetOpt::Assembly.pm - Velvet assembly container class. | |
42 | |
43 =head1 AUTHOR | |
44 | |
45 Simon Gladman, CSIRO, 2007, 2008. | |
46 | |
47 =head1 LICENSE | |
48 | |
49 Copyright 2008, 2009 Simon Gladman <simon.gladman@csiro.au> | |
50 | |
51 This program is free software; you can redistribute it and/or modify | |
52 it under the terms of the GNU General Public License as published by | |
53 the Free Software Foundation; either version 2 of the License, or | |
54 (at your option) any later version. | |
55 | |
56 This program is distributed in the hope that it will be useful, | |
57 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
58 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
59 GNU General Public License for more details. | |
60 | |
61 You should have received a copy of the GNU General Public License | |
62 along with this program; if not, write to the Free Software | |
63 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
64 MA 02110-1301, USA. | |
65 | |
66 =head1 SYNOPSIS | |
67 | |
68 use VelvetOpt::Assembly; | |
69 my $object = VelvetOpt::Assembly->new( | |
70 timestamph => "23 November 2008 15:00:00", | |
71 ass_id => "1", | |
72 versionh => "0.7.04", | |
73 ass_dir => "/home/gla048/Desktop/newVelvetOptimiser/data_1" | |
74 ); | |
75 print $object->toString(); | |
76 | |
77 =head1 DESCRIPTION | |
78 | |
79 A container class to hold the results of a Velvet assembly. Includes timestamps, | |
80 version information, parameter strings and assembly output metrics. | |
81 | |
82 Version 1.1 | |
83 | |
84 =head2 Uses | |
85 | |
86 =over 8 | |
87 | |
88 =item strict | |
89 | |
90 =item warnings | |
91 | |
92 =item Carp | |
93 | |
94 =back | |
95 | |
96 =head2 Fields | |
97 | |
98 =over 8 | |
99 | |
100 =item assmscore | |
101 | |
102 The assembly score metric for this object | |
103 | |
104 =item timstamph | |
105 | |
106 The timestamp of the start of the velveth run for this assembly | |
107 | |
108 =item timestampg | |
109 | |
110 The date and time of the end of the velvetg run. | |
111 | |
112 =item ass_id | |
113 | |
114 The assembly id number. Sequential for all the runs for this optimisation. | |
115 | |
116 =item versionh | |
117 | |
118 The version number of velveth used in this assembly | |
119 | |
120 =item versiong | |
121 | |
122 The version number of velvetg used in this assembly | |
123 | |
124 =item readfilename | |
125 | |
126 The name of the file containing all the reads (or a qw of them if more than one...) | |
127 | |
128 =item pstringh | |
129 | |
130 The velveth parameter string used in this assembly | |
131 | |
132 =item pstringg | |
133 | |
134 The velvetg parameter string used in this assembly | |
135 | |
136 =item ass_dir | |
137 | |
138 The assembly directory path (full) | |
139 | |
140 =item hashval | |
141 | |
142 The hash value used for this assembly | |
143 | |
144 =item rmapfs | |
145 | |
146 The roadmap file size | |
147 | |
148 =item sequences | |
149 | |
150 The total number of sequences in the input files | |
151 | |
152 =item nconts | |
153 | |
154 The number of contigs in the final assembly | |
155 | |
156 =item totalbp | |
157 | |
158 The total number of bases in the contigs | |
159 | |
160 =item n50 | |
161 | |
162 The n50 of the assembly | |
163 | |
164 =item maxlength | |
165 | |
166 The length of the longest contig in the assembly | |
167 | |
168 =item maxcont | |
169 | |
170 The size of the largest contig in the assembly | |
171 | |
172 =item nconts1k | |
173 | |
174 The number of contigs greater than 1k in size | |
175 | |
176 =item totalbp1k | |
177 | |
178 the sum of the length of contigs > 1k in size | |
179 | |
180 =item velvethout | |
181 | |
182 The velveth output | |
183 | |
184 =item velvetgout | |
185 | |
186 The velvetg output | |
187 | |
188 =back | |
189 | |
190 =head2 Methods | |
191 | |
192 =over 8 | |
193 | |
194 =item new | |
195 | |
196 Returns a new VelvetAssembly object. | |
197 | |
198 =item accessor methods | |
199 | |
200 Accessor methods for all fields. | |
201 | |
202 =item calcAssemblyScore | |
203 | |
204 Calculates the assembly score of the object (after velvetg has been run.) and stores it in self. | |
205 | |
206 =item getHashingDetails | |
207 | |
208 Gets the details of the outputs from the velveth run and stores it in self. | |
209 | |
210 =item getAssemblyDetails | |
211 | |
212 Gets the details of the outputs from the velvetg run and stores it in self. | |
213 | |
214 =item toString | |
215 | |
216 Returns a string representation of the object's contents. | |
217 | |
218 =item toStringNoV | |
219 | |
220 Returns a string representation of the object's contents without the velvet outputs which are large. | |
221 | |
222 =item opt_func_toString | |
223 | |
224 Returns the usage of the optimisation function. | |
225 | |
226 =back | |
227 | |
228 =cut | |
229 | |
230 use strict; | |
231 use lib "/usr/local/lib/perl5/site_perl/5.8.8"; | |
232 use Carp; | |
233 use warnings; | |
234 #use base "Storable"; | |
235 use Cwd; | |
236 use Bio::SeqIO; | |
237 | |
238 my $interested = 0; | |
239 | |
240 | |
241 | |
242 #constructor | |
243 sub new { | |
244 my $class = shift; | |
245 my $self = {@_}; | |
246 bless ($self, $class); | |
247 return $self; | |
248 } | |
249 | |
250 #optimisation function options... | |
251 my %f_opts; | |
252 $f_opts{'ncon'}->{'intname'} = 'nconts'; | |
253 $f_opts{'ncon'}->{'desc'} = "The total number of contigs"; | |
254 $f_opts{'n50'}->{'intname'} = 'n50'; | |
255 $f_opts{'n50'}->{'desc'} = "The n50"; | |
256 $f_opts{'max'}->{'intname'} = 'maxlength'; | |
257 $f_opts{'max'}->{'desc'} = "The length of the longest contig"; | |
258 $f_opts{'Lcon'}->{'intname'} = 'nconts1k'; | |
259 $f_opts{'Lcon'}->{'desc'} = "The number of large contigs"; | |
260 $f_opts{'tbp'}->{'intname'} = 'totalbp'; | |
261 $f_opts{'tbp'}->{'desc'} = "The total number of basepairs in contigs"; | |
262 $f_opts{'Lbp'}->{'intname'} = 'totalbp1k'; | |
263 $f_opts{'Lbp'}->{'desc'} = "The total number of base pairs in large contigs"; | |
264 | |
265 #accessor methods | |
266 sub assmscore{ $_[0]->{assmscore}=$_[1] if defined $_[1]; $_[0]->{assmscore}} | |
267 sub timestamph{ $_[0]->{timestamph}=$_[1] if defined $_[1]; $_[0]->{timestamph}} | |
268 sub timestampg{ $_[0]->{timestampg}=$_[1] if defined $_[1]; $_[0]->{timestampg}} | |
269 sub ass_id{ $_[0]->{ass_id}=$_[1] if defined $_[1]; $_[0]->{ass_id}} | |
270 sub versionh{ $_[0]->{versionh}=$_[1] if defined $_[1]; $_[0]->{versionh}} | |
271 sub versiong{ $_[0]->{versiong}=$_[1] if defined $_[1]; $_[0]->{versiong}} | |
272 sub readfilename{ $_[0]->{readfilename}=$_[1] if defined $_[1]; $_[0]->{readfilename}} | |
273 sub pstringh{ $_[0]->{pstringh}=$_[1] if defined $_[1]; $_[0]->{pstringh}} | |
274 sub pstringg{ $_[0]->{pstringg}=$_[1] if defined $_[1]; $_[0]->{pstringg}} | |
275 sub ass_dir{ $_[0]->{ass_dir}=$_[1] if defined $_[1]; $_[0]->{ass_dir}} | |
276 sub hashval{ $_[0]->{hashval}=$_[1] if defined $_[1]; $_[0]->{hashval}} | |
277 sub rmapfs{ $_[0]->{rmapfs}=$_[1] if defined $_[1]; $_[0]->{rmapfs}} | |
278 sub nconts{ $_[0]->{nconts}=$_[1] if defined $_[1]; $_[0]->{nconts}} | |
279 sub n50{ $_[0]->{n50}=$_[1] if defined $_[1]; $_[0]->{n50}} | |
280 sub maxlength{ $_[0]->{maxlength}=$_[1] if defined $_[1]; $_[0]->{maxlength}} | |
281 sub nconts1k{ $_[0]->{nconts1k}=$_[1] if defined $_[1]; $_[0]->{nconts1k}} | |
282 sub totalbp{ $_[0]->{totalbp}=$_[1] if defined $_[1]; $_[0]->{totalbp}} | |
283 sub totalbp1k{ $_[0]->{totalbp1k}=$_[1] if defined $_[1]; $_[0]->{totalbp1k}} | |
284 sub velvethout{ $_[0]->{velvethout}=$_[1] if defined $_[1]; $_[0]->{velvethout}} | |
285 sub velvetgout{ $_[0]->{velvetgout}=$_[1] if defined $_[1]; $_[0]->{velvetgout}} | |
286 sub sequences{ $_[0]->{sequences}=$_[1] if defined $_[1]; $_[0]->{sequences}} | |
287 sub assmfunc{ $_[0]->{assmfunc}=$_[1] if defined $_[1]; $_[0]->{assmfunc}} | |
288 sub assmfunc2{ $_[0]->{assmfunc2}=$_[1] if defined $_[1]; $_[0]->{assmfunc2}} | |
289 | |
290 #assemblyScoreCalculator | |
291 sub calcAssemblyScore { | |
292 use Safe; | |
293 | |
294 my $self = shift; | |
295 my $func = shift; | |
296 | |
297 my $cpt = new Safe; | |
298 | |
299 #Basic variable IO and traversal | |
300 $cpt->permit(qw(null scalar const padany lineseq leaveeval rv2sv rv2hv helem hslice each values keys exists delete rv2cv)); | |
301 #Comparators | |
302 $cpt->permit(qw(lt i_lt gt i_gt le i_le ge i_ge eq i_eq ne i_ne ncmp i_ncmp slt sgt sle sge seq sne scmp)); | |
303 #Base math | |
304 $cpt->permit(qw(preinc i_preinc predec i_predec postinc i_postinc postdec i_postdec int hex oct abs pow multiply i_multiply divide i_divide modulo i_modulo add i_add subtract i_subtract)); | |
305 #Binary math | |
306 $cpt->permit(qw(left_shift right_shift bit_and bit_xor bit_or negate i_negate not complement)); | |
307 #Regex | |
308 $cpt->permit(qw(match split qr)); | |
309 #Conditionals | |
310 $cpt->permit(qw(cond_expr flip flop andassign orassign and or xor)); | |
311 #Advanced math | |
312 $cpt->permit(qw(atan2 sin cos exp log sqrt rand srand)); | |
313 | |
314 foreach my $key (keys %f_opts){ | |
315 print "\nkey: $key\tintname: ", $f_opts{$key}->{'intname'}, "\n" if $interested; | |
316 | |
317 $func =~ s/\b$key\b/$self->{$f_opts{$key}->{'intname'}}/g; | |
318 } | |
319 | |
320 my $r = $cpt->reval($func); | |
321 warn $@ if $@; | |
322 $self->{assmscore} = $r; | |
323 unless($r =~ /^\d+/){ | |
324 warn "Optimisation function did not return a single float.\nOptimisation function was not evaluatable.\nOptfunc: $func"; | |
325 warn "Setting assembly score to 0\n"; | |
326 $self->{assmscore} = 0; | |
327 } | |
328 if($r == 0){ | |
329 print STDERR "**********\n"; | |
330 print STDERR "Warning: Assembly score for assembly_id " . $self->{ass_id} . " is 0\n"; | |
331 print STDERR "You may want to consider choosing a different optimisation variable or function.\n"; | |
332 print STDERR "Current optimisation functions are ", $self->{assmfunc}, " for k value and ", $self->{assmfunc2}, " for cov_cutoff\n"; | |
333 print STDERR "**********\n"; | |
334 } | |
335 return 1; | |
336 } | |
337 | |
338 #getHashingDetails | |
339 sub getHashingDetails { | |
340 my $self = shift; | |
341 unless(!$self->timestamph || !$self->pstringh){ | |
342 my $programPath = cwd; | |
343 $self->pstringh =~ /^(\S+)\s+(\d+)\s+(.*)$/; | |
344 $self->{ass_dir} = $programPath . "/" . $1; | |
345 $self->{rmapfs} = -s $self->ass_dir . "/Roadmaps"; | |
346 $self->{hashval} = $2; | |
347 $self->{readfilename} = $3; | |
348 my @t = split /\n/, $self->velvethout; | |
349 foreach(@t){ | |
350 if(/^(\d+).*total\.$/){ | |
351 $self->{sequences} = $1; | |
352 last; | |
353 } | |
354 } | |
355 return 1; | |
356 } | |
357 return 0; | |
358 } | |
359 | |
360 #getAssemblyDetails | |
361 sub getAssemblyDetails { | |
362 my $self = shift; | |
363 my $file = $self->ass_dir . "/contigs.fa"; | |
364 unless(!(-e $file)){ | |
365 | |
366 my $all = &contigStats($file,1); | |
367 my $large = &contigStats($file,1000); | |
368 | |
369 $self->{nconts} = defined $all->{numSeqs} ? $all->{numSeqs} : 0; | |
370 $self->{n50} = defined $all->{n50} ? $all->{n50} : 0; | |
371 $self->{maxlength} = defined $all->{maxLen} ? $all->{maxLen} : 0; | |
372 $self->{nconts1k} = defined $large->{numSeqs} ? $large->{numSeqs} : 0; | |
373 $self->{totalbp} = defined $all->{numBases} ? $all->{numBases} : 0; | |
374 $self->{totalbp1k} = defined $large->{numBases} ? $large->{numBases} : 0; | |
375 | |
376 if($self->pstringg =~ m/cov_cutoff/){ | |
377 $self->calcAssemblyScore($self->{assmfunc2}); | |
378 } | |
379 else { | |
380 $self->calcAssemblyScore($self->{assmfunc}); | |
381 } | |
382 | |
383 return 1; | |
384 } | |
385 return 0; | |
386 } | |
387 | |
388 #contigStats | |
389 #Original script fa-show.pl by Torsten Seemann (Monash University, Melbourne, Australia) | |
390 #Modified by Simon Gladman to suit. | |
391 sub contigStats { | |
392 | |
393 my $file = shift; | |
394 my $minsize = shift; | |
395 | |
396 print "In contigStats with $file, $minsize\n" if $interested; | |
397 | |
398 my $numseq=0; | |
399 my $avglen=0; | |
400 my $minlen=1E9; | |
401 my $maxlen=0; | |
402 my @len; | |
403 my $toosmall=0; | |
404 my $nn=0; | |
405 | |
406 my $in = Bio::SeqIO->new(-file => $file, -format => 'Fasta'); | |
407 while(my $seq = $in->next_seq()){ | |
408 my $L = $seq->length; | |
409 #check > minsize | |
410 if($L < $minsize){ | |
411 $toosmall ++; | |
412 next; | |
413 } | |
414 #count Ns | |
415 my $s = $seq->seq; | |
416 my $n = $s =~ s/N/N/gi; | |
417 $n ||= 0; | |
418 $nn += $n; | |
419 #count seqs and other stats | |
420 $numseq ++; | |
421 $avglen += $L; | |
422 $maxlen = $L if $L > $maxlen; | |
423 $minlen = $L if $L < $minlen; | |
424 push @len, $L; | |
425 } | |
426 @len = sort { $a <=> $b } @len; | |
427 my $cum = 0; | |
428 my $n50 = 0; | |
429 for my $i (0 .. $#len){ | |
430 $cum += $len[$i]; | |
431 if($cum >= $avglen/2) { | |
432 $n50 = $len[$i]; | |
433 last; | |
434 } | |
435 } | |
436 | |
437 my %out; | |
438 if($numseq > 0){ | |
439 $out{numSeqs} = $numseq; | |
440 $out{numBases} = $avglen; | |
441 $out{numOK} = ($avglen - $nn); | |
442 $out{numNs} = $nn; | |
443 $out{minLen} = $minlen; | |
444 $out{avgLen} = $avglen/$numseq; | |
445 $out{maxLen} = $maxlen; | |
446 $out{n50} = $n50; | |
447 $out{minsize} = $minsize; | |
448 $out{numTooSmall} = $toosmall; | |
449 } | |
450 else { | |
451 $out{$numseq} = 0; | |
452 } | |
453 | |
454 print "Leaving contigstats!\n" if $interested; | |
455 return (\%out); | |
456 } | |
457 | |
458 | |
459 #toString method | |
460 sub toString { | |
461 my $self = shift; | |
462 my $tmp = $self->toStringNoV(); | |
463 if(defined $self->velvethout){ | |
464 $tmp .= "Velveth Output:\n" . $self->velvethout() . "\n"; | |
465 } | |
466 if(defined $self->velvetgout){ | |
467 $tmp .= "Velvetg Output:\n" . $self->velvetgout() . "\n"; | |
468 } | |
469 $tmp .= "**********************************************************\n"; | |
470 return $tmp; | |
471 } | |
472 | |
473 | |
474 #toStringNoV method | |
475 sub toStringNoV { | |
476 my $self = shift; | |
477 my $tmp = "********************************************************\n"; | |
478 if($self->ass_id()){ | |
479 $tmp .= "Assembly id: " . $self->ass_id(). "\n"; | |
480 } | |
481 if($self->assmscore()){ | |
482 $tmp .= "Assembly score: " .$self->assmscore(). "\n"; | |
483 } | |
484 if($self->timestamph()){ | |
485 $tmp .= "Velveth timestamp: " . $self->timestamph(). "\n"; | |
486 } | |
487 if($self->timestampg()){ | |
488 $tmp .= "Velvetg timestamp: " . $self->timestampg(). "\n"; | |
489 } | |
490 if(defined $self->versionh){ | |
491 $tmp .= "Velveth version: " . $self->versionh(). "\n"; | |
492 } | |
493 if(defined $self->versiong){ | |
494 $tmp .= "Velvetg version: " . $self->versiong(). "\n"; | |
495 } | |
496 if(defined $self->readfilename){ | |
497 $tmp .= "Readfile(s): " . $self->readfilename(). "\n"; | |
498 } | |
499 if(defined $self->pstringh){ | |
500 $tmp .= "Velveth parameter string: " . $self->pstringh(). "\n"; | |
501 } | |
502 if(defined $self->pstringg){ | |
503 $tmp .= "Velvetg parameter string: " . $self->pstringg(). "\n"; | |
504 } | |
505 if(defined $self->ass_dir){ | |
506 $tmp .= "Assembly directory: " . $self->ass_dir(). "\n"; | |
507 } | |
508 if(defined $self->hashval){ | |
509 $tmp .= "Velvet hash value: " . $self->hashval(). "\n"; | |
510 } | |
511 if(defined $self->rmapfs){ | |
512 $tmp .= "Roadmap file size: " . $self->rmapfs(). "\n"; | |
513 } | |
514 if(defined $self->sequences){ | |
515 $tmp .= "Total number of sequences: " . $self->sequences(). "\n"; | |
516 } | |
517 if(defined $self->nconts){ | |
518 $tmp .= "Total number of contigs: " . $self->nconts(). "\n"; | |
519 } | |
520 if(defined $self->n50){ | |
521 $tmp .= "n50: " . $self->n50(). "\n"; | |
522 } | |
523 if(defined $self->maxlength){ | |
524 $tmp .= "length of longest contig: " . $self->maxlength(). "\n"; | |
525 } | |
526 if(defined $self->totalbp){ | |
527 $tmp .= "Total bases in contigs: " . $self->totalbp(). "\n"; | |
528 } | |
529 if(defined $self->nconts1k){ | |
530 $tmp .= "Number of contigs > 1k: " . $self->nconts1k(). "\n"; | |
531 } | |
532 if(defined $self->totalbp1k){ | |
533 $tmp .= "Total bases in contigs > 1k: " . $self->totalbp1k(). "\n"; | |
534 } | |
535 if($self->pstringh =~ /Pair/ && defined $self->pstringg && $self->pstringg =~ /-exp_cov/){ | |
536 $tmp .= "Paired Library insert stats:\n"; | |
537 my @x = split /\n/, $self->velvetgout; | |
538 foreach(@x){ | |
539 chomp; | |
540 if(/^Paired-end library \d+ has/){ | |
541 $tmp .= "$_\n"; | |
542 } | |
543 } | |
544 } | |
545 $tmp .= "**********************************************************\n"; | |
546 return $tmp; | |
547 } | |
548 | |
549 sub opt_func_toString { | |
550 my $out = "\nVelvet optimiser assembly optimisation function can be built from the following variables.\n"; | |
551 foreach my $key (sort keys %f_opts){ | |
552 $out .= "\t$key = " . $f_opts{$key}->{'desc'} . "\n"; | |
553 } | |
554 $out .= "Examples are:\n\t'Lbp' = Just the total basepairs in contigs longer than 1kb\n"; | |
555 $out .= "\t'n50*Lcon' = The n50 times the number of long contigs.\n"; | |
556 $out .= "\t'n50*Lcon/tbp+log(Lbp)' = The n50 times the number of long contigs divided\n\t\tby the total bases in all contigs plus the log of the number of bases\n\t\tin long contigs.\n"; | |
557 return $out | |
558 } | |
559 | |
560 1; |