comparison bam2consensus @ 1:2367d00c5182 draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author jdv
date Tue, 28 Sep 2021 06:12:40 +0000
parents
children
comparison
equal deleted inserted replaced
0:d32139014ec3 1:2367d00c5182
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5 use 5.012;
6
7 use BioX::Seq;
8 use BioX::Seq::Stream;
9 use BioX::Seq::Fetch;
10 use File::Temp qw/tempfile/;
11 use IPC::Cmd qw/can_run/;
12 use Getopt::Long;
13 use List::Util qw/sum max first/;
14 use Pod::Usage;
15 use POSIX qw/floor ceil/;
16
17 #-inputs---------------------------------------------------------------------#
18 my $fn_bam;
19 my $fn_ref;
20 #-outputs--------------------------------------------------------------------#
21 my $fn_table;
22 my $fn_consensus;
23 my $fn_bedgraph;
24 #-knobs----------------------------------------------------------------------#
25 my $min_qual = 10;
26 my $min_depth = 3;
27 my $trim_fraction = 0.2;
28 my $sliding_window = 30;
29 my $bg_bin_figs = 0;
30 my $verbose = 0;
31
32 my $PROGRAM = 'bam2consensus';
33 my $VERSION = 0.004;
34
35 GetOptions(
36
37 #-inputs-----------------------------------------------------------------#
38 'bam=s' => \$fn_bam,
39 'ref=s' => \$fn_ref,
40 #-outputs----------------------------------------------------------------#
41 'table=s' => \$fn_table,
42 'consensus=s' => \$fn_consensus,
43 'bedgraph=s' => \$fn_bedgraph,
44 #-knobs------------------------------------------------------------------#
45 'min_qual=i' => \$min_qual,
46 'min_depth=i' => \$min_depth,
47 'trim=f' => \$trim_fraction,
48 'window=i' => \$sliding_window,
49 'bg_bin_figs=i' => \$bg_bin_figs,
50 'verbose' => \$verbose,
51 'help' => sub{ pod2usage(-verbose => 2); },
52 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; },
53
54 ) or pod2usage( -verbose => 1);
55
56 # check for recent version of samtools
57 my $SAMTOOLS = can_run('samtools')
58 // die "Samtools is required but not found\n";
59 my $v_string = `$SAMTOOLS --version`;
60 if ($v_string =~ /^samtools (\d+)\.(\d+)/m) {
61 die "Requires samtools >= 1.3.0\n" if ($1 < 1 || $2 < 3);
62 } else {
63 die "Error parsing samtools version string\n";
64 }
65
66 # check for mafft
67 my $MAFFT = can_run('mafft')
68 // die "MAFFT is required but not found\n";
69
70
71 # misc param checking
72 die "Error: must specify at least one output target" if (! (
73 defined $fn_table
74 || defined $fn_consensus
75 || defined $fn_bedgraph
76 ));
77 die "Error: missing reference parameter"
78 if (! defined $fn_ref);
79 die "Error reading reference"
80 if (! -r $fn_ref);
81
82
83 # globals
84 my @errors;
85 my @lines = () if (defined $fn_table);
86
87 my %iupac = (
88 A => 'A',
89 C => 'C',
90 G => 'G',
91 T => 'T',
92 AG => 'R',
93 CT => 'Y',
94 CG => 'S',
95 AT => 'W',
96 GT => 'K',
97 AC => 'M',
98 CGT => 'B',
99 AGT => 'D',
100 ACT => 'H',
101 ACG => 'V',
102 ACGT => 'N',
103 );
104
105 my @consensi;
106 my $bg = '';
107
108 my @curr_lines;
109 my $last_chr;
110
111 my $last_depth = undef;
112 my $last_loc = 0;
113 my $bg_start = 0;
114 my $bg_loc = 0;
115
116
117 # initialize random-access sequence collection
118 my $seqs = BioX::Seq::Fetch->new($fn_ref) or die "Error loading reference";
119
120
121 # pipe from samtools mpileup command
122 # (note: this is much faster in testing than using Perl bindings, e.g.
123 # Bio::DB::HTS or the like)
124
125 $fn_bam //= '-'; # use stdin if BAM file not given
126
127 open my $fh, '-|', $SAMTOOLS,
128 'mpileup',
129 '-d' => 1000000,
130 '-B',
131 '-f' => $fn_ref,
132 $fn_bam ;
133
134
135
136 LINE:
137 while (my $line = <$fh>) {
138
139 chomp $line;
140 my ($chr, @other) = split "\t", $line;
141 $last_chr //= $chr;
142
143 if ($chr ne $last_chr) {
144 process(\@curr_lines);
145 @curr_lines = ();
146 $last_chr = $chr;
147 }
148
149 push @curr_lines, $line;
150 }
151
152 process(\@curr_lines); # don't forget last call
153
154 # output bedgraph if asked
155 if (defined $fn_bedgraph) {
156 open my $fh_bedgraph, '>', $fn_bedgraph;
157 print {$fh_bedgraph}
158 "track type=bedGraph name=read_coverage maxHeightPixels=1000:80:20\n";
159 print {$fh_bedgraph} $bg;
160 close $fh_bedgraph;
161
162 }
163
164 # output fasta if asked
165 if (defined $fn_consensus) {
166
167 open my $out, '>', $fn_consensus;
168 print {$out} $_->as_fasta for (@consensi);
169 close $out;
170
171 }
172
173 # build and process table if asked
174 if (defined $fn_table) {
175
176 # calculate sliding errors
177 my @avg_errors;
178 my $l = scalar(@errors);
179 $sliding_window = $l if ($l < $sliding_window);
180 my $left = floor(($sliding_window-1)/2);
181 my $right = ceil(($sliding_window-1)/2);
182 my $lower = $left;
183 my $upper = $l - $right;
184 for my $i (0..$#errors) {
185 my @pool;
186 if ($i < $lower) {
187 @pool = (@errors[0..$i-1] ,@errors[$i+1..$sliding_window-1]);
188 }
189 elsif ($i >= $upper) {
190 @pool = (@errors[$l-$sliding_window..$i-1], @errors[$i+1..$l-1]);
191 }
192 else {
193 @pool = (@errors[$i-$left..$i-1], @errors[$i+1..$i+$right]);
194 }
195 die "bad pool size @pool at $i" if (scalar(@pool)+1 != $sliding_window);
196
197 # calc trimmed mean
198 @pool = sort {$a <=> $b} @pool;
199 my $l = @pool;
200 my @trimmed
201 = @pool[ int($l*$trim_fraction), int($l*(1-$trim_fraction))+0.5 ];
202 my $tm = scalar(@trimmed) > 0 ? sum(@trimmed)/scalar(@trimmed) : 'NA';
203 push @avg_errors, $tm;
204 }
205
206 open my $fh_table, '>', $fn_table;
207
208 # print table header
209 print {$fh_table} join( "\t", (
210 'id',
211 'loc',
212 'ref',
213 'called',
214 'total_depth',
215 'counted_depth',
216 'mm_rate',
217 'A_count',
218 'T_count',
219 'G_count',
220 'C_count',
221 'N_count',
222 'gap_count',
223 'A_freq',
224 'T_freq',
225 'G_freq',
226 'C_freq',
227 'N_freq',
228 'gap_freq',
229 'A_sb',
230 'T_sb',
231 'G_sb',
232 'C_sb',
233 'bgnd_err',
234 'insertions'
235 ) ) . "\n";
236
237 my $iter = 0;
238 POS:
239 for (0..$#lines) {
240 my @parts = @{ $lines[$_] };
241 @parts[23] = sprintf '%.3f', $avg_errors[$_];
242 print {$fh_table} join( "\t",@parts), "\n";
243 }
244 close $fh_table;
245 }
246
247 sub process {
248
249 my $ln_ref = shift;
250
251 my $last_chr;
252 $last_depth = undef;
253 $last_loc = 0;
254 $bg_start = 0;
255 $bg_loc = 0;
256
257 LINE:
258 for my $line (@$ln_ref) {
259 chomp $line;
260 my @parts = split "\t", $line;
261 my $chr = $parts[0];
262 my $loc = $parts[1];
263 my $ref = uc $parts[2];
264 my $depth = $parts[3];
265 my $read_string = $parts[4];
266 my $qual_string = $parts[5];
267
268 # check that chr hasn't changed (don't supported multiple refs)
269 $last_chr = $last_chr // $chr;
270 if ($chr ne $last_chr) {
271 #process current, start new
272 }
273
274 # simulate missing rows
275 my $t = $last_loc + 1;
276 while ($t < $loc) {
277 handle_entry(
278 $chr,
279 $t,
280 $seqs->fetch_seq($chr, $t, $t),
281 #substr($ref_seq, $t-1, 1),
282 0,
283 '',
284 '',
285 );
286 ++$t;
287 }
288
289 # send entry for handling
290 handle_entry(
291 $chr,
292 $loc,
293 $ref,
294 $depth,
295 $read_string,
296 $qual_string,
297 );
298
299 $last_loc = $loc;
300
301 }
302
303 # simulate missing rows at end
304 my $t = $last_loc + 1;
305 my $l = $seqs->length($last_chr);
306 while ($t <= $l) {
307 handle_entry(
308 $last_chr,
309 $t,
310 $seqs->fetch_seq($last_chr, $t, $t),
311 #substr($ref_seq, $t-1, 1),
312 0,
313 '',
314 '',
315 );
316 ++$t;
317 }
318
319 if (defined $fn_bedgraph) {
320
321 $bg .= join("\t", $last_chr, $bg_start, $bg_loc, $last_depth) . "\n";
322 }
323
324 }
325
326
327 sub handle_entry {
328
329 my ($chr, $loc, $ref, $depth, $read_string, $qual_string) = @_;
330
331 my $called = '';
332
333 # handle zero-depth positions separately
334 if ($depth < 1) {
335 $called = 'N';
336 print "Missing coverage at $chr pos $loc\n" if ($verbose);
337 if (defined $fn_table) {
338 push @lines, [
339 $chr,
340 $loc,
341 $ref,
342 'N',
343 (0) x 19,
344 undef,
345 '',
346 ];
347 }
348 push @errors, 0;
349 }
350
351 # everything else
352 else {
353
354 # handle insertions
355 my %inserts;
356 my $insert_count = 0;
357 while ($read_string =~ /\+(\d+)((??{"[ATGCNatgcnRYSWKMBDHVryswkmbdhv-]{$^N}"}))/g) {
358 $inserts{$2} += 1;
359 ++$insert_count;
360 }
361
362 # ...and strip extra characters
363 $read_string =~ s/\^.//g;
364 $read_string =~ s/[\+\-](\d+)(??{"[ATGCNatgcnRYSWKMBDHVryswkmbdhv-]{$^N}"})//g;
365 $read_string =~ s/[^\.\,\w\*]//g;
366
367 # simple parse check
368 my $l1 = length($read_string);
369 my $l2 = length($qual_string);
370 die "read/qual mismatch ($l1 v $l2)" if ($l1 != $l2);
371
372 die "unexpected char at $chr pos $loc\n"
373 if ($read_string =~ /[^.,ATGCNatgcn*]/);
374
375 my $lc = lc $ref;
376 $read_string =~ s/\./$ref/g;
377 $read_string =~ s/\,/$lc/g;
378 $read_string =~ s/n/N/g;
379
380 # split into arrays
381 my %counts = map {$_ => 0} qw/A T G C N a t g c */;
382 my %cons_counts = map {$_ => 0} qw/A T G C N a t g c */;
383 my @chars = split '', $read_string;
384 my @quals = map {ord($_) - 33} split('', $qual_string);
385
386 READ:
387 for my $i (0..$#chars) {
388 ++$cons_counts{ uc($chars[$i]) };
389 ++$counts{ $chars[$i] } if ($quals[$i] >= $min_qual);
390 }
391
392 # calculate strand bias and collapse counts
393 my %sb;
394 for my $b (qw/A T G C/) {
395 my $n = $counts{$b} + $counts{lc($b)};
396 $sb{$b} = $n > 0
397 ? ($n-1)/$n*(2*max($counts{$b}/$n, ($n-$counts{$b})/$n)-1)
398 : 0;
399 $counts{$b} += $counts{lc($b)};
400 delete $counts{lc($b)};
401 }
402
403 $counts{$ref} = $counts{$ref} // 0; # some IUPAC codes not defined above
404 $cons_counts{$ref} = $cons_counts{$ref} // 0; # some IUPAC codes not defined above
405 my $mismatches = sum map {$counts{$_}} grep {$_ ne $ref} keys %counts;
406 my $counted_depth = $counts{$ref} + $mismatches;
407 my $cons_depth = sum map {$cons_counts{$_}} keys %counts;
408 my $error_rate = $counted_depth == 0
409 ? 0
410 : sprintf '%.4f', $mismatches/$counted_depth;
411 push @errors, $error_rate;
412
413 my @insert_strings = ();
414 my $consensus_insert = '';
415
416 #create case-insensitive insert hash
417 my %combined_inserts;
418 for (keys %inserts) {
419 $combined_inserts{uc($_)} += $inserts{$_};
420 }
421
422 if (scalar(keys %combined_inserts) > 0) {
423 my @sorted_inserts = sort {
424 $combined_inserts{$b} <=> $combined_inserts{$a}
425 || $a cmp $b
426 } keys %combined_inserts;
427 for (@sorted_inserts) {
428 my $f_count = $inserts{$_} // 0;
429 my $r_count = $inserts{lc($_)} // 0;
430 my $n = $combined_inserts{$_};
431 my $sb = sprintf '%.3f', ($n-1)/$n*max($f_count/$n, ($n-$f_count)/$n);
432 push @insert_strings, "$_($f_count,$r_count:$sb)";
433 }
434
435 # decide whether to include insert in consensus
436 if ($insert_count/$l1 > 0.5) {
437 my @realigned = realign(\%combined_inserts);
438 for my $i (0..$#realigned) {
439 my @b = sort {
440 $realigned[$i]->{$b} <=> $realigned[$i]->{$a}
441 } keys %{ $realigned[$i] };
442 if ($realigned[$i]->{$b[0]}/$l1 > 0.5) {
443 $consensus_insert .= uc $b[0];
444 }
445 }
446 }
447
448 }
449 if ($cons_depth < $min_depth) {
450 $called = 'N';
451 }
452 else {
453 my @sorted
454 = sort {$cons_counts{$b} <=> $cons_counts{$a}} keys %cons_counts;
455
456 # get all top hits that aren't gaps
457 my @equal_hits = grep {
458 $cons_counts{$_} eq $cons_counts{$sorted[0]} && $_ ne '*'
459 } @sorted;
460
461 if (scalar(@equal_hits)) {
462 my $tag = join('',sort {$a cmp $b} @equal_hits);
463 die "bad tag $tag" if (! defined $iupac{$tag});
464 $called = $iupac{$tag};
465 }
466 }
467 $called .= $consensus_insert;
468
469 print "consensus/reference difference at $chr pos $loc (ref: $ref cons: $called)\n"
470 if ($verbose && $called ne $ref);
471
472 if (defined $fn_table) {
473 push @lines, [
474 $chr,
475 $loc,
476 $ref,
477 $called eq '' ? '-' : $called,
478 $depth,
479 $counted_depth,
480 sprintf('%.3f',$error_rate),
481 $counts{A},
482 $counts{T},
483 $counts{G},
484 $counts{C},
485 $counts{N},
486 $counts{'*'},
487 sprintf('%.3f',$counts{A}/$counted_depth),
488 sprintf('%.3f',$counts{T}/$counted_depth),
489 sprintf('%.3f',$counts{G}/$counted_depth),
490 sprintf('%.3f',$counts{C}/$counted_depth),
491 sprintf('%.3f',$counts{N}/$counted_depth),
492 sprintf('%.3f',$counts{'*'}/$counted_depth),
493 sprintf('%.3f',$sb{A}),
494 sprintf('%.3f',$sb{T}),
495 sprintf('%.3f',$sb{G}),
496 sprintf('%.3f',$sb{C}),
497 undef,
498 join(':',@insert_strings)
499 ];
500 }
501 }
502
503 my $consensus = first {$_->id eq $chr} @consensi;
504 if (! defined $consensus) {
505 $consensus = BioX::Seq->new('',$chr);
506 push @consensi, $consensus;
507 }
508 $consensus->seq .= $called;
509
510 my $cons_len = length($called);
511
512 # Generate bedgraph output
513 if (defined $fn_bedgraph && $cons_len > 0) {
514
515 # bin depth if requested
516 if ($bg_bin_figs > 0) {
517 my $divisor = 10**max(0, length($depth)-$bg_bin_figs);
518 $depth = int($depth/$divisor) * $divisor;
519 }
520
521 # output on depth change
522 if (! defined $last_depth || $depth != $last_depth) {
523 $bg .= join("\t",$last_chr, $bg_start, $bg_loc, $last_depth) . "\n"
524 if (defined $last_depth);
525 $last_depth = $depth;
526 $bg_start = $bg_loc;
527 }
528
529 $bg_loc += $cons_len;
530
531 }
532
533 }
534
535
536 sub realign {
537
538 # calculate a local realignment at indel using MAFFT
539 # TODO: reimplement using native code
540
541 my ($hash) = @_;
542
543 my @seqs = keys %{ $hash };
544 my @weights = map {$hash->{$_}} @seqs;
545 my @scores;
546 if (scalar(@seqs) > 1) {
547 my ($fh, $fn) = tempfile;
548 for (0..$#seqs) {
549 my $n = $_ + 1;
550 print {$fh} ">$n\n$seqs[$_]\n";
551 }
552 close $fh;
553 open my $stream, '-|', $MAFFT,
554 '--auto',
555 '--quiet',
556 '--op' => 0,
557 '--lop' => 0,
558 $fn;
559 my $p = BioX::Seq::Stream->new($stream);
560 while (my $seq = $p->next_seq) {
561 my $w = shift @weights;
562 for (0..length($seq)-1) {
563 my $base = substr $seq, $_, 1;
564 next if ($base eq '-');
565 $scores[$_] = {} if (! defined $scores[$_]);
566 $scores[$_]->{$base} += $w;
567 }
568 }
569 }
570 else {
571 my $seq = $seqs[0];
572 my $w = $weights[0];
573 for (0..length($seq)-1) {
574 my $base = substr $seq, $_, 1;
575 next if ($base eq '-');
576 $scores[$_] = {} if (! defined $scores[$_]);
577 $scores[$_]->{$base} += $w;
578 }
579 }
580 return @scores;
581
582 }
583
584
585 __END__
586
587 =head1 NAME
588
589 bam2consensus - consensus calling (etc) from BAM alignment
590
591 =head1 SYNOPSIS
592
593 bam2consensus --bam <in.bam> --ref <in.fasta> [options] --consensus <out.fasta>
594
595 =head1 DESCRIPTION
596
597 Re-calls a consensus sequence based on a BAM alignment to reference, with
598 various knobs and optional output formats
599
600 =head1 PREREQUISITES
601
602 Requires the following non-core Perl libraries:
603
604 =over 1
605
606 =item * BioX::Seq
607
608 =back
609
610 as well as the following binaries:
611
612 =over 1
613
614 =item * samtools (>= 1.3.1)
615
616 =item * mafft
617
618 =back
619
620 =head1 OPTIONS
621
622 =head2 Input (required)
623
624 =over 4
625
626 =item B<--bam> I<filename>
627
628 Path to input BAM alignments
629
630 =item B<--ref> I<filename>
631
632 Path to reference sequence used to generate BAM alignments
633
634 =back
635
636 =head2 Output (at least one is required, can specify more than one)
637
638 =over 4
639
640 =item B<--consensus>
641
642 Path to write consensus sequence to (as FASTA)
643
644 =item B<--bedgraph>
645
646 Path to write coverage file to (as bedgraph)
647
648 =item B<--table>
649
650 Path to write coverage file to (as tab-separated table)
651
652 =back
653
654 =head2 Configuration
655
656 =over 4
657
658 =item B<--min_qual>
659
660 Minimum quality for a base to be considered in consensus calling. Default: 10.
661
662 =item B<--min_depth>
663
664 Minimum read depth for consensus to be called (otherwise called as "N"). Default: 3.
665
666 =item B<--trim>
667
668 Fraction to trim from each end when calculating trimmed mean of error window.
669 Default: 0.2.
670
671 =item B<--window>
672
673 Size of sliding window used to calculate local error rates. Default: 30.
674
675 =item B<--bg_bin_figs> <integer>
676
677 If greater than zero, the number of significant figures used to bin depths in
678 bedgraph output. If zero, no binning is applied. This option is useful to
679 reduce the size of bedgraph output by binning similar depth values when high
680 resolution is not important. Default: 0 (disabled).
681
682 =back
683
684 =head1 CAVEATS AND BUGS
685
686 Please submit bug reports to the issue tracker in the distribution repository.
687
688 =head1 AUTHOR
689
690 Jeremy Volkening (jdv@base2bio.com)
691
692 =head1 LICENSE AND COPYRIGHT
693
694 Copyright 2014-17 Jeremy Volkening
695
696 This program is free software: you can redistribute it and/or modify
697 it under the terms of the GNU General Public License as published by
698 the Free Software Foundation, either version 3 of the License, or
699 (at your option) any later version.
700
701 This program is distributed in the hope that it will be useful,
702 but WITHOUT ANY WARRANTY; without even the implied warranty of
703 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
704 GNU General Public License for more details.
705
706 You should have received a copy of the GNU General Public License
707 along with this program. If not, see <http://www.gnu.org/licenses/>.
708
709 =cut
710