# HG changeset patch # User jdv # Date 1632809492 0 # Node ID 20d941ecc1bb532cfde8d30010f021a7b2abfaa3 # Parent 6fb0eff4697207502561326dacbb51c921576cf6 "planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty" diff -r 6fb0eff46972 -r 20d941ecc1bb bam2consensus --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam2consensus Tue Sep 28 06:11:32 2021 +0000 @@ -0,0 +1,710 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use BioX::Seq; +use BioX::Seq::Stream; +use BioX::Seq::Fetch; +use File::Temp qw/tempfile/; +use IPC::Cmd qw/can_run/; +use Getopt::Long; +use List::Util qw/sum max first/; +use Pod::Usage; +use POSIX qw/floor ceil/; + +#-inputs---------------------------------------------------------------------# +my $fn_bam; +my $fn_ref; +#-outputs--------------------------------------------------------------------# +my $fn_table; +my $fn_consensus; +my $fn_bedgraph; +#-knobs----------------------------------------------------------------------# +my $min_qual = 10; +my $min_depth = 3; +my $trim_fraction = 0.2; +my $sliding_window = 30; +my $bg_bin_figs = 0; +my $verbose = 0; + +my $PROGRAM = 'bam2consensus'; +my $VERSION = 0.004; + +GetOptions( + + #-inputs-----------------------------------------------------------------# + 'bam=s' => \$fn_bam, + 'ref=s' => \$fn_ref, + #-outputs----------------------------------------------------------------# + 'table=s' => \$fn_table, + 'consensus=s' => \$fn_consensus, + 'bedgraph=s' => \$fn_bedgraph, + #-knobs------------------------------------------------------------------# + 'min_qual=i' => \$min_qual, + 'min_depth=i' => \$min_depth, + 'trim=f' => \$trim_fraction, + 'window=i' => \$sliding_window, + 'bg_bin_figs=i' => \$bg_bin_figs, + 'verbose' => \$verbose, + 'help' => sub{ pod2usage(-verbose => 2); }, + 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; }, + +) or pod2usage( -verbose => 1); + +# check for recent version of samtools +my $SAMTOOLS = can_run('samtools') + // die "Samtools is required but not found\n"; +my $v_string = `$SAMTOOLS --version`; +if ($v_string =~ /^samtools (\d+)\.(\d+)/m) { + die "Requires samtools >= 1.3.0\n" if ($1 < 1 || $2 < 3); +} else { + die "Error parsing samtools version string\n"; +} + +# check for mafft +my $MAFFT = can_run('mafft') + // die "MAFFT is required but not found\n"; + + +# misc param checking +die "Error: must specify at least one output target" if (! ( + defined $fn_table + || defined $fn_consensus + || defined $fn_bedgraph +)); +die "Error: missing reference parameter" + if (! defined $fn_ref); +die "Error reading reference" + if (! -r $fn_ref); + + +# globals +my @errors; +my @lines = () if (defined $fn_table); + +my %iupac = ( + A => 'A', + C => 'C', + G => 'G', + T => 'T', + AG => 'R', + CT => 'Y', + CG => 'S', + AT => 'W', + GT => 'K', + AC => 'M', + CGT => 'B', + AGT => 'D', + ACT => 'H', + ACG => 'V', + ACGT => 'N', +); + +my @consensi; +my $bg = ''; + +my @curr_lines; +my $last_chr; + +my $last_depth = undef; +my $last_loc = 0; +my $bg_start = 0; +my $bg_loc = 0; + + +# initialize random-access sequence collection +my $seqs = BioX::Seq::Fetch->new($fn_ref) or die "Error loading reference"; + + +# pipe from samtools mpileup command +# (note: this is much faster in testing than using Perl bindings, e.g. +# Bio::DB::HTS or the like) + +$fn_bam //= '-'; # use stdin if BAM file not given + +open my $fh, '-|', $SAMTOOLS, + 'mpileup', + '-d' => 1000000, + '-B', + '-f' => $fn_ref, + $fn_bam ; + + + +LINE: +while (my $line = <$fh>) { + + chomp $line; + my ($chr, @other) = split "\t", $line; + $last_chr //= $chr; + + if ($chr ne $last_chr) { + process(\@curr_lines); + @curr_lines = (); + $last_chr = $chr; + } + + push @curr_lines, $line; +} + +process(\@curr_lines); # don't forget last call + +# output bedgraph if asked +if (defined $fn_bedgraph) { + open my $fh_bedgraph, '>', $fn_bedgraph; + print {$fh_bedgraph} + "track type=bedGraph name=read_coverage maxHeightPixels=1000:80:20\n"; + print {$fh_bedgraph} $bg; + close $fh_bedgraph; + +} + +# output fasta if asked +if (defined $fn_consensus) { + + open my $out, '>', $fn_consensus; + print {$out} $_->as_fasta for (@consensi); + close $out; + +} + +# build and process table if asked +if (defined $fn_table) { + + # calculate sliding errors + my @avg_errors; + my $l = scalar(@errors); + $sliding_window = $l if ($l < $sliding_window); + my $left = floor(($sliding_window-1)/2); + my $right = ceil(($sliding_window-1)/2); + my $lower = $left; + my $upper = $l - $right; + for my $i (0..$#errors) { + my @pool; + if ($i < $lower) { + @pool = (@errors[0..$i-1] ,@errors[$i+1..$sliding_window-1]); + } + elsif ($i >= $upper) { + @pool = (@errors[$l-$sliding_window..$i-1], @errors[$i+1..$l-1]); + } + else { + @pool = (@errors[$i-$left..$i-1], @errors[$i+1..$i+$right]); + } + die "bad pool size @pool at $i" if (scalar(@pool)+1 != $sliding_window); + + # calc trimmed mean + @pool = sort {$a <=> $b} @pool; + my $l = @pool; + my @trimmed + = @pool[ int($l*$trim_fraction), int($l*(1-$trim_fraction))+0.5 ]; + my $tm = scalar(@trimmed) > 0 ? sum(@trimmed)/scalar(@trimmed) : 'NA'; + push @avg_errors, $tm; + } + + open my $fh_table, '>', $fn_table; + + # print table header + print {$fh_table} join( "\t", ( + 'id', + 'loc', + 'ref', + 'called', + 'total_depth', + 'counted_depth', + 'mm_rate', + 'A_count', + 'T_count', + 'G_count', + 'C_count', + 'N_count', + 'gap_count', + 'A_freq', + 'T_freq', + 'G_freq', + 'C_freq', + 'N_freq', + 'gap_freq', + 'A_sb', + 'T_sb', + 'G_sb', + 'C_sb', + 'bgnd_err', + 'insertions' + ) ) . "\n"; + + my $iter = 0; + POS: + for (0..$#lines) { + my @parts = @{ $lines[$_] }; + @parts[23] = sprintf '%.3f', $avg_errors[$_]; + print {$fh_table} join( "\t",@parts), "\n"; + } + close $fh_table; +} + +sub process { + + my $ln_ref = shift; + + my $last_chr; + $last_depth = undef; + $last_loc = 0; + $bg_start = 0; + $bg_loc = 0; + + LINE: + for my $line (@$ln_ref) { + chomp $line; + my @parts = split "\t", $line; + my $chr = $parts[0]; + my $loc = $parts[1]; + my $ref = uc $parts[2]; + my $depth = $parts[3]; + my $read_string = $parts[4]; + my $qual_string = $parts[5]; + + # check that chr hasn't changed (don't supported multiple refs) + $last_chr = $last_chr // $chr; + if ($chr ne $last_chr) { + #process current, start new + } + + # simulate missing rows + my $t = $last_loc + 1; + while ($t < $loc) { + handle_entry( + $chr, + $t, + $seqs->fetch_seq($chr, $t, $t), + #substr($ref_seq, $t-1, 1), + 0, + '', + '', + ); + ++$t; + } + + # send entry for handling + handle_entry( + $chr, + $loc, + $ref, + $depth, + $read_string, + $qual_string, + ); + + $last_loc = $loc; + + } + + # simulate missing rows at end + my $t = $last_loc + 1; + my $l = $seqs->length($last_chr); + while ($t <= $l) { + handle_entry( + $last_chr, + $t, + $seqs->fetch_seq($last_chr, $t, $t), + #substr($ref_seq, $t-1, 1), + 0, + '', + '', + ); + ++$t; + } + + if (defined $fn_bedgraph) { + + $bg .= join("\t", $last_chr, $bg_start, $bg_loc, $last_depth) . "\n"; + } + +} + + +sub handle_entry { + + my ($chr, $loc, $ref, $depth, $read_string, $qual_string) = @_; + + my $called = ''; + + # handle zero-depth positions separately + if ($depth < 1) { + $called = 'N'; + print "Missing coverage at $chr pos $loc\n" if ($verbose); + if (defined $fn_table) { + push @lines, [ + $chr, + $loc, + $ref, + 'N', + (0) x 19, + undef, + '', + ]; + } + push @errors, 0; + } + + # everything else + else { + + # handle insertions + my %inserts; + my $insert_count = 0; + while ($read_string =~ /\+(\d+)((??{"[ATGCNatgcnRYSWKMBDHVryswkmbdhv-]{$^N}"}))/g) { + $inserts{$2} += 1; + ++$insert_count; + } + + # ...and strip extra characters + $read_string =~ s/\^.//g; + $read_string =~ s/[\+\-](\d+)(??{"[ATGCNatgcnRYSWKMBDHVryswkmbdhv-]{$^N}"})//g; + $read_string =~ s/[^\.\,\w\*]//g; + + # simple parse check + my $l1 = length($read_string); + my $l2 = length($qual_string); + die "read/qual mismatch ($l1 v $l2)" if ($l1 != $l2); + + die "unexpected char at $chr pos $loc\n" + if ($read_string =~ /[^.,ATGCNatgcn*]/); + + my $lc = lc $ref; + $read_string =~ s/\./$ref/g; + $read_string =~ s/\,/$lc/g; + $read_string =~ s/n/N/g; + + # split into arrays + my %counts = map {$_ => 0} qw/A T G C N a t g c */; + my %cons_counts = map {$_ => 0} qw/A T G C N a t g c */; + my @chars = split '', $read_string; + my @quals = map {ord($_) - 33} split('', $qual_string); + + READ: + for my $i (0..$#chars) { + ++$cons_counts{ uc($chars[$i]) }; + ++$counts{ $chars[$i] } if ($quals[$i] >= $min_qual); + } + + # calculate strand bias and collapse counts + my %sb; + for my $b (qw/A T G C/) { + my $n = $counts{$b} + $counts{lc($b)}; + $sb{$b} = $n > 0 + ? ($n-1)/$n*(2*max($counts{$b}/$n, ($n-$counts{$b})/$n)-1) + : 0; + $counts{$b} += $counts{lc($b)}; + delete $counts{lc($b)}; + } + + $counts{$ref} = $counts{$ref} // 0; # some IUPAC codes not defined above + $cons_counts{$ref} = $cons_counts{$ref} // 0; # some IUPAC codes not defined above + my $mismatches = sum map {$counts{$_}} grep {$_ ne $ref} keys %counts; + my $counted_depth = $counts{$ref} + $mismatches; + my $cons_depth = sum map {$cons_counts{$_}} keys %counts; + my $error_rate = $counted_depth == 0 + ? 0 + : sprintf '%.4f', $mismatches/$counted_depth; + push @errors, $error_rate; + + my @insert_strings = (); + my $consensus_insert = ''; + + #create case-insensitive insert hash + my %combined_inserts; + for (keys %inserts) { + $combined_inserts{uc($_)} += $inserts{$_}; + } + + if (scalar(keys %combined_inserts) > 0) { + my @sorted_inserts = sort { + $combined_inserts{$b} <=> $combined_inserts{$a} + || $a cmp $b + } keys %combined_inserts; + for (@sorted_inserts) { + my $f_count = $inserts{$_} // 0; + my $r_count = $inserts{lc($_)} // 0; + my $n = $combined_inserts{$_}; + my $sb = sprintf '%.3f', ($n-1)/$n*max($f_count/$n, ($n-$f_count)/$n); + push @insert_strings, "$_($f_count,$r_count:$sb)"; + } + + # decide whether to include insert in consensus + if ($insert_count/$l1 > 0.5) { + my @realigned = realign(\%combined_inserts); + for my $i (0..$#realigned) { + my @b = sort { + $realigned[$i]->{$b} <=> $realigned[$i]->{$a} + } keys %{ $realigned[$i] }; + if ($realigned[$i]->{$b[0]}/$l1 > 0.5) { + $consensus_insert .= uc $b[0]; + } + } + } + + } + if ($cons_depth < $min_depth) { + $called = 'N'; + } + else { + my @sorted + = sort {$cons_counts{$b} <=> $cons_counts{$a}} keys %cons_counts; + + # get all top hits that aren't gaps + my @equal_hits = grep { + $cons_counts{$_} eq $cons_counts{$sorted[0]} && $_ ne '*' + } @sorted; + + if (scalar(@equal_hits)) { + my $tag = join('',sort {$a cmp $b} @equal_hits); + die "bad tag $tag" if (! defined $iupac{$tag}); + $called = $iupac{$tag}; + } + } + $called .= $consensus_insert; + + print "consensus/reference difference at $chr pos $loc (ref: $ref cons: $called)\n" + if ($verbose && $called ne $ref); + + if (defined $fn_table) { + push @lines, [ + $chr, + $loc, + $ref, + $called eq '' ? '-' : $called, + $depth, + $counted_depth, + sprintf('%.3f',$error_rate), + $counts{A}, + $counts{T}, + $counts{G}, + $counts{C}, + $counts{N}, + $counts{'*'}, + sprintf('%.3f',$counts{A}/$counted_depth), + sprintf('%.3f',$counts{T}/$counted_depth), + sprintf('%.3f',$counts{G}/$counted_depth), + sprintf('%.3f',$counts{C}/$counted_depth), + sprintf('%.3f',$counts{N}/$counted_depth), + sprintf('%.3f',$counts{'*'}/$counted_depth), + sprintf('%.3f',$sb{A}), + sprintf('%.3f',$sb{T}), + sprintf('%.3f',$sb{G}), + sprintf('%.3f',$sb{C}), + undef, + join(':',@insert_strings) + ]; + } + } + + my $consensus = first {$_->id eq $chr} @consensi; + if (! defined $consensus) { + $consensus = BioX::Seq->new('',$chr); + push @consensi, $consensus; + } + $consensus->seq .= $called; + + my $cons_len = length($called); + + # Generate bedgraph output + if (defined $fn_bedgraph && $cons_len > 0) { + + # bin depth if requested + if ($bg_bin_figs > 0) { + my $divisor = 10**max(0, length($depth)-$bg_bin_figs); + $depth = int($depth/$divisor) * $divisor; + } + + # output on depth change + if (! defined $last_depth || $depth != $last_depth) { + $bg .= join("\t",$last_chr, $bg_start, $bg_loc, $last_depth) . "\n" + if (defined $last_depth); + $last_depth = $depth; + $bg_start = $bg_loc; + } + + $bg_loc += $cons_len; + + } + +} + + +sub realign { + + # calculate a local realignment at indel using MAFFT + # TODO: reimplement using native code + + my ($hash) = @_; + + my @seqs = keys %{ $hash }; + my @weights = map {$hash->{$_}} @seqs; + my @scores; + if (scalar(@seqs) > 1) { + my ($fh, $fn) = tempfile; + for (0..$#seqs) { + my $n = $_ + 1; + print {$fh} ">$n\n$seqs[$_]\n"; + } + close $fh; + open my $stream, '-|', $MAFFT, + '--auto', + '--quiet', + '--op' => 0, + '--lop' => 0, + $fn; + my $p = BioX::Seq::Stream->new($stream); + while (my $seq = $p->next_seq) { + my $w = shift @weights; + for (0..length($seq)-1) { + my $base = substr $seq, $_, 1; + next if ($base eq '-'); + $scores[$_] = {} if (! defined $scores[$_]); + $scores[$_]->{$base} += $w; + } + } + } + else { + my $seq = $seqs[0]; + my $w = $weights[0]; + for (0..length($seq)-1) { + my $base = substr $seq, $_, 1; + next if ($base eq '-'); + $scores[$_] = {} if (! defined $scores[$_]); + $scores[$_]->{$base} += $w; + } + } + return @scores; + +} + + +__END__ + +=head1 NAME + +bam2consensus - consensus calling (etc) from BAM alignment + +=head1 SYNOPSIS + +bam2consensus --bam --ref [options] --consensus + +=head1 DESCRIPTION + +Re-calls a consensus sequence based on a BAM alignment to reference, with +various knobs and optional output formats + +=head1 PREREQUISITES + +Requires the following non-core Perl libraries: + +=over 1 + +=item * BioX::Seq + +=back + +as well as the following binaries: + +=over 1 + +=item * samtools (>= 1.3.1) + +=item * mafft + +=back + +=head1 OPTIONS + +=head2 Input (required) + +=over 4 + +=item B<--bam> I + +Path to input BAM alignments + +=item B<--ref> I + +Path to reference sequence used to generate BAM alignments + +=back + +=head2 Output (at least one is required, can specify more than one) + +=over 4 + +=item B<--consensus> + +Path to write consensus sequence to (as FASTA) + +=item B<--bedgraph> + +Path to write coverage file to (as bedgraph) + +=item B<--table> + +Path to write coverage file to (as tab-separated table) + +=back + +=head2 Configuration + +=over 4 + +=item B<--min_qual> + +Minimum quality for a base to be considered in consensus calling. Default: 10. + +=item B<--min_depth> + +Minimum read depth for consensus to be called (otherwise called as "N"). Default: 3. + +=item B<--trim> + +Fraction to trim from each end when calculating trimmed mean of error window. +Default: 0.2. + +=item B<--window> + +Size of sliding window used to calculate local error rates. Default: 30. + +=item B<--bg_bin_figs> + +If greater than zero, the number of significant figures used to bin depths in +bedgraph output. If zero, no binning is applied. This option is useful to +reduce the size of bedgraph output by binning similar depth values when high +resolution is not important. Default: 0 (disabled). + +=back + +=head1 CAVEATS AND BUGS + +Please submit bug reports to the issue tracker in the distribution repository. + +=head1 AUTHOR + +Jeremy Volkening (jdv@base2bio.com) + +=head1 LICENSE AND COPYRIGHT + +Copyright 2014-17 Jeremy Volkening + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=cut + diff -r 6fb0eff46972 -r 20d941ecc1bb frag_lens --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/frag_lens Tue Sep 28 06:11:32 2021 +0000 @@ -0,0 +1,197 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use Cwd qw/abs_path/; +use File::Temp qw/tempdir tempfile/; +use IPC::Cmd qw/can_run/; +use List::Util qw/sum/; +use Getopt::Long; +use Pod::Usage; + +my @good_codes = ( 0x0002, 0x0040 ); +my @bad_codes = ( 0x0004, 0x0100, 0x0800 ); + +#-inputs---------------------------------------------------------------------# +my $fasta; +my $forward; +my $reverse; +my $sam; +#-knobs----------------------------------------------------------------------# +my $threads = 1; +my $max_align = 10000; + +my $PROGRAM = 'frag_lens'; +my $VERSION = 0.001; + +GetOptions( + #-inputs-----------------------------------------------------------------# + 'sam=s' => \$sam, + 'forward=s' => \$forward, + 'reverse=s' => \$reverse, + 'ref=s' => \$fasta, + #-knobs------------------------------------------------------------------# + 'threads=i' => \$threads, + 'max_aln=i' => \$max_align, + 'help' => sub{ pod2usage(-verbose => 2); }, + 'version' => sub{ say "This is $PROGRAM v$VERSION";exit; }, + +) or pod2usage( -verbose => 1); + +my $fh_sam; +my $tmp_fasta; + +if (defined $sam) { + open $fh_sam, '<', $sam or die "failed to open SAM\n"; +} + +else { + + my $BWA = can_run('bwa') + // die "BWA is required but not found\n"; + + my ($tmp_dir) = tempdir( CLEANUP => 1); + + die "specify forward and reverse read files and reference\n" + if (! defined $forward || ! defined $reverse || ! defined $fasta); + + $fasta = abs_path($fasta); + + my $res = system( + 'ln', + '-s', + $fasta, + "$tmp_dir/tmp.fasta" + ); + die "link failed" if ($res); + $res = system( + $BWA, + 'index', + "$tmp_dir/tmp.fasta" + ); + die "index failed" if ($res); + open $fh_sam, '-|', $BWA, + 'mem', + '-t' => $threads, + '-v' => 1, + "$tmp_dir/tmp.fasta", + $forward, + $reverse + ; +} + +my $c = 0; +while (my $line = <$fh_sam>) { + next if ($line =~ /^\@/); + chomp $line; + my @parts = split "\t", $line; + my $flags = $parts[1]; + my $sum1 = sum map {$_ & $flags ? 1 : 0} @good_codes; + my $sum2 = sum map {$_ & $flags ? 1 : 0} @bad_codes; + if ($sum1 == scalar @good_codes && $sum2 == 0) { + say abs($parts[8]); + last if (++$c >= $max_align); + } +} +close $fh_sam; + +__END__ + +=head1 NAME + +frag_lens - Calculate paired end fragment lengths from read alignment + +=head1 SYNOPSIS + +frag_lens [--sam ] OR [--ref --forward --reverse ] [options] > frag_lens.txt + +=head1 DESCRIPTION + +Calculates library fragment lengths based on paired-end read alignment. +Takes as input either a preprepared SAM alignment or a reference and read +files from which it produces an alignment. Outputs calculated fragment +lengths, one per line. + +=head1 PREREQUISITES + +Requires the following binaries: + +=over 1 + +=item * bwa + +=back + +=head1 OPTIONS + +=head2 Input option one + +=over 4 + +=item B<--sam> I + +Path to input SAM alignment. + +=back + +=head2 Input option two + +=over 4 + +=item B<--ref> I + +Path to reference sequence (e.g. assembly) + +=item B<--forward> I + +Forward reads in FASTQ format + +=item B<--reverse> I + +Reverse reads in FASTQ format + +=back + +=head2 Configuration + +=over 4 + +=item B<--max_align> + +Maximum number of alignment records to read as input. Used to limit run times. + +=item B<--threads> + +Number of threads to use for alignment (ignored if --sam is given) + +=back + +=head1 CAVEATS AND BUGS + +Please submit bug reports to the issue tracker in the distribution repository. + +=head1 AUTHOR + +Jeremy Volkening (jdv@base2bio.com) + +=head1 LICENSE AND COPYRIGHT + +Copyright 2014-19 Jeremy Volkening + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=cut + diff -r 6fb0eff46972 -r 20d941ecc1bb summarize_assembly --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summarize_assembly Tue Sep 28 06:11:32 2021 +0000 @@ -0,0 +1,208 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use Getopt::Long; +use Pod::Usage; +use List::Util qw/sum max min/; +use List::MoreUtils qw/uniq/; +use BioX::Seq::Stream; + +my $PROGRAM = 'summarize_assembly'; +my $VERSION = 0.002; + +my $fn_fasta; +my @cutoffs; +my $strip = 0; +my $split = 0; + +# Collect command-line parameters +my $err_msg = 'Syntax error: please check your syntax'; +GetOptions( + 'fasta=s' => \$fn_fasta, + 'cutoffs:i{,}' => \@cutoffs, + 'strip_N' => \$strip, + 'split_N' => \$split, + 'help' => sub{ pod2usage( -verbose => 2 ) }, + 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; }, +) or pod2usage( -msg => $err_msg, -verbose => 1 ); + +# Set default cutoffs if necessary and sort +if (! scalar @cutoffs) { + warn "No cutoff supplied, defaulting to N50\n"; + push @cutoffs, 50; +} +@cutoffs = sort {$a <=> $b} uniq @cutoffs; + +# Only one of 'strip_N' or 'split_N' is valid +if ($strip && $split) { + warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n"; + $strip = 0; +} + +# Check for a few necessary conditions +die "Can't open FASTA file for reading" + if (defined $fn_fasta && ! -r $fn_fasta); +die "One or more cutoffs outside valid range (1-99)" + if (grep {$_ < 1 || $_ > 99} @cutoffs); +die "Cutoffs must be integer values" + if (grep {$_ ne int($_)} @cutoffs); +my @lens; +my $N_sum = 0; +my $GC_sum = 0; + +# Read in sequences and calculate descriptive stats +my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined + +SEQ: +while (my $seq = $stream->next_seq) { + my @parts = ($seq); + @parts = split(/n+/i, $seq) if $split; + for (@parts) { + my $Ns = ($_ =~ tr/Nn//); + $N_sum += $Ns; + $GC_sum += ($_ =~ tr/GCgc//); + push @lens, length($_) - $Ns * $strip; + } +} +@lens = sort {$b <=> $a} @lens; + +# Calculate basic stats +my $scaffold_count = scalar @lens; +my $total_len = sum @lens; +my $N_fraction = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100; +my $max_length = max @lens; +my $min_length = min @lens; +my $mean_length = round($total_len/$scaffold_count, 0); + +# GC percentage calculated from non-ambiguous bases only +my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100; + +# Calculate Nx (N50, N80, etc) values +# For example, N50 is the size of the smallest contig for which it and all +# larger contigs contain 50% of the total nucleotides in the database +my $cum_length = 0; +my @fractions = map {$_/100} @cutoffs; +my @Nx; + +LEN: +for (@lens) { + $cum_length += $_; + if ($cum_length/$total_len >= $fractions[0]) { + push @Nx, $_; + shift @fractions; + last LEN if (@fractions < 1); + } +} + +# Print summary +print '-' x 40 . "\n" + . "Summary\n" + . '-' x 40 . "\n" + . "number of scaffolds: $scaffold_count\n" + . "total length: $total_len\n" + . "average length: $mean_length\n" + . "G/C content: $GC_fraction\%\n" + . "ambiguous content: $N_fraction\%\n" + . "longest scaffold: $max_length\n"; +for (0..$#Nx) { + my $label = sprintf "N%02d", $cutoffs[$_]; + print "$label: $Nx[$_]\n"; +} +print "shortest scaffold: $min_length\n"; +print "NOTE: Ns were stripped for above calculations\n" if ($strip); +print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split); +print '-' x 40 . "\n"; + +exit; + +sub round { + + my ($val,$places) = @_; + return int($val*10**$places+0.5)/10**$places; + +} + + +__END__ + +=head1 NAME + +summarize_assembly - print basic summary info for a file of assembly scaffolds + +=head1 SYNOPSIS + +summarize_assembly [--cutoffs I I .. I --strip_N --split_N ] --fasta I] + +=head1 DESCRIPTION + +This program takes a FASTA file and optionally a list of cutoff values as +input and prints out summary information about the contigs/scaffolds contained +in the file. You can, of course, supply a FASTA file of any sort of nucleic +acid sequences, but the summary information makes most sense for contigs from +genomic sequencing assemblies. + +=head1 OPTIONS + +=over + +=item B<--fasta> I + +Specify contig/scaffold file from which to read input (default: STDIN) + +=item B<--cutoffs> + +Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90' +will output N50 and N90 values) (default: 50) + +=item B<--strip_N> + +If specified, Ns will be stripped from scaffold sequences before statistics +are calculated (default: FALSE) + +=item B<--split_N> + +If specified, scaffold sequences will be split at regions of one or more Ns +before statistics are calculated (e.g. to get contig-level stats from a +scaffold file). Note that if this flag is specified, the value of '--strip_N' +will be ignored. (default: FALSE) + +=item B<--help> + +Display this usage page + +=item B<--version> + +Print version information + +=back + +=head1 CAVEATS AND BUGS + +Please submit bug reports to the issue tracker in the distribution repository. + +=head1 AUTHOR + +Jeremy Volkening (jdv@base2bio.com) + +=head1 LICENSE AND COPYRIGHT + +Copyright 2014-16 Jeremy Volkening + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=cut + diff -r 6fb0eff46972 -r 20d941ecc1bb summarize_run.pl --- a/summarize_run.pl Wed Jul 17 17:47:15 2019 -0400 +++ b/summarize_run.pl Tue Sep 28 06:11:32 2021 +0000 @@ -17,7 +17,6 @@ my $fn_consensus; my $fn_out; my $n_threads = 1; -my $max_aln = 100000; GetOptions( 'raw_1=s' => \$fn_raw1, @@ -30,7 +29,6 @@ 'consensus=s' => \$fn_consensus, 'out=s' => \$fn_out, 'threads=i' => \$n_threads, - 'max_aln=i' => \$max_aln, ); @@ -50,14 +48,12 @@ die "raw pair count mismatch\n" if ($counts[0] != $counts[1]); die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]); -#warn "calculating fragment length stats...\n"; +# read fragment stats from STDIN my @lens; -open my $stream, '-|', "frag_lens","--forward",$fn_filt1,"--reverse",$fn_filt2,"--ref",$fn_consensus,"--threads",$n_threads,"--max_aln",$max_aln; -while (<$stream>) { +while () { chomp $_; push @lens, $_; } -close $stream; my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 ); my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 ); diff -r 6fb0eff46972 -r 20d941ecc1bb sync_reads --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sync_reads Tue Sep 28 06:11:32 2021 +0000 @@ -0,0 +1,300 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use autodie qw/open close/; +use BioX::Seq::Stream; +use Getopt::Long; +use Pod::Usage; + +my $reads1; +my $reads2; +my $singles; +my $sync_suffix = 'sync'; +my $singles_suffix = 'singles'; +my $compress = ''; # one of 'gzip' or 'dsrc' +my $outname1; +my $outname2; +my $singles1_name; +my $singles2_name; + +use constant DSRC_BIN => 'dsrc'; +use constant GZIP_BIN => 'gzip'; +use constant NAME => 'sync_reads'; +use constant VERSION => '0.005'; + +GetOptions( + + 'fwd=s' => \$reads1, + 'rev=s' => \$reads2, + 'singles' => \$singles, + 'fwd_out=s' => \$outname1, + 'rev_out=s' => \$outname2, + 'fwd_singles_out=s' => \$singles1_name, + 'rev_singles_out=s' => \$singles2_name, + 'sync_suffix=s' => \$sync_suffix, + 'singles_suffix=s' => \$singles_suffix, + 'compress:s' => \$compress, + 'help' => sub{ pod2usage(-verbose => 2); }, + 'version' => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; }, + +) or pod2usage( -verbose => 1 ); + +pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read") + if (! -r $reads1 || ! -r $reads2); + +# open output filehandles, possibly with compression +if (! defined $outname1) { + $outname1 = $reads1; + $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/; +} +if (! defined $outname2) { + $outname2 = $reads2; + $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/; +} +my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; +my $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' + : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' + : ''; +my $suffix = $compress eq 'gzip' ? '.gz' + : $compress eq 'dsrc' ? '.dsrc' + : ''; +open my $s1, $mode, $cmd . $outname1 . $suffix; +open my $s2, $mode, $cmd . $outname2 . $suffix; + + +# open singles output filehandles if requested, possibly with compression +my $up1; +my $up2; +if ($singles) { + if (! defined $singles1_name) { + $singles1_name = $reads1; + $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/; + } + if (! defined $singles2_name) { + $singles2_name = $reads2; + $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/; + } + $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; + $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' + : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' + : ''; + my $suffix = $compress eq 'gzip' ? '.gz' + : $compress eq 'dsrc' ? '.dsrc' + : ''; + open $up1, $mode, $cmd . $singles1_name . $suffix; + open $up2, $mode, $cmd . $singles2_name . $suffix; +} + +my $ll_f1 = {}; +my $ll_f2 = {}; +my $f1_prev; +my $f2_prev; +my $f1_open = 1; +my $f2_open = 1; + +my $parser1 = BioX::Seq::Stream->new($reads1); +my $parser2 = BioX::Seq::Stream->new($reads2); + +while ($f1_open || $f2_open) { + + # process next read for file 1 + if ($f1_open && defined (my $seq = $parser1->next_seq)) { + my $name = $seq->id; + $name =~ s/\/[12]$//; #remove directional tag if present + if (defined $ll_f2->{$name}) { + my $prev = $ll_f2->{$name}->{prev} // undef; + purge_cache( $ll_f2, $prev, $up2 ); + purge_cache( $ll_f1, $f1_prev, $up1 ); + print {$s1} $seq->as_fastq; + print {$s2} $ll_f2->{$name}->{read}; + delete $ll_f2->{$name}; + $f1_prev = undef; + } + else { + $ll_f1->{$name}->{read} = $seq->as_fastq; + $ll_f1->{$name}->{prev} = $f1_prev // undef; + $f1_prev = $name; + } + } + else { + $f1_open = 0; + } + + # process next read for file 2 + if ($f2_open && defined (my $seq = $parser2->next_seq)) { + my $name = $seq->id; + $name =~ s/\/[12]$//; #remove directional tag if present + if (defined $ll_f1->{$name}) { + my $prev = $ll_f1->{$name}->{prev} // undef; + purge_cache( $ll_f1, $prev, $up1 ); + purge_cache( $ll_f2, $f2_prev, $up2 ); + print {$s2} $seq->as_fastq; + print {$s1} $ll_f1->{$name}->{read}; + delete $ll_f1->{$name}; + $f2_prev = undef; + } + else { + $ll_f2->{$name}->{read} = $seq->as_fastq; + $ll_f2->{$name}->{prev} = $f2_prev // undef; + $f2_prev = $name; + } + } + else { + $f2_open = 0; + } +} + +#handle remaining unpaired reads if necessary +if ($singles) { + purge_cache( $ll_f1, $f1_prev, $up1 ); + purge_cache( $ll_f2, $f2_prev, $up2 ); + close $up1; + close $up2; +} + +exit; + +sub purge_cache { + + my ($ll, $prev, $fh) = @_; + while (defined $prev && defined $ll->{$prev}) { + my $prev2 = $ll->{$prev}->{prev} // undef; + print {$fh} $ll->{$prev}->{read} if ($singles); + delete $ll->{$prev}; + $prev = $prev2; + } + +} + + +__END__ + +=head1 NAME + +sync_reads - resynchronize paired FASTQ files + +=head1 SYNOPSIS + +sync_reads [options] --fwd I --rev I + +=head1 DESCRIPTION + +B will re-synchronize two FASTQ files containing paired reads which +are no longer in sync due to individual removal of reads during pre-processing +(trimming, filtering, etc). In this case, "in sync" means that both files have +the same number of reads and, at any given read position in the files, the +corresponding reads represent proper pairs. The resulting files will contain +matching reads in order (assuming the input files were properly ordered). It +will optionally print out unpaired reads to separate files. Memory usage is +not dependent on the input file size but rather the maximum distance between +paired reads in the two files, as the read cache is flushed each time paired +reads are identified. In the worst-case scenario (one file has a single read +that pairs with the last read in the matching file) memory usage can approach +the largest file size, but in typical usage it rarely exceeds a few MB +regardless of file size. + +B Reads in input files MUST be in the same order, aside from +missing reads, or the output will report many valid pairs as singletons. + +=head1 OPTIONS + +=head2 Mandatory + +=over 4 + +=item B<--fwd> I + +Specify FASTQ file containing the first of the trimmed read pairs + +=item B<--rev> I + +Specify FASTQ file containing the second of the trimmed read pairs + +=back + +=head2 Optional + +=over 4 + +=item B<--fwd_out> I + +Specify output name for synced forward reads + +=item B<--rev_out> I + +Specify output name for synced reverse reads + +=item B<--fwd_singles_out> I + +Specify output name for forward singleton reads + +=item B<--rev_singles_out> I + +Specify output name for reverse singleton reads + +=item B<--sync_suffix> I + +Specify suffix to add to synced read output files. This will be added to the +input read name before the final suffix (i.e. after the last period). Default +is 'sync'. + +=item B<--compress> I + +Specify type of compression for output files (will compress all output files) + +=item B<--singles> + +If given, unpaired reads will be written to separate output files. Default is +FALSE. + +=item B<--singles_suffix> I + +Specify suffix to add to singles read output files. This will be added to the +input read name before the final suffix (i.e. after the last period). Default +is 'singles'. + +=item B<--help> + +Display this usage page + +=item B<--version> + +Print version information + +=back + +=head1 CAVEATS AND BUGS + +Currently no input validation is performed on the input files. Files are +assumed to be standard FASTQ file format with each read represented by four +lines and no other extraneous information present. B, they are also +assumed to be in the same input order after accounting for deleted reads +(the software will fail miserably if this is not the case). + +Please submit bug reports to the issue tracker in the distribution repository. + +=head1 AUTHOR + +Jeremy Volkening + +=head1 COPYRIGHT AND LICENSE + +Copyright 2014-16 Jeremy Volkening + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=cut