Mercurial > repos > jdv > b2b_summarize_assembly

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bam2consensus	Tue Sep 28 06:16:07 2021 +0000
@@ -0,0 +1,710 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use BioX::Seq;
+use BioX::Seq::Stream;
+use BioX::Seq::Fetch;
+use File::Temp qw/tempfile/;
+use IPC::Cmd qw/can_run/;
+use Getopt::Long;
+use List::Util qw/sum max first/;
+use Pod::Usage;
+use POSIX qw/floor ceil/;
+
+#-inputs---------------------------------------------------------------------#
+my $fn_bam;
+my $fn_ref;
+#-outputs--------------------------------------------------------------------#
+my $fn_table;
+my $fn_consensus;
+my $fn_bedgraph;
+#-knobs----------------------------------------------------------------------#
+my $min_qual       = 10;
+my $min_depth      = 3;
+my $trim_fraction  = 0.2;
+my $sliding_window = 30;
+my $bg_bin_figs    = 0;
+my $verbose        = 0;
+
+my $PROGRAM = 'bam2consensus';
+my $VERSION = 0.004;
+
+GetOptions(
+
+    #-inputs-----------------------------------------------------------------#
+    'bam=s'         => \$fn_bam,
+    'ref=s'         => \$fn_ref,
+    #-outputs----------------------------------------------------------------#
+    'table=s'       => \$fn_table,
+    'consensus=s'   => \$fn_consensus,
+    'bedgraph=s'    => \$fn_bedgraph,
+    #-knobs------------------------------------------------------------------#
+    'min_qual=i'    => \$min_qual,
+    'min_depth=i'   => \$min_depth,
+    'trim=f'        => \$trim_fraction,
+    'window=i'      => \$sliding_window,
+    'bg_bin_figs=i' => \$bg_bin_figs,
+    'verbose'       => \$verbose,
+    'help'          => sub{ pod2usage(-verbose => 2); },
+    'version'       => sub{ print "This is $PROGRAM v$VERSION\n";exit; },
+
+) or pod2usage( -verbose => 1);
+
+# check for recent version of samtools
+my $SAMTOOLS = can_run('samtools')
+    // die "Samtools is required but not found\n";
+my $v_string = `$SAMTOOLS --version`;
+if ($v_string =~ /^samtools (\d+)\.(\d+)/m) {
+    die "Requires samtools >= 1.3.0\n" if ($1 < 1 || $2 < 3);
+} else {
+    die "Error parsing samtools version string\n";
+}
+
+# check for mafft
+my $MAFFT = can_run('mafft')
+    // die "MAFFT is required but not found\n";
+
+
+# misc param checking
+die "Error: must specify at least one output target" if (! (
+       defined $fn_table
+    || defined $fn_consensus
+    || defined $fn_bedgraph
+));
+die "Error: missing reference parameter"
+    if (! defined $fn_ref);
+die "Error reading reference"
+    if (! -r $fn_ref);
+
+
+# globals
+my @errors;
+my @lines = () if (defined $fn_table);
+
+my %iupac = (
+    A    => 'A',
+    C    => 'C',
+    G    => 'G',
+    T    => 'T',
+    AG   => 'R',
+    CT   => 'Y',
+    CG   => 'S',
+    AT   => 'W',
+    GT   => 'K',
+    AC   => 'M',
+    CGT  => 'B',
+    AGT  => 'D',
+    ACT  => 'H',
+    ACG  => 'V',
+    ACGT => 'N',
+);
+
+my @consensi;
+my $bg = '';
+
+my @curr_lines;
+my $last_chr;
+
+my $last_depth = undef;
+my $last_loc   = 0;
+my $bg_start   = 0;
+my $bg_loc     = 0;
+
+
+# initialize random-access sequence collection
+my $seqs = BioX::Seq::Fetch->new($fn_ref) or die "Error loading reference";
+
+
+# pipe from samtools mpileup command
+# (note: this is much faster in testing than using Perl bindings, e.g.
+# Bio::DB::HTS or the like)
+
+$fn_bam //= '-'; # use stdin if BAM file not given
+
+open my $fh, '-|', $SAMTOOLS,
+    'mpileup',
+    '-d' => 1000000,
+    '-B',
+    '-f' => $fn_ref,
+    $fn_bam ;
+
+
+
+LINE:
+while (my $line = <$fh>) {
+
+    chomp $line;
+    my ($chr, @other) = split "\t", $line;
+    $last_chr //= $chr;
+
+    if ($chr ne $last_chr) {
+        process(\@curr_lines);
+        @curr_lines = ();
+        $last_chr = $chr;
+    }
+
+    push @curr_lines, $line;
+}
+
+process(\@curr_lines); # don't forget last call
+
+# output bedgraph if asked
+if (defined $fn_bedgraph) {
+    open my $fh_bedgraph, '>', $fn_bedgraph;
+    print {$fh_bedgraph}
+        "track type=bedGraph name=read_coverage maxHeightPixels=1000:80:20\n";
+    print {$fh_bedgraph} $bg;
+    close $fh_bedgraph;
+
+}
+
+# output fasta if asked
+if (defined $fn_consensus) {
+
+    open my $out, '>', $fn_consensus;
+    print {$out} $_->as_fasta for (@consensi);
+    close $out;
+
+}
+
+# build and process table if asked
+if (defined $fn_table) {
+
+    # calculate sliding errors
+    my @avg_errors;
+    my $l = scalar(@errors);
+    $sliding_window = $l if ($l < $sliding_window);
+    my $left  = floor(($sliding_window-1)/2);
+    my $right = ceil(($sliding_window-1)/2);
+    my $lower = $left;
+    my $upper = $l - $right;
+    for my $i (0..$#errors) {
+        my @pool;
+        if ($i < $lower) {
+            @pool = (@errors[0..$i-1] ,@errors[$i+1..$sliding_window-1]);
+        }
+        elsif ($i >= $upper) {
+            @pool = (@errors[$l-$sliding_window..$i-1], @errors[$i+1..$l-1]);
+        }
+        else {
+            @pool = (@errors[$i-$left..$i-1], @errors[$i+1..$i+$right]);
+        }
+        die "bad pool size @pool at $i" if (scalar(@pool)+1 != $sliding_window);
+
+        # calc trimmed mean
+        @pool = sort {$a <=> $b} @pool;
+        my $l = @pool;
+        my @trimmed
+            = @pool[ int($l*$trim_fraction), int($l*(1-$trim_fraction))+0.5 ];
+        my $tm = scalar(@trimmed) > 0 ? sum(@trimmed)/scalar(@trimmed) : 'NA';
+        push @avg_errors, $tm;
+    }
+
+    open my $fh_table, '>', $fn_table;
+
+    # print table header
+    print {$fh_table} join( "\t", (
+        'id',
+        'loc',
+        'ref',
+        'called',
+        'total_depth',
+        'counted_depth',
+        'mm_rate',
+        'A_count',
+        'T_count',
+        'G_count',
+        'C_count',
+        'N_count',
+        'gap_count',
+        'A_freq',
+        'T_freq',
+        'G_freq',
+        'C_freq',
+        'N_freq',
+        'gap_freq',
+        'A_sb',
+        'T_sb',
+        'G_sb',
+        'C_sb',
+        'bgnd_err',
+        'insertions'
+    ) ) . "\n";
+
+    my $iter = 0;
+    POS:
+    for (0..$#lines) {
+        my @parts = @{ $lines[$_] };
+        @parts[23] = sprintf '%.3f', $avg_errors[$_];
+        print {$fh_table} join( "\t",@parts), "\n";
+    }
+    close $fh_table;
+}
+
+sub process {
+
+    my $ln_ref = shift;
+
+    my $last_chr;
+    $last_depth = undef;
+    $last_loc = 0;
+    $bg_start = 0;
+    $bg_loc = 0;
+
+    LINE:
+    for my $line (@$ln_ref) {
+        chomp $line;
+        my @parts = split "\t", $line;
+        my $chr         = $parts[0];
+        my $loc         = $parts[1];
+        my $ref         = uc $parts[2];
+        my $depth       = $parts[3];
+        my $read_string = $parts[4];
+        my $qual_string = $parts[5];
+
+        # check that chr hasn't changed (don't supported multiple refs)
+        $last_chr   = $last_chr   // $chr;
+        if ($chr ne $last_chr) {
+            #process current, start new
+        }
+
+        # simulate missing rows
+        my $t = $last_loc + 1;
+        while ($t < $loc) {
+            handle_entry(
+                $chr,
+                $t,
+                $seqs->fetch_seq($chr, $t, $t),
+                #substr($ref_seq, $t-1, 1),
+                0,
+                '',
+                '',
+            );
+            ++$t;
+        }
+
+        # send entry for handling
+        handle_entry(
+            $chr,
+            $loc,
+            $ref,
+            $depth,
+            $read_string,
+            $qual_string,
+        );
+
+        $last_loc = $loc;
+
+    }
+
+    # simulate missing rows at end
+    my $t = $last_loc + 1;
+    my $l = $seqs->length($last_chr);
+    while ($t <= $l) {
+        handle_entry(
+            $last_chr,
+            $t,
+            $seqs->fetch_seq($last_chr, $t, $t),
+            #substr($ref_seq, $t-1, 1),
+            0,
+            '',
+            '',
+        );
+        ++$t;
+    }
+
+    if (defined $fn_bedgraph) {
+
+        $bg .= join("\t", $last_chr, $bg_start, $bg_loc, $last_depth) . "\n";
+    }
+
+}
+
+
+sub handle_entry {
+
+    my ($chr, $loc, $ref, $depth, $read_string, $qual_string) = @_;
+
+    my $called = '';
+
+    # handle zero-depth positions separately
+    if ($depth < 1) {
+        $called = 'N';
+        print "Missing coverage at $chr pos $loc\n" if ($verbose);
+        if (defined $fn_table) {
+            push @lines, [
+                $chr,
+                $loc,
+                $ref,
+                'N',
+                (0) x 19,
+                undef,
+                '',
+            ];
+        }
+        push @errors, 0;
+    }
+
+    # everything else
+    else {
+
+        # handle insertions
+        my %inserts;
+        my $insert_count = 0;
+        while ($read_string =~ /\+(\d+)((??{"[ATGCNatgcnRYSWKMBDHVryswkmbdhv-]{$^N}"}))/g) {
+            $inserts{$2} += 1;
+            ++$insert_count;
+        }
+
+        # ...and strip extra characters
+        $read_string =~ s/\^.//g;
+        $read_string =~ s/[\+\-](\d+)(??{"[ATGCNatgcnRYSWKMBDHVryswkmbdhv-]{$^N}"})//g;
+        $read_string =~ s/[^\.\,\w\*]//g;
+
+        # simple parse check
+        my $l1 = length($read_string);
+        my $l2 = length($qual_string);
+        die "read/qual mismatch ($l1 v $l2)" if ($l1 != $l2);
+
+        die "unexpected char at $chr pos $loc\n"
+            if ($read_string =~ /[^.,ATGCNatgcn*]/);
+
+        my $lc = lc $ref;
+        $read_string =~ s/\./$ref/g;
+        $read_string =~ s/\,/$lc/g;
+        $read_string =~ s/n/N/g;
+
+        # split into arrays
+        my %counts = map {$_ => 0} qw/A T G C N a t g c */;
+        my %cons_counts = map {$_ => 0} qw/A T G C N a t g c */;
+        my @chars  = split '', $read_string;
+        my @quals  = map {ord($_) - 33} split('', $qual_string);
+
+        READ:
+        for my $i (0..$#chars) {
+            ++$cons_counts{ uc($chars[$i]) };
+            ++$counts{ $chars[$i] } if ($quals[$i] >= $min_qual);
+        }
+
+        # calculate strand bias and collapse counts
+        my %sb;
+        for my $b (qw/A T G C/) {
+            my $n = $counts{$b} + $counts{lc($b)};
+            $sb{$b} = $n > 0
+                ? ($n-1)/$n*(2*max($counts{$b}/$n, ($n-$counts{$b})/$n)-1)
+                : 0;
+            $counts{$b} += $counts{lc($b)};
+            delete $counts{lc($b)};
+        }
+
+        $counts{$ref} = $counts{$ref} // 0; # some IUPAC codes not defined above
+        $cons_counts{$ref} = $cons_counts{$ref} // 0; # some IUPAC codes not defined above
+        my $mismatches    = sum map {$counts{$_}} grep {$_ ne $ref} keys %counts;
+        my $counted_depth = $counts{$ref} + $mismatches;
+        my $cons_depth    = sum map {$cons_counts{$_}} keys %counts;
+        my $error_rate    = $counted_depth == 0
+            ? 0
+            : sprintf '%.4f', $mismatches/$counted_depth;
+        push @errors, $error_rate;
+
+        my @insert_strings = ();
+        my $consensus_insert = '';
+
+        #create case-insensitive insert hash
+        my %combined_inserts;
+        for (keys %inserts) {
+            $combined_inserts{uc($_)} += $inserts{$_};
+        }
+
+        if (scalar(keys %combined_inserts) > 0) {
+            my @sorted_inserts = sort {
+                $combined_inserts{$b} <=> $combined_inserts{$a}
+             || $a cmp $b
+            } keys %combined_inserts;
+            for (@sorted_inserts) {
+                my $f_count = $inserts{$_} // 0;
+                my $r_count = $inserts{lc($_)} // 0;
+                my $n = $combined_inserts{$_};
+                my $sb = sprintf '%.3f', ($n-1)/$n*max($f_count/$n, ($n-$f_count)/$n);
+                push @insert_strings, "$_($f_count,$r_count:$sb)";
+            }
+
+            # decide whether to include insert in consensus
+            if ($insert_count/$l1 > 0.5) {
+                my @realigned = realign(\%combined_inserts);
+                for my $i (0..$#realigned) {
+                    my @b = sort {
+                        $realigned[$i]->{$b} <=> $realigned[$i]->{$a}
+                    } keys %{ $realigned[$i] };
+                    if ($realigned[$i]->{$b[0]}/$l1 > 0.5) {
+                        $consensus_insert .= uc $b[0];
+                    }
+                }
+            }
+
+        }
+        if ($cons_depth < $min_depth) {
+            $called = 'N';
+        }
+        else {
+            my @sorted
+                = sort {$cons_counts{$b} <=> $cons_counts{$a}} keys %cons_counts;
+
+            # get all top hits that aren't gaps
+            my @equal_hits = grep {
+                $cons_counts{$_} eq $cons_counts{$sorted[0]} && $_ ne '*'
+            } @sorted;
+
+            if (scalar(@equal_hits)) {
+                my $tag = join('',sort {$a cmp $b} @equal_hits);
+                die "bad tag $tag" if (! defined $iupac{$tag});
+                $called = $iupac{$tag};
+            }
+        }
+        $called .= $consensus_insert;
+
+        print "consensus/reference difference at $chr pos $loc (ref: $ref cons: $called)\n"
+            if ($verbose && $called ne $ref);
+
+        if (defined $fn_table) {
+            push @lines, [
+                $chr,
+                $loc,
+                $ref,
+                $called eq '' ? '-' : $called,
+                $depth,
+                $counted_depth,
+                sprintf('%.3f',$error_rate),
+                $counts{A},
+                $counts{T},
+                $counts{G},
+                $counts{C},
+                $counts{N},
+                $counts{'*'},
+                sprintf('%.3f',$counts{A}/$counted_depth),
+                sprintf('%.3f',$counts{T}/$counted_depth),
+                sprintf('%.3f',$counts{G}/$counted_depth),
+                sprintf('%.3f',$counts{C}/$counted_depth),
+                sprintf('%.3f',$counts{N}/$counted_depth),
+                sprintf('%.3f',$counts{'*'}/$counted_depth),
+                sprintf('%.3f',$sb{A}),
+                sprintf('%.3f',$sb{T}),
+                sprintf('%.3f',$sb{G}),
+                sprintf('%.3f',$sb{C}),
+                undef,
+                join(':',@insert_strings)
+            ];
+        }
+    }
+
+    my $consensus = first {$_->id eq $chr} @consensi;
+    if (! defined $consensus) {
+        $consensus = BioX::Seq->new('',$chr);
+        push @consensi, $consensus;
+    }
+    $consensus->seq .= $called;
+
+    my $cons_len = length($called);
+
+    # Generate bedgraph output
+    if (defined $fn_bedgraph && $cons_len > 0) {
+
+        # bin depth if requested
+        if ($bg_bin_figs > 0) {
+            my $divisor = 10**max(0, length($depth)-$bg_bin_figs);
+            $depth = int($depth/$divisor) * $divisor;
+        }
+
+        # output on depth change
+        if (! defined $last_depth || $depth != $last_depth) {
+            $bg .= join("\t",$last_chr, $bg_start, $bg_loc, $last_depth) . "\n"
+                if (defined $last_depth);
+            $last_depth = $depth;
+            $bg_start = $bg_loc;
+        }
+
+        $bg_loc += $cons_len;
+
+    }
+
+}
+
+
+sub realign {
+
+    # calculate a local realignment at indel using MAFFT
+    # TODO: reimplement using native code
+
+    my ($hash) = @_;
+
+    my @seqs = keys %{ $hash };
+    my @weights = map {$hash->{$_}} @seqs;
+    my @scores;
+    if (scalar(@seqs) > 1) {
+        my ($fh, $fn) = tempfile;
+        for (0..$#seqs) {
+            my $n = $_ + 1;
+            print {$fh} ">$n\n$seqs[$_]\n";
+        }
+        close $fh;
+        open my $stream, '-|', $MAFFT,
+            '--auto',
+            '--quiet',
+            '--op'  => 0,
+            '--lop' => 0,
+            $fn;
+        my $p = BioX::Seq::Stream->new($stream);
+        while (my $seq = $p->next_seq) {
+            my $w = shift @weights;
+            for (0..length($seq)-1) {
+                my $base = substr $seq, $_, 1;
+                next if ($base eq '-');
+                $scores[$_] = {} if (! defined $scores[$_]);
+                $scores[$_]->{$base} += $w;
+            }
+        }
+    }
+    else {
+        my $seq = $seqs[0];
+        my $w   = $weights[0];
+        for (0..length($seq)-1) {
+            my $base = substr $seq, $_, 1;
+            next if ($base eq '-');
+            $scores[$_] = {} if (! defined $scores[$_]);
+            $scores[$_]->{$base} += $w;
+        }
+    }
+    return @scores;
+
+}
+
+
+__END__
+
+=head1 NAME
+
+bam2consensus - consensus calling (etc) from BAM alignment
+
+=head1 SYNOPSIS
+
+bam2consensus --bam <in.bam> --ref <in.fasta> [options] --consensus <out.fasta>
+
+=head1 DESCRIPTION
+
+Re-calls a consensus sequence based on a BAM alignment to reference, with
+various knobs and optional output formats
+
+=head1 PREREQUISITES
+
+Requires the following non-core Perl libraries:
+
+=over 1
+
+=item * BioX::Seq
+
+=back
+
+as well as the following binaries:
+
+=over 1
+
+=item * samtools (>= 1.3.1)
+
+=item * mafft
+
+=back
+
+=head1 OPTIONS
+
+=head2 Input (required)
+
+=over 4
+
+=item B<--bam> I<filename>
+
+Path to input BAM alignments
+
+=item B<--ref> I<filename>
+
+Path to reference sequence used to generate BAM alignments
+
+=back
+
+=head2 Output (at least one is required, can specify more than one)
+
+=over 4
+
+=item B<--consensus>
+
+Path to write consensus sequence to (as FASTA)
+
+=item B<--bedgraph>
+
+Path to write coverage file to (as bedgraph)
+
+=item B<--table>
+
+Path to write coverage file to (as tab-separated table)
+
+=back
+
+=head2 Configuration
+
+=over 4
+
+=item B<--min_qual>
+
+Minimum quality for a base to be considered in consensus calling. Default: 10.
+
+=item B<--min_depth>
+
+Minimum read depth for consensus to be called (otherwise called as "N").  Default: 3.
+
+=item B<--trim>
+
+Fraction to trim from each end when calculating trimmed mean of error window.
+Default: 0.2.
+
+=item B<--window>
+
+Size of sliding window used to calculate local error rates. Default: 30.
+
+=item B<--bg_bin_figs> <integer>
+
+If greater than zero, the number of significant figures used to bin depths in
+bedgraph output. If zero, no binning is applied. This option is useful to
+reduce the size of bedgraph output by binning similar depth values when high
+resolution is not important. Default: 0 (disabled).
+
+=back
+
+=head1 CAVEATS AND BUGS
+
+Please submit bug reports to the issue tracker in the distribution repository.
+
+=head1 AUTHOR
+
+Jeremy Volkening (jdv@base2bio.com)
+
+=head1 LICENSE AND COPYRIGHT
+
+Copyright 2014-17 Jeremy Volkening
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/frag_lens	Tue Sep 28 06:16:07 2021 +0000
@@ -0,0 +1,197 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use Cwd qw/abs_path/;
+use File::Temp qw/tempdir tempfile/;
+use IPC::Cmd qw/can_run/;
+use List::Util qw/sum/;
+use Getopt::Long;
+use Pod::Usage;
+
+my @good_codes = ( 0x0002, 0x0040 );
+my @bad_codes  = ( 0x0004, 0x0100, 0x0800 );
+
+#-inputs---------------------------------------------------------------------#
+my $fasta;
+my $forward;
+my $reverse;
+my $sam;
+#-knobs----------------------------------------------------------------------#
+my $threads = 1;
+my $max_align = 10000;
+
+my $PROGRAM = 'frag_lens';
+my $VERSION = 0.001;
+
+GetOptions(
+    #-inputs-----------------------------------------------------------------#
+    'sam=s'     => \$sam,
+    'forward=s' => \$forward,
+    'reverse=s' => \$reverse,
+    'ref=s'     => \$fasta,
+    #-knobs------------------------------------------------------------------#
+    'threads=i' => \$threads,
+    'max_aln=i' => \$max_align,
+    'help'      => sub{ pod2usage(-verbose => 2); },
+    'version'   => sub{ say "This is $PROGRAM v$VERSION";exit; },
+
+) or pod2usage( -verbose => 1);
+
+my $fh_sam;
+my $tmp_fasta;
+
+if (defined $sam) {
+    open $fh_sam, '<', $sam or die "failed to open SAM\n";
+}
+
+else {
+
+    my $BWA = can_run('bwa')
+        // die "BWA is required but not found\n";
+
+    my ($tmp_dir) = tempdir( CLEANUP => 1);
+
+    die "specify forward and reverse read files and reference\n"
+        if (! defined $forward || ! defined $reverse || ! defined $fasta);
+
+    $fasta = abs_path($fasta);
+
+    my $res = system(
+        'ln',
+        '-s',
+        $fasta,
+        "$tmp_dir/tmp.fasta"
+    );
+    die "link failed" if ($res);
+    $res = system(
+        $BWA,
+        'index',
+        "$tmp_dir/tmp.fasta"
+    );
+    die "index failed" if ($res);
+    open $fh_sam, '-|', $BWA,
+        'mem',
+        '-t' => $threads,
+        '-v' => 1,
+        "$tmp_dir/tmp.fasta",
+        $forward,
+        $reverse
+    ;
+}
+
+my $c = 0;
+while (my $line = <$fh_sam>) {
+    next if ($line =~ /^\@/);
+    chomp $line;
+    my @parts = split "\t", $line;
+    my $flags = $parts[1];
+    my $sum1 = sum map {$_ & $flags ? 1 : 0} @good_codes;
+    my $sum2 = sum map {$_ & $flags ? 1 : 0} @bad_codes;
+    if ($sum1 == scalar @good_codes && $sum2 == 0) {
+        say abs($parts[8]);
+        last if (++$c >= $max_align);
+    }
+}
+close $fh_sam;
+
+__END__
+
+=head1 NAME
+
+frag_lens - Calculate paired end fragment lengths from read alignment
+
+=head1 SYNOPSIS
+
+frag_lens [--sam <in.sam>] OR [--ref <cons.fa> --forward <R1.fq> --reverse <R2.fq>] [options] > frag_lens.txt
+
+=head1 DESCRIPTION
+
+Calculates library fragment lengths based on paired-end read alignment.
+Takes as input either a preprepared SAM alignment or a reference and read
+files from which it produces an alignment. Outputs calculated fragment
+lengths, one per line.
+
+=head1 PREREQUISITES
+
+Requires the following binaries:
+
+=over 1
+
+=item * bwa
+
+=back
+
+=head1 OPTIONS
+
+=head2 Input option one
+
+=over 4
+
+=item B<--sam> I<filename>
+
+Path to input SAM alignment.
+
+=back
+
+=head2 Input option two
+
+=over 4
+
+=item B<--ref> I<filename>
+
+Path to reference sequence (e.g. assembly)
+
+=item B<--forward> I<filename>
+
+Forward reads in FASTQ format
+
+=item B<--reverse> I<filename>
+
+Reverse reads in FASTQ format
+
+=back
+
+=head2 Configuration
+
+=over 4
+
+=item B<--max_align>
+
+Maximum number of alignment records to read as input. Used to limit run times.
+
+=item B<--threads>
+
+Number of threads to use for alignment (ignored if --sam is given)
+
+=back
+
+=head1 CAVEATS AND BUGS
+
+Please submit bug reports to the issue tracker in the distribution repository.
+
+=head1 AUTHOR
+
+Jeremy Volkening (jdv@base2bio.com)
+
+=head1 LICENSE AND COPYRIGHT
+
+Copyright 2014-19 Jeremy Volkening
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/summarize_assembly	Tue Sep 28 06:16:07 2021 +0000
@@ -0,0 +1,208 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use Getopt::Long;
+use Pod::Usage;
+use List::Util qw/sum max min/;
+use List::MoreUtils qw/uniq/;
+use BioX::Seq::Stream;
+
+my $PROGRAM = 'summarize_assembly';
+my $VERSION = 0.002;
+
+my $fn_fasta;
+my @cutoffs;
+my $strip = 0;
+my $split = 0;
+
+# Collect command-line parameters
+my $err_msg = 'Syntax error: please check your syntax';
+GetOptions(
+    'fasta=s'      => \$fn_fasta,
+    'cutoffs:i{,}' => \@cutoffs,
+    'strip_N'      => \$strip,
+    'split_N'      => \$split,
+    'help'         => sub{ pod2usage( -verbose => 2 ) },
+    'version'      => sub{ print "This is $PROGRAM v$VERSION\n";exit; },
+) or pod2usage( -msg => $err_msg, -verbose => 1 );
+
+# Set default cutoffs if necessary and sort
+if (! scalar @cutoffs) {
+    warn "No cutoff supplied, defaulting to N50\n";
+    push @cutoffs, 50;
+}
+@cutoffs = sort {$a <=> $b} uniq @cutoffs;
+
+# Only one of 'strip_N' or 'split_N' is valid
+if ($strip && $split) {
+    warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n";
+    $strip = 0;
+}
+
+# Check for a few necessary conditions
+die "Can't open FASTA file for reading"
+    if (defined $fn_fasta && ! -r $fn_fasta);
+die "One or more cutoffs outside valid range (1-99)"
+    if (grep {$_ < 1 || $_ > 99} @cutoffs);
+die "Cutoffs must be integer values"
+    if (grep {$_ ne int($_)} @cutoffs);
+my @lens;
+my $N_sum  = 0;
+my $GC_sum = 0;
+
+# Read in sequences and calculate descriptive stats
+my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined
+
+SEQ:
+while (my $seq = $stream->next_seq) {
+    my @parts = ($seq);
+    @parts = split(/n+/i, $seq) if $split;
+    for (@parts) {
+        my $Ns   = ($_ =~ tr/Nn//);
+        $N_sum  += $Ns;
+        $GC_sum += ($_ =~ tr/GCgc//);
+        push @lens, length($_) - $Ns * $strip;
+    }
+}
+@lens = sort {$b <=> $a} @lens;
+
+# Calculate basic stats
+my $scaffold_count = scalar @lens;
+my $total_len   = sum @lens;
+my $N_fraction  = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100;
+my $max_length  = max @lens;
+my $min_length  = min @lens;
+my $mean_length = round($total_len/$scaffold_count, 0);
+
+# GC percentage calculated from non-ambiguous bases only
+my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100;
+
+# Calculate Nx (N50, N80, etc) values
+# For example, N50 is the size of the smallest contig for which it and all
+# larger contigs contain 50% of the total nucleotides in the database
+my $cum_length = 0;
+my @fractions  = map {$_/100} @cutoffs;
+my @Nx;
+
+LEN:
+for (@lens) {
+    $cum_length += $_;
+    if ($cum_length/$total_len >= $fractions[0]) {
+        push @Nx, $_;
+        shift @fractions;
+        last LEN if (@fractions < 1);
+    }
+}
+
+# Print summary
+print '-' x 40 . "\n"
+    . "Summary\n"
+    . '-' x 40 . "\n"
+    . "number of scaffolds: $scaffold_count\n"
+    . "total length:        $total_len\n"
+    . "average length:      $mean_length\n"
+    . "G/C content:         $GC_fraction\%\n"
+    . "ambiguous content:   $N_fraction\%\n"
+    . "longest scaffold:    $max_length\n";
+for (0..$#Nx) {
+    my $label = sprintf "N%02d", $cutoffs[$_];
+    print "$label:                 $Nx[$_]\n";
+}
+print "shortest scaffold:   $min_length\n";
+print "NOTE: Ns were stripped for above calculations\n" if ($strip);
+print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split);
+print '-' x 40 . "\n";
+
+exit;
+
+sub round {
+
+    my ($val,$places) = @_;
+    return int($val*10**$places+0.5)/10**$places;
+
+}
+
+
+__END__
+
+=head1 NAME
+
+summarize_assembly - print basic summary info for a file of assembly scaffolds
+
+=head1 SYNOPSIS
+
+summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>]
+
+=head1 DESCRIPTION
+
+This program takes a FASTA file and optionally a list of cutoff values as
+input and prints out summary information about the contigs/scaffolds contained
+in the file. You can, of course, supply a FASTA file of any sort of nucleic
+acid sequences, but the summary information makes most sense for contigs from
+genomic sequencing assemblies.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--fasta> I<filename>
+
+Specify contig/scaffold file from which to read input (default: STDIN)
+
+=item B<--cutoffs>
+
+Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90'
+will output N50 and N90 values) (default: 50)
+
+=item B<--strip_N>
+
+If specified, Ns will be stripped from scaffold sequences before statistics
+are calculated (default: FALSE)
+
+=item B<--split_N>
+
+If specified, scaffold sequences will be split at regions of one or more Ns
+before statistics are calculated (e.g. to get contig-level stats from a
+scaffold file). Note that if this flag is specified, the value of '--strip_N'
+will be ignored. (default: FALSE)
+
+=item B<--help>
+
+Display this usage page
+
+=item B<--version>
+
+Print version information
+
+=back
+
+=head1 CAVEATS AND BUGS
+
+Please submit bug reports to the issue tracker in the distribution repository.
+
+=head1 AUTHOR
+
+Jeremy Volkening (jdv@base2bio.com)
+
+=head1 LICENSE AND COPYRIGHT
+
+Copyright 2014-16 Jeremy Volkening
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+
--- a/summarize_assembly.xml	Wed Jul 17 17:49:39 2019 -0400
+++ b/summarize_assembly.xml	Tue Sep 28 06:16:07 2021 +0000
@@ -8,19 +8,20 @@
     <!--
         <requirement type="package" version="0.003">b2b-utils</requirement>
     -->
-        <requirement type="package" version="0.006007">perl-biox-seq</requirement>
+        <requirement type="package" version="0.008002">perl-biox-seq</requirement>
+        <requirement type="package">perl-list-moreutils</requirement>
     </requirements>

     <!-- ***************************************************************** -->

-    <version_command>summarize_assembly --version | perl -wnE'print "$1\n" for /summarize_assembly v(.+)/g'</version_command>
+    <version_command>perl $__tool_directory__/summarize_assembly --version | perl -wnE'print "$1\n" for /summarize_assembly v(.+)/g'</version_command>

     <!-- ***************************************************************** -->

     <command detect_errors="aggressive">
     <![CDATA[

-    summarize_assembly
+    perl $__tool_directory__/summarize_assembly
     --fasta $input1
     --cutoffs $cutoffs
     #if $split_N:
--- a/summarize_run.pl	Wed Jul 17 17:49:39 2019 -0400
+++ b/summarize_run.pl	Tue Sep 28 06:16:07 2021 +0000
@@ -17,7 +17,6 @@
 my $fn_consensus;
 my $fn_out;
 my $n_threads = 1;
-my $max_aln = 100000;

 GetOptions(
     'raw_1=s'     => \$fn_raw1,
@@ -30,7 +29,6 @@
     'consensus=s' => \$fn_consensus,
     'out=s'       => \$fn_out,
     'threads=i'   => \$n_threads,
-    'max_aln=i'   => \$max_aln,
 );


@@ -50,14 +48,12 @@
 die "raw pair count mismatch\n" if ($counts[0] != $counts[1]);
 die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]);

-#warn "calculating fragment length stats...\n";
+# read fragment stats from STDIN
 my @lens;
-open my $stream, '-|', "frag_lens","--forward",$fn_filt1,"--reverse",$fn_filt2,"--ref",$fn_consensus,"--threads",$n_threads,"--max_aln",$max_aln;
-while (<$stream>) {
+while (<STDIN>) {
     chomp $_;
     push @lens, $_;
 }
-close $stream;

 my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 );
 my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 );
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sync_reads	Tue Sep 28 06:16:07 2021 +0000
@@ -0,0 +1,300 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use autodie qw/open close/;
+use BioX::Seq::Stream;
+use Getopt::Long;
+use Pod::Usage;
+
+my $reads1;
+my $reads2;
+my $singles;
+my $sync_suffix    = 'sync';
+my $singles_suffix = 'singles';
+my $compress       = ''; # one of 'gzip' or 'dsrc'
+my $outname1;
+my $outname2;
+my $singles1_name;
+my $singles2_name;
+
+use constant DSRC_BIN => 'dsrc';
+use constant GZIP_BIN => 'gzip';
+use constant NAME     => 'sync_reads';
+use constant VERSION  => '0.005';
+
+GetOptions(
+
+    'fwd=s'             => \$reads1,
+    'rev=s'             => \$reads2,
+    'singles'           => \$singles,
+    'fwd_out=s'         => \$outname1,
+    'rev_out=s'         => \$outname2,
+    'fwd_singles_out=s' => \$singles1_name,
+    'rev_singles_out=s' => \$singles2_name,
+    'sync_suffix=s'     => \$sync_suffix,
+    'singles_suffix=s'  => \$singles_suffix,
+    'compress:s'        => \$compress,
+    'help'              => sub{ pod2usage(-verbose => 2); },
+    'version'           => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; },
+
+) or pod2usage( -verbose => 1 );
+
+pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read")
+    if (! -r $reads1 || ! -r $reads2);
+
+# open output filehandles, possibly with compression
+if (! defined $outname1) {
+    $outname1 = $reads1;
+    $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/;
+}
+if (! defined $outname2) {
+    $outname2 = $reads2;
+    $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/;
+}
+my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
+my $cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
+      : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
+      : '';
+my $suffix  = $compress eq 'gzip' ? '.gz'
+            : $compress eq 'dsrc' ? '.dsrc'
+            : '';
+open my $s1, $mode, $cmd . $outname1 . $suffix;
+open my $s2, $mode, $cmd . $outname2 . $suffix;
+
+
+# open singles output filehandles if requested, possibly with compression
+my $up1;
+my $up2;
+if ($singles) {
+    if (! defined $singles1_name) {
+        $singles1_name = $reads1;
+        $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
+    }
+    if (! defined $singles2_name) {
+        $singles2_name = $reads2;
+        $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
+    }
+    $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
+    $cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
+          : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
+        : '';
+    my $suffix  = $compress eq 'gzip' ? '.gz'
+                : $compress eq 'dsrc' ? '.dsrc'
+                : '';
+    open $up1, $mode, $cmd . $singles1_name . $suffix;
+    open $up2, $mode, $cmd . $singles2_name . $suffix;
+}
+
+my $ll_f1 = {};
+my $ll_f2 = {};
+my $f1_prev;
+my $f2_prev;
+my $f1_open = 1;
+my $f2_open = 1;
+
+my $parser1 = BioX::Seq::Stream->new($reads1);
+my $parser2 = BioX::Seq::Stream->new($reads2);
+
+while ($f1_open || $f2_open) {
+
+    # process next read for file 1
+    if ($f1_open && defined (my $seq = $parser1->next_seq)) {
+        my $name = $seq->id;
+        $name =~ s/\/[12]$//; #remove directional tag if present
+        if (defined $ll_f2->{$name}) {
+            my $prev = $ll_f2->{$name}->{prev} // undef;
+            purge_cache( $ll_f2, $prev,    $up2 );
+            purge_cache( $ll_f1, $f1_prev, $up1 );
+            print {$s1} $seq->as_fastq;
+            print {$s2} $ll_f2->{$name}->{read};
+            delete $ll_f2->{$name};
+            $f1_prev = undef;
+        }
+        else {
+            $ll_f1->{$name}->{read} = $seq->as_fastq;
+            $ll_f1->{$name}->{prev} = $f1_prev // undef;
+            $f1_prev = $name;
+        }
+    }
+    else {
+        $f1_open = 0;
+    }
+
+    # process next read for file 2
+    if ($f2_open && defined (my $seq = $parser2->next_seq)) {
+        my $name = $seq->id;
+        $name =~ s/\/[12]$//; #remove directional tag if present
+        if (defined $ll_f1->{$name}) {
+            my $prev = $ll_f1->{$name}->{prev} // undef;
+            purge_cache( $ll_f1, $prev,    $up1 );
+            purge_cache( $ll_f2, $f2_prev, $up2 );
+            print {$s2} $seq->as_fastq;
+            print {$s1} $ll_f1->{$name}->{read};
+            delete $ll_f1->{$name};
+            $f2_prev = undef;
+        }
+        else {
+            $ll_f2->{$name}->{read} = $seq->as_fastq;
+            $ll_f2->{$name}->{prev} = $f2_prev // undef;
+            $f2_prev = $name;
+        }
+    }
+    else {
+        $f2_open = 0;
+    }
+}
+
+#handle remaining unpaired reads if necessary
+if ($singles) {
+    purge_cache( $ll_f1, $f1_prev, $up1 );
+    purge_cache( $ll_f2, $f2_prev, $up2 );
+    close $up1;
+    close $up2;
+}
+
+exit;
+
+sub purge_cache {
+
+    my ($ll, $prev, $fh) = @_;
+    while (defined $prev && defined $ll->{$prev}) {
+        my $prev2 = $ll->{$prev}->{prev} // undef;
+        print {$fh} $ll->{$prev}->{read} if ($singles);
+        delete $ll->{$prev};
+        $prev = $prev2;
+    }
+
+}
+
+
+__END__
+
+=head1 NAME
+
+sync_reads - resynchronize paired FASTQ files
+
+=head1 SYNOPSIS
+
+sync_reads [options] --fwd I<left_reads> --rev I<right reads>
+
+=head1 DESCRIPTION
+
+B<sync_reads> will re-synchronize two FASTQ files containing paired reads which
+are no longer in sync due to individual removal of reads during pre-processing
+(trimming, filtering, etc). In this case, "in sync" means that both files have
+the same number of reads and, at any given read position in the files, the
+corresponding reads represent proper pairs. The resulting files will contain
+matching reads in order (assuming the input files were properly ordered). It
+will optionally print out unpaired reads to separate files. Memory usage is
+not dependent on the input file size but rather the maximum distance between
+paired reads in the two files, as the read cache is flushed each time paired
+reads are identified. In the worst-case scenario (one file has a single read
+that pairs with the last read in the matching file) memory usage can approach
+the largest file size, but in typical usage it rarely exceeds a few MB
+regardless of file size.
+
+B<IMPORTANT:> Reads in input files MUST be in the same order, aside from
+missing reads, or the output will report many valid pairs as singletons.
+
+=head1 OPTIONS
+
+=head2 Mandatory
+
+=over 4
+
+=item B<--fwd> I<forward_fastq>
+
+Specify FASTQ file containing the first of the trimmed read pairs
+
+=item B<--rev> I<reverse_fastq>
+
+Specify FASTQ file containing the second of the trimmed read pairs
+
+=back
+
+=head2 Optional
+
+=over 4
+
+=item B<--fwd_out> I<filename>
+
+Specify output name for synced forward reads
+
+=item B<--rev_out> I<filename>
+
+Specify output name for synced reverse reads
+
+=item B<--fwd_singles_out> I<filename>
+
+Specify output name for forward singleton reads
+
+=item B<--rev_singles_out> I<filename>
+
+Specify output name for reverse singleton reads
+
+=item B<--sync_suffix> I<suffix>
+
+Specify suffix to add to synced read output files. This will be added to the
+input read name before the final suffix (i.e. after the last period). Default
+is 'sync'.
+
+=item B<--compress> I<gzip|dsrc>
+
+Specify type of compression for output files (will compress all output files)
+
+=item B<--singles>
+
+If given, unpaired reads will be written to separate output files. Default is
+FALSE.
+
+=item B<--singles_suffix> I<suffix>
+
+Specify suffix to add to singles read output files. This will be added to the
+input read name before the final suffix (i.e. after the last period). Default
+is 'singles'.
+
+=item B<--help>
+
+Display this usage page
+
+=item B<--version>
+
+Print version information
+
+=back
+
+=head1 CAVEATS AND BUGS
+
+Currently no input validation is performed on the input files. Files are
+assumed to be standard FASTQ file format with each read represented by four
+lines and no other extraneous information present. B<CRITICALLY>, they are also
+assumed to be in the same input order after accounting for deleted reads
+(the software will fail miserably if this is not the case).
+
+Please submit bug reports to the issue tracker in the distribution repository.
+
+=head1 AUTHOR
+
+Jeremy Volkening <jdv@base2bio.com>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2014-16 Jeremy Volkening
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=cut