view summarize_assembly @ 1:20d941ecc1bb draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author jdv
date Tue, 28 Sep 2021 06:11:32 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;
use 5.012;

use Getopt::Long;
use Pod::Usage;
use List::Util qw/sum max min/;
use List::MoreUtils qw/uniq/;
use BioX::Seq::Stream;

my $PROGRAM = 'summarize_assembly';
my $VERSION = 0.002;

my $fn_fasta;
my @cutoffs;
my $strip = 0;
my $split = 0;

# Collect command-line parameters
my $err_msg = 'Syntax error: please check your syntax';
GetOptions(
    'fasta=s'      => \$fn_fasta,
    'cutoffs:i{,}' => \@cutoffs,
    'strip_N'      => \$strip,
    'split_N'      => \$split,
    'help'         => sub{ pod2usage( -verbose => 2 ) },
    'version'      => sub{ print "This is $PROGRAM v$VERSION\n";exit; },
) or pod2usage( -msg => $err_msg, -verbose => 1 );

# Set default cutoffs if necessary and sort
if (! scalar @cutoffs) {
    warn "No cutoff supplied, defaulting to N50\n";
    push @cutoffs, 50;
}
@cutoffs = sort {$a <=> $b} uniq @cutoffs;

# Only one of 'strip_N' or 'split_N' is valid
if ($strip && $split) {
    warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n";
    $strip = 0;
}

# Check for a few necessary conditions
die "Can't open FASTA file for reading"
    if (defined $fn_fasta && ! -r $fn_fasta);
die "One or more cutoffs outside valid range (1-99)"
    if (grep {$_ < 1 || $_ > 99} @cutoffs);
die "Cutoffs must be integer values"
    if (grep {$_ ne int($_)} @cutoffs);
my @lens;
my $N_sum  = 0;
my $GC_sum = 0;

# Read in sequences and calculate descriptive stats
my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined

SEQ:
while (my $seq = $stream->next_seq) {
    my @parts = ($seq);
    @parts = split(/n+/i, $seq) if $split;
    for (@parts) {
        my $Ns   = ($_ =~ tr/Nn//);
        $N_sum  += $Ns;
        $GC_sum += ($_ =~ tr/GCgc//);
        push @lens, length($_) - $Ns * $strip;
    }
}
@lens = sort {$b <=> $a} @lens;

# Calculate basic stats
my $scaffold_count = scalar @lens;
my $total_len   = sum @lens;
my $N_fraction  = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100;
my $max_length  = max @lens;
my $min_length  = min @lens;
my $mean_length = round($total_len/$scaffold_count, 0);

# GC percentage calculated from non-ambiguous bases only
my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100;

# Calculate Nx (N50, N80, etc) values
# For example, N50 is the size of the smallest contig for which it and all
# larger contigs contain 50% of the total nucleotides in the database
my $cum_length = 0;
my @fractions  = map {$_/100} @cutoffs;
my @Nx;

LEN:
for (@lens) {
    $cum_length += $_;
    if ($cum_length/$total_len >= $fractions[0]) {
        push @Nx, $_;
        shift @fractions;
        last LEN if (@fractions < 1);
    }
}

# Print summary
print '-' x 40 . "\n"
    . "Summary\n"
    . '-' x 40 . "\n"
    . "number of scaffolds: $scaffold_count\n"
    . "total length:        $total_len\n"
    . "average length:      $mean_length\n"
    . "G/C content:         $GC_fraction\%\n"
    . "ambiguous content:   $N_fraction\%\n"
    . "longest scaffold:    $max_length\n";
for (0..$#Nx) {
    my $label = sprintf "N%02d", $cutoffs[$_];
    print "$label:                 $Nx[$_]\n";
}
print "shortest scaffold:   $min_length\n";
print "NOTE: Ns were stripped for above calculations\n" if ($strip);
print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split);
print '-' x 40 . "\n";

exit;

sub round {

    my ($val,$places) = @_;
    return int($val*10**$places+0.5)/10**$places;

}
    

__END__

=head1 NAME

summarize_assembly - print basic summary info for a file of assembly scaffolds

=head1 SYNOPSIS

summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>]

=head1 DESCRIPTION

This program takes a FASTA file and optionally a list of cutoff values as
input and prints out summary information about the contigs/scaffolds contained
in the file. You can, of course, supply a FASTA file of any sort of nucleic
acid sequences, but the summary information makes most sense for contigs from
genomic sequencing assemblies.

=head1 OPTIONS

=over

=item B<--fasta> I<filename>

Specify contig/scaffold file from which to read input (default: STDIN)

=item B<--cutoffs>

Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90'
will output N50 and N90 values) (default: 50)

=item B<--strip_N>

If specified, Ns will be stripped from scaffold sequences before statistics
are calculated (default: FALSE)

=item B<--split_N>

If specified, scaffold sequences will be split at regions of one or more Ns
before statistics are calculated (e.g. to get contig-level stats from a
scaffold file). Note that if this flag is specified, the value of '--strip_N'
will be ignored. (default: FALSE)

=item B<--help>

Display this usage page

=item B<--version>

Print version information

=back

=head1 CAVEATS AND BUGS

Please submit bug reports to the issue tracker in the distribution repository.

=head1 AUTHOR

Jeremy Volkening (jdv@base2bio.com)

=head1 LICENSE AND COPYRIGHT

Copyright 2014-16 Jeremy Volkening

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

=cut