b2b_bam2consensus: summarize_assembly comparison

comparison summarize_assembly @ 1:2367d00c5182 draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"

author	jdv
date	Tue, 28 Sep 2021 06:12:40 +0000
parents
children

comparison

equal deleted inserted replaced

-:d32139014ec3
+:2367d00c5182
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use 5.012;
+use Getopt::Long;
+use Pod::Usage;
+use List::Util qw/sum max min/;
+use List::MoreUtils qw/uniq/;
+use BioX::Seq::Stream;
+my $PROGRAM = 'summarize_assembly';
+my $VERSION = 0.002;
+my $fn_fasta;
+my @cutoffs;
+my $strip = 0;
+my $split = 0;
+# Collect command-line parameters
+my $err_msg = 'Syntax error: please check your syntax';
+GetOptions(
+'fasta=s'      => \$fn_fasta,
+'cutoffs:i{,}' => \@cutoffs,
+'strip_N'      => \$strip,
+'split_N'      => \$split,
+'help'         => sub{ pod2usage( -verbose => 2 ) },
+'version'      => sub{ print "This is $PROGRAM v$VERSION\n";exit; },
+) or pod2usage( -msg => $err_msg, -verbose => 1 );
+# Set default cutoffs if necessary and sort
+if (! scalar @cutoffs) {
+warn "No cutoff supplied, defaulting to N50\n";
+push @cutoffs, 50;
+}
+@cutoffs = sort {$a <=> $b} uniq @cutoffs;
+# Only one of 'strip_N' or 'split_N' is valid
+if ($strip && $split) {
+warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n";
+$strip = 0;
+}
+# Check for a few necessary conditions
+die "Can't open FASTA file for reading"
+if (defined $fn_fasta && ! -r $fn_fasta);
+die "One or more cutoffs outside valid range (1-99)"
+if (grep {$_ < 1 || $_ > 99} @cutoffs);
+die "Cutoffs must be integer values"
+if (grep {$_ ne int($_)} @cutoffs);
+my @lens;
+my $N_sum  = 0;
+my $GC_sum = 0;
+# Read in sequences and calculate descriptive stats
+my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined
+SEQ:
+while (my $seq = $stream->next_seq) {
+my @parts = ($seq);
+@parts = split(/n+/i, $seq) if $split;
+for (@parts) {
+my $Ns   = ($_ =~ tr/Nn//);
+$N_sum  += $Ns;
+$GC_sum += ($_ =~ tr/GCgc//);
+push @lens, length($_) - $Ns * $strip;
+}
+}
+@lens = sort {$b <=> $a} @lens;
+# Calculate basic stats
+my $scaffold_count = scalar @lens;
+my $total_len   = sum @lens;
+my $N_fraction  = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100;
+my $max_length  = max @lens;
+my $min_length  = min @lens;
+my $mean_length = round($total_len/$scaffold_count, 0);
+# GC percentage calculated from non-ambiguous bases only
+my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100;
+# Calculate Nx (N50, N80, etc) values
+# For example, N50 is the size of the smallest contig for which it and all
+# larger contigs contain 50% of the total nucleotides in the database
+my $cum_length = 0;
+my @fractions  = map {$_/100} @cutoffs;
+my @Nx;
+LEN:
+for (@lens) {
+$cum_length += $_;
+if ($cum_length/$total_len >= $fractions[0]) {
+push @Nx, $_;
+shift @fractions;
+last LEN if (@fractions < 1);
+}
+}
+# Print summary
+print '-' x 40 . "\n"
+. "Summary\n"
+. '-' x 40 . "\n"
+. "number of scaffolds: $scaffold_count\n"
+. "total length:        $total_len\n"
+. "average length:      $mean_length\n"
+. "G/C content:         $GC_fraction\%\n"
+. "ambiguous content:   $N_fraction\%\n"
+. "longest scaffold:    $max_length\n";
+for (0..$#Nx) {
+my $label = sprintf "N%02d", $cutoffs[$_];
+print "$label:                 $Nx[$_]\n";
+}
+print "shortest scaffold:   $min_length\n";
+print "NOTE: Ns were stripped for above calculations\n" if ($strip);
+print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split);
+print '-' x 40 . "\n";
+exit;
+sub round {
+my ($val,$places) = @_;
+return int($val*10**$places+0.5)/10**$places;
+}
+__END__
+=head1 NAME
+summarize_assembly - print basic summary info for a file of assembly scaffolds
+=head1 SYNOPSIS
+summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>]
+=head1 DESCRIPTION
+This program takes a FASTA file and optionally a list of cutoff values as
+input and prints out summary information about the contigs/scaffolds contained
+in the file. You can, of course, supply a FASTA file of any sort of nucleic
+acid sequences, but the summary information makes most sense for contigs from
+genomic sequencing assemblies.
+=head1 OPTIONS
+=over
+=item B<--fasta> I<filename>
+Specify contig/scaffold file from which to read input (default: STDIN)
+=item B<--cutoffs>
+Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90'
+will output N50 and N90 values) (default: 50)
+=item B<--strip_N>
+If specified, Ns will be stripped from scaffold sequences before statistics
+are calculated (default: FALSE)
+=item B<--split_N>
+If specified, scaffold sequences will be split at regions of one or more Ns
+before statistics are calculated (e.g. to get contig-level stats from a
+scaffold file). Note that if this flag is specified, the value of '--strip_N'
+will be ignored. (default: FALSE)
+=item B<--help>
+Display this usage page
+=item B<--version>
+Print version information
+=back
+=head1 CAVEATS AND BUGS
+Please submit bug reports to the issue tracker in the distribution repository.
+=head1 AUTHOR
+Jeremy Volkening (jdv@base2bio.com)
+=head1 LICENSE AND COPYRIGHT
+Copyright 2014-16 Jeremy Volkening
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+=cut

Mercurial > repos > jdv > b2b_bam2consensus

comparison summarize_assembly @ 1:2367d00c5182 draft default tip