Mercurial > repos > jdv > b2b_bam2consensus
comparison summarize_assembly @ 1:2367d00c5182 draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
| author | jdv |
|---|---|
| date | Tue, 28 Sep 2021 06:12:40 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d32139014ec3 | 1:2367d00c5182 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use 5.012; | |
| 6 | |
| 7 use Getopt::Long; | |
| 8 use Pod::Usage; | |
| 9 use List::Util qw/sum max min/; | |
| 10 use List::MoreUtils qw/uniq/; | |
| 11 use BioX::Seq::Stream; | |
| 12 | |
| 13 my $PROGRAM = 'summarize_assembly'; | |
| 14 my $VERSION = 0.002; | |
| 15 | |
| 16 my $fn_fasta; | |
| 17 my @cutoffs; | |
| 18 my $strip = 0; | |
| 19 my $split = 0; | |
| 20 | |
| 21 # Collect command-line parameters | |
| 22 my $err_msg = 'Syntax error: please check your syntax'; | |
| 23 GetOptions( | |
| 24 'fasta=s' => \$fn_fasta, | |
| 25 'cutoffs:i{,}' => \@cutoffs, | |
| 26 'strip_N' => \$strip, | |
| 27 'split_N' => \$split, | |
| 28 'help' => sub{ pod2usage( -verbose => 2 ) }, | |
| 29 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; }, | |
| 30 ) or pod2usage( -msg => $err_msg, -verbose => 1 ); | |
| 31 | |
| 32 # Set default cutoffs if necessary and sort | |
| 33 if (! scalar @cutoffs) { | |
| 34 warn "No cutoff supplied, defaulting to N50\n"; | |
| 35 push @cutoffs, 50; | |
| 36 } | |
| 37 @cutoffs = sort {$a <=> $b} uniq @cutoffs; | |
| 38 | |
| 39 # Only one of 'strip_N' or 'split_N' is valid | |
| 40 if ($strip && $split) { | |
| 41 warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n"; | |
| 42 $strip = 0; | |
| 43 } | |
| 44 | |
| 45 # Check for a few necessary conditions | |
| 46 die "Can't open FASTA file for reading" | |
| 47 if (defined $fn_fasta && ! -r $fn_fasta); | |
| 48 die "One or more cutoffs outside valid range (1-99)" | |
| 49 if (grep {$_ < 1 || $_ > 99} @cutoffs); | |
| 50 die "Cutoffs must be integer values" | |
| 51 if (grep {$_ ne int($_)} @cutoffs); | |
| 52 my @lens; | |
| 53 my $N_sum = 0; | |
| 54 my $GC_sum = 0; | |
| 55 | |
| 56 # Read in sequences and calculate descriptive stats | |
| 57 my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined | |
| 58 | |
| 59 SEQ: | |
| 60 while (my $seq = $stream->next_seq) { | |
| 61 my @parts = ($seq); | |
| 62 @parts = split(/n+/i, $seq) if $split; | |
| 63 for (@parts) { | |
| 64 my $Ns = ($_ =~ tr/Nn//); | |
| 65 $N_sum += $Ns; | |
| 66 $GC_sum += ($_ =~ tr/GCgc//); | |
| 67 push @lens, length($_) - $Ns * $strip; | |
| 68 } | |
| 69 } | |
| 70 @lens = sort {$b <=> $a} @lens; | |
| 71 | |
| 72 # Calculate basic stats | |
| 73 my $scaffold_count = scalar @lens; | |
| 74 my $total_len = sum @lens; | |
| 75 my $N_fraction = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100; | |
| 76 my $max_length = max @lens; | |
| 77 my $min_length = min @lens; | |
| 78 my $mean_length = round($total_len/$scaffold_count, 0); | |
| 79 | |
| 80 # GC percentage calculated from non-ambiguous bases only | |
| 81 my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100; | |
| 82 | |
| 83 # Calculate Nx (N50, N80, etc) values | |
| 84 # For example, N50 is the size of the smallest contig for which it and all | |
| 85 # larger contigs contain 50% of the total nucleotides in the database | |
| 86 my $cum_length = 0; | |
| 87 my @fractions = map {$_/100} @cutoffs; | |
| 88 my @Nx; | |
| 89 | |
| 90 LEN: | |
| 91 for (@lens) { | |
| 92 $cum_length += $_; | |
| 93 if ($cum_length/$total_len >= $fractions[0]) { | |
| 94 push @Nx, $_; | |
| 95 shift @fractions; | |
| 96 last LEN if (@fractions < 1); | |
| 97 } | |
| 98 } | |
| 99 | |
| 100 # Print summary | |
| 101 print '-' x 40 . "\n" | |
| 102 . "Summary\n" | |
| 103 . '-' x 40 . "\n" | |
| 104 . "number of scaffolds: $scaffold_count\n" | |
| 105 . "total length: $total_len\n" | |
| 106 . "average length: $mean_length\n" | |
| 107 . "G/C content: $GC_fraction\%\n" | |
| 108 . "ambiguous content: $N_fraction\%\n" | |
| 109 . "longest scaffold: $max_length\n"; | |
| 110 for (0..$#Nx) { | |
| 111 my $label = sprintf "N%02d", $cutoffs[$_]; | |
| 112 print "$label: $Nx[$_]\n"; | |
| 113 } | |
| 114 print "shortest scaffold: $min_length\n"; | |
| 115 print "NOTE: Ns were stripped for above calculations\n" if ($strip); | |
| 116 print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split); | |
| 117 print '-' x 40 . "\n"; | |
| 118 | |
| 119 exit; | |
| 120 | |
| 121 sub round { | |
| 122 | |
| 123 my ($val,$places) = @_; | |
| 124 return int($val*10**$places+0.5)/10**$places; | |
| 125 | |
| 126 } | |
| 127 | |
| 128 | |
| 129 __END__ | |
| 130 | |
| 131 =head1 NAME | |
| 132 | |
| 133 summarize_assembly - print basic summary info for a file of assembly scaffolds | |
| 134 | |
| 135 =head1 SYNOPSIS | |
| 136 | |
| 137 summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>] | |
| 138 | |
| 139 =head1 DESCRIPTION | |
| 140 | |
| 141 This program takes a FASTA file and optionally a list of cutoff values as | |
| 142 input and prints out summary information about the contigs/scaffolds contained | |
| 143 in the file. You can, of course, supply a FASTA file of any sort of nucleic | |
| 144 acid sequences, but the summary information makes most sense for contigs from | |
| 145 genomic sequencing assemblies. | |
| 146 | |
| 147 =head1 OPTIONS | |
| 148 | |
| 149 =over | |
| 150 | |
| 151 =item B<--fasta> I<filename> | |
| 152 | |
| 153 Specify contig/scaffold file from which to read input (default: STDIN) | |
| 154 | |
| 155 =item B<--cutoffs> | |
| 156 | |
| 157 Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90' | |
| 158 will output N50 and N90 values) (default: 50) | |
| 159 | |
| 160 =item B<--strip_N> | |
| 161 | |
| 162 If specified, Ns will be stripped from scaffold sequences before statistics | |
| 163 are calculated (default: FALSE) | |
| 164 | |
| 165 =item B<--split_N> | |
| 166 | |
| 167 If specified, scaffold sequences will be split at regions of one or more Ns | |
| 168 before statistics are calculated (e.g. to get contig-level stats from a | |
| 169 scaffold file). Note that if this flag is specified, the value of '--strip_N' | |
| 170 will be ignored. (default: FALSE) | |
| 171 | |
| 172 =item B<--help> | |
| 173 | |
| 174 Display this usage page | |
| 175 | |
| 176 =item B<--version> | |
| 177 | |
| 178 Print version information | |
| 179 | |
| 180 =back | |
| 181 | |
| 182 =head1 CAVEATS AND BUGS | |
| 183 | |
| 184 Please submit bug reports to the issue tracker in the distribution repository. | |
| 185 | |
| 186 =head1 AUTHOR | |
| 187 | |
| 188 Jeremy Volkening (jdv@base2bio.com) | |
| 189 | |
| 190 =head1 LICENSE AND COPYRIGHT | |
| 191 | |
| 192 Copyright 2014-16 Jeremy Volkening | |
| 193 | |
| 194 This program is free software: you can redistribute it and/or modify | |
| 195 it under the terms of the GNU General Public License as published by | |
| 196 the Free Software Foundation, either version 3 of the License, or | |
| 197 (at your option) any later version. | |
| 198 | |
| 199 This program is distributed in the hope that it will be useful, | |
| 200 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 201 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 202 GNU General Public License for more details. | |
| 203 | |
| 204 You should have received a copy of the GNU General Public License | |
| 205 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 206 | |
| 207 =cut | |
| 208 |
