Mercurial > repos > charles_s_test > seqsero2
diff libs/sratoolkit.2.8.0-centos_linux64/example/perl/base-stats.pl @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libs/sratoolkit.2.8.0-centos_linux64/example/perl/base-stats.pl Mon Nov 27 11:21:07 2017 -0500 @@ -0,0 +1,304 @@ +#!/usr/bin/env perl +# ============================================================================= +# +# PUBLIC DOMAIN NOTICE +# National Center for Biotechnology Information +# +# This software/database is a "United States Government Work" under the +# terms of the United States Copyright Act. It was written as part of +# the author's official duties as a United States Government employee and +# thus cannot be copyrighted. This software/database is freely available +# to the public for use. The National Library of Medicine and the U.S. +# Government have not placed any restriction on its use or reproduction. +# +# Although all reasonable efforts have been taken to ensure the accuracy +# and reliability of the software and data, the NLM and the U.S. +# Government do not and cannot warrant the performance or results that +# may be obtained by using this software or data. The NLM and the U.S. +# Government disclaim all warranties, express or implied, including +# warranties of performance, merchantability or fitness for any particular +# purpose. +# +# Please cite the author in any work or product based on this material. +# +# ============================================================================= + +=head1 NAME + +base-stats - report base statistics for an SRA run + +=head1 DESCRIPTION + +This script computes base and GC content statistics for an SRA run. The output +is summarized by mate and position within the mate. + +=head1 PURPOSE + +This script demonstrates driving the SRA Toolkit tool 'vdb-dump' from Perl for +accessing data in the SRA. It requires that the toolkit be properly configured. +'vdb-dump' is the Jack-of-all-trades of the SRA Toolkit. + +=head1 EXAMPLE + +base-stats.pl SRR797646 + +=head1 SEE ALSO + +base-stats.pl --help + +vdb-dump --help + +=cut + +use strict; +use warnings; + +sub usage() +{ + print <<"HELP"; +report base statistics for an SRA run + +Usage: +$0 [<options>...] <accession>... +options are + -h | -? | --help shows this message + --row-range <row-range-spec> default is spots 1-1000000 + row-range-spec a comma seperated list of row numbers and/or + ranges; a range is <first>-<last> + + --output-type <output-spec> set the type of data output; proportion is + the default + output-spec one of 'count', 'percent', or 'proportion' + + accession an SRA accession or path to an SRA file + +Example: + $0 --row-range "5,7,11-15,25-37" --output-type count SRR797646 + +HELP + exit 0; +} + +usage() if scalar @ARGV == 0; +foreach (@ARGV) { + usage() if (/^-h$/ || /^-\?$/ || /^--help$/); +} + +use constant MAX_READ_COUNT => 2; + +# the columns to be read from the input +use constant COLUMN_LIST => 'READ_START,READ_LEN,READ_TYPE,READ'; + +# need vdb-dump to read input files +my $VDB_DUMP = `which vdb-dump` or die "Please put path to vdb-dump in PATH"; +chomp $VDB_DUMP; + +my %opts = ( + 'row-range' => '1-1000000', # first million spots + 'output-type' => 'proportion' +); + +sub foreach_row($$) +{ + my ($source, $callback) = @_; + my $cmd = $VDB_DUMP + . ' --rows "' . $opts{'row-range'} . '"' # set the row range + . ' --columns "' . COLUMN_LIST . '"' # set the columns to be read + . ' --format tab' # set output format to tab delimited + . ' "' . $source . '"'; # the file/accession to be read + open CMD, '-|', $cmd or die "failed to exec vdb-dump: $!"; + + while (defined(local $_ = <CMD>)) { +# print; # uncomment to print raw output from vdb-dump (e.g. for debugging) + chomp; + + &$callback(\split(/\t/, $_)); + } + close CMD; +} + +sub sra_object_type($) +{ + open CMD, '-|', $VDB_DUMP .' --obj_type "' . $_[0] . '"' or die "failed to exec vdb-dump: $!"; + local $_ = <CMD>; + + close CMD; + chomp; + return $_; +} + +sub sra_list_tables($) +{ + my @rslt = (); + + if (sra_object_type($_[0]) eq 'Database') { + open CMD, '-|', $VDB_DUMP . ' --table_enum "' . $_[0] . '"' or die "failed to exec vdb-dump: $!"; + while (defined(local $_ = <CMD>)) { + push @rslt, $1 if /^tbl #\d+:\s*(\w+)/; + } + close CMD; + } + return @rslt; +} + +sub warn_if_aligned($); +sub print_result(); +sub process; + +my @stats = (); +my $max_read = 0; +my $max_pos = 0; + +for (my $i = 0; $i < scalar @ARGV; ++$i) { + $_ = $ARGV[$i]; + + if (/^-/) { + if (/^--row-range$/) { + $opts{'row-range'} = $ARGV[++$i]; + next; + } + if (/^--output-type$/) { + my $type = $ARGV[++$i]; + + if ( $type eq 'count' + || $type eq 'proportion' + || $type eq 'percent') + { + $opts{'output-type'} = $type; + next; + } + } + usage(); + } + warn_if_aligned $_; + foreach_row($_, \&process); +} + +print_result(); +exit 0; + +sub warn_if_aligned($) +{ + local $_; + + foreach (sra_list_tables($_[0])) { + if (/PRIMARY_ALIGNMENT/) { + print "$_[0] is aligned and is likely to be mostly ordered; results may be biased to the reference\n"; + last; + } + } +} + +sub get_stat($$$) +{ + # (pos * 5 + base) * MAX_READ_COUNT + read_number; + return $stats[($_[1] * 5 + $_[2]) * MAX_READ_COUNT + $_[0]]; +} + +sub inc_stat($$$) # this is called a zillion times and needs to be fast +{ + # (pos * 5 + base) * MAX_READ_COUNT + read_number; + ++$stats[($_[1] * 5 + $_[2]) * MAX_READ_COUNT + $_[0]]; +} + +sub resize_stats($) +{ + my $length = $_[0]; + my $need = MAX_READ_COUNT * 5 * $length; + my $have = scalar(@stats); + + return if $have >= $need; + splice @stats, $have, 0, map { 0 } (1..($need - $have)); +} + +sub convert_bases($) +{ + local $_ = ${$_[0]}; + + tr/ACGTN/01234/; + return $_; +} + +# accumulate statistics for one spot +sub process +{ + # this matches COLUMN_LIST above + my ($READ_START, $READ_LEN, $READ_TYPE, $READ) = @_; + + # the data in these fields is comma delimited + # and describe how to split READ up into its pieces + my @start = split(/,\s*/, $$READ_START); + my @length = split(/,\s*/, $$READ_LEN); + my @type = split(/,\s*/, $$READ_TYPE); + + # READ contains the bases for the whole spot (i.e. all the mates concatenated) + my @bases = split(//, convert_bases($READ)); + my $reads = scalar(@start); + + die "too many reads: $reads reads; adjust MAX_READ_COUNT and rerun" unless $reads <= MAX_READ_COUNT; + + $max_read = $reads if ($max_read < $reads); + for (my $read = 0; $read != $reads; ++$read) { + # only count biological bases (i.e. no adapters, linkers, etc.) + next unless $type[$read] =~ /SRA_READ_TYPE_BIOLOGICAL/; + my $pos = $start[$read]; + my $len = $length[$read]; + + resize_stats($len); + $max_pos = $len if ($max_pos < $len); + for (my $i = 0; $i != $len; ++$i) { + inc_stat($read, $i, $bases[$pos + $i]); + } + } +} + +sub print_result() { + my $GCp_mean; + my $GCp_stdev; + my $sum = 0; + my $ssum = 0; + my $format = + $opts{'output-type'} eq 'count' ? sub { $_[0] } : + $opts{'output-type'} eq 'percent' ? sub { int($_[0] * 100.0/$_[1] + 0.5).'%' } : + sub { $_[0]/$_[1] }; + + for (my $read = 0; $read != $max_read; ++$read) { + for (my $pos = 0; $pos != $max_pos; ++$pos) { + my $gc = (get_stat($read, $pos, 1) + get_stat($read, $pos, 2)) * 1.0; + my $n = $gc + get_stat($read, $pos, 0) + get_stat($read, $pos, 3); + my $x = $gc/$n; + + $sum += $x; + $ssum += $x * $x; + } + } + $GCp_mean = $sum/($max_pos*$max_read); + $GCp_stdev = sqrt(($ssum - $sum*$GCp_mean)/($max_pos*$max_read)); + + printf join("\t", ('Read', 'Pos', 'A', 'C', 'G', 'T', 'N', 'GC', 'Suspect'))."\n"; + for (my $read = 0; $read != $max_read; ++$read) { + for (my $pos = 0; $pos != $max_pos; ++$pos) { + my $cntA = get_stat($read, $pos, 0); + my $cntC = get_stat($read, $pos, 1); + my $cntG = get_stat($read, $pos, 2); + my $cntT = get_stat($read, $pos, 3); + my $cntN = get_stat($read, $pos, 4); + my $GC = $cntC + $cntG; + my $tot = $cntA + $GC + $cntT; + + # keep N out for GC + my $GC_score = ($GC/$tot - $GCp_mean)/$GCp_stdev; + + # add N in + $tot += $cntN; + print join("\t", ($read + 1, $pos + 1, + &$format($cntA, $tot), + &$format($cntC, $tot), + &$format($cntG, $tot), + &$format($cntT, $tot), + &$format($cntN, $tot), + &$format($GC, $tot), + '*'x(int(abs($GC_score)))))."\n"; + } + } +}