diff libs/sratoolkit.2.8.0-centos_linux64/example/perl/quality-stats.pl @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libs/sratoolkit.2.8.0-centos_linux64/example/perl/quality-stats.pl	Mon Nov 27 11:21:07 2017 -0500
@@ -0,0 +1,224 @@
+#!/usr/bin/env perl
+# ===========================================================================
+#
+#                            PUBLIC DOMAIN NOTICE
+#               National Center for Biotechnology Information
+#
+#  This software/database is a "United States Government Work" under the
+#  terms of the United States Copyright Act.  It was written as part of
+#  the author's official duties as a United States Government employee and
+#  thus cannot be copyrighted.  This software/database is freely available
+#  to the public for use. The National Library of Medicine and the U.S.
+#  Government have not placed any restriction on its use or reproduction.
+#
+#  Although all reasonable efforts have been taken to ensure the accuracy
+#  and reliability of the software and data, the NLM and the U.S.
+#  Government do not and cannot warrant the performance or results that
+#  may be obtained by using this software or data. The NLM and the U.S.
+#  Government disclaim all warranties, express or implied, including
+#  warranties of performance, merchantability or fitness for any particular
+#  purpose.
+#
+#  Please cite the author in any work or product based on this material.
+#
+# ===========================================================================
+
+use warnings;
+
+use constant MAX_SCORE_COUNT => 64;
+use constant MAX_READ_COUNT => 2;
+use Data::Dumper;
+
+my %opts = (
+    'row-range' => '-1000000',
+);
+
+sub usage()
+{
+    print <<"HELP";
+report QUALITY statistics for an SRA
+
+Usage:
+$0 [<options>...] <accession>...
+options are
+    -h | -? | --help
+    --row-range <row-range-spec>    default is first 1000000 spots
+
+Example:
+    $0 --row-range "5,7,11-15,25-37" SRR797646
+
+HELP
+    exit 0;
+}
+
+usage if scalar @ARGV == 0;
+foreach (@ARGV) {
+    usage() if (/^-h$/ || /^-\?$/ || /^--help$/);
+}
+
+my $VDB_DUMP = `which vdb-dump` or die "Please put path to vdb-dump in PATH";
+chomp $VDB_DUMP;
+
+my @stats = ();
+my $max_read = 0;
+my $max_len = 0;
+my $max_score = 0;
+
+sub split_record($$$$)
+{
+    my @fld = \split(/\t/, ${$_[0]});
+    @{$_[1]} = split(/,\s*/, ${$fld[0]});
+    @{$_[2]} = split(/,\s*/, ${$fld[1]});
+    @{$_[3]} = split(/,\s*/, ${$fld[2]});
+    
+    return [ split(/,\s*/, ${$fld[3]}) ];
+}
+
+sub resize_stats($)
+{
+    my $need = scalar(@{$_[0]}) * MAX_SCORE_COUNT * MAX_READ_COUNT;
+    my $have = scalar(@stats);
+    
+    return unless ($have < $need);
+    splice @stats, $have, 0, map { 0 } (1..($need - $have));
+}
+
+sub expand_and_summarize($)
+{
+    my @rs;
+    my @rl;
+    my @rt;
+    my $qv = split_record $_[0], \@rs, \@rl, \@rt;
+    my $reads = scalar @rs;
+
+    die "inconsistent data" unless scalar(@rl) == $reads;
+    die "inconsistent data" unless scalar(@rt) == $reads;
+
+    resize_stats $qv;
+    $max_read = $reads if $max_read < $reads;
+    for (my $read = 0; $read != $reads; ++$read) {
+        next unless $rt[$read] =~ /SRA_READ_TYPE_BIOLOGICAL/;
+        my $pos = $rs[$read];
+        my $len = $rl[$read];
+        
+        $max_len = $len if ($max_len < $len);
+        for (my $i = 0; $i != $len; ++$i) {
+            my $score = $qv->[$pos + $i] + 0;
+            $score = MAX_SCORE_COUNT - 1 if $score >= MAX_SCORE_COUNT;
+            my $idx = ($i * MAX_SCORE_COUNT + $score) * MAX_READ_COUNT + $read;
+            
+            $max_score = $score if $max_score < $score;
+            ++$stats[$idx];
+        }
+    }
+}
+
+sub warn_if_aligned($)
+{
+    return unless `$VDB_DUMP -y "$_[0]"` =~ /Database/i;
+    open CMD, '-|', "$VDB_DUMP -E \"$_[0]\"" or die "$!";
+
+    while (defined(local $_ = <CMD>)) {
+        chomp;
+        if (/PRIMARY_ALIGNMENT/) {
+            print "$_[0] is aligned; results may be biased to the reference\n";
+            last;
+        }
+    }
+    close CMD;
+}
+
+sub process($)
+{
+    my $cmd = "$VDB_DUMP -f tab -C \"READ_START,READ_LEN,READ_TYPE,QUALITY\" -R $opts{'row-range'} \"$_[0]\"";
+    open CMD, '-|', $cmd or die "$!";
+
+    while (defined($_ = <CMD>)) {
+        chomp;
+        expand_and_summarize \$_;
+    }
+    close CMD;
+}
+
+for (my $i = 0; $i < scalar @ARGV; ++$i) {
+    $_ = $ARGV[$i];
+
+    if (/^-/) {
+        if (/^--row-range$/) {
+            $opts{'row-range'} = $ARGV[++$i];
+            next;
+        }
+        usage();
+    }
+    warn_if_aligned $_;
+    process $_;
+}
+
+print join("\t", ('Read', 'Pos', 'Min', '25%', '50%', '75%', 'Max', 'Avg', 'St.Dev.', 'Mode'))."\n";
+
+sub partition($$$)
+{
+    my ($array, $cnt, $le) = @_;
+    my $N = scalar(@{$array});
+    my $ge = 1.0 - $le;
+    my $i;
+    my $rsum = 0;
+    
+    for ($i = 0; $i != $N; ++$i) {
+        next if $i == 2;
+        my $n = $array->[$i];
+        
+        next unless $n;
+        
+        my $n_le = $rsum + $n;
+        my $n_ge = $cnt - $rsum + $n;
+
+        $rsum = $n_le;
+        return $i if ($n_le/$cnt >= $le && $n_ge/$cnt >= $ge);
+    }
+    return undef;
+}
+
+for (my $read = 0; $read != $max_read; ++$read) {
+    for (my $pos = 0; $pos != $max_len; ++$pos) {
+        my $sum = 0;
+        my $ssum = 0;
+        my $cnt = 0;
+        my $most = -1;
+        my @part = ( MAX_SCORE_COUNT, 0, 0, 0, 0 );
+        my @mode = ();
+        my $array = [ map { ($stats[($pos * MAX_SCORE_COUNT + $_) * MAX_READ_COUNT + $read] || 0) } (0..$max_score) ];
+        
+        for (my $i = 0; $i != $max_score + 1; ++$i) {
+            my $n = $array->[$i];
+
+            $most = $n if ($most < $n);
+            if ($i != 2) {
+                my $x = $n * $i;
+                
+                $cnt += $n;
+                $sum += $x;
+                $ssum += $i * $x;
+            }
+        }
+        if ($cnt) {
+            my $i;
+            
+            for ($i = 0; $i != $max_score + 1; ++$i) {
+                my $n = $array->[$i];
+            
+                if ($n) {
+                    push @mode, $i if ($n == $most);
+                    if ($i != 2) {
+                        $part[0] = $i if $part[0] > $i;
+                        $part[4] = $i if $part[4] < $i;
+                    }
+                }
+            }
+            $part[1] = partition $array, $cnt, 0.25;
+            $part[2] = partition $array, $cnt, 0.5;
+            $part[3] = partition $array, $cnt, 0.75;
+            printf join("\t", ($read + 1, $pos + 1, @part, $sum/$cnt, sqrt(($ssum - $sum*$sum/$cnt)/$cnt), @mode))."\n";
+        }
+    }
+}