comparison libs/sratoolkit.2.8.0-centos_linux64/example/perl/quality-stats.pl @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 #!/usr/bin/env perl
2 # ===========================================================================
3 #
4 # PUBLIC DOMAIN NOTICE
5 # National Center for Biotechnology Information
6 #
7 # This software/database is a "United States Government Work" under the
8 # terms of the United States Copyright Act. It was written as part of
9 # the author's official duties as a United States Government employee and
10 # thus cannot be copyrighted. This software/database is freely available
11 # to the public for use. The National Library of Medicine and the U.S.
12 # Government have not placed any restriction on its use or reproduction.
13 #
14 # Although all reasonable efforts have been taken to ensure the accuracy
15 # and reliability of the software and data, the NLM and the U.S.
16 # Government do not and cannot warrant the performance or results that
17 # may be obtained by using this software or data. The NLM and the U.S.
18 # Government disclaim all warranties, express or implied, including
19 # warranties of performance, merchantability or fitness for any particular
20 # purpose.
21 #
22 # Please cite the author in any work or product based on this material.
23 #
24 # ===========================================================================
25
26 use warnings;
27
28 use constant MAX_SCORE_COUNT => 64;
29 use constant MAX_READ_COUNT => 2;
30 use Data::Dumper;
31
32 my %opts = (
33 'row-range' => '-1000000',
34 );
35
36 sub usage()
37 {
38 print <<"HELP";
39 report QUALITY statistics for an SRA
40
41 Usage:
42 $0 [<options>...] <accession>...
43 options are
44 -h | -? | --help
45 --row-range <row-range-spec> default is first 1000000 spots
46
47 Example:
48 $0 --row-range "5,7,11-15,25-37" SRR797646
49
50 HELP
51 exit 0;
52 }
53
54 usage if scalar @ARGV == 0;
55 foreach (@ARGV) {
56 usage() if (/^-h$/ || /^-\?$/ || /^--help$/);
57 }
58
59 my $VDB_DUMP = `which vdb-dump` or die "Please put path to vdb-dump in PATH";
60 chomp $VDB_DUMP;
61
62 my @stats = ();
63 my $max_read = 0;
64 my $max_len = 0;
65 my $max_score = 0;
66
67 sub split_record($$$$)
68 {
69 my @fld = \split(/\t/, ${$_[0]});
70 @{$_[1]} = split(/,\s*/, ${$fld[0]});
71 @{$_[2]} = split(/,\s*/, ${$fld[1]});
72 @{$_[3]} = split(/,\s*/, ${$fld[2]});
73
74 return [ split(/,\s*/, ${$fld[3]}) ];
75 }
76
77 sub resize_stats($)
78 {
79 my $need = scalar(@{$_[0]}) * MAX_SCORE_COUNT * MAX_READ_COUNT;
80 my $have = scalar(@stats);
81
82 return unless ($have < $need);
83 splice @stats, $have, 0, map { 0 } (1..($need - $have));
84 }
85
86 sub expand_and_summarize($)
87 {
88 my @rs;
89 my @rl;
90 my @rt;
91 my $qv = split_record $_[0], \@rs, \@rl, \@rt;
92 my $reads = scalar @rs;
93
94 die "inconsistent data" unless scalar(@rl) == $reads;
95 die "inconsistent data" unless scalar(@rt) == $reads;
96
97 resize_stats $qv;
98 $max_read = $reads if $max_read < $reads;
99 for (my $read = 0; $read != $reads; ++$read) {
100 next unless $rt[$read] =~ /SRA_READ_TYPE_BIOLOGICAL/;
101 my $pos = $rs[$read];
102 my $len = $rl[$read];
103
104 $max_len = $len if ($max_len < $len);
105 for (my $i = 0; $i != $len; ++$i) {
106 my $score = $qv->[$pos + $i] + 0;
107 $score = MAX_SCORE_COUNT - 1 if $score >= MAX_SCORE_COUNT;
108 my $idx = ($i * MAX_SCORE_COUNT + $score) * MAX_READ_COUNT + $read;
109
110 $max_score = $score if $max_score < $score;
111 ++$stats[$idx];
112 }
113 }
114 }
115
116 sub warn_if_aligned($)
117 {
118 return unless `$VDB_DUMP -y "$_[0]"` =~ /Database/i;
119 open CMD, '-|', "$VDB_DUMP -E \"$_[0]\"" or die "$!";
120
121 while (defined(local $_ = <CMD>)) {
122 chomp;
123 if (/PRIMARY_ALIGNMENT/) {
124 print "$_[0] is aligned; results may be biased to the reference\n";
125 last;
126 }
127 }
128 close CMD;
129 }
130
131 sub process($)
132 {
133 my $cmd = "$VDB_DUMP -f tab -C \"READ_START,READ_LEN,READ_TYPE,QUALITY\" -R $opts{'row-range'} \"$_[0]\"";
134 open CMD, '-|', $cmd or die "$!";
135
136 while (defined($_ = <CMD>)) {
137 chomp;
138 expand_and_summarize \$_;
139 }
140 close CMD;
141 }
142
143 for (my $i = 0; $i < scalar @ARGV; ++$i) {
144 $_ = $ARGV[$i];
145
146 if (/^-/) {
147 if (/^--row-range$/) {
148 $opts{'row-range'} = $ARGV[++$i];
149 next;
150 }
151 usage();
152 }
153 warn_if_aligned $_;
154 process $_;
155 }
156
157 print join("\t", ('Read', 'Pos', 'Min', '25%', '50%', '75%', 'Max', 'Avg', 'St.Dev.', 'Mode'))."\n";
158
159 sub partition($$$)
160 {
161 my ($array, $cnt, $le) = @_;
162 my $N = scalar(@{$array});
163 my $ge = 1.0 - $le;
164 my $i;
165 my $rsum = 0;
166
167 for ($i = 0; $i != $N; ++$i) {
168 next if $i == 2;
169 my $n = $array->[$i];
170
171 next unless $n;
172
173 my $n_le = $rsum + $n;
174 my $n_ge = $cnt - $rsum + $n;
175
176 $rsum = $n_le;
177 return $i if ($n_le/$cnt >= $le && $n_ge/$cnt >= $ge);
178 }
179 return undef;
180 }
181
182 for (my $read = 0; $read != $max_read; ++$read) {
183 for (my $pos = 0; $pos != $max_len; ++$pos) {
184 my $sum = 0;
185 my $ssum = 0;
186 my $cnt = 0;
187 my $most = -1;
188 my @part = ( MAX_SCORE_COUNT, 0, 0, 0, 0 );
189 my @mode = ();
190 my $array = [ map { ($stats[($pos * MAX_SCORE_COUNT + $_) * MAX_READ_COUNT + $read] || 0) } (0..$max_score) ];
191
192 for (my $i = 0; $i != $max_score + 1; ++$i) {
193 my $n = $array->[$i];
194
195 $most = $n if ($most < $n);
196 if ($i != 2) {
197 my $x = $n * $i;
198
199 $cnt += $n;
200 $sum += $x;
201 $ssum += $i * $x;
202 }
203 }
204 if ($cnt) {
205 my $i;
206
207 for ($i = 0; $i != $max_score + 1; ++$i) {
208 my $n = $array->[$i];
209
210 if ($n) {
211 push @mode, $i if ($n == $most);
212 if ($i != 2) {
213 $part[0] = $i if $part[0] > $i;
214 $part[4] = $i if $part[4] < $i;
215 }
216 }
217 }
218 $part[1] = partition $array, $cnt, 0.25;
219 $part[2] = partition $array, $cnt, 0.5;
220 $part[3] = partition $array, $cnt, 0.75;
221 printf join("\t", ($read + 1, $pos + 1, @part, $sum/$cnt, sqrt(($ssum - $sum*$sum/$cnt)/$cnt), @mode))."\n";
222 }
223 }
224 }